acryl-datahub 0.15.0.3__py3-none-any.whl → 0.15.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- acryl_datahub-0.15.0.4.dist-info/LICENSE +202 -0
- {acryl_datahub-0.15.0.3.dist-info → acryl_datahub-0.15.0.4.dist-info}/METADATA +2417 -2414
- {acryl_datahub-0.15.0.3.dist-info → acryl_datahub-0.15.0.4.dist-info}/RECORD +36 -33
- datahub/__init__.py +1 -1
- datahub/cli/container_cli.py +108 -0
- datahub/emitter/enum_helpers.py +4 -2
- datahub/emitter/mce_builder.py +4 -0
- datahub/emitter/mcp_builder.py +19 -0
- datahub/entrypoints.py +2 -0
- datahub/ingestion/api/decorators.py +2 -0
- datahub/ingestion/api/registry.py +3 -1
- datahub/ingestion/api/sink.py +12 -0
- datahub/ingestion/api/source.py +5 -2
- datahub/ingestion/source/aws/glue.py +11 -5
- datahub/ingestion/source/aws/s3_util.py +1 -24
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -2
- datahub/ingestion/source/dbt/dbt_common.py +2 -2
- datahub/ingestion/source/powerbi/powerbi.py +4 -4
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +24 -18
- datahub/ingestion/source/s3/source.py +6 -2
- datahub/ingestion/source/slack/slack.py +6 -0
- datahub/ingestion/source/sql/hive_metastore.py +3 -3
- datahub/ingestion/source/sql/mssql/job_models.py +2 -2
- datahub/ingestion/source/sql/mssql/source.py +26 -11
- datahub/ingestion/source/sql/teradata.py +2 -2
- datahub/ingestion/source/tableau/tableau.py +23 -10
- datahub/metadata/_schema_classes.py +401 -401
- datahub/metadata/_urns/urn_defs.py +1857 -1408
- datahub/metadata/schema.avsc +16624 -16266
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -3
- datahub/utilities/groupby.py +17 -0
- datahub/utilities/urns/_urn_base.py +6 -2
- {acryl_datahub-0.15.0.3.dist-info → acryl_datahub-0.15.0.4.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.3.dist-info → acryl_datahub-0.15.0.4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.3.dist-info → acryl_datahub-0.15.0.4.dist-info}/top_level.txt +0 -0
|
@@ -71,13 +71,13 @@ class Workspace:
|
|
|
71
71
|
id: str
|
|
72
72
|
name: str
|
|
73
73
|
type: str # This is used as a subtype of the Container entity.
|
|
74
|
-
dashboards:
|
|
75
|
-
reports:
|
|
76
|
-
datasets: Dict[str, "PowerBIDataset"]
|
|
77
|
-
report_endorsements: Dict[str, List[str]]
|
|
78
|
-
dashboard_endorsements: Dict[str, List[str]]
|
|
74
|
+
dashboards: Dict[str, "Dashboard"] # key = dashboard id
|
|
75
|
+
reports: Dict[str, "Report"] # key = report id
|
|
76
|
+
datasets: Dict[str, "PowerBIDataset"] # key = dataset id
|
|
77
|
+
report_endorsements: Dict[str, List[str]] # key = report id
|
|
78
|
+
dashboard_endorsements: Dict[str, List[str]] # key = dashboard id
|
|
79
79
|
scan_result: dict
|
|
80
|
-
independent_datasets:
|
|
80
|
+
independent_datasets: Dict[str, "PowerBIDataset"] # key = dataset id
|
|
81
81
|
app: Optional["App"]
|
|
82
82
|
|
|
83
83
|
def get_urn_part(self, workspace_id_as_urn_part: Optional[bool] = False) -> str:
|
|
@@ -193,15 +193,18 @@ class PowerBiAPI:
|
|
|
193
193
|
def get_report_users(self, workspace_id: str, report_id: str) -> List[User]:
|
|
194
194
|
return self._get_entity_users(workspace_id, Constant.REPORTS, report_id)
|
|
195
195
|
|
|
196
|
-
def get_reports(self, workspace: Workspace) ->
|
|
196
|
+
def get_reports(self, workspace: Workspace) -> Dict[str, Report]:
|
|
197
197
|
"""
|
|
198
198
|
Fetch the report from PowerBi for the given Workspace
|
|
199
199
|
"""
|
|
200
|
-
reports:
|
|
200
|
+
reports: Dict[str, Report] = {}
|
|
201
201
|
try:
|
|
202
|
-
reports =
|
|
202
|
+
reports = {
|
|
203
|
+
report.id: report
|
|
204
|
+
for report in self._get_resolver().get_reports(workspace)
|
|
205
|
+
}
|
|
203
206
|
# Fill Report dataset
|
|
204
|
-
for report in reports:
|
|
207
|
+
for report in reports.values():
|
|
205
208
|
if report.dataset_id:
|
|
206
209
|
report.dataset = self.dataset_registry.get(report.dataset_id)
|
|
207
210
|
if report.dataset is None:
|
|
@@ -222,7 +225,7 @@ class PowerBiAPI:
|
|
|
222
225
|
)
|
|
223
226
|
return
|
|
224
227
|
|
|
225
|
-
for report in reports:
|
|
228
|
+
for report in reports.values():
|
|
226
229
|
report.users = self.get_report_users(
|
|
227
230
|
workspace_id=workspace.id, report_id=report.id
|
|
228
231
|
)
|
|
@@ -234,7 +237,7 @@ class PowerBiAPI:
|
|
|
234
237
|
)
|
|
235
238
|
return
|
|
236
239
|
|
|
237
|
-
for report in reports:
|
|
240
|
+
for report in reports.values():
|
|
238
241
|
report.tags = workspace.report_endorsements.get(report.id, [])
|
|
239
242
|
|
|
240
243
|
fill_ownership()
|
|
@@ -270,12 +273,12 @@ class PowerBiAPI:
|
|
|
270
273
|
name=workspace[Constant.NAME],
|
|
271
274
|
type=workspace[Constant.TYPE],
|
|
272
275
|
datasets={},
|
|
273
|
-
dashboards=
|
|
274
|
-
reports=
|
|
276
|
+
dashboards={},
|
|
277
|
+
reports={},
|
|
275
278
|
report_endorsements={},
|
|
276
279
|
dashboard_endorsements={},
|
|
277
280
|
scan_result={},
|
|
278
|
-
independent_datasets=
|
|
281
|
+
independent_datasets={},
|
|
279
282
|
app=None, # It will be populated in _fill_metadata_from_scan_result method
|
|
280
283
|
)
|
|
281
284
|
for workspace in groups
|
|
@@ -561,12 +564,12 @@ class PowerBiAPI:
|
|
|
561
564
|
name=workspace_metadata[Constant.NAME],
|
|
562
565
|
type=workspace_metadata[Constant.TYPE],
|
|
563
566
|
datasets={},
|
|
564
|
-
dashboards=
|
|
565
|
-
reports=
|
|
567
|
+
dashboards={},
|
|
568
|
+
reports={},
|
|
566
569
|
report_endorsements={},
|
|
567
570
|
dashboard_endorsements={},
|
|
568
571
|
scan_result={},
|
|
569
|
-
independent_datasets=
|
|
572
|
+
independent_datasets={},
|
|
570
573
|
app=None, # It is getting set from scan-result
|
|
571
574
|
)
|
|
572
575
|
cur_workspace.scan_result = workspace_metadata
|
|
@@ -597,25 +600,28 @@ class PowerBiAPI:
|
|
|
597
600
|
def _fill_independent_datasets(self, workspace: Workspace) -> None:
|
|
598
601
|
reachable_datasets: List[str] = []
|
|
599
602
|
# Find out reachable datasets
|
|
600
|
-
for dashboard in workspace.dashboards:
|
|
603
|
+
for dashboard in workspace.dashboards.values():
|
|
601
604
|
for tile in dashboard.tiles:
|
|
602
605
|
if tile.dataset is not None:
|
|
603
606
|
reachable_datasets.append(tile.dataset.id)
|
|
604
607
|
|
|
605
|
-
for report in workspace.reports:
|
|
608
|
+
for report in workspace.reports.values():
|
|
606
609
|
if report.dataset is not None:
|
|
607
610
|
reachable_datasets.append(report.dataset.id)
|
|
608
611
|
|
|
609
612
|
# Set datasets not present in reachable_datasets
|
|
610
613
|
for dataset in workspace.datasets.values():
|
|
611
614
|
if dataset.id not in reachable_datasets:
|
|
612
|
-
workspace.independent_datasets.
|
|
615
|
+
workspace.independent_datasets[dataset.id] = dataset
|
|
613
616
|
|
|
614
617
|
def _fill_regular_metadata_detail(self, workspace: Workspace) -> None:
|
|
615
618
|
def fill_dashboards() -> None:
|
|
616
|
-
workspace.dashboards =
|
|
619
|
+
workspace.dashboards = {
|
|
620
|
+
dashboard.id: dashboard
|
|
621
|
+
for dashboard in self._get_resolver().get_dashboards(workspace)
|
|
622
|
+
}
|
|
617
623
|
# set tiles of Dashboard
|
|
618
|
-
for dashboard in workspace.dashboards:
|
|
624
|
+
for dashboard in workspace.dashboards.values():
|
|
619
625
|
dashboard.tiles = self._get_resolver().get_tiles(
|
|
620
626
|
workspace, dashboard=dashboard
|
|
621
627
|
)
|
|
@@ -644,7 +650,7 @@ class PowerBiAPI:
|
|
|
644
650
|
"Skipping tag retrieval for dashboard as extract_endorsements_to_tags is set to false"
|
|
645
651
|
)
|
|
646
652
|
return
|
|
647
|
-
for dashboard in workspace.dashboards:
|
|
653
|
+
for dashboard in workspace.dashboards.values():
|
|
648
654
|
dashboard.tags = workspace.dashboard_endorsements.get(dashboard.id, [])
|
|
649
655
|
|
|
650
656
|
if self.__config.extract_dashboards:
|
|
@@ -40,7 +40,6 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
40
40
|
get_bucket_name,
|
|
41
41
|
get_bucket_relative_path,
|
|
42
42
|
get_key_prefix,
|
|
43
|
-
group_s3_objects_by_dirname,
|
|
44
43
|
strip_s3_prefix,
|
|
45
44
|
)
|
|
46
45
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
|
|
@@ -73,6 +72,7 @@ from datahub.metadata.schema_classes import (
|
|
|
73
72
|
_Aspect,
|
|
74
73
|
)
|
|
75
74
|
from datahub.telemetry import stats, telemetry
|
|
75
|
+
from datahub.utilities.groupby import groupby_unsorted
|
|
76
76
|
from datahub.utilities.perf_timer import PerfTimer
|
|
77
77
|
|
|
78
78
|
if TYPE_CHECKING:
|
|
@@ -868,7 +868,11 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
868
868
|
"""
|
|
869
869
|
partitions: List[Folder] = []
|
|
870
870
|
s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
|
|
871
|
-
|
|
871
|
+
grouped_s3_objects_by_dirname = groupby_unsorted(
|
|
872
|
+
s3_objects,
|
|
873
|
+
key=lambda obj: obj.key.rsplit("/", 1)[0],
|
|
874
|
+
)
|
|
875
|
+
for key, group in grouped_s3_objects_by_dirname:
|
|
872
876
|
file_size = 0
|
|
873
877
|
creation_time = None
|
|
874
878
|
modification_time = None
|
|
@@ -5,6 +5,8 @@ from typing import Iterable, List, Optional, Tuple
|
|
|
5
5
|
|
|
6
6
|
from pydantic import Field, SecretStr
|
|
7
7
|
from slack_sdk import WebClient
|
|
8
|
+
from tenacity import retry, wait_exponential
|
|
9
|
+
from tenacity.before_sleep import before_sleep_log
|
|
8
10
|
|
|
9
11
|
import datahub.emitter.mce_builder as builder
|
|
10
12
|
from datahub.configuration.common import ConfigModel
|
|
@@ -294,6 +296,10 @@ class SlackSource(Source):
|
|
|
294
296
|
return
|
|
295
297
|
raise e
|
|
296
298
|
|
|
299
|
+
@retry(
|
|
300
|
+
wait=wait_exponential(multiplier=2, min=4, max=60),
|
|
301
|
+
before_sleep=before_sleep_log(logger, logging.ERROR, True),
|
|
302
|
+
)
|
|
297
303
|
def get_user_to_be_updated(self) -> Iterable[CorpUser]:
|
|
298
304
|
graphql_query = textwrap.dedent(
|
|
299
305
|
"""
|
|
@@ -2,7 +2,6 @@ import base64
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
from collections import namedtuple
|
|
5
|
-
from itertools import groupby
|
|
6
5
|
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
7
6
|
|
|
8
7
|
from pydantic.dataclasses import dataclass
|
|
@@ -58,6 +57,7 @@ from datahub.metadata.schema_classes import (
|
|
|
58
57
|
SubTypesClass,
|
|
59
58
|
ViewPropertiesClass,
|
|
60
59
|
)
|
|
60
|
+
from datahub.utilities.groupby import groupby_unsorted
|
|
61
61
|
from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
|
|
62
62
|
from datahub.utilities.str_enum import StrEnum
|
|
63
63
|
|
|
@@ -490,7 +490,7 @@ class HiveMetastoreSource(SQLAlchemySource):
|
|
|
490
490
|
|
|
491
491
|
iter_res = self._alchemy_client.execute_query(statement)
|
|
492
492
|
|
|
493
|
-
for key, group in
|
|
493
|
+
for key, group in groupby_unsorted(iter_res, self._get_table_key):
|
|
494
494
|
schema_name = (
|
|
495
495
|
f"{db_name}.{key.schema}"
|
|
496
496
|
if self.config.include_catalog_name_in_ids
|
|
@@ -647,7 +647,7 @@ class HiveMetastoreSource(SQLAlchemySource):
|
|
|
647
647
|
)
|
|
648
648
|
|
|
649
649
|
iter_res = self._alchemy_client.execute_query(statement)
|
|
650
|
-
for key, group in
|
|
650
|
+
for key, group in groupby_unsorted(iter_res, self._get_table_key):
|
|
651
651
|
db_name = self.get_db_name(inspector)
|
|
652
652
|
|
|
653
653
|
schema_name = (
|
|
@@ -156,7 +156,7 @@ class MSSQLDataJob:
|
|
|
156
156
|
entity: Union[StoredProcedure, JobStep]
|
|
157
157
|
type: str = "dataJob"
|
|
158
158
|
source: str = "mssql"
|
|
159
|
-
external_url: str =
|
|
159
|
+
external_url: Optional[str] = None
|
|
160
160
|
description: Optional[str] = None
|
|
161
161
|
status: Optional[str] = None
|
|
162
162
|
incoming: List[str] = field(default_factory=list)
|
|
@@ -228,7 +228,7 @@ class MSSQLDataFlow:
|
|
|
228
228
|
entity: Union[MSSQLJob, MSSQLProceduresContainer]
|
|
229
229
|
type: str = "dataFlow"
|
|
230
230
|
source: str = "mssql"
|
|
231
|
-
external_url: str =
|
|
231
|
+
external_url: Optional[str] = None
|
|
232
232
|
flow_properties: Dict[str, str] = field(default_factory=dict)
|
|
233
233
|
|
|
234
234
|
def add_property(
|
|
@@ -11,6 +11,7 @@ from sqlalchemy.engine.base import Connection
|
|
|
11
11
|
from sqlalchemy.engine.reflection import Inspector
|
|
12
12
|
from sqlalchemy.exc import ProgrammingError, ResourceClosedError
|
|
13
13
|
|
|
14
|
+
import datahub.metadata.schema_classes as models
|
|
14
15
|
from datahub.configuration.common import AllowDenyPattern
|
|
15
16
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
16
17
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -49,21 +50,15 @@ from datahub.ingestion.source.sql.sql_config import (
|
|
|
49
50
|
make_sqlalchemy_uri,
|
|
50
51
|
)
|
|
51
52
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
52
|
-
from datahub.metadata.schema_classes import (
|
|
53
|
-
BooleanTypeClass,
|
|
54
|
-
NumberTypeClass,
|
|
55
|
-
StringTypeClass,
|
|
56
|
-
UnionTypeClass,
|
|
57
|
-
)
|
|
58
53
|
from datahub.utilities.file_backed_collections import FileBackedList
|
|
59
54
|
|
|
60
55
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
61
56
|
|
|
62
|
-
register_custom_type(sqlalchemy.dialects.mssql.BIT, BooleanTypeClass)
|
|
63
|
-
register_custom_type(sqlalchemy.dialects.mssql.MONEY, NumberTypeClass)
|
|
64
|
-
register_custom_type(sqlalchemy.dialects.mssql.SMALLMONEY, NumberTypeClass)
|
|
65
|
-
register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, UnionTypeClass)
|
|
66
|
-
register_custom_type(sqlalchemy.dialects.mssql.UNIQUEIDENTIFIER, StringTypeClass)
|
|
57
|
+
register_custom_type(sqlalchemy.dialects.mssql.BIT, models.BooleanTypeClass)
|
|
58
|
+
register_custom_type(sqlalchemy.dialects.mssql.MONEY, models.NumberTypeClass)
|
|
59
|
+
register_custom_type(sqlalchemy.dialects.mssql.SMALLMONEY, models.NumberTypeClass)
|
|
60
|
+
register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, models.UnionTypeClass)
|
|
61
|
+
register_custom_type(sqlalchemy.dialects.mssql.UNIQUEIDENTIFIER, models.StringTypeClass)
|
|
67
62
|
|
|
68
63
|
|
|
69
64
|
class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
@@ -651,6 +646,26 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
651
646
|
entityUrn=data_job.urn,
|
|
652
647
|
aspect=data_job.as_datajob_input_output_aspect,
|
|
653
648
|
).as_workunit()
|
|
649
|
+
|
|
650
|
+
if (
|
|
651
|
+
self.config.include_stored_procedures_code
|
|
652
|
+
and isinstance(data_job.entity, StoredProcedure)
|
|
653
|
+
and data_job.entity.code is not None
|
|
654
|
+
):
|
|
655
|
+
yield MetadataChangeProposalWrapper(
|
|
656
|
+
entityUrn=data_job.urn,
|
|
657
|
+
aspect=models.DataTransformLogicClass(
|
|
658
|
+
transforms=[
|
|
659
|
+
models.DataTransformClass(
|
|
660
|
+
queryStatement=models.QueryStatementClass(
|
|
661
|
+
value=data_job.entity.code,
|
|
662
|
+
language=models.QueryLanguageClass.SQL,
|
|
663
|
+
),
|
|
664
|
+
)
|
|
665
|
+
]
|
|
666
|
+
),
|
|
667
|
+
).as_workunit()
|
|
668
|
+
|
|
654
669
|
# TODO: Add SubType when it appear
|
|
655
670
|
|
|
656
671
|
def construct_flow_workunits(
|
|
@@ -3,7 +3,6 @@ from collections import defaultdict
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from functools import lru_cache
|
|
6
|
-
from itertools import groupby
|
|
7
6
|
from typing import (
|
|
8
7
|
Any,
|
|
9
8
|
Dict,
|
|
@@ -59,6 +58,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
59
58
|
from datahub.metadata.schema_classes import SchemaMetadataClass
|
|
60
59
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
61
60
|
from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
|
|
61
|
+
from datahub.utilities.groupby import groupby_unsorted
|
|
62
62
|
|
|
63
63
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
64
64
|
|
|
@@ -286,7 +286,7 @@ def optimized_get_foreign_keys(self, connection, table_name, schema=None, **kw):
|
|
|
286
286
|
|
|
287
287
|
# TODO: Check if there's a better way
|
|
288
288
|
fk_dicts = list()
|
|
289
|
-
for constraint_info, constraint_cols in
|
|
289
|
+
for constraint_info, constraint_cols in groupby_unsorted(res, grouper):
|
|
290
290
|
fk_dict = {
|
|
291
291
|
"name": str(constraint_info["name"]),
|
|
292
292
|
"constrained_columns": list(),
|
|
@@ -1147,23 +1147,36 @@ class TableauSiteSource:
|
|
|
1147
1147
|
)
|
|
1148
1148
|
# Set parent project name
|
|
1149
1149
|
for _project_id, project in all_project_map.items():
|
|
1150
|
-
if
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1150
|
+
if project.parent_id is None:
|
|
1151
|
+
continue
|
|
1152
|
+
|
|
1153
|
+
if project.parent_id in all_project_map:
|
|
1154
1154
|
project.parent_name = all_project_map[project.parent_id].name
|
|
1155
|
+
else:
|
|
1156
|
+
self.report.warning(
|
|
1157
|
+
title="Incomplete project hierarchy",
|
|
1158
|
+
message="Project details missing. Child projects will be ingested without reference to their parent project. We generally need Site Administrator Explorer permissions to extract the complete project hierarchy.",
|
|
1159
|
+
context=f"Missing {project.parent_id}, referenced by {project.id} {project.project_name}",
|
|
1160
|
+
)
|
|
1161
|
+
project.parent_id = None
|
|
1162
|
+
|
|
1163
|
+
# Post-condition
|
|
1164
|
+
assert all(
|
|
1165
|
+
[
|
|
1166
|
+
((project.parent_id is None) == (project.parent_name is None))
|
|
1167
|
+
and (
|
|
1168
|
+
project.parent_id is None
|
|
1169
|
+
or project.parent_id in all_project_map
|
|
1170
|
+
)
|
|
1171
|
+
for project in all_project_map.values()
|
|
1172
|
+
]
|
|
1173
|
+
), "Parent project id and name should be consistent"
|
|
1155
1174
|
|
|
1156
1175
|
def set_project_path():
|
|
1157
1176
|
def form_path(project_id: str) -> List[str]:
|
|
1158
1177
|
cur_proj = all_project_map[project_id]
|
|
1159
1178
|
ancestors = [cur_proj.name]
|
|
1160
1179
|
while cur_proj.parent_id is not None:
|
|
1161
|
-
if cur_proj.parent_id not in all_project_map:
|
|
1162
|
-
self.report.warning(
|
|
1163
|
-
"project-issue",
|
|
1164
|
-
f"Parent project {cur_proj.parent_id} not found. We need Site Administrator Explorer permissions.",
|
|
1165
|
-
)
|
|
1166
|
-
break
|
|
1167
1180
|
cur_proj = all_project_map[cur_proj.parent_id]
|
|
1168
1181
|
ancestors = [cur_proj.name, *ancestors]
|
|
1169
1182
|
return ancestors
|