acryl-datahub 1.2.0.3rc2__py3-none-any.whl → 1.2.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4.dist-info}/METADATA +2665 -2664
- {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4.dist-info}/RECORD +68 -67
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +3 -3
- datahub/api/entities/external/external_tag.py +6 -4
- datahub/api/entities/external/lake_formation_external_entites.py +50 -49
- datahub/api/entities/external/restricted_text.py +105 -180
- datahub/api/entities/external/unity_catalog_external_entites.py +51 -52
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/quickstart_versioning.py +1 -1
- datahub/cli/specific/assertions_cli.py +37 -2
- datahub/cli/specific/datacontract_cli.py +54 -4
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +1 -1
- datahub/ingestion/api/report.py +21 -2
- datahub/ingestion/api/source.py +81 -7
- datahub/ingestion/autogenerated/capability_summary.json +47 -19
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +9 -0
- datahub/ingestion/source/aws/glue.py +18 -2
- datahub/ingestion/source/aws/tag_entities.py +4 -4
- datahub/ingestion/source/data_lake_common/path_spec.py +6 -3
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +6 -3
- datahub/ingestion/source/delta_lake/source.py +8 -1
- datahub/ingestion/source/dremio/dremio_source.py +19 -2
- datahub/ingestion/source/fivetran/fivetran.py +9 -3
- datahub/ingestion/source/fivetran/fivetran_log_api.py +4 -3
- datahub/ingestion/source/ge_data_profiler.py +8 -0
- datahub/ingestion/source/grafana/models.py +6 -0
- datahub/ingestion/source/hex/hex.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +4 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/mock_data/datahub_mock_data.py +26 -10
- datahub/ingestion/source/powerbi/powerbi.py +4 -1
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/redshift.py +1 -0
- datahub/ingestion/source/salesforce.py +8 -0
- datahub/ingestion/source/slack/slack.py +7 -14
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -4
- datahub/ingestion/source/sql/hive_metastore.py +8 -0
- datahub/ingestion/source/sql/teradata.py +8 -1
- datahub/ingestion/source/sql/trino.py +9 -0
- datahub/ingestion/source/tableau/tableau.py +1 -1
- datahub/ingestion/source/unity/config.py +36 -1
- datahub/ingestion/source/unity/proxy.py +332 -46
- datahub/ingestion/source/unity/proxy_types.py +12 -2
- datahub/ingestion/source/unity/source.py +91 -34
- datahub/ingestion/source/unity/tag_entities.py +5 -5
- datahub/ingestion/source/usage/starburst_trino_usage.py +2 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/metadata/_internal_schema_classes.py +513 -513
- datahub/metadata/_urns/urn_defs.py +1684 -1684
- datahub/metadata/schema.avsc +16745 -16348
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/sdk/entity_client.py +22 -7
- datahub/sdk/search_client.py +3 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataset.py +37 -59
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/server_config_util.py +2 -1
- {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4.dist-info}/top_level.txt +0 -0
|
@@ -7,12 +7,14 @@ from urllib.parse import urljoin
|
|
|
7
7
|
from datahub.api.entities.external.external_entities import PlatformResourceRepository
|
|
8
8
|
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
9
9
|
from datahub.emitter.mce_builder import (
|
|
10
|
+
UNKNOWN_USER,
|
|
10
11
|
make_data_platform_urn,
|
|
11
12
|
make_dataplatform_instance_urn,
|
|
12
13
|
make_dataset_urn_with_platform_instance,
|
|
13
14
|
make_domain_urn,
|
|
14
15
|
make_group_urn,
|
|
15
16
|
make_schema_field_urn,
|
|
17
|
+
make_ts_millis,
|
|
16
18
|
make_user_urn,
|
|
17
19
|
)
|
|
18
20
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
@@ -111,6 +113,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
111
113
|
ViewProperties,
|
|
112
114
|
)
|
|
113
115
|
from datahub.metadata.schema_classes import (
|
|
116
|
+
AuditStampClass,
|
|
114
117
|
BrowsePathsClass,
|
|
115
118
|
DataPlatformInstanceClass,
|
|
116
119
|
DatasetLineageTypeClass,
|
|
@@ -203,6 +206,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
203
206
|
config.warehouse_id,
|
|
204
207
|
report=self.report,
|
|
205
208
|
hive_metastore_proxy=self.hive_metastore_proxy,
|
|
209
|
+
lineage_data_source=config.lineage_data_source,
|
|
206
210
|
)
|
|
207
211
|
|
|
208
212
|
self.external_url_base = urljoin(self.config.workspace_url, "/explore/data")
|
|
@@ -410,12 +414,12 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
410
414
|
self.config.workspace_url, f"#notebook/{notebook.id}"
|
|
411
415
|
),
|
|
412
416
|
created=(
|
|
413
|
-
TimeStampClass(
|
|
417
|
+
TimeStampClass(make_ts_millis(notebook.created_at))
|
|
414
418
|
if notebook.created_at
|
|
415
419
|
else None
|
|
416
420
|
),
|
|
417
421
|
lastModified=(
|
|
418
|
-
TimeStampClass(
|
|
422
|
+
TimeStampClass(make_ts_millis(notebook.modified_at))
|
|
419
423
|
if notebook.modified_at
|
|
420
424
|
else None
|
|
421
425
|
),
|
|
@@ -434,17 +438,20 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
434
438
|
if not notebook.upstreams:
|
|
435
439
|
return None
|
|
436
440
|
|
|
441
|
+
upstreams = []
|
|
442
|
+
for upstream_ref in notebook.upstreams:
|
|
443
|
+
timestamp = make_ts_millis(upstream_ref.last_updated)
|
|
444
|
+
upstreams.append(
|
|
445
|
+
self._create_upstream_class(
|
|
446
|
+
self.gen_dataset_urn(upstream_ref),
|
|
447
|
+
DatasetLineageTypeClass.COPY,
|
|
448
|
+
timestamp,
|
|
449
|
+
)
|
|
450
|
+
)
|
|
451
|
+
|
|
437
452
|
return MetadataChangeProposalWrapper(
|
|
438
453
|
entityUrn=self.gen_notebook_urn(notebook),
|
|
439
|
-
aspect=UpstreamLineageClass(
|
|
440
|
-
upstreams=[
|
|
441
|
-
UpstreamClass(
|
|
442
|
-
dataset=self.gen_dataset_urn(upstream_ref),
|
|
443
|
-
type=DatasetLineageTypeClass.COPY,
|
|
444
|
-
)
|
|
445
|
-
for upstream_ref in notebook.upstreams
|
|
446
|
-
]
|
|
447
|
-
),
|
|
454
|
+
aspect=UpstreamLineageClass(upstreams=upstreams),
|
|
448
455
|
).as_workunit()
|
|
449
456
|
|
|
450
457
|
def process_metastores(self) -> Iterable[MetadataWorkUnit]:
|
|
@@ -463,14 +470,15 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
463
470
|
self, metastore: Optional[Metastore]
|
|
464
471
|
) -> Iterable[MetadataWorkUnit]:
|
|
465
472
|
for catalog in self._get_catalogs(metastore):
|
|
466
|
-
|
|
467
|
-
self.
|
|
468
|
-
|
|
473
|
+
with self.report.new_stage(f"Ingest catalog {catalog.id}"):
|
|
474
|
+
if not self.config.catalog_pattern.allowed(catalog.id):
|
|
475
|
+
self.report.catalogs.dropped(catalog.id)
|
|
476
|
+
continue
|
|
469
477
|
|
|
470
|
-
|
|
471
|
-
|
|
478
|
+
yield from self.gen_catalog_containers(catalog)
|
|
479
|
+
yield from self.process_schemas(catalog)
|
|
472
480
|
|
|
473
|
-
|
|
481
|
+
self.report.catalogs.processed(catalog.id)
|
|
474
482
|
|
|
475
483
|
def _get_catalogs(self, metastore: Optional[Metastore]) -> Iterable[Catalog]:
|
|
476
484
|
if self.config.catalogs:
|
|
@@ -647,9 +655,21 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
647
655
|
]
|
|
648
656
|
|
|
649
657
|
def ingest_lineage(self, table: Table) -> Optional[UpstreamLineageClass]:
|
|
658
|
+
# Calculate datetime filters for lineage
|
|
659
|
+
lineage_start_time = None
|
|
660
|
+
lineage_end_time = self.config.end_time
|
|
661
|
+
|
|
662
|
+
if self.config.ignore_start_time_lineage:
|
|
663
|
+
lineage_start_time = None # Ignore start time to get all lineage
|
|
664
|
+
else:
|
|
665
|
+
lineage_start_time = self.config.start_time
|
|
666
|
+
|
|
650
667
|
if self.config.include_table_lineage:
|
|
651
668
|
self.unity_catalog_api_proxy.table_lineage(
|
|
652
|
-
table,
|
|
669
|
+
table,
|
|
670
|
+
include_entity_lineage=self.config.include_notebooks,
|
|
671
|
+
start_time=lineage_start_time,
|
|
672
|
+
end_time=lineage_end_time,
|
|
653
673
|
)
|
|
654
674
|
|
|
655
675
|
if self.config.include_column_lineage and table.upstreams:
|
|
@@ -661,7 +681,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
661
681
|
for column in table.columns[: self.config.column_lineage_column_limit]
|
|
662
682
|
]
|
|
663
683
|
self.unity_catalog_api_proxy.get_column_lineage(
|
|
664
|
-
table,
|
|
684
|
+
table,
|
|
685
|
+
column_names,
|
|
686
|
+
max_workers=self.config.lineage_max_workers,
|
|
687
|
+
start_time=lineage_start_time,
|
|
688
|
+
end_time=lineage_end_time,
|
|
665
689
|
)
|
|
666
690
|
|
|
667
691
|
return self._generate_lineage_aspect(self.gen_dataset_urn(table.ref), table)
|
|
@@ -690,18 +714,22 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
690
714
|
for d_col, u_cols in sorted(downstream_to_upstream_cols.items())
|
|
691
715
|
)
|
|
692
716
|
|
|
717
|
+
timestamp = make_ts_millis(upstream_ref.last_updated)
|
|
693
718
|
upstreams.append(
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
719
|
+
self._create_upstream_class(
|
|
720
|
+
upstream_urn,
|
|
721
|
+
DatasetLineageTypeClass.TRANSFORMED,
|
|
722
|
+
timestamp,
|
|
697
723
|
)
|
|
698
724
|
)
|
|
699
725
|
|
|
700
|
-
for notebook in table.upstream_notebooks:
|
|
726
|
+
for notebook in table.upstream_notebooks.values():
|
|
727
|
+
timestamp = make_ts_millis(notebook.last_updated)
|
|
701
728
|
upstreams.append(
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
729
|
+
self._create_upstream_class(
|
|
730
|
+
self.gen_notebook_urn(notebook.id),
|
|
731
|
+
DatasetLineageTypeClass.TRANSFORMED,
|
|
732
|
+
timestamp,
|
|
705
733
|
)
|
|
706
734
|
)
|
|
707
735
|
|
|
@@ -771,6 +799,31 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
771
799
|
instance=self.config.platform_instance,
|
|
772
800
|
).as_urn()
|
|
773
801
|
|
|
802
|
+
def _create_upstream_class(
|
|
803
|
+
self,
|
|
804
|
+
dataset_urn: str,
|
|
805
|
+
lineage_type: Union[str, DatasetLineageTypeClass],
|
|
806
|
+
timestamp: Optional[int],
|
|
807
|
+
) -> UpstreamClass:
|
|
808
|
+
"""
|
|
809
|
+
Helper method to create UpstreamClass with optional audit stamp.
|
|
810
|
+
If timestamp is None, audit stamp is omitted.
|
|
811
|
+
"""
|
|
812
|
+
if timestamp is not None:
|
|
813
|
+
return UpstreamClass(
|
|
814
|
+
dataset=dataset_urn,
|
|
815
|
+
type=lineage_type,
|
|
816
|
+
auditStamp=AuditStampClass(
|
|
817
|
+
time=timestamp,
|
|
818
|
+
actor=UNKNOWN_USER,
|
|
819
|
+
),
|
|
820
|
+
)
|
|
821
|
+
else:
|
|
822
|
+
return UpstreamClass(
|
|
823
|
+
dataset=dataset_urn,
|
|
824
|
+
type=lineage_type,
|
|
825
|
+
)
|
|
826
|
+
|
|
774
827
|
def gen_schema_containers(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
|
|
775
828
|
domain_urn = self._gen_domain_urn(f"{schema.catalog.name}.{schema.name}")
|
|
776
829
|
schema_tags = []
|
|
@@ -961,16 +1014,20 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
961
1014
|
created: Optional[TimeStampClass] = None
|
|
962
1015
|
if table.created_at:
|
|
963
1016
|
custom_properties["created_at"] = str(table.created_at)
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
1017
|
+
created_ts = make_ts_millis(table.created_at)
|
|
1018
|
+
if created_ts is not None:
|
|
1019
|
+
created = TimeStampClass(
|
|
1020
|
+
created_ts,
|
|
1021
|
+
make_user_urn(table.created_by) if table.created_by else None,
|
|
1022
|
+
)
|
|
968
1023
|
last_modified = created
|
|
969
1024
|
if table.updated_at:
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
1025
|
+
updated_ts = make_ts_millis(table.updated_at)
|
|
1026
|
+
if updated_ts is not None:
|
|
1027
|
+
last_modified = TimeStampClass(
|
|
1028
|
+
updated_ts,
|
|
1029
|
+
table.updated_by and make_user_urn(table.updated_by),
|
|
1030
|
+
)
|
|
974
1031
|
|
|
975
1032
|
return DatasetPropertiesClass(
|
|
976
1033
|
name=table.name,
|
|
@@ -36,7 +36,7 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
36
36
|
|
|
37
37
|
tag_key: str
|
|
38
38
|
tag_value: Optional[str] = None
|
|
39
|
-
platform_instance: Optional[str]
|
|
39
|
+
platform_instance: Optional[str] = None
|
|
40
40
|
exists_in_unity_catalog: bool = False
|
|
41
41
|
persisted: bool = False
|
|
42
42
|
|
|
@@ -77,13 +77,13 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
77
77
|
)
|
|
78
78
|
if existing_platform_resource:
|
|
79
79
|
logger.info(
|
|
80
|
-
f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.
|
|
80
|
+
f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.raw_text}: {existing_platform_resource}"
|
|
81
81
|
)
|
|
82
82
|
return existing_platform_resource
|
|
83
83
|
|
|
84
84
|
return UnityCatalogTagPlatformResourceId(
|
|
85
|
-
tag_key=tag.key.
|
|
86
|
-
tag_value=tag.value.
|
|
85
|
+
tag_key=tag.key.raw_text,
|
|
86
|
+
tag_value=tag.value.raw_text if tag.value is not None else None,
|
|
87
87
|
platform_instance=platform_instance,
|
|
88
88
|
exists_in_unity_catalog=exists_in_unity_catalog,
|
|
89
89
|
persisted=False,
|
|
@@ -218,7 +218,7 @@ class UnityCatalogTagPlatformResource(BaseModel, ExternalEntity):
|
|
|
218
218
|
datahub_urns: LinkedResourceSet
|
|
219
219
|
managed_by_datahub: bool
|
|
220
220
|
id: UnityCatalogTagPlatformResourceId
|
|
221
|
-
allowed_values: Optional[List[str]]
|
|
221
|
+
allowed_values: Optional[List[str]] = None
|
|
222
222
|
|
|
223
223
|
def get_id(self) -> ExternalEntityId:
|
|
224
224
|
return self.id
|
|
@@ -60,7 +60,7 @@ AggregatedDataset = GenericAggregatedDataset[TrinoTableRef]
|
|
|
60
60
|
|
|
61
61
|
class TrinoConnectorInfo(BaseModel):
|
|
62
62
|
partitionIds: List[str]
|
|
63
|
-
truncated: Optional[bool]
|
|
63
|
+
truncated: Optional[bool] = None
|
|
64
64
|
|
|
65
65
|
|
|
66
66
|
class TrinoAccessedMetadata(BaseModel):
|
|
@@ -80,7 +80,7 @@ class TrinoJoinedAccessEvent(BaseModel):
|
|
|
80
80
|
table: Optional[str] = None
|
|
81
81
|
accessed_metadata: List[TrinoAccessedMetadata]
|
|
82
82
|
starttime: datetime = Field(alias="create_time")
|
|
83
|
-
endtime: Optional[datetime] = Field(alias="end_time")
|
|
83
|
+
endtime: Optional[datetime] = Field(None, alias="end_time")
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
class EnvBasedSourceBaseConfig:
|
|
@@ -281,11 +281,14 @@ class BaseTransformer(Transformer, metaclass=ABCMeta):
|
|
|
281
281
|
)
|
|
282
282
|
)
|
|
283
283
|
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
284
|
+
if mcp.entityUrn:
|
|
285
|
+
record_metadata = _update_work_unit_id(
|
|
286
|
+
envelope=envelope,
|
|
287
|
+
aspect_name=mcp.aspect.get_aspect_name(), # type: ignore
|
|
288
|
+
urn=mcp.entityUrn,
|
|
289
|
+
)
|
|
290
|
+
else:
|
|
291
|
+
record_metadata = envelope.metadata.copy()
|
|
289
292
|
|
|
290
293
|
yield RecordEnvelope(
|
|
291
294
|
record=mcp,
|