acryl-datahub 1.2.0.3rc2__py3-none-any.whl → 1.2.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (68) hide show
  1. {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4.dist-info}/METADATA +2665 -2664
  2. {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4.dist-info}/RECORD +68 -67
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +3 -3
  5. datahub/api/entities/external/external_tag.py +6 -4
  6. datahub/api/entities/external/lake_formation_external_entites.py +50 -49
  7. datahub/api/entities/external/restricted_text.py +105 -180
  8. datahub/api/entities/external/unity_catalog_external_entites.py +51 -52
  9. datahub/api/entities/forms/forms.py +3 -3
  10. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  11. datahub/cli/quickstart_versioning.py +1 -1
  12. datahub/cli/specific/assertions_cli.py +37 -2
  13. datahub/cli/specific/datacontract_cli.py +54 -4
  14. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +1 -1
  15. datahub/ingestion/api/report.py +21 -2
  16. datahub/ingestion/api/source.py +81 -7
  17. datahub/ingestion/autogenerated/capability_summary.json +47 -19
  18. datahub/ingestion/source/abs/config.py +1 -1
  19. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  20. datahub/ingestion/source/abs/source.py +9 -0
  21. datahub/ingestion/source/aws/glue.py +18 -2
  22. datahub/ingestion/source/aws/tag_entities.py +4 -4
  23. datahub/ingestion/source/data_lake_common/path_spec.py +6 -3
  24. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  25. datahub/ingestion/source/dbt/dbt_cloud.py +6 -3
  26. datahub/ingestion/source/delta_lake/source.py +8 -1
  27. datahub/ingestion/source/dremio/dremio_source.py +19 -2
  28. datahub/ingestion/source/fivetran/fivetran.py +9 -3
  29. datahub/ingestion/source/fivetran/fivetran_log_api.py +4 -3
  30. datahub/ingestion/source/ge_data_profiler.py +8 -0
  31. datahub/ingestion/source/grafana/models.py +6 -0
  32. datahub/ingestion/source/hex/hex.py +1 -1
  33. datahub/ingestion/source/iceberg/iceberg.py +4 -4
  34. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  35. datahub/ingestion/source/mock_data/datahub_mock_data.py +26 -10
  36. datahub/ingestion/source/powerbi/powerbi.py +4 -1
  37. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  38. datahub/ingestion/source/redshift/datashares.py +1 -1
  39. datahub/ingestion/source/redshift/redshift.py +1 -0
  40. datahub/ingestion/source/salesforce.py +8 -0
  41. datahub/ingestion/source/slack/slack.py +7 -14
  42. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -4
  43. datahub/ingestion/source/sql/hive_metastore.py +8 -0
  44. datahub/ingestion/source/sql/teradata.py +8 -1
  45. datahub/ingestion/source/sql/trino.py +9 -0
  46. datahub/ingestion/source/tableau/tableau.py +1 -1
  47. datahub/ingestion/source/unity/config.py +36 -1
  48. datahub/ingestion/source/unity/proxy.py +332 -46
  49. datahub/ingestion/source/unity/proxy_types.py +12 -2
  50. datahub/ingestion/source/unity/source.py +91 -34
  51. datahub/ingestion/source/unity/tag_entities.py +5 -5
  52. datahub/ingestion/source/usage/starburst_trino_usage.py +2 -2
  53. datahub/ingestion/transformer/base_transformer.py +8 -5
  54. datahub/metadata/_internal_schema_classes.py +513 -513
  55. datahub/metadata/_urns/urn_defs.py +1684 -1684
  56. datahub/metadata/schema.avsc +16745 -16348
  57. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  58. datahub/sdk/entity_client.py +22 -7
  59. datahub/sdk/search_client.py +3 -0
  60. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  61. datahub/specific/datajob.py +15 -1
  62. datahub/specific/dataset.py +37 -59
  63. datahub/utilities/mapping.py +29 -2
  64. datahub/utilities/server_config_util.py +2 -1
  65. {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4.dist-info}/WHEEL +0 -0
  66. {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4.dist-info}/entry_points.txt +0 -0
  67. {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4.dist-info}/licenses/LICENSE +0 -0
  68. {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4.dist-info}/top_level.txt +0 -0
@@ -7,12 +7,14 @@ from urllib.parse import urljoin
7
7
  from datahub.api.entities.external.external_entities import PlatformResourceRepository
8
8
  from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
9
9
  from datahub.emitter.mce_builder import (
10
+ UNKNOWN_USER,
10
11
  make_data_platform_urn,
11
12
  make_dataplatform_instance_urn,
12
13
  make_dataset_urn_with_platform_instance,
13
14
  make_domain_urn,
14
15
  make_group_urn,
15
16
  make_schema_field_urn,
17
+ make_ts_millis,
16
18
  make_user_urn,
17
19
  )
18
20
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -111,6 +113,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
111
113
  ViewProperties,
112
114
  )
113
115
  from datahub.metadata.schema_classes import (
116
+ AuditStampClass,
114
117
  BrowsePathsClass,
115
118
  DataPlatformInstanceClass,
116
119
  DatasetLineageTypeClass,
@@ -203,6 +206,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
203
206
  config.warehouse_id,
204
207
  report=self.report,
205
208
  hive_metastore_proxy=self.hive_metastore_proxy,
209
+ lineage_data_source=config.lineage_data_source,
206
210
  )
207
211
 
208
212
  self.external_url_base = urljoin(self.config.workspace_url, "/explore/data")
@@ -410,12 +414,12 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
410
414
  self.config.workspace_url, f"#notebook/{notebook.id}"
411
415
  ),
412
416
  created=(
413
- TimeStampClass(int(notebook.created_at.timestamp() * 1000))
417
+ TimeStampClass(make_ts_millis(notebook.created_at))
414
418
  if notebook.created_at
415
419
  else None
416
420
  ),
417
421
  lastModified=(
418
- TimeStampClass(int(notebook.modified_at.timestamp() * 1000))
422
+ TimeStampClass(make_ts_millis(notebook.modified_at))
419
423
  if notebook.modified_at
420
424
  else None
421
425
  ),
@@ -434,17 +438,20 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
434
438
  if not notebook.upstreams:
435
439
  return None
436
440
 
441
+ upstreams = []
442
+ for upstream_ref in notebook.upstreams:
443
+ timestamp = make_ts_millis(upstream_ref.last_updated)
444
+ upstreams.append(
445
+ self._create_upstream_class(
446
+ self.gen_dataset_urn(upstream_ref),
447
+ DatasetLineageTypeClass.COPY,
448
+ timestamp,
449
+ )
450
+ )
451
+
437
452
  return MetadataChangeProposalWrapper(
438
453
  entityUrn=self.gen_notebook_urn(notebook),
439
- aspect=UpstreamLineageClass(
440
- upstreams=[
441
- UpstreamClass(
442
- dataset=self.gen_dataset_urn(upstream_ref),
443
- type=DatasetLineageTypeClass.COPY,
444
- )
445
- for upstream_ref in notebook.upstreams
446
- ]
447
- ),
454
+ aspect=UpstreamLineageClass(upstreams=upstreams),
448
455
  ).as_workunit()
449
456
 
450
457
  def process_metastores(self) -> Iterable[MetadataWorkUnit]:
@@ -463,14 +470,15 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
463
470
  self, metastore: Optional[Metastore]
464
471
  ) -> Iterable[MetadataWorkUnit]:
465
472
  for catalog in self._get_catalogs(metastore):
466
- if not self.config.catalog_pattern.allowed(catalog.id):
467
- self.report.catalogs.dropped(catalog.id)
468
- continue
473
+ with self.report.new_stage(f"Ingest catalog {catalog.id}"):
474
+ if not self.config.catalog_pattern.allowed(catalog.id):
475
+ self.report.catalogs.dropped(catalog.id)
476
+ continue
469
477
 
470
- yield from self.gen_catalog_containers(catalog)
471
- yield from self.process_schemas(catalog)
478
+ yield from self.gen_catalog_containers(catalog)
479
+ yield from self.process_schemas(catalog)
472
480
 
473
- self.report.catalogs.processed(catalog.id)
481
+ self.report.catalogs.processed(catalog.id)
474
482
 
475
483
  def _get_catalogs(self, metastore: Optional[Metastore]) -> Iterable[Catalog]:
476
484
  if self.config.catalogs:
@@ -647,9 +655,21 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
647
655
  ]
648
656
 
649
657
  def ingest_lineage(self, table: Table) -> Optional[UpstreamLineageClass]:
658
+ # Calculate datetime filters for lineage
659
+ lineage_start_time = None
660
+ lineage_end_time = self.config.end_time
661
+
662
+ if self.config.ignore_start_time_lineage:
663
+ lineage_start_time = None # Ignore start time to get all lineage
664
+ else:
665
+ lineage_start_time = self.config.start_time
666
+
650
667
  if self.config.include_table_lineage:
651
668
  self.unity_catalog_api_proxy.table_lineage(
652
- table, include_entity_lineage=self.config.include_notebooks
669
+ table,
670
+ include_entity_lineage=self.config.include_notebooks,
671
+ start_time=lineage_start_time,
672
+ end_time=lineage_end_time,
653
673
  )
654
674
 
655
675
  if self.config.include_column_lineage and table.upstreams:
@@ -661,7 +681,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
661
681
  for column in table.columns[: self.config.column_lineage_column_limit]
662
682
  ]
663
683
  self.unity_catalog_api_proxy.get_column_lineage(
664
- table, column_names, max_workers=self.config.lineage_max_workers
684
+ table,
685
+ column_names,
686
+ max_workers=self.config.lineage_max_workers,
687
+ start_time=lineage_start_time,
688
+ end_time=lineage_end_time,
665
689
  )
666
690
 
667
691
  return self._generate_lineage_aspect(self.gen_dataset_urn(table.ref), table)
@@ -690,18 +714,22 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
690
714
  for d_col, u_cols in sorted(downstream_to_upstream_cols.items())
691
715
  )
692
716
 
717
+ timestamp = make_ts_millis(upstream_ref.last_updated)
693
718
  upstreams.append(
694
- UpstreamClass(
695
- dataset=upstream_urn,
696
- type=DatasetLineageTypeClass.TRANSFORMED,
719
+ self._create_upstream_class(
720
+ upstream_urn,
721
+ DatasetLineageTypeClass.TRANSFORMED,
722
+ timestamp,
697
723
  )
698
724
  )
699
725
 
700
- for notebook in table.upstream_notebooks:
726
+ for notebook in table.upstream_notebooks.values():
727
+ timestamp = make_ts_millis(notebook.last_updated)
701
728
  upstreams.append(
702
- UpstreamClass(
703
- dataset=self.gen_notebook_urn(notebook),
704
- type=DatasetLineageTypeClass.TRANSFORMED,
729
+ self._create_upstream_class(
730
+ self.gen_notebook_urn(notebook.id),
731
+ DatasetLineageTypeClass.TRANSFORMED,
732
+ timestamp,
705
733
  )
706
734
  )
707
735
 
@@ -771,6 +799,31 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
771
799
  instance=self.config.platform_instance,
772
800
  ).as_urn()
773
801
 
802
+ def _create_upstream_class(
803
+ self,
804
+ dataset_urn: str,
805
+ lineage_type: Union[str, DatasetLineageTypeClass],
806
+ timestamp: Optional[int],
807
+ ) -> UpstreamClass:
808
+ """
809
+ Helper method to create UpstreamClass with optional audit stamp.
810
+ If timestamp is None, audit stamp is omitted.
811
+ """
812
+ if timestamp is not None:
813
+ return UpstreamClass(
814
+ dataset=dataset_urn,
815
+ type=lineage_type,
816
+ auditStamp=AuditStampClass(
817
+ time=timestamp,
818
+ actor=UNKNOWN_USER,
819
+ ),
820
+ )
821
+ else:
822
+ return UpstreamClass(
823
+ dataset=dataset_urn,
824
+ type=lineage_type,
825
+ )
826
+
774
827
  def gen_schema_containers(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
775
828
  domain_urn = self._gen_domain_urn(f"{schema.catalog.name}.{schema.name}")
776
829
  schema_tags = []
@@ -961,16 +1014,20 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
961
1014
  created: Optional[TimeStampClass] = None
962
1015
  if table.created_at:
963
1016
  custom_properties["created_at"] = str(table.created_at)
964
- created = TimeStampClass(
965
- int(table.created_at.timestamp() * 1000),
966
- make_user_urn(table.created_by) if table.created_by else None,
967
- )
1017
+ created_ts = make_ts_millis(table.created_at)
1018
+ if created_ts is not None:
1019
+ created = TimeStampClass(
1020
+ created_ts,
1021
+ make_user_urn(table.created_by) if table.created_by else None,
1022
+ )
968
1023
  last_modified = created
969
1024
  if table.updated_at:
970
- last_modified = TimeStampClass(
971
- int(table.updated_at.timestamp() * 1000),
972
- table.updated_by and make_user_urn(table.updated_by),
973
- )
1025
+ updated_ts = make_ts_millis(table.updated_at)
1026
+ if updated_ts is not None:
1027
+ last_modified = TimeStampClass(
1028
+ updated_ts,
1029
+ table.updated_by and make_user_urn(table.updated_by),
1030
+ )
974
1031
 
975
1032
  return DatasetPropertiesClass(
976
1033
  name=table.name,
@@ -36,7 +36,7 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
36
36
 
37
37
  tag_key: str
38
38
  tag_value: Optional[str] = None
39
- platform_instance: Optional[str]
39
+ platform_instance: Optional[str] = None
40
40
  exists_in_unity_catalog: bool = False
41
41
  persisted: bool = False
42
42
 
@@ -77,13 +77,13 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
77
77
  )
78
78
  if existing_platform_resource:
79
79
  logger.info(
80
- f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.original}: {existing_platform_resource}"
80
+ f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.raw_text}: {existing_platform_resource}"
81
81
  )
82
82
  return existing_platform_resource
83
83
 
84
84
  return UnityCatalogTagPlatformResourceId(
85
- tag_key=tag.key.original,
86
- tag_value=tag.value.original if tag.value is not None else None,
85
+ tag_key=tag.key.raw_text,
86
+ tag_value=tag.value.raw_text if tag.value is not None else None,
87
87
  platform_instance=platform_instance,
88
88
  exists_in_unity_catalog=exists_in_unity_catalog,
89
89
  persisted=False,
@@ -218,7 +218,7 @@ class UnityCatalogTagPlatformResource(BaseModel, ExternalEntity):
218
218
  datahub_urns: LinkedResourceSet
219
219
  managed_by_datahub: bool
220
220
  id: UnityCatalogTagPlatformResourceId
221
- allowed_values: Optional[List[str]]
221
+ allowed_values: Optional[List[str]] = None
222
222
 
223
223
  def get_id(self) -> ExternalEntityId:
224
224
  return self.id
@@ -60,7 +60,7 @@ AggregatedDataset = GenericAggregatedDataset[TrinoTableRef]
60
60
 
61
61
  class TrinoConnectorInfo(BaseModel):
62
62
  partitionIds: List[str]
63
- truncated: Optional[bool]
63
+ truncated: Optional[bool] = None
64
64
 
65
65
 
66
66
  class TrinoAccessedMetadata(BaseModel):
@@ -80,7 +80,7 @@ class TrinoJoinedAccessEvent(BaseModel):
80
80
  table: Optional[str] = None
81
81
  accessed_metadata: List[TrinoAccessedMetadata]
82
82
  starttime: datetime = Field(alias="create_time")
83
- endtime: Optional[datetime] = Field(alias="end_time")
83
+ endtime: Optional[datetime] = Field(None, alias="end_time")
84
84
 
85
85
 
86
86
  class EnvBasedSourceBaseConfig:
@@ -281,11 +281,14 @@ class BaseTransformer(Transformer, metaclass=ABCMeta):
281
281
  )
282
282
  )
283
283
 
284
- record_metadata = _update_work_unit_id(
285
- envelope=envelope,
286
- aspect_name=mcp.aspect.get_aspect_name(), # type: ignore
287
- urn=mcp.entityUrn,
288
- )
284
+ if mcp.entityUrn:
285
+ record_metadata = _update_work_unit_id(
286
+ envelope=envelope,
287
+ aspect_name=mcp.aspect.get_aspect_name(), # type: ignore
288
+ urn=mcp.entityUrn,
289
+ )
290
+ else:
291
+ record_metadata = envelope.metadata.copy()
289
292
 
290
293
  yield RecordEnvelope(
291
294
  record=mcp,