acryl-datahub 1.2.0.3rc2__py3-none-any.whl → 1.2.0.4rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (51) hide show
  1. {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4rc2.dist-info}/METADATA +2561 -2561
  2. {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4rc2.dist-info}/RECORD +51 -51
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +3 -3
  5. datahub/api/entities/external/external_tag.py +6 -4
  6. datahub/api/entities/external/lake_formation_external_entites.py +50 -49
  7. datahub/api/entities/external/restricted_text.py +105 -180
  8. datahub/api/entities/external/unity_catalog_external_entites.py +51 -52
  9. datahub/api/entities/forms/forms.py +3 -3
  10. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  11. datahub/cli/quickstart_versioning.py +1 -1
  12. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +1 -1
  13. datahub/ingestion/api/source.py +81 -7
  14. datahub/ingestion/autogenerated/capability_summary.json +47 -19
  15. datahub/ingestion/source/abs/config.py +1 -1
  16. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  17. datahub/ingestion/source/abs/source.py +9 -0
  18. datahub/ingestion/source/aws/glue.py +18 -2
  19. datahub/ingestion/source/aws/tag_entities.py +4 -4
  20. datahub/ingestion/source/data_lake_common/path_spec.py +1 -2
  21. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  22. datahub/ingestion/source/delta_lake/source.py +8 -1
  23. datahub/ingestion/source/dremio/dremio_source.py +19 -2
  24. datahub/ingestion/source/fivetran/fivetran.py +9 -3
  25. datahub/ingestion/source/ge_data_profiler.py +8 -0
  26. datahub/ingestion/source/hex/hex.py +1 -1
  27. datahub/ingestion/source/iceberg/iceberg.py +4 -4
  28. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  29. datahub/ingestion/source/mock_data/datahub_mock_data.py +26 -10
  30. datahub/ingestion/source/powerbi/powerbi.py +4 -1
  31. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  32. datahub/ingestion/source/redshift/datashares.py +1 -1
  33. datahub/ingestion/source/redshift/redshift.py +1 -0
  34. datahub/ingestion/source/salesforce.py +8 -0
  35. datahub/ingestion/source/slack/slack.py +7 -14
  36. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -4
  37. datahub/ingestion/source/sql/hive_metastore.py +8 -0
  38. datahub/ingestion/source/sql/teradata.py +8 -1
  39. datahub/ingestion/source/sql/trino.py +9 -0
  40. datahub/ingestion/source/unity/config.py +36 -1
  41. datahub/ingestion/source/unity/proxy.py +332 -46
  42. datahub/ingestion/source/unity/proxy_types.py +12 -2
  43. datahub/ingestion/source/unity/source.py +91 -34
  44. datahub/ingestion/source/unity/tag_entities.py +5 -5
  45. datahub/ingestion/source/usage/starburst_trino_usage.py +2 -2
  46. datahub/sdk/entity_client.py +22 -7
  47. datahub/utilities/mapping.py +29 -2
  48. {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4rc2.dist-info}/WHEEL +0 -0
  49. {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4rc2.dist-info}/entry_points.txt +0 -0
  50. {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4rc2.dist-info}/licenses/LICENSE +0 -0
  51. {acryl_datahub-1.2.0.3rc2.dist-info → acryl_datahub-1.2.0.4rc2.dist-info}/top_level.txt +0 -0
@@ -7,12 +7,14 @@ from urllib.parse import urljoin
7
7
  from datahub.api.entities.external.external_entities import PlatformResourceRepository
8
8
  from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
9
9
  from datahub.emitter.mce_builder import (
10
+ UNKNOWN_USER,
10
11
  make_data_platform_urn,
11
12
  make_dataplatform_instance_urn,
12
13
  make_dataset_urn_with_platform_instance,
13
14
  make_domain_urn,
14
15
  make_group_urn,
15
16
  make_schema_field_urn,
17
+ make_ts_millis,
16
18
  make_user_urn,
17
19
  )
18
20
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -111,6 +113,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
111
113
  ViewProperties,
112
114
  )
113
115
  from datahub.metadata.schema_classes import (
116
+ AuditStampClass,
114
117
  BrowsePathsClass,
115
118
  DataPlatformInstanceClass,
116
119
  DatasetLineageTypeClass,
@@ -203,6 +206,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
203
206
  config.warehouse_id,
204
207
  report=self.report,
205
208
  hive_metastore_proxy=self.hive_metastore_proxy,
209
+ lineage_data_source=config.lineage_data_source,
206
210
  )
207
211
 
208
212
  self.external_url_base = urljoin(self.config.workspace_url, "/explore/data")
@@ -410,12 +414,12 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
410
414
  self.config.workspace_url, f"#notebook/{notebook.id}"
411
415
  ),
412
416
  created=(
413
- TimeStampClass(int(notebook.created_at.timestamp() * 1000))
417
+ TimeStampClass(make_ts_millis(notebook.created_at))
414
418
  if notebook.created_at
415
419
  else None
416
420
  ),
417
421
  lastModified=(
418
- TimeStampClass(int(notebook.modified_at.timestamp() * 1000))
422
+ TimeStampClass(make_ts_millis(notebook.modified_at))
419
423
  if notebook.modified_at
420
424
  else None
421
425
  ),
@@ -434,17 +438,20 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
434
438
  if not notebook.upstreams:
435
439
  return None
436
440
 
441
+ upstreams = []
442
+ for upstream_ref in notebook.upstreams:
443
+ timestamp = make_ts_millis(upstream_ref.last_updated)
444
+ upstreams.append(
445
+ self._create_upstream_class(
446
+ self.gen_dataset_urn(upstream_ref),
447
+ DatasetLineageTypeClass.COPY,
448
+ timestamp,
449
+ )
450
+ )
451
+
437
452
  return MetadataChangeProposalWrapper(
438
453
  entityUrn=self.gen_notebook_urn(notebook),
439
- aspect=UpstreamLineageClass(
440
- upstreams=[
441
- UpstreamClass(
442
- dataset=self.gen_dataset_urn(upstream_ref),
443
- type=DatasetLineageTypeClass.COPY,
444
- )
445
- for upstream_ref in notebook.upstreams
446
- ]
447
- ),
454
+ aspect=UpstreamLineageClass(upstreams=upstreams),
448
455
  ).as_workunit()
449
456
 
450
457
  def process_metastores(self) -> Iterable[MetadataWorkUnit]:
@@ -463,14 +470,15 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
463
470
  self, metastore: Optional[Metastore]
464
471
  ) -> Iterable[MetadataWorkUnit]:
465
472
  for catalog in self._get_catalogs(metastore):
466
- if not self.config.catalog_pattern.allowed(catalog.id):
467
- self.report.catalogs.dropped(catalog.id)
468
- continue
473
+ with self.report.new_stage(f"Ingest catalog {catalog.id}"):
474
+ if not self.config.catalog_pattern.allowed(catalog.id):
475
+ self.report.catalogs.dropped(catalog.id)
476
+ continue
469
477
 
470
- yield from self.gen_catalog_containers(catalog)
471
- yield from self.process_schemas(catalog)
478
+ yield from self.gen_catalog_containers(catalog)
479
+ yield from self.process_schemas(catalog)
472
480
 
473
- self.report.catalogs.processed(catalog.id)
481
+ self.report.catalogs.processed(catalog.id)
474
482
 
475
483
  def _get_catalogs(self, metastore: Optional[Metastore]) -> Iterable[Catalog]:
476
484
  if self.config.catalogs:
@@ -647,9 +655,21 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
647
655
  ]
648
656
 
649
657
  def ingest_lineage(self, table: Table) -> Optional[UpstreamLineageClass]:
658
+ # Calculate datetime filters for lineage
659
+ lineage_start_time = None
660
+ lineage_end_time = self.config.end_time
661
+
662
+ if self.config.ignore_start_time_lineage:
663
+ lineage_start_time = None # Ignore start time to get all lineage
664
+ else:
665
+ lineage_start_time = self.config.start_time
666
+
650
667
  if self.config.include_table_lineage:
651
668
  self.unity_catalog_api_proxy.table_lineage(
652
- table, include_entity_lineage=self.config.include_notebooks
669
+ table,
670
+ include_entity_lineage=self.config.include_notebooks,
671
+ start_time=lineage_start_time,
672
+ end_time=lineage_end_time,
653
673
  )
654
674
 
655
675
  if self.config.include_column_lineage and table.upstreams:
@@ -661,7 +681,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
661
681
  for column in table.columns[: self.config.column_lineage_column_limit]
662
682
  ]
663
683
  self.unity_catalog_api_proxy.get_column_lineage(
664
- table, column_names, max_workers=self.config.lineage_max_workers
684
+ table,
685
+ column_names,
686
+ max_workers=self.config.lineage_max_workers,
687
+ start_time=lineage_start_time,
688
+ end_time=lineage_end_time,
665
689
  )
666
690
 
667
691
  return self._generate_lineage_aspect(self.gen_dataset_urn(table.ref), table)
@@ -690,18 +714,22 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
690
714
  for d_col, u_cols in sorted(downstream_to_upstream_cols.items())
691
715
  )
692
716
 
717
+ timestamp = make_ts_millis(upstream_ref.last_updated)
693
718
  upstreams.append(
694
- UpstreamClass(
695
- dataset=upstream_urn,
696
- type=DatasetLineageTypeClass.TRANSFORMED,
719
+ self._create_upstream_class(
720
+ upstream_urn,
721
+ DatasetLineageTypeClass.TRANSFORMED,
722
+ timestamp,
697
723
  )
698
724
  )
699
725
 
700
- for notebook in table.upstream_notebooks:
726
+ for notebook in table.upstream_notebooks.values():
727
+ timestamp = make_ts_millis(notebook.last_updated)
701
728
  upstreams.append(
702
- UpstreamClass(
703
- dataset=self.gen_notebook_urn(notebook),
704
- type=DatasetLineageTypeClass.TRANSFORMED,
729
+ self._create_upstream_class(
730
+ self.gen_notebook_urn(notebook.id),
731
+ DatasetLineageTypeClass.TRANSFORMED,
732
+ timestamp,
705
733
  )
706
734
  )
707
735
 
@@ -771,6 +799,31 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
771
799
  instance=self.config.platform_instance,
772
800
  ).as_urn()
773
801
 
802
+ def _create_upstream_class(
803
+ self,
804
+ dataset_urn: str,
805
+ lineage_type: Union[str, DatasetLineageTypeClass],
806
+ timestamp: Optional[int],
807
+ ) -> UpstreamClass:
808
+ """
809
+ Helper method to create UpstreamClass with optional audit stamp.
810
+ If timestamp is None, audit stamp is omitted.
811
+ """
812
+ if timestamp is not None:
813
+ return UpstreamClass(
814
+ dataset=dataset_urn,
815
+ type=lineage_type,
816
+ auditStamp=AuditStampClass(
817
+ time=timestamp,
818
+ actor=UNKNOWN_USER,
819
+ ),
820
+ )
821
+ else:
822
+ return UpstreamClass(
823
+ dataset=dataset_urn,
824
+ type=lineage_type,
825
+ )
826
+
774
827
  def gen_schema_containers(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
775
828
  domain_urn = self._gen_domain_urn(f"{schema.catalog.name}.{schema.name}")
776
829
  schema_tags = []
@@ -961,16 +1014,20 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
961
1014
  created: Optional[TimeStampClass] = None
962
1015
  if table.created_at:
963
1016
  custom_properties["created_at"] = str(table.created_at)
964
- created = TimeStampClass(
965
- int(table.created_at.timestamp() * 1000),
966
- make_user_urn(table.created_by) if table.created_by else None,
967
- )
1017
+ created_ts = make_ts_millis(table.created_at)
1018
+ if created_ts is not None:
1019
+ created = TimeStampClass(
1020
+ created_ts,
1021
+ make_user_urn(table.created_by) if table.created_by else None,
1022
+ )
968
1023
  last_modified = created
969
1024
  if table.updated_at:
970
- last_modified = TimeStampClass(
971
- int(table.updated_at.timestamp() * 1000),
972
- table.updated_by and make_user_urn(table.updated_by),
973
- )
1025
+ updated_ts = make_ts_millis(table.updated_at)
1026
+ if updated_ts is not None:
1027
+ last_modified = TimeStampClass(
1028
+ updated_ts,
1029
+ table.updated_by and make_user_urn(table.updated_by),
1030
+ )
974
1031
 
975
1032
  return DatasetPropertiesClass(
976
1033
  name=table.name,
@@ -36,7 +36,7 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
36
36
 
37
37
  tag_key: str
38
38
  tag_value: Optional[str] = None
39
- platform_instance: Optional[str]
39
+ platform_instance: Optional[str] = None
40
40
  exists_in_unity_catalog: bool = False
41
41
  persisted: bool = False
42
42
 
@@ -77,13 +77,13 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
77
77
  )
78
78
  if existing_platform_resource:
79
79
  logger.info(
80
- f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.original}: {existing_platform_resource}"
80
+ f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.raw_text}: {existing_platform_resource}"
81
81
  )
82
82
  return existing_platform_resource
83
83
 
84
84
  return UnityCatalogTagPlatformResourceId(
85
- tag_key=tag.key.original,
86
- tag_value=tag.value.original if tag.value is not None else None,
85
+ tag_key=tag.key.raw_text,
86
+ tag_value=tag.value.raw_text if tag.value is not None else None,
87
87
  platform_instance=platform_instance,
88
88
  exists_in_unity_catalog=exists_in_unity_catalog,
89
89
  persisted=False,
@@ -218,7 +218,7 @@ class UnityCatalogTagPlatformResource(BaseModel, ExternalEntity):
218
218
  datahub_urns: LinkedResourceSet
219
219
  managed_by_datahub: bool
220
220
  id: UnityCatalogTagPlatformResourceId
221
- allowed_values: Optional[List[str]]
221
+ allowed_values: Optional[List[str]] = None
222
222
 
223
223
  def get_id(self) -> ExternalEntityId:
224
224
  return self.id
@@ -60,7 +60,7 @@ AggregatedDataset = GenericAggregatedDataset[TrinoTableRef]
60
60
 
61
61
  class TrinoConnectorInfo(BaseModel):
62
62
  partitionIds: List[str]
63
- truncated: Optional[bool]
63
+ truncated: Optional[bool] = None
64
64
 
65
65
 
66
66
  class TrinoAccessedMetadata(BaseModel):
@@ -80,7 +80,7 @@ class TrinoJoinedAccessEvent(BaseModel):
80
80
  table: Optional[str] = None
81
81
  accessed_metadata: List[TrinoAccessedMetadata]
82
82
  starttime: datetime = Field(alias="create_time")
83
- endtime: Optional[datetime] = Field(alias="end_time")
83
+ endtime: Optional[datetime] = Field(None, alias="end_time")
84
84
 
85
85
 
86
86
  class EnvBasedSourceBaseConfig:
@@ -1,11 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import warnings
4
- from typing import TYPE_CHECKING, Union, overload
4
+ from typing import TYPE_CHECKING, Optional, Union, overload
5
5
 
6
6
  import datahub.metadata.schema_classes as models
7
7
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
8
8
  from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
9
+ from datahub.emitter.rest_emitter import EmitMode
9
10
  from datahub.errors import IngestionAttributionWarning, ItemNotFoundError, SdkUsageError
10
11
  from datahub.ingestion.graph.client import DataHubGraph
11
12
  from datahub.metadata.urns import (
@@ -133,7 +134,7 @@ class EntityClient:
133
134
 
134
135
  return entity
135
136
 
136
- def create(self, entity: Entity) -> None:
137
+ def create(self, entity: Entity, *, emit_mode: Optional[EmitMode] = None) -> None:
137
138
  mcps = []
138
139
 
139
140
  if self._graph.exists(str(entity.urn)):
@@ -152,9 +153,12 @@ class EntityClient:
152
153
  )
153
154
  mcps.extend(entity.as_mcps(models.ChangeTypeClass.CREATE))
154
155
 
155
- self._graph.emit_mcps(mcps)
156
+ if emit_mode:
157
+ self._graph.emit_mcps(mcps, emit_mode=emit_mode)
158
+ else:
159
+ self._graph.emit_mcps(mcps)
156
160
 
157
- def upsert(self, entity: Entity) -> None:
161
+ def upsert(self, entity: Entity, *, emit_mode: Optional[EmitMode] = None) -> None:
158
162
  if entity._prev_aspects is None and self._graph.exists(str(entity.urn)):
159
163
  warnings.warn(
160
164
  f"The entity {entity.urn} already exists. This operation will partially overwrite the existing entity.",
@@ -164,9 +168,17 @@ class EntityClient:
164
168
  # TODO: If there are no previous aspects but the entity exists, should we delete aspects that are not present here?
165
169
 
166
170
  mcps = entity.as_mcps(models.ChangeTypeClass.UPSERT)
167
- self._graph.emit_mcps(mcps)
171
+ if emit_mode:
172
+ self._graph.emit_mcps(mcps, emit_mode=emit_mode)
173
+ else:
174
+ self._graph.emit_mcps(mcps)
168
175
 
169
- def update(self, entity: Union[Entity, MetadataPatchProposal]) -> None:
176
+ def update(
177
+ self,
178
+ entity: Union[Entity, MetadataPatchProposal],
179
+ *,
180
+ emit_mode: Optional[EmitMode] = None,
181
+ ) -> None:
170
182
  if isinstance(entity, MetadataPatchProposal):
171
183
  return self._update_patch(entity)
172
184
 
@@ -179,7 +191,10 @@ class EntityClient:
179
191
  # -> probably add a "mode" parameter that can be "update" (e.g. if not modified) or "update_force"
180
192
 
181
193
  mcps = entity.as_mcps(models.ChangeTypeClass.UPSERT)
182
- self._graph.emit_mcps(mcps)
194
+ if emit_mode:
195
+ self._graph.emit_mcps(mcps, emit_mode=emit_mode)
196
+ else:
197
+ self._graph.emit_mcps(mcps)
183
198
 
184
199
  def _update_patch(
185
200
  self, updater: MetadataPatchProposal, check_exists: bool = True
@@ -83,7 +83,7 @@ class Constants:
83
83
  MATCH = "match"
84
84
  USER_OWNER = "user"
85
85
  GROUP_OWNER = "group"
86
- OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float]
86
+ OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float, list]
87
87
  TAG_PARTITION_KEY = "PARTITION_KEY"
88
88
  TAG_DIST_KEY = "DIST_KEY"
89
89
  TAG_SORT_KEY = "SORT_KEY"
@@ -455,7 +455,34 @@ class OperationProcessor:
455
455
  # function to check if a match clause is satisfied to a value.
456
456
  if not any(
457
457
  isinstance(raw_props_value, t) for t in Constants.OPERAND_DATATYPE_SUPPORTED
458
- ) or not isinstance(raw_props_value, type(match_clause)):
458
+ ):
459
+ return None
460
+
461
+ # Handle list values by checking if any item in the list matches
462
+ if isinstance(raw_props_value, list):
463
+ # For lists, we need to find at least one matching item
464
+ # Return a match with the concatenated values of all matching items
465
+ matching_items = []
466
+ for item in raw_props_value:
467
+ if isinstance(item, str):
468
+ match = re.match(match_clause, item)
469
+ if match:
470
+ matching_items.append(item)
471
+ elif isinstance(match_clause, type(item)):
472
+ match = re.match(str(match_clause), str(item))
473
+ if match:
474
+ matching_items.append(str(item))
475
+
476
+ if matching_items:
477
+ # Create a synthetic match object with all matching items joined
478
+ combined_value = ",".join(matching_items)
479
+ return re.match(
480
+ ".*", combined_value
481
+ ) # Always matches, returns combined value
482
+ return None
483
+
484
+ # Handle scalar values (existing logic)
485
+ elif not isinstance(raw_props_value, type(match_clause)):
459
486
  return None
460
487
  elif isinstance(raw_props_value, str):
461
488
  return re.match(match_clause, raw_props_value)