acryl-datahub 1.0.0.1rc7__py3-none-any.whl → 1.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (76) hide show
  1. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2561 -2561
  2. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +75 -73
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/datajob/dataflow.py +15 -0
  5. datahub/api/entities/datajob/datajob.py +17 -0
  6. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  7. datahub/api/entities/dataset/dataset.py +2 -2
  8. datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
  9. datahub/cli/ingest_cli.py +4 -4
  10. datahub/cli/migrate.py +6 -6
  11. datahub/configuration/common.py +1 -1
  12. datahub/emitter/mcp_builder.py +4 -0
  13. datahub/ingestion/api/common.py +9 -0
  14. datahub/ingestion/api/source.py +4 -1
  15. datahub/ingestion/api/source_helpers.py +26 -1
  16. datahub/ingestion/graph/client.py +104 -0
  17. datahub/ingestion/run/pipeline.py +0 -6
  18. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  19. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  20. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  21. datahub/ingestion/source/fivetran/fivetran.py +1 -0
  22. datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
  23. datahub/ingestion/source/hex/constants.py +5 -0
  24. datahub/ingestion/source/hex/hex.py +150 -22
  25. datahub/ingestion/source/hex/mapper.py +28 -2
  26. datahub/ingestion/source/hex/model.py +10 -2
  27. datahub/ingestion/source/hex/query_fetcher.py +300 -0
  28. datahub/ingestion/source/iceberg/iceberg.py +106 -18
  29. datahub/ingestion/source/kafka/kafka.py +1 -4
  30. datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
  31. datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
  32. datahub/ingestion/source/looker/looker_source.py +2 -3
  33. datahub/ingestion/source/mlflow.py +6 -7
  34. datahub/ingestion/source/mode.py +2 -2
  35. datahub/ingestion/source/nifi.py +3 -3
  36. datahub/ingestion/source/openapi.py +3 -3
  37. datahub/ingestion/source/openapi_parser.py +8 -8
  38. datahub/ingestion/source/powerbi/config.py +1 -1
  39. datahub/ingestion/source/powerbi/powerbi.py +16 -3
  40. datahub/ingestion/source/redshift/profile.py +2 -2
  41. datahub/ingestion/source/sigma/sigma.py +6 -2
  42. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
  43. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  44. datahub/ingestion/source/sql/trino.py +4 -3
  45. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  46. datahub/ingestion/source/superset.py +108 -81
  47. datahub/ingestion/source/tableau/tableau.py +4 -4
  48. datahub/ingestion/source/tableau/tableau_common.py +2 -2
  49. datahub/ingestion/source/unity/source.py +1 -1
  50. datahub/ingestion/source/vertexai/vertexai.py +7 -7
  51. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  52. datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
  53. datahub/ingestion/transformer/dataset_domain.py +1 -1
  54. datahub/lite/lite_util.py +2 -2
  55. datahub/metadata/_schema_classes.py +47 -2
  56. datahub/metadata/_urns/urn_defs.py +56 -0
  57. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  58. datahub/metadata/schema.avsc +121 -85
  59. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  60. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  61. datahub/metadata/schemas/FormInfo.avsc +5 -0
  62. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  63. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  64. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  65. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  66. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  67. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  68. datahub/testing/mcp_diff.py +1 -1
  69. datahub/utilities/file_backed_collections.py +6 -6
  70. datahub/utilities/hive_schema_to_avro.py +2 -2
  71. datahub/utilities/ingest_utils.py +2 -2
  72. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  73. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
  74. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
  75. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
  76. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
@@ -43,7 +43,7 @@ class AllowedValue(ConfigModel):
43
43
 
44
44
 
45
45
  VALID_ENTITY_TYPE_URNS = [
46
- Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES.keys()
46
+ Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES
47
47
  ]
48
48
  _VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."
49
49
 
datahub/cli/ingest_cli.py CHANGED
@@ -216,9 +216,9 @@ def run(
216
216
  @click.option(
217
217
  "--executor-id",
218
218
  type=str,
219
- default="default",
220
219
  help="Executor id to route execution requests to. Do not use this unless you have configured a custom executor.",
221
220
  required=False,
221
+ default=None,
222
222
  )
223
223
  @click.option(
224
224
  "--cli-version",
@@ -239,7 +239,7 @@ def run(
239
239
  type=str,
240
240
  help="Timezone for the schedule in 'America/New_York' format. Uses UTC by default.",
241
241
  required=False,
242
- default="UTC",
242
+ default=None,
243
243
  )
244
244
  @click.option(
245
245
  "--debug", type=bool, help="Should we debug.", required=False, default=False
@@ -255,10 +255,10 @@ def deploy(
255
255
  name: Optional[str],
256
256
  config: str,
257
257
  urn: Optional[str],
258
- executor_id: str,
258
+ executor_id: Optional[str],
259
259
  cli_version: Optional[str],
260
260
  schedule: Optional[str],
261
- time_zone: str,
261
+ time_zone: Optional[str],
262
262
  extra_pip: Optional[str],
263
263
  debug: bool = False,
264
264
  ) -> None:
datahub/cli/migrate.py CHANGED
@@ -76,13 +76,13 @@ class MigrationReport:
76
76
  def __repr__(self) -> str:
77
77
  repr = f"{self._get_prefix()}Migration Report:\n--------------\n"
78
78
  repr += f"{self._get_prefix()}Migration Run Id: {self.run_id}\n"
79
- repr += f"{self._get_prefix()}Num entities created = {len(set([x[0] for x in self.entities_created.keys()]))}\n"
80
- repr += f"{self._get_prefix()}Num entities affected = {len(set([x[0] for x in self.entities_affected.keys()]))}\n"
81
- repr += f"{self._get_prefix()}Num entities {'kept' if self.keep else 'migrated'} = {len(set([x[0] for x in self.entities_migrated.keys()]))}\n"
79
+ repr += f"{self._get_prefix()}Num entities created = {len(set([x[0] for x in self.entities_created]))}\n"
80
+ repr += f"{self._get_prefix()}Num entities affected = {len(set([x[0] for x in self.entities_affected]))}\n"
81
+ repr += f"{self._get_prefix()}Num entities {'kept' if self.keep else 'migrated'} = {len(set([x[0] for x in self.entities_migrated]))}\n"
82
82
  repr += f"{self._get_prefix()}Details:\n"
83
- repr += f"{self._get_prefix()}New Entities Created: {set([x[0] for x in self.entities_created.keys()]) or 'None'}\n"
84
- repr += f"{self._get_prefix()}External Entities Affected: {set([x[0] for x in self.entities_affected.keys()]) or 'None'}\n"
85
- repr += f"{self._get_prefix()}Old Entities {'Kept' if self.keep else 'Migrated'} = {set([x[0] for x in self.entities_migrated.keys()]) or 'None'}\n"
83
+ repr += f"{self._get_prefix()}New Entities Created: {set([x[0] for x in self.entities_created]) or 'None'}\n"
84
+ repr += f"{self._get_prefix()}External Entities Affected: {set([x[0] for x in self.entities_affected]) or 'None'}\n"
85
+ repr += f"{self._get_prefix()}Old Entities {'Kept' if self.keep else 'Migrated'} = {set([x[0] for x in self.entities_migrated]) or 'None'}\n"
86
86
  return repr
87
87
 
88
88
 
@@ -317,7 +317,7 @@ class KeyValuePattern(ConfigModel):
317
317
  return KeyValuePattern()
318
318
 
319
319
  def value(self, string: str) -> List[str]:
320
- matching_keys = [key for key in self.rules.keys() if re.match(key, string)]
320
+ matching_keys = [key for key in self.rules if re.match(key, string)]
321
321
  if not matching_keys:
322
322
  return []
323
323
  elif self.first_match_only:
@@ -137,6 +137,10 @@ class ProjectIdKey(ContainerKey):
137
137
  project_id: str
138
138
 
139
139
 
140
+ class ExperimentKey(ContainerKey):
141
+ id: str
142
+
143
+
140
144
  class MetastoreKey(ContainerKey):
141
145
  metastore: str
142
146
 
@@ -12,6 +12,9 @@ if TYPE_CHECKING:
12
12
 
13
13
  T = TypeVar("T")
14
14
 
15
+ if TYPE_CHECKING:
16
+ from datahub.ingestion.run.pipeline_config import FlagsConfig
17
+
15
18
 
16
19
  @dataclass
17
20
  class RecordEnvelope(Generic[T]):
@@ -60,6 +63,12 @@ class PipelineContext:
60
63
 
61
64
  self._set_dataset_urn_to_lower_if_needed()
62
65
 
66
+ @property
67
+ def flags(self) -> "FlagsConfig":
68
+ from datahub.ingestion.run.pipeline_config import FlagsConfig
69
+
70
+ return self.pipeline_config.flags if self.pipeline_config else FlagsConfig()
71
+
63
72
  def _set_dataset_urn_to_lower_if_needed(self) -> None:
64
73
  # TODO: Get rid of this function once lower-casing is the standard.
65
74
  if self.graph:
@@ -39,6 +39,7 @@ from datahub.ingestion.api.closeable import Closeable
39
39
  from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
40
40
  from datahub.ingestion.api.report import Report
41
41
  from datahub.ingestion.api.source_helpers import (
42
+ AutoSystemMetadata,
42
43
  auto_browse_path_v2,
43
44
  auto_fix_duplicate_schema_field_paths,
44
45
  auto_fix_empty_field_paths,
@@ -475,8 +476,10 @@ class Source(Closeable, metaclass=ABCMeta):
475
476
  return stream
476
477
 
477
478
  def get_workunits(self) -> Iterable[MetadataWorkUnit]:
479
+ workunit_processors = self.get_workunit_processors()
480
+ workunit_processors.append(AutoSystemMetadata(self.ctx).stamp)
478
481
  return self._apply_workunit_processors(
479
- self.get_workunit_processors(), auto_workunit(self.get_workunits_internal())
482
+ workunit_processors, auto_workunit(self.get_workunits_internal())
480
483
  )
481
484
 
482
485
  def get_workunits_internal(
@@ -13,9 +13,14 @@ from typing import (
13
13
  )
14
14
 
15
15
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
16
- from datahub.emitter.mce_builder import make_dataplatform_instance_urn, parse_ts_millis
16
+ from datahub.emitter.mce_builder import (
17
+ get_sys_time,
18
+ make_dataplatform_instance_urn,
19
+ parse_ts_millis,
20
+ )
17
21
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
18
22
  from datahub.emitter.mcp_builder import entity_supports_aspect
23
+ from datahub.ingestion.api.common import PipelineContext
19
24
  from datahub.ingestion.api.workunit import MetadataWorkUnit
20
25
  from datahub.metadata.schema_classes import (
21
26
  BrowsePathEntryClass,
@@ -544,3 +549,23 @@ def _prepend_platform_instance(
544
549
  return [BrowsePathEntryClass(id=urn, urn=urn)] + entries
545
550
 
546
551
  return entries
552
+
553
+
554
+ class AutoSystemMetadata:
555
+ def __init__(self, ctx: PipelineContext):
556
+ self.ctx = ctx
557
+
558
+ def stamp(self, stream: Iterable[MetadataWorkUnit]) -> Iterable[MetadataWorkUnit]:
559
+ for wu in stream:
560
+ yield self.stamp_wu(wu)
561
+
562
+ def stamp_wu(self, wu: MetadataWorkUnit) -> MetadataWorkUnit:
563
+ if self.ctx.flags.set_system_metadata:
564
+ if not wu.metadata.systemMetadata:
565
+ wu.metadata.systemMetadata = SystemMetadataClass()
566
+ wu.metadata.systemMetadata.runId = self.ctx.run_id
567
+ if not wu.metadata.systemMetadata.lastObserved:
568
+ wu.metadata.systemMetadata.lastObserved = get_sys_time()
569
+ if self.ctx.flags.set_system_metadata_pipeline_name:
570
+ wu.metadata.systemMetadata.pipelineName = self.ctx.pipeline_name
571
+ return wu
@@ -27,6 +27,7 @@ from pydantic import BaseModel
27
27
  from requests.models import HTTPError
28
28
  from typing_extensions import deprecated
29
29
 
30
+ from datahub._codegen.aspect import _Aspect
30
31
  from datahub.cli import config_utils
31
32
  from datahub.configuration.common import ConfigModel, GraphError, OperationalError
32
33
  from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
@@ -1697,6 +1698,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1697
1698
 
1698
1699
  return res["runAssertionsForAsset"]
1699
1700
 
1701
+ @deprecated("Use get_entities instead which returns typed aspects")
1700
1702
  def get_entities_v2(
1701
1703
  self,
1702
1704
  entity_name: str,
@@ -1736,6 +1738,108 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1736
1738
  retval[entity_urn][aspect_key] = aspect_value
1737
1739
  return retval
1738
1740
 
1741
+ def get_entities(
1742
+ self,
1743
+ entity_name: str,
1744
+ urns: List[str],
1745
+ aspects: Optional[List[str]] = None,
1746
+ with_system_metadata: bool = False,
1747
+ ) -> Dict[str, Dict[str, Tuple[_Aspect, Optional[SystemMetadataClass]]]]:
1748
+ """
1749
+ Get entities using the OpenAPI v3 endpoint, deserializing aspects into typed objects.
1750
+
1751
+ Args:
1752
+ entity_name: The entity type name
1753
+ urns: List of entity URNs to fetch
1754
+ aspects: Optional list of aspect names to fetch. If None, all aspects will be fetched.
1755
+ with_system_metadata: If True, return system metadata along with each aspect.
1756
+
1757
+ Returns:
1758
+ A dictionary mapping URNs to a dictionary of aspect name to tuples of
1759
+ (typed aspect object, system metadata). If with_system_metadata is False,
1760
+ the system metadata in the tuple will be None.
1761
+ """
1762
+ aspects = aspects or []
1763
+
1764
+ request_payload = []
1765
+ for urn in urns:
1766
+ entity_request: Dict[str, Any] = {"urn": urn}
1767
+ for aspect_name in aspects:
1768
+ entity_request[aspect_name] = {}
1769
+ request_payload.append(entity_request)
1770
+
1771
+ headers: Dict[str, Any] = {
1772
+ "Accept": "application/json",
1773
+ "Content-Type": "application/json",
1774
+ }
1775
+
1776
+ url = f"{self.config.server}/openapi/v3/entity/{entity_name}/batchGet"
1777
+ if with_system_metadata:
1778
+ url += "?systemMetadata=true"
1779
+
1780
+ response = self._session.post(
1781
+ url, data=json.dumps(request_payload), headers=headers
1782
+ )
1783
+ response.raise_for_status()
1784
+ entities = response.json()
1785
+
1786
+ result: Dict[str, Dict[str, Tuple[_Aspect, Optional[SystemMetadataClass]]]] = {}
1787
+
1788
+ for entity in entities:
1789
+ entity_urn = entity.get("urn")
1790
+ if entity_urn is None:
1791
+ logger.warning(
1792
+ f"Missing URN in entity response: {entity}, skipping deserialization"
1793
+ )
1794
+ continue
1795
+
1796
+ entity_aspects: Dict[
1797
+ str, Tuple[_Aspect, Optional[SystemMetadataClass]]
1798
+ ] = {}
1799
+
1800
+ for aspect_name, aspect_obj in entity.items():
1801
+ if aspect_name == "urn":
1802
+ continue
1803
+
1804
+ aspect_class = ASPECT_NAME_MAP.get(aspect_name)
1805
+ if aspect_class is None:
1806
+ logger.warning(
1807
+ f"Unknown aspect type {aspect_name}, skipping deserialization"
1808
+ )
1809
+ continue
1810
+
1811
+ aspect_value = aspect_obj.get("value")
1812
+ if aspect_value is None:
1813
+ logger.warning(
1814
+ f"Unknown aspect value for aspect {aspect_name}, skipping deserialization"
1815
+ )
1816
+ continue
1817
+
1818
+ try:
1819
+ post_json_obj = post_json_transform(aspect_value)
1820
+ typed_aspect = aspect_class.from_obj(post_json_obj)
1821
+ assert isinstance(typed_aspect, aspect_class) and isinstance(
1822
+ typed_aspect, _Aspect
1823
+ )
1824
+
1825
+ system_metadata = None
1826
+ if with_system_metadata:
1827
+ system_metadata_obj = aspect_obj.get("systemMetadata")
1828
+ if system_metadata_obj:
1829
+ system_metadata = SystemMetadataClass.from_obj(
1830
+ system_metadata_obj
1831
+ )
1832
+
1833
+ entity_aspects[aspect_name] = (typed_aspect, system_metadata)
1834
+ except Exception as e:
1835
+ logger.error(f"Error deserializing aspect {aspect_name}: {e}")
1836
+ raise
1837
+
1838
+ if entity_aspects:
1839
+ result[entity_urn] = entity_aspects
1840
+
1841
+ return result
1842
+
1739
1843
  def upsert_custom_assertion(
1740
1844
  self,
1741
1845
  urn: Optional[str],
@@ -39,9 +39,6 @@ from datahub.ingestion.run.sink_callback import DeadLetterQueueCallback, Logging
39
39
  from datahub.ingestion.sink.datahub_rest import DatahubRestSink
40
40
  from datahub.ingestion.sink.sink_registry import sink_registry
41
41
  from datahub.ingestion.source.source_registry import source_registry
42
- from datahub.ingestion.transformer.system_metadata_transformer import (
43
- SystemMetadataTransformer,
44
- )
45
42
  from datahub.ingestion.transformer.transform_registry import transform_registry
46
43
  from datahub.sdk._attribution import KnownAttribution, change_default_attribution
47
44
  from datahub.telemetry import stats
@@ -286,9 +283,6 @@ class Pipeline:
286
283
  f"Transformer type:{transformer_type},{transformer_class} configured"
287
284
  )
288
285
 
289
- # Add the system metadata transformer at the end of the list.
290
- self.transformers.append(SystemMetadataTransformer(self.ctx))
291
-
292
286
  def _configure_reporting(self, report_to: Optional[str]) -> None:
293
287
  if self.dry_run:
294
288
  # In dry run mode, we don't want to report anything.
@@ -323,7 +323,7 @@ class ModelProcessor:
323
323
  model_training_jobs = model_training_jobs.union(
324
324
  {
325
325
  job_urn
326
- for job_urn, job_direction in data_url_matched_jobs.keys()
326
+ for job_urn, job_direction in data_url_matched_jobs
327
327
  if job_direction == JobDirection.TRAINING
328
328
  }
329
329
  )
@@ -331,7 +331,7 @@ class ModelProcessor:
331
331
  model_downstream_jobs = model_downstream_jobs.union(
332
332
  {
333
333
  job_urn
334
- for job_urn, job_direction in data_url_matched_jobs.keys()
334
+ for job_urn, job_direction in data_url_matched_jobs
335
335
  if job_direction == JobDirection.DOWNSTREAM
336
336
  }
337
337
  )
@@ -368,7 +368,7 @@ class ModelProcessor:
368
368
  model_training_jobs = model_training_jobs.union(
369
369
  {
370
370
  job_urn
371
- for job_urn, job_direction in name_matched_jobs.keys()
371
+ for job_urn, job_direction in name_matched_jobs
372
372
  if job_direction == JobDirection.TRAINING
373
373
  }
374
374
  )
@@ -376,7 +376,7 @@ class ModelProcessor:
376
376
  model_downstream_jobs = model_downstream_jobs.union(
377
377
  {
378
378
  job_urn
379
- for job_urn, job_direction in name_matched_jobs.keys()
379
+ for job_urn, job_direction in name_matched_jobs
380
380
  if job_direction == JobDirection.DOWNSTREAM
381
381
  }
382
382
  )
@@ -375,7 +375,7 @@ class BigqueryLineageExtractor:
375
375
  memory_footprint.total_size(lineage)
376
376
  )
377
377
 
378
- for lineage_key in lineage.keys():
378
+ for lineage_key in lineage:
379
379
  # For views, we do not use the upstreams obtained by parsing audit logs
380
380
  # as they may contain indirectly referenced tables.
381
381
  if (
@@ -362,7 +362,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
362
362
  if self.config.include_table_item is None:
363
363
  return
364
364
  dataset_name = f"{region}.{table_name}"
365
- if dataset_name not in self.config.include_table_item.keys():
365
+ if dataset_name not in self.config.include_table_item:
366
366
  return
367
367
  primary_key_list = self.config.include_table_item.get(dataset_name)
368
368
  assert isinstance(primary_key_list, List)
@@ -215,6 +215,7 @@ class FivetranSource(StatefulIngestionSourceBase):
215
215
  datajob = DataJob(
216
216
  id=connector.connector_id,
217
217
  flow_urn=dataflow_urn,
218
+ platform_instance=self.config.platform_instance,
218
219
  name=connector.connector_name,
219
220
  owners={owner_email} if owner_email else set(),
220
221
  )
@@ -190,7 +190,7 @@ class FivetranLogAPI:
190
190
  jobs: List[Job] = []
191
191
  if connector_sync_log is None:
192
192
  return jobs
193
- for sync_id in connector_sync_log.keys():
193
+ for sync_id in connector_sync_log:
194
194
  if len(connector_sync_log[sync_id]) != 2:
195
195
  # If both sync-start and sync-end event log not present for this sync that means sync is still in progress
196
196
  continue
@@ -1,3 +1,8 @@
1
+ from datahub.metadata.urns import DataPlatformUrn
2
+
1
3
  HEX_PLATFORM_NAME = "hex"
4
+ HEX_PLATFORM_URN = DataPlatformUrn(platform_name=HEX_PLATFORM_NAME)
2
5
  HEX_API_BASE_URL_DEFAULT = "https://app.hex.tech/api/v1"
3
6
  HEX_API_PAGE_SIZE_DEFAULT = 100
7
+
8
+ DATAHUB_API_PAGE_SIZE_DEFAULT = 100
@@ -1,9 +1,12 @@
1
+ from dataclasses import dataclass
2
+ from datetime import datetime, timedelta, timezone
1
3
  from typing import Any, Dict, Iterable, List, Optional
2
4
 
3
- from pydantic import Field, SecretStr
5
+ from pydantic import Field, SecretStr, root_validator
4
6
  from typing_extensions import assert_never
5
7
 
6
8
  from datahub.configuration.common import AllowDenyPattern
9
+ from datahub.configuration.datetimes import parse_user_datetime
7
10
  from datahub.configuration.source_common import (
8
11
  EnvConfigMixin,
9
12
  PlatformInstanceConfigMixin,
@@ -21,12 +24,17 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
21
24
  from datahub.ingestion.api.workunit import MetadataWorkUnit
22
25
  from datahub.ingestion.source.hex.api import HexApi, HexApiReport
23
26
  from datahub.ingestion.source.hex.constants import (
27
+ DATAHUB_API_PAGE_SIZE_DEFAULT,
24
28
  HEX_API_BASE_URL_DEFAULT,
25
29
  HEX_API_PAGE_SIZE_DEFAULT,
26
30
  HEX_PLATFORM_NAME,
27
31
  )
28
32
  from datahub.ingestion.source.hex.mapper import Mapper
29
33
  from datahub.ingestion.source.hex.model import Component, Project
34
+ from datahub.ingestion.source.hex.query_fetcher import (
35
+ HexQueryFetcher,
36
+ HexQueryFetcherReport,
37
+ )
30
38
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
31
39
  StaleEntityRemovalHandler,
32
40
  StaleEntityRemovalSourceReport,
@@ -34,9 +42,10 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
34
42
  )
35
43
  from datahub.ingestion.source.state.stateful_ingestion_base import (
36
44
  StatefulIngestionConfigBase,
37
- StatefulIngestionReport,
38
45
  StatefulIngestionSourceBase,
39
46
  )
47
+ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
48
+ from datahub.sdk.main_client import DataHubClient
40
49
 
41
50
 
42
51
  class HexSourceConfig(
@@ -93,9 +102,73 @@ class HexSourceConfig(
93
102
  default=True,
94
103
  description="Set ownership identity from owner/creator email",
95
104
  )
105
+ include_lineage: bool = Field(
106
+ default=True,
107
+ description='Include Hex lineage, being fetched from DataHub. See "Limitations" section in the docs for more details about the limitations of this feature.',
108
+ )
109
+ lineage_start_time: Optional[datetime] = Field(
110
+ default=None,
111
+ description="Earliest date of lineage to consider. Default: 1 day before lineage end time. You can specify absolute time like '2023-01-01' or relative time like '-7 days' or '-7d'.",
112
+ )
113
+ lineage_end_time: Optional[datetime] = Field(
114
+ default=None,
115
+ description="Latest date of lineage to consider. Default: Current time in UTC. You can specify absolute time like '2023-01-01' or relative time like '-1 day' or '-1d'.",
116
+ )
117
+ datahub_page_size: int = Field(
118
+ default=DATAHUB_API_PAGE_SIZE_DEFAULT,
119
+ description="Number of items to fetch per DataHub API call.",
120
+ )
121
+
122
+ @root_validator(pre=True)
123
+ def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
124
+ # lineage_end_time default = now
125
+ if "lineage_end_time" not in data or data["lineage_end_time"] is None:
126
+ data["lineage_end_time"] = datetime.now(tz=timezone.utc)
127
+ # if string is given, parse it
128
+ if isinstance(data["lineage_end_time"], str):
129
+ data["lineage_end_time"] = parse_user_datetime(data["lineage_end_time"])
130
+ # if no timezone is given, assume UTC
131
+ if data["lineage_end_time"].tzinfo is None:
132
+ data["lineage_end_time"] = data["lineage_end_time"].replace(
133
+ tzinfo=timezone.utc
134
+ )
135
+ # at this point, we ensure there is a non null datetime with UTC timezone for lineage_end_time
136
+ assert (
137
+ data["lineage_end_time"]
138
+ and isinstance(data["lineage_end_time"], datetime)
139
+ and data["lineage_end_time"].tzinfo is not None
140
+ and data["lineage_end_time"].tzinfo == timezone.utc
141
+ )
142
+
143
+ # lineage_start_time default = lineage_end_time - 1 day
144
+ if "lineage_start_time" not in data or data["lineage_start_time"] is None:
145
+ data["lineage_start_time"] = data["lineage_end_time"] - timedelta(days=1)
146
+ # if string is given, parse it
147
+ if isinstance(data["lineage_start_time"], str):
148
+ data["lineage_start_time"] = parse_user_datetime(data["lineage_start_time"])
149
+ # if no timezone is given, assume UTC
150
+ if data["lineage_start_time"].tzinfo is None:
151
+ data["lineage_start_time"] = data["lineage_start_time"].replace(
152
+ tzinfo=timezone.utc
153
+ )
154
+ # at this point, we ensure there is a non null datetime with UTC timezone for lineage_start_time
155
+ assert (
156
+ data["lineage_start_time"]
157
+ and isinstance(data["lineage_start_time"], datetime)
158
+ and data["lineage_start_time"].tzinfo is not None
159
+ and data["lineage_start_time"].tzinfo == timezone.utc
160
+ )
161
+
162
+ return data
96
163
 
97
164
 
98
- class HexReport(StaleEntityRemovalSourceReport, HexApiReport):
165
+ @dataclass
166
+ class HexReport(
167
+ StaleEntityRemovalSourceReport,
168
+ HexApiReport,
169
+ IngestionStageReport,
170
+ HexQueryFetcherReport,
171
+ ):
99
172
  pass
100
173
 
101
174
 
@@ -110,7 +183,7 @@ class HexSource(StatefulIngestionSourceBase):
110
183
  def __init__(self, config: HexSourceConfig, ctx: PipelineContext):
111
184
  super().__init__(config, ctx)
112
185
  self.source_config = config
113
- self.report = HexReport()
186
+ self.report: HexReport = HexReport()
114
187
  self.platform = HEX_PLATFORM_NAME
115
188
  self.hex_api = HexApi(
116
189
  report=self.report,
@@ -129,6 +202,28 @@ class HexSource(StatefulIngestionSourceBase):
129
202
  categories_as_tags=self.source_config.categories_as_tags,
130
203
  set_ownership_from_email=self.source_config.set_ownership_from_email,
131
204
  )
205
+ self.project_registry: Dict[str, Project] = {}
206
+ self.component_registry: Dict[str, Component] = {}
207
+
208
+ self.datahub_client: Optional[DataHubClient] = None
209
+ self.query_fetcher: Optional[HexQueryFetcher] = None
210
+ if self.source_config.include_lineage:
211
+ graph = ctx.require_graph("Lineage")
212
+ assert self.source_config.lineage_start_time and isinstance(
213
+ self.source_config.lineage_start_time, datetime
214
+ )
215
+ assert self.source_config.lineage_end_time and isinstance(
216
+ self.source_config.lineage_end_time, datetime
217
+ )
218
+ self.datahub_client = DataHubClient(graph=graph)
219
+ self.query_fetcher = HexQueryFetcher(
220
+ datahub_client=self.datahub_client,
221
+ workspace_name=self.source_config.workspace_name,
222
+ start_datetime=self.source_config.lineage_start_time,
223
+ end_datetime=self.source_config.lineage_end_time,
224
+ report=self.report,
225
+ page_size=self.source_config.datahub_page_size,
226
+ )
132
227
 
133
228
  @classmethod
134
229
  def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> "HexSource":
@@ -143,25 +238,58 @@ class HexSource(StatefulIngestionSourceBase):
143
238
  ).workunit_processor,
144
239
  ]
145
240
 
146
- def get_report(self) -> StatefulIngestionReport:
241
+ def get_report(self) -> HexReport:
147
242
  return self.report
148
243
 
149
244
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
150
- yield from self.mapper.map_workspace()
151
-
152
- for project_or_component in self.hex_api.fetch_projects():
153
- if isinstance(project_or_component, Project):
154
- if self.source_config.project_title_pattern.allowed(
155
- project_or_component.title
156
- ):
157
- yield from self.mapper.map_project(project=project_or_component)
158
- elif isinstance(project_or_component, Component):
159
- if (
160
- self.source_config.include_components
161
- and self.source_config.component_title_pattern.allowed(
245
+ with self.report.new_stage("Fetch Hex assets from Hex API"):
246
+ for project_or_component in self.hex_api.fetch_projects():
247
+ if isinstance(project_or_component, Project):
248
+ if self.source_config.project_title_pattern.allowed(
162
249
  project_or_component.title
163
- )
164
- ):
165
- yield from self.mapper.map_component(component=project_or_component)
166
- else:
167
- assert_never(project_or_component)
250
+ ):
251
+ self.project_registry[project_or_component.id] = (
252
+ project_or_component
253
+ )
254
+ elif isinstance(project_or_component, Component):
255
+ if (
256
+ self.source_config.include_components
257
+ and self.source_config.component_title_pattern.allowed(
258
+ project_or_component.title
259
+ )
260
+ ):
261
+ self.component_registry[project_or_component.id] = (
262
+ project_or_component
263
+ )
264
+ else:
265
+ assert_never(project_or_component)
266
+
267
+ if self.source_config.include_lineage:
268
+ assert self.datahub_client and self.query_fetcher
269
+
270
+ with self.report.new_stage(
271
+ "Fetch Hex lineage from existing Queries in DataHub"
272
+ ):
273
+ for query_metadata in self.query_fetcher.fetch():
274
+ project = self.project_registry.get(query_metadata.hex_project_id)
275
+ if project:
276
+ project.upstream_datasets.extend(
277
+ query_metadata.dataset_subjects
278
+ )
279
+ project.upstream_schema_fields.extend(
280
+ query_metadata.schema_field_subjects
281
+ )
282
+ else:
283
+ self.report.report_warning(
284
+ title="Missing project for lineage",
285
+ message="Lineage missed because missed project, likely due to filter patterns or deleted project.",
286
+ context=str(query_metadata),
287
+ )
288
+
289
+ with self.report.new_stage("Emit"):
290
+ yield from self.mapper.map_workspace()
291
+
292
+ for project in self.project_registry.values():
293
+ yield from self.mapper.map_project(project=project)
294
+ for component in self.component_registry.values():
295
+ yield from self.mapper.map_component(component=component)