acryl-datahub 0.15.0.4rc2__py3-none-any.whl → 0.15.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (96) hide show
  1. acryl_datahub-0.15.0.5.dist-info/LICENSE +202 -0
  2. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/METADATA +2444 -2404
  3. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/RECORD +96 -86
  4. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/entry_points.txt +1 -0
  5. datahub/__init__.py +1 -25
  6. datahub/_version.py +13 -0
  7. datahub/api/entities/dataprocess/dataprocess_instance.py +104 -11
  8. datahub/cli/check_cli.py +1 -1
  9. datahub/cli/cli_utils.py +3 -3
  10. datahub/cli/container_cli.py +1 -64
  11. datahub/cli/iceberg_cli.py +707 -0
  12. datahub/cli/ingest_cli.py +2 -2
  13. datahub/emitter/composite_emitter.py +36 -0
  14. datahub/emitter/rest_emitter.py +1 -1
  15. datahub/entrypoints.py +26 -5
  16. datahub/ingestion/api/incremental_lineage_helper.py +4 -0
  17. datahub/ingestion/api/registry.py +4 -2
  18. datahub/ingestion/glossary/classification_mixin.py +6 -0
  19. datahub/ingestion/glossary/classifier.py +3 -2
  20. datahub/ingestion/graph/client.py +2 -1
  21. datahub/ingestion/graph/entity_versioning.py +201 -0
  22. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  23. datahub/ingestion/run/connection.py +1 -1
  24. datahub/ingestion/run/pipeline.py +3 -3
  25. datahub/ingestion/source/abs/report.py +2 -2
  26. datahub/ingestion/source/apply/__init__.py +0 -0
  27. datahub/ingestion/source/apply/datahub_apply.py +223 -0
  28. datahub/ingestion/source/aws/glue.py +15 -6
  29. datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
  30. datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
  31. datahub/ingestion/source/dbt/dbt_core.py +1 -1
  32. datahub/ingestion/source/delta_lake/report.py +2 -2
  33. datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
  34. datahub/ingestion/source/elastic_search.py +2 -1
  35. datahub/ingestion/source/ge_profiling_config.py +11 -7
  36. datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
  37. datahub/ingestion/source/identity/azure_ad.py +6 -14
  38. datahub/ingestion/source/identity/okta.py +2 -1
  39. datahub/ingestion/source/kafka/kafka.py +2 -1
  40. datahub/ingestion/source/kafka_connect/common.py +2 -1
  41. datahub/ingestion/source/ldap.py +2 -1
  42. datahub/ingestion/source/looker/looker_config.py +3 -1
  43. datahub/ingestion/source/looker/looker_dataclasses.py +8 -0
  44. datahub/ingestion/source/looker/looker_file_loader.py +14 -3
  45. datahub/ingestion/source/looker/looker_template_language.py +104 -14
  46. datahub/ingestion/source/looker/lookml_config.py +29 -8
  47. datahub/ingestion/source/looker/lookml_source.py +110 -22
  48. datahub/ingestion/source/mode.py +2 -4
  49. datahub/ingestion/source/mongodb.py +2 -1
  50. datahub/ingestion/source/nifi.py +2 -1
  51. datahub/ingestion/source/powerbi/config.py +2 -2
  52. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
  53. datahub/ingestion/source/redash.py +5 -5
  54. datahub/ingestion/source/salesforce.py +4 -1
  55. datahub/ingestion/source/slack/slack.py +6 -0
  56. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  57. datahub/ingestion/source/snowflake/snowflake_query.py +11 -0
  58. datahub/ingestion/source/snowflake/snowflake_report.py +3 -1
  59. datahub/ingestion/source/snowflake/snowflake_schema.py +17 -0
  60. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +35 -43
  61. datahub/ingestion/source/snowflake/snowflake_tag.py +57 -3
  62. datahub/ingestion/source/snowflake/snowflake_v2.py +42 -4
  63. datahub/ingestion/source/sql/clickhouse.py +5 -43
  64. datahub/ingestion/source/sql/mssql/job_models.py +37 -8
  65. datahub/ingestion/source/sql/mssql/source.py +17 -0
  66. datahub/ingestion/source/sql/sql_config.py +0 -10
  67. datahub/ingestion/source/tableau/tableau.py +16 -13
  68. datahub/ingestion/source/tableau/tableau_common.py +1 -1
  69. datahub/ingestion/source/unity/ge_profiler.py +55 -4
  70. datahub/ingestion/source/unity/proxy.py +2 -2
  71. datahub/ingestion/source/unity/report.py +1 -0
  72. datahub/ingestion/source_config/operation_config.py +9 -0
  73. datahub/ingestion/source_report/pulsar.py +5 -4
  74. datahub/metadata/_schema_classes.py +304 -6
  75. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
  76. datahub/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
  77. datahub/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
  78. datahub/metadata/schema.avsc +211 -12
  79. datahub/metadata/schemas/AssertionInfo.avsc +2 -2
  80. datahub/metadata/schemas/CorpUserSettings.avsc +9 -0
  81. datahub/metadata/schemas/DashboardInfo.avsc +5 -5
  82. datahub/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
  83. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  84. datahub/metadata/schemas/Deprecation.avsc +12 -0
  85. datahub/metadata/schemas/DisplayProperties.avsc +62 -0
  86. datahub/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
  87. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +92 -0
  88. datahub/metadata/schemas/MetadataChangeEvent.avsc +17 -5
  89. datahub/metadata/schemas/PostInfo.avsc +28 -2
  90. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  91. datahub/specific/dashboard.py +43 -1
  92. datahub/telemetry/telemetry.py +4 -4
  93. datahub/testing/check_imports.py +28 -0
  94. datahub/upgrade/upgrade.py +17 -9
  95. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/WHEEL +0 -0
  96. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,7 @@ from typing import Callable, Dict, Iterable, List, Optional, Union, cast
5
5
  from datahub.api.entities.datajob import DataFlow, DataJob
6
6
  from datahub.emitter.generic_emitter import Emitter
7
7
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
8
- from datahub.emitter.mcp_builder import DatahubKey
8
+ from datahub.emitter.mcp_builder import ContainerKey, DatahubKey
9
9
  from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import (
10
10
  DataProcessInstanceInput,
11
11
  DataProcessInstanceOutput,
@@ -15,11 +15,15 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import (
15
15
  )
16
16
  from datahub.metadata.schema_classes import (
17
17
  AuditStampClass,
18
+ ContainerClass,
19
+ DataPlatformInstanceClass,
18
20
  DataProcessInstanceRunEventClass,
19
21
  DataProcessInstanceRunResultClass,
20
22
  DataProcessRunStatusClass,
21
23
  DataProcessTypeClass,
24
+ SubTypesClass,
22
25
  )
26
+ from datahub.metadata.urns import DataPlatformInstanceUrn, DataPlatformUrn
23
27
  from datahub.utilities.str_enum import StrEnum
24
28
  from datahub.utilities.urns.data_flow_urn import DataFlowUrn
25
29
  from datahub.utilities.urns.data_job_urn import DataJobUrn
@@ -42,7 +46,7 @@ class InstanceRunResult(StrEnum):
42
46
 
43
47
  @dataclass
44
48
  class DataProcessInstance:
45
- """This is a DataProcessInstance class which represent an instance of a DataFlow or DataJob.
49
+ """This is a DataProcessInstance class which represents an instance of a DataFlow, DataJob, or a standalone process within a Container.
46
50
 
47
51
  Args:
48
52
  id: The id of the dataprocess instance execution.
@@ -71,6 +75,10 @@ class DataProcessInstance:
71
75
  _template_object: Optional[Union[DataJob, DataFlow]] = field(
72
76
  init=False, default=None, repr=False
73
77
  )
78
+ data_platform_instance: Optional[str] = None
79
+ subtype: Optional[str] = None
80
+ container_urn: Optional[str] = None
81
+ _platform: Optional[str] = field(init=False, repr=False, default=None)
74
82
 
75
83
  def __post_init__(self):
76
84
  self.urn = DataProcessInstanceUrn(
@@ -80,6 +88,28 @@ class DataProcessInstance:
80
88
  id=self.id,
81
89
  ).guid()
82
90
  )
91
+ self._platform = self.orchestrator
92
+
93
+ try:
94
+ # We first try to create from string assuming its an urn
95
+ self._platform = str(DataPlatformUrn.from_string(self._platform))
96
+ except Exception:
97
+ # If it fails, we assume its an id
98
+ self._platform = str(DataPlatformUrn(self._platform))
99
+
100
+ if self.data_platform_instance is not None:
101
+ try:
102
+ # We first try to create from string assuming its an urn
103
+ self.data_platform_instance = str(
104
+ DataPlatformInstanceUrn.from_string(self.data_platform_instance)
105
+ )
106
+ except Exception:
107
+ # If it fails, we assume its an id
108
+ self.data_platform_instance = str(
109
+ DataPlatformInstanceUrn(
110
+ platform=self._platform, instance=self.data_platform_instance
111
+ )
112
+ )
83
113
 
84
114
  def start_event_mcp(
85
115
  self, start_timestamp_millis: int, attempt: Optional[int] = None
@@ -269,6 +299,29 @@ class DataProcessInstance:
269
299
  )
270
300
  yield mcp
271
301
 
302
+ assert self._platform
303
+ if self.data_platform_instance:
304
+ mcp = MetadataChangeProposalWrapper(
305
+ entityUrn=str(self.urn),
306
+ aspect=DataPlatformInstanceClass(
307
+ platform=self._platform, instance=self.data_platform_instance
308
+ ),
309
+ )
310
+ yield mcp
311
+
312
+ if self.subtype:
313
+ mcp = MetadataChangeProposalWrapper(
314
+ entityUrn=str(self.urn), aspect=SubTypesClass(typeNames=[self.subtype])
315
+ )
316
+ yield mcp
317
+
318
+ if self.container_urn:
319
+ mcp = MetadataChangeProposalWrapper(
320
+ entityUrn=str(self.urn),
321
+ aspect=ContainerClass(container=self.container_urn),
322
+ )
323
+ yield mcp
324
+
272
325
  yield from self.generate_inlet_outlet_mcp(materialize_iolets=materialize_iolets)
273
326
 
274
327
  @staticmethod
@@ -309,13 +362,20 @@ class DataProcessInstance:
309
362
  clone_outlets: bool = False,
310
363
  ) -> "DataProcessInstance":
311
364
  """
312
- Generates DataProcessInstance from a DataJob
365
+ Generates a DataProcessInstance from a given DataJob.
313
366
 
314
- :param datajob: (DataJob) the datajob from generate the DataProcessInstance
315
- :param id: (str) the id for the DataProcessInstance
316
- :param clone_inlets: (bool) whether to clone datajob's inlets
317
- :param clone_outlets: (bool) whether to clone datajob's outlets
318
- :return: DataProcessInstance
367
+ This method creates a DataProcessInstance object using the provided DataJob
368
+ and assigns it a unique identifier. Optionally, it can clone the inlets and
369
+ outlets from the DataJob to the DataProcessInstance.
370
+
371
+ Args:
372
+ datajob (DataJob): The DataJob instance from which to generate the DataProcessInstance.
373
+ id (str): The unique identifier for the DataProcessInstance.
374
+ clone_inlets (bool, optional): If True, clones the inlets from the DataJob to the DataProcessInstance. Defaults to False.
375
+ clone_outlets (bool, optional): If True, clones the outlets from the DataJob to the DataProcessInstance. Defaults to False.
376
+
377
+ Returns:
378
+ DataProcessInstance: The generated DataProcessInstance object.
319
379
  """
320
380
  dpi: DataProcessInstance = DataProcessInstance(
321
381
  orchestrator=datajob.flow_urn.orchestrator,
@@ -332,14 +392,47 @@ class DataProcessInstance:
332
392
  return dpi
333
393
 
334
394
  @staticmethod
335
- def from_dataflow(dataflow: DataFlow, id: str) -> "DataProcessInstance":
395
+ def from_container(
396
+ container_key: ContainerKey,
397
+ id: str,
398
+ ) -> "DataProcessInstance":
336
399
  """
337
- Generates DataProcessInstance from a DataFlow
400
+ Create a DataProcessInstance that is located within a Container.
401
+ Use this method when you need to represent a DataProcessInstance that
402
+ is not an instance of a DataJob or a DataFlow.
403
+ e.g. If recording an ad-hoc training run that is just associated with an Experiment.
338
404
 
339
- :param dataflow: (DataFlow) the DataFlow from generate the DataProcessInstance
405
+ :param container_key: (ContainerKey) the container key to generate the DataProcessInstance
340
406
  :param id: (str) the id for the DataProcessInstance
341
407
  :return: DataProcessInstance
342
408
  """
409
+ dpi: DataProcessInstance = DataProcessInstance(
410
+ id=id,
411
+ orchestrator=DataPlatformUrn.from_string(
412
+ container_key.platform
413
+ ).platform_name,
414
+ template_urn=None,
415
+ container_urn=container_key.as_urn(),
416
+ )
417
+
418
+ return dpi
419
+
420
+ @staticmethod
421
+ def from_dataflow(dataflow: DataFlow, id: str) -> "DataProcessInstance":
422
+ """
423
+ Creates a DataProcessInstance from a given DataFlow.
424
+
425
+ This method generates a DataProcessInstance object using the provided DataFlow
426
+ and a specified id. The DataProcessInstance will inherit properties from the
427
+ DataFlow such as orchestrator, environment, and template URN.
428
+
429
+ Args:
430
+ dataflow (DataFlow): The DataFlow object from which to generate the DataProcessInstance.
431
+ id (str): The unique identifier for the DataProcessInstance.
432
+
433
+ Returns:
434
+ DataProcessInstance: The newly created DataProcessInstance object.
435
+ """
343
436
  dpi = DataProcessInstance(
344
437
  id=id,
345
438
  orchestrator=dataflow.orchestrator,
datahub/cli/check_cli.py CHANGED
@@ -9,7 +9,7 @@ from typing import Dict, List, Optional, Union
9
9
 
10
10
  import click
11
11
 
12
- from datahub import __package_name__
12
+ from datahub._version import __package_name__
13
13
  from datahub.cli.json_file import check_mce_file
14
14
  from datahub.configuration import config_loader
15
15
  from datahub.configuration.common import AllowDenyPattern
datahub/cli/cli_utils.py CHANGED
@@ -9,7 +9,7 @@ import click
9
9
  import requests
10
10
  from requests.sessions import Session
11
11
 
12
- import datahub
12
+ import datahub._version as datahub_version
13
13
  from datahub.cli import config_utils
14
14
  from datahub.emitter.aspect import ASPECT_MAP, TIMESERIES_ASPECT_MAP
15
15
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -422,5 +422,5 @@ def ensure_has_system_metadata(
422
422
  if metadata.properties is None:
423
423
  metadata.properties = {}
424
424
  props = metadata.properties
425
- props["clientId"] = datahub.__package_name__
426
- props["clientVersion"] = datahub.__version__
425
+ props["clientId"] = datahub_version.__package_name__
426
+ props["clientVersion"] = datahub_version.__version__
@@ -1,19 +1,8 @@
1
1
  import logging
2
- from typing import Any, List
3
2
 
4
3
  import click
5
- import progressbar
6
4
 
7
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
8
- from datahub.ingestion.graph.client import get_default_graph
9
- from datahub.metadata.schema_classes import (
10
- DomainsClass,
11
- GlossaryTermAssociationClass,
12
- OwnerClass,
13
- OwnershipTypeClass,
14
- TagAssociationClass,
15
- )
16
- from datahub.specific.dataset import DatasetPatchBuilder
5
+ from datahub.ingestion.source.apply.datahub_apply import apply_association_to_container
17
6
 
18
7
  logger = logging.getLogger(__name__)
19
8
 
@@ -24,58 +13,6 @@ def container() -> None:
24
13
  pass
25
14
 
26
15
 
27
- def apply_association_to_container(
28
- container_urn: str,
29
- association_urn: str,
30
- association_type: str,
31
- ) -> None:
32
- """
33
- Common function to add either tags, terms, domains, or owners to child datasets (for now).
34
-
35
- Args:
36
- container_urn: The URN of the container
37
- association_urn: The URN of the tag, term, or user to apply
38
- association_type: One of 'tag', 'term', 'domain' or 'owner'
39
- """
40
- urns: List[str] = []
41
- graph = get_default_graph()
42
- logger.info(f"Using {graph}")
43
- urns.extend(
44
- graph.get_urns_by_filter(
45
- container=container_urn, batch_size=1000, entity_types=["dataset"]
46
- )
47
- )
48
-
49
- all_patches: List[Any] = []
50
- for urn in urns:
51
- builder = DatasetPatchBuilder(urn)
52
- patches: List[Any] = []
53
- if association_type == "tag":
54
- patches = builder.add_tag(TagAssociationClass(association_urn)).build()
55
- elif association_type == "term":
56
- patches = builder.add_term(
57
- GlossaryTermAssociationClass(association_urn)
58
- ).build()
59
- elif association_type == "owner":
60
- patches = builder.add_owner(
61
- OwnerClass(
62
- owner=association_urn,
63
- type=OwnershipTypeClass.TECHNICAL_OWNER,
64
- )
65
- ).build()
66
- elif association_type == "domain":
67
- patches = [
68
- MetadataChangeProposalWrapper(
69
- entityUrn=urn,
70
- aspect=DomainsClass(domains=[association_urn]),
71
- )
72
- ]
73
- all_patches.extend(patches)
74
- mcps_iter = progressbar.progressbar(all_patches, redirect_stdout=True)
75
- for mcp in mcps_iter:
76
- graph.emit(mcp)
77
-
78
-
79
16
  @container.command()
80
17
  @click.option("--container-urn", required=True, type=str)
81
18
  @click.option("--tag-urn", required=True, type=str)