acryl-datahub 0.15.0.4rc3__py3-none-any.whl → 0.15.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/METADATA +2507 -2470
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/RECORD +95 -86
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/entry_points.txt +1 -0
- datahub/__init__.py +1 -25
- datahub/_version.py +13 -0
- datahub/api/entities/dataprocess/dataprocess_instance.py +104 -11
- datahub/cli/check_cli.py +1 -1
- datahub/cli/cli_utils.py +3 -3
- datahub/cli/container_cli.py +1 -64
- datahub/cli/iceberg_cli.py +707 -0
- datahub/cli/ingest_cli.py +2 -2
- datahub/emitter/composite_emitter.py +36 -0
- datahub/emitter/rest_emitter.py +1 -1
- datahub/entrypoints.py +26 -5
- datahub/ingestion/api/incremental_lineage_helper.py +4 -0
- datahub/ingestion/api/registry.py +1 -1
- datahub/ingestion/glossary/classification_mixin.py +6 -0
- datahub/ingestion/glossary/classifier.py +3 -2
- datahub/ingestion/graph/client.py +2 -1
- datahub/ingestion/graph/entity_versioning.py +201 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/run/connection.py +1 -1
- datahub/ingestion/run/pipeline.py +3 -3
- datahub/ingestion/source/abs/report.py +2 -2
- datahub/ingestion/source/apply/__init__.py +0 -0
- datahub/ingestion/source/apply/datahub_apply.py +223 -0
- datahub/ingestion/source/aws/glue.py +5 -2
- datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
- datahub/ingestion/source/dbt/dbt_core.py +1 -1
- datahub/ingestion/source/delta_lake/report.py +2 -2
- datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
- datahub/ingestion/source/elastic_search.py +2 -1
- datahub/ingestion/source/ge_profiling_config.py +11 -7
- datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
- datahub/ingestion/source/identity/azure_ad.py +6 -14
- datahub/ingestion/source/identity/okta.py +2 -1
- datahub/ingestion/source/kafka/kafka.py +2 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -1
- datahub/ingestion/source/ldap.py +2 -1
- datahub/ingestion/source/looker/looker_config.py +3 -1
- datahub/ingestion/source/looker/looker_dataclasses.py +8 -0
- datahub/ingestion/source/looker/looker_file_loader.py +14 -3
- datahub/ingestion/source/looker/looker_template_language.py +104 -14
- datahub/ingestion/source/looker/lookml_config.py +29 -8
- datahub/ingestion/source/looker/lookml_source.py +110 -22
- datahub/ingestion/source/mode.py +2 -4
- datahub/ingestion/source/mongodb.py +2 -1
- datahub/ingestion/source/nifi.py +2 -1
- datahub/ingestion/source/powerbi/config.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
- datahub/ingestion/source/redash.py +5 -5
- datahub/ingestion/source/salesforce.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_query.py +11 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -1
- datahub/ingestion/source/snowflake/snowflake_schema.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +35 -43
- datahub/ingestion/source/snowflake/snowflake_tag.py +57 -3
- datahub/ingestion/source/snowflake/snowflake_v2.py +42 -4
- datahub/ingestion/source/sql/clickhouse.py +5 -43
- datahub/ingestion/source/sql/mssql/job_models.py +37 -8
- datahub/ingestion/source/sql/mssql/source.py +17 -0
- datahub/ingestion/source/sql/sql_config.py +0 -10
- datahub/ingestion/source/tableau/tableau.py +16 -13
- datahub/ingestion/source/tableau/tableau_common.py +1 -1
- datahub/ingestion/source/unity/ge_profiler.py +55 -4
- datahub/ingestion/source/unity/proxy.py +2 -2
- datahub/ingestion/source/unity/report.py +1 -0
- datahub/ingestion/source_config/operation_config.py +9 -0
- datahub/ingestion/source_report/pulsar.py +5 -4
- datahub/metadata/_schema_classes.py +304 -6
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
- datahub/metadata/schema.avsc +211 -12
- datahub/metadata/schemas/AssertionInfo.avsc +2 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +9 -0
- datahub/metadata/schemas/DashboardInfo.avsc +5 -5
- datahub/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/Deprecation.avsc +12 -0
- datahub/metadata/schemas/DisplayProperties.avsc +62 -0
- datahub/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +92 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +17 -5
- datahub/metadata/schemas/PostInfo.avsc +28 -2
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/specific/dashboard.py +43 -1
- datahub/telemetry/telemetry.py +4 -4
- datahub/testing/check_imports.py +28 -0
- datahub/upgrade/upgrade.py +17 -9
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/top_level.txt +0 -0
|
@@ -5,7 +5,7 @@ from typing import Callable, Dict, Iterable, List, Optional, Union, cast
|
|
|
5
5
|
from datahub.api.entities.datajob import DataFlow, DataJob
|
|
6
6
|
from datahub.emitter.generic_emitter import Emitter
|
|
7
7
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
8
|
-
from datahub.emitter.mcp_builder import DatahubKey
|
|
8
|
+
from datahub.emitter.mcp_builder import ContainerKey, DatahubKey
|
|
9
9
|
from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import (
|
|
10
10
|
DataProcessInstanceInput,
|
|
11
11
|
DataProcessInstanceOutput,
|
|
@@ -15,11 +15,15 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import (
|
|
|
15
15
|
)
|
|
16
16
|
from datahub.metadata.schema_classes import (
|
|
17
17
|
AuditStampClass,
|
|
18
|
+
ContainerClass,
|
|
19
|
+
DataPlatformInstanceClass,
|
|
18
20
|
DataProcessInstanceRunEventClass,
|
|
19
21
|
DataProcessInstanceRunResultClass,
|
|
20
22
|
DataProcessRunStatusClass,
|
|
21
23
|
DataProcessTypeClass,
|
|
24
|
+
SubTypesClass,
|
|
22
25
|
)
|
|
26
|
+
from datahub.metadata.urns import DataPlatformInstanceUrn, DataPlatformUrn
|
|
23
27
|
from datahub.utilities.str_enum import StrEnum
|
|
24
28
|
from datahub.utilities.urns.data_flow_urn import DataFlowUrn
|
|
25
29
|
from datahub.utilities.urns.data_job_urn import DataJobUrn
|
|
@@ -42,7 +46,7 @@ class InstanceRunResult(StrEnum):
|
|
|
42
46
|
|
|
43
47
|
@dataclass
|
|
44
48
|
class DataProcessInstance:
|
|
45
|
-
"""This is a DataProcessInstance class which
|
|
49
|
+
"""This is a DataProcessInstance class which represents an instance of a DataFlow, DataJob, or a standalone process within a Container.
|
|
46
50
|
|
|
47
51
|
Args:
|
|
48
52
|
id: The id of the dataprocess instance execution.
|
|
@@ -71,6 +75,10 @@ class DataProcessInstance:
|
|
|
71
75
|
_template_object: Optional[Union[DataJob, DataFlow]] = field(
|
|
72
76
|
init=False, default=None, repr=False
|
|
73
77
|
)
|
|
78
|
+
data_platform_instance: Optional[str] = None
|
|
79
|
+
subtype: Optional[str] = None
|
|
80
|
+
container_urn: Optional[str] = None
|
|
81
|
+
_platform: Optional[str] = field(init=False, repr=False, default=None)
|
|
74
82
|
|
|
75
83
|
def __post_init__(self):
|
|
76
84
|
self.urn = DataProcessInstanceUrn(
|
|
@@ -80,6 +88,28 @@ class DataProcessInstance:
|
|
|
80
88
|
id=self.id,
|
|
81
89
|
).guid()
|
|
82
90
|
)
|
|
91
|
+
self._platform = self.orchestrator
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
# We first try to create from string assuming its an urn
|
|
95
|
+
self._platform = str(DataPlatformUrn.from_string(self._platform))
|
|
96
|
+
except Exception:
|
|
97
|
+
# If it fails, we assume its an id
|
|
98
|
+
self._platform = str(DataPlatformUrn(self._platform))
|
|
99
|
+
|
|
100
|
+
if self.data_platform_instance is not None:
|
|
101
|
+
try:
|
|
102
|
+
# We first try to create from string assuming its an urn
|
|
103
|
+
self.data_platform_instance = str(
|
|
104
|
+
DataPlatformInstanceUrn.from_string(self.data_platform_instance)
|
|
105
|
+
)
|
|
106
|
+
except Exception:
|
|
107
|
+
# If it fails, we assume its an id
|
|
108
|
+
self.data_platform_instance = str(
|
|
109
|
+
DataPlatformInstanceUrn(
|
|
110
|
+
platform=self._platform, instance=self.data_platform_instance
|
|
111
|
+
)
|
|
112
|
+
)
|
|
83
113
|
|
|
84
114
|
def start_event_mcp(
|
|
85
115
|
self, start_timestamp_millis: int, attempt: Optional[int] = None
|
|
@@ -269,6 +299,29 @@ class DataProcessInstance:
|
|
|
269
299
|
)
|
|
270
300
|
yield mcp
|
|
271
301
|
|
|
302
|
+
assert self._platform
|
|
303
|
+
if self.data_platform_instance:
|
|
304
|
+
mcp = MetadataChangeProposalWrapper(
|
|
305
|
+
entityUrn=str(self.urn),
|
|
306
|
+
aspect=DataPlatformInstanceClass(
|
|
307
|
+
platform=self._platform, instance=self.data_platform_instance
|
|
308
|
+
),
|
|
309
|
+
)
|
|
310
|
+
yield mcp
|
|
311
|
+
|
|
312
|
+
if self.subtype:
|
|
313
|
+
mcp = MetadataChangeProposalWrapper(
|
|
314
|
+
entityUrn=str(self.urn), aspect=SubTypesClass(typeNames=[self.subtype])
|
|
315
|
+
)
|
|
316
|
+
yield mcp
|
|
317
|
+
|
|
318
|
+
if self.container_urn:
|
|
319
|
+
mcp = MetadataChangeProposalWrapper(
|
|
320
|
+
entityUrn=str(self.urn),
|
|
321
|
+
aspect=ContainerClass(container=self.container_urn),
|
|
322
|
+
)
|
|
323
|
+
yield mcp
|
|
324
|
+
|
|
272
325
|
yield from self.generate_inlet_outlet_mcp(materialize_iolets=materialize_iolets)
|
|
273
326
|
|
|
274
327
|
@staticmethod
|
|
@@ -309,13 +362,20 @@ class DataProcessInstance:
|
|
|
309
362
|
clone_outlets: bool = False,
|
|
310
363
|
) -> "DataProcessInstance":
|
|
311
364
|
"""
|
|
312
|
-
Generates DataProcessInstance from a DataJob
|
|
365
|
+
Generates a DataProcessInstance from a given DataJob.
|
|
313
366
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
:
|
|
367
|
+
This method creates a DataProcessInstance object using the provided DataJob
|
|
368
|
+
and assigns it a unique identifier. Optionally, it can clone the inlets and
|
|
369
|
+
outlets from the DataJob to the DataProcessInstance.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
datajob (DataJob): The DataJob instance from which to generate the DataProcessInstance.
|
|
373
|
+
id (str): The unique identifier for the DataProcessInstance.
|
|
374
|
+
clone_inlets (bool, optional): If True, clones the inlets from the DataJob to the DataProcessInstance. Defaults to False.
|
|
375
|
+
clone_outlets (bool, optional): If True, clones the outlets from the DataJob to the DataProcessInstance. Defaults to False.
|
|
376
|
+
|
|
377
|
+
Returns:
|
|
378
|
+
DataProcessInstance: The generated DataProcessInstance object.
|
|
319
379
|
"""
|
|
320
380
|
dpi: DataProcessInstance = DataProcessInstance(
|
|
321
381
|
orchestrator=datajob.flow_urn.orchestrator,
|
|
@@ -332,14 +392,47 @@ class DataProcessInstance:
|
|
|
332
392
|
return dpi
|
|
333
393
|
|
|
334
394
|
@staticmethod
|
|
335
|
-
def
|
|
395
|
+
def from_container(
|
|
396
|
+
container_key: ContainerKey,
|
|
397
|
+
id: str,
|
|
398
|
+
) -> "DataProcessInstance":
|
|
336
399
|
"""
|
|
337
|
-
|
|
400
|
+
Create a DataProcessInstance that is located within a Container.
|
|
401
|
+
Use this method when you need to represent a DataProcessInstance that
|
|
402
|
+
is not an instance of a DataJob or a DataFlow.
|
|
403
|
+
e.g. If recording an ad-hoc training run that is just associated with an Experiment.
|
|
338
404
|
|
|
339
|
-
:param
|
|
405
|
+
:param container_key: (ContainerKey) the container key to generate the DataProcessInstance
|
|
340
406
|
:param id: (str) the id for the DataProcessInstance
|
|
341
407
|
:return: DataProcessInstance
|
|
342
408
|
"""
|
|
409
|
+
dpi: DataProcessInstance = DataProcessInstance(
|
|
410
|
+
id=id,
|
|
411
|
+
orchestrator=DataPlatformUrn.from_string(
|
|
412
|
+
container_key.platform
|
|
413
|
+
).platform_name,
|
|
414
|
+
template_urn=None,
|
|
415
|
+
container_urn=container_key.as_urn(),
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
return dpi
|
|
419
|
+
|
|
420
|
+
@staticmethod
|
|
421
|
+
def from_dataflow(dataflow: DataFlow, id: str) -> "DataProcessInstance":
|
|
422
|
+
"""
|
|
423
|
+
Creates a DataProcessInstance from a given DataFlow.
|
|
424
|
+
|
|
425
|
+
This method generates a DataProcessInstance object using the provided DataFlow
|
|
426
|
+
and a specified id. The DataProcessInstance will inherit properties from the
|
|
427
|
+
DataFlow such as orchestrator, environment, and template URN.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
dataflow (DataFlow): The DataFlow object from which to generate the DataProcessInstance.
|
|
431
|
+
id (str): The unique identifier for the DataProcessInstance.
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
DataProcessInstance: The newly created DataProcessInstance object.
|
|
435
|
+
"""
|
|
343
436
|
dpi = DataProcessInstance(
|
|
344
437
|
id=id,
|
|
345
438
|
orchestrator=dataflow.orchestrator,
|
datahub/cli/check_cli.py
CHANGED
|
@@ -9,7 +9,7 @@ from typing import Dict, List, Optional, Union
|
|
|
9
9
|
|
|
10
10
|
import click
|
|
11
11
|
|
|
12
|
-
from datahub import __package_name__
|
|
12
|
+
from datahub._version import __package_name__
|
|
13
13
|
from datahub.cli.json_file import check_mce_file
|
|
14
14
|
from datahub.configuration import config_loader
|
|
15
15
|
from datahub.configuration.common import AllowDenyPattern
|
datahub/cli/cli_utils.py
CHANGED
|
@@ -9,7 +9,7 @@ import click
|
|
|
9
9
|
import requests
|
|
10
10
|
from requests.sessions import Session
|
|
11
11
|
|
|
12
|
-
import datahub
|
|
12
|
+
import datahub._version as datahub_version
|
|
13
13
|
from datahub.cli import config_utils
|
|
14
14
|
from datahub.emitter.aspect import ASPECT_MAP, TIMESERIES_ASPECT_MAP
|
|
15
15
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
@@ -422,5 +422,5 @@ def ensure_has_system_metadata(
|
|
|
422
422
|
if metadata.properties is None:
|
|
423
423
|
metadata.properties = {}
|
|
424
424
|
props = metadata.properties
|
|
425
|
-
props["clientId"] =
|
|
426
|
-
props["clientVersion"] =
|
|
425
|
+
props["clientId"] = datahub_version.__package_name__
|
|
426
|
+
props["clientVersion"] = datahub_version.__version__
|
datahub/cli/container_cli.py
CHANGED
|
@@ -1,19 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, List
|
|
3
2
|
|
|
4
3
|
import click
|
|
5
|
-
import progressbar
|
|
6
4
|
|
|
7
|
-
from datahub.
|
|
8
|
-
from datahub.ingestion.graph.client import get_default_graph
|
|
9
|
-
from datahub.metadata.schema_classes import (
|
|
10
|
-
DomainsClass,
|
|
11
|
-
GlossaryTermAssociationClass,
|
|
12
|
-
OwnerClass,
|
|
13
|
-
OwnershipTypeClass,
|
|
14
|
-
TagAssociationClass,
|
|
15
|
-
)
|
|
16
|
-
from datahub.specific.dataset import DatasetPatchBuilder
|
|
5
|
+
from datahub.ingestion.source.apply.datahub_apply import apply_association_to_container
|
|
17
6
|
|
|
18
7
|
logger = logging.getLogger(__name__)
|
|
19
8
|
|
|
@@ -24,58 +13,6 @@ def container() -> None:
|
|
|
24
13
|
pass
|
|
25
14
|
|
|
26
15
|
|
|
27
|
-
def apply_association_to_container(
|
|
28
|
-
container_urn: str,
|
|
29
|
-
association_urn: str,
|
|
30
|
-
association_type: str,
|
|
31
|
-
) -> None:
|
|
32
|
-
"""
|
|
33
|
-
Common function to add either tags, terms, domains, or owners to child datasets (for now).
|
|
34
|
-
|
|
35
|
-
Args:
|
|
36
|
-
container_urn: The URN of the container
|
|
37
|
-
association_urn: The URN of the tag, term, or user to apply
|
|
38
|
-
association_type: One of 'tag', 'term', 'domain' or 'owner'
|
|
39
|
-
"""
|
|
40
|
-
urns: List[str] = []
|
|
41
|
-
graph = get_default_graph()
|
|
42
|
-
logger.info(f"Using {graph}")
|
|
43
|
-
urns.extend(
|
|
44
|
-
graph.get_urns_by_filter(
|
|
45
|
-
container=container_urn, batch_size=1000, entity_types=["dataset"]
|
|
46
|
-
)
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
all_patches: List[Any] = []
|
|
50
|
-
for urn in urns:
|
|
51
|
-
builder = DatasetPatchBuilder(urn)
|
|
52
|
-
patches: List[Any] = []
|
|
53
|
-
if association_type == "tag":
|
|
54
|
-
patches = builder.add_tag(TagAssociationClass(association_urn)).build()
|
|
55
|
-
elif association_type == "term":
|
|
56
|
-
patches = builder.add_term(
|
|
57
|
-
GlossaryTermAssociationClass(association_urn)
|
|
58
|
-
).build()
|
|
59
|
-
elif association_type == "owner":
|
|
60
|
-
patches = builder.add_owner(
|
|
61
|
-
OwnerClass(
|
|
62
|
-
owner=association_urn,
|
|
63
|
-
type=OwnershipTypeClass.TECHNICAL_OWNER,
|
|
64
|
-
)
|
|
65
|
-
).build()
|
|
66
|
-
elif association_type == "domain":
|
|
67
|
-
patches = [
|
|
68
|
-
MetadataChangeProposalWrapper(
|
|
69
|
-
entityUrn=urn,
|
|
70
|
-
aspect=DomainsClass(domains=[association_urn]),
|
|
71
|
-
)
|
|
72
|
-
]
|
|
73
|
-
all_patches.extend(patches)
|
|
74
|
-
mcps_iter = progressbar.progressbar(all_patches, redirect_stdout=True)
|
|
75
|
-
for mcp in mcps_iter:
|
|
76
|
-
graph.emit(mcp)
|
|
77
|
-
|
|
78
|
-
|
|
79
16
|
@container.command()
|
|
80
17
|
@click.option("--container-urn", required=True, type=str)
|
|
81
18
|
@click.option("--tag-urn", required=True, type=str)
|