acryl-datahub 1.0.0.1rc7__py3-none-any.whl → 1.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2561 -2561
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +75 -73
- datahub/_version.py +1 -1
- datahub/api/entities/datajob/dataflow.py +15 -0
- datahub/api/entities/datajob/datajob.py +17 -0
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataset/dataset.py +2 -2
- datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
- datahub/cli/ingest_cli.py +4 -4
- datahub/cli/migrate.py +6 -6
- datahub/configuration/common.py +1 -1
- datahub/emitter/mcp_builder.py +4 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/source.py +4 -1
- datahub/ingestion/api/source_helpers.py +26 -1
- datahub/ingestion/graph/client.py +104 -0
- datahub/ingestion/run/pipeline.py +0 -6
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/fivetran/fivetran.py +1 -0
- datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
- datahub/ingestion/source/hex/constants.py +5 -0
- datahub/ingestion/source/hex/hex.py +150 -22
- datahub/ingestion/source/hex/mapper.py +28 -2
- datahub/ingestion/source/hex/model.py +10 -2
- datahub/ingestion/source/hex/query_fetcher.py +300 -0
- datahub/ingestion/source/iceberg/iceberg.py +106 -18
- datahub/ingestion/source/kafka/kafka.py +1 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
- datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +2 -3
- datahub/ingestion/source/mlflow.py +6 -7
- datahub/ingestion/source/mode.py +2 -2
- datahub/ingestion/source/nifi.py +3 -3
- datahub/ingestion/source/openapi.py +3 -3
- datahub/ingestion/source/openapi_parser.py +8 -8
- datahub/ingestion/source/powerbi/config.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +16 -3
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/sigma/sigma.py +6 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/trino.py +4 -3
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/superset.py +108 -81
- datahub/ingestion/source/tableau/tableau.py +4 -4
- datahub/ingestion/source/tableau/tableau_common.py +2 -2
- datahub/ingestion/source/unity/source.py +1 -1
- datahub/ingestion/source/vertexai/vertexai.py +7 -7
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_schema_classes.py +47 -2
- datahub/metadata/_urns/urn_defs.py +56 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +121 -85
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +6 -6
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
|
@@ -36,12 +36,14 @@
|
|
|
36
36
|
"type": {
|
|
37
37
|
"type": "enum",
|
|
38
38
|
"symbolDocs": {
|
|
39
|
-
"SQL": "A SQL Query"
|
|
39
|
+
"SQL": "A SQL Query",
|
|
40
|
+
"UNKNOWN": "Unknown query language"
|
|
40
41
|
},
|
|
41
42
|
"name": "QueryLanguage",
|
|
42
43
|
"namespace": "com.linkedin.pegasus2avro.query",
|
|
43
44
|
"symbols": [
|
|
44
|
-
"SQL"
|
|
45
|
+
"SQL",
|
|
46
|
+
"UNKNOWN"
|
|
45
47
|
]
|
|
46
48
|
},
|
|
47
49
|
"name": "language",
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "record",
|
|
3
|
+
"Aspect": {
|
|
4
|
+
"name": "systemMetadata"
|
|
5
|
+
},
|
|
6
|
+
"name": "SystemMetadata",
|
|
7
|
+
"namespace": "com.linkedin.pegasus2avro.mxe",
|
|
8
|
+
"fields": [
|
|
9
|
+
{
|
|
10
|
+
"type": [
|
|
11
|
+
"long",
|
|
12
|
+
"null"
|
|
13
|
+
],
|
|
14
|
+
"name": "lastObserved",
|
|
15
|
+
"default": 0,
|
|
16
|
+
"doc": "The timestamp the metadata was observed at"
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"type": [
|
|
20
|
+
"string",
|
|
21
|
+
"null"
|
|
22
|
+
],
|
|
23
|
+
"name": "runId",
|
|
24
|
+
"default": "no-run-id-provided",
|
|
25
|
+
"doc": "The original run id that produced the metadata. Populated in case of batch-ingestion."
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"type": [
|
|
29
|
+
"string",
|
|
30
|
+
"null"
|
|
31
|
+
],
|
|
32
|
+
"name": "lastRunId",
|
|
33
|
+
"default": "no-run-id-provided",
|
|
34
|
+
"doc": "The last run id that produced the metadata. Populated in case of batch-ingestion."
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"type": [
|
|
38
|
+
"null",
|
|
39
|
+
"string"
|
|
40
|
+
],
|
|
41
|
+
"name": "pipelineName",
|
|
42
|
+
"default": null,
|
|
43
|
+
"doc": "The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion."
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"type": [
|
|
47
|
+
"null",
|
|
48
|
+
"string"
|
|
49
|
+
],
|
|
50
|
+
"name": "registryName",
|
|
51
|
+
"default": null,
|
|
52
|
+
"doc": "The model registry name that was used to process this event"
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"type": [
|
|
56
|
+
"null",
|
|
57
|
+
"string"
|
|
58
|
+
],
|
|
59
|
+
"name": "registryVersion",
|
|
60
|
+
"default": null,
|
|
61
|
+
"doc": "The model registry version that was used to process this event"
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
"type": [
|
|
65
|
+
"null",
|
|
66
|
+
{
|
|
67
|
+
"type": "map",
|
|
68
|
+
"values": "string"
|
|
69
|
+
}
|
|
70
|
+
],
|
|
71
|
+
"name": "properties",
|
|
72
|
+
"default": null,
|
|
73
|
+
"doc": "Additional properties"
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"type": [
|
|
77
|
+
"null",
|
|
78
|
+
"string"
|
|
79
|
+
],
|
|
80
|
+
"name": "version",
|
|
81
|
+
"default": null,
|
|
82
|
+
"doc": "Aspect version\n Initial implementation will use the aspect version's number, however stored as\n a string in the case where a different aspect versioning scheme is later adopted."
|
|
83
|
+
}
|
|
84
|
+
],
|
|
85
|
+
"doc": "Metadata associated with each metadata change that is processed by the system"
|
|
86
|
+
}
|
datahub/testing/mcp_diff.py
CHANGED
|
@@ -189,7 +189,7 @@ class MCPDiff:
|
|
|
189
189
|
"""
|
|
190
190
|
aspect_diffs = [v for d in self.aspect_changes.values() for v in d.values()]
|
|
191
191
|
for aspect_diff in aspect_diffs:
|
|
192
|
-
for _, old, new in aspect_diff.aspects_changed
|
|
192
|
+
for _, old, new in aspect_diff.aspects_changed:
|
|
193
193
|
golden[old.delta_info.idx] = new.delta_info.original
|
|
194
194
|
|
|
195
195
|
indices_to_remove = set()
|
|
@@ -250,7 +250,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
250
250
|
rowid INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
251
251
|
key TEXT UNIQUE,
|
|
252
252
|
value BLOB
|
|
253
|
-
{"".join(f", {column_name} BLOB" for column_name in self.extra_columns
|
|
253
|
+
{"".join(f", {column_name} BLOB" for column_name in self.extra_columns)}
|
|
254
254
|
)"""
|
|
255
255
|
)
|
|
256
256
|
|
|
@@ -267,7 +267,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
267
267
|
if self.indexes_created:
|
|
268
268
|
return
|
|
269
269
|
# The key column will automatically be indexed, but we need indexes for the extra columns.
|
|
270
|
-
for column_name in self.extra_columns
|
|
270
|
+
for column_name in self.extra_columns:
|
|
271
271
|
self._conn.execute(
|
|
272
272
|
f"CREATE INDEX {self.tablename}_{column_name} ON {self.tablename} ({column_name})"
|
|
273
273
|
)
|
|
@@ -305,12 +305,12 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
305
305
|
f"""INSERT INTO {self.tablename} (
|
|
306
306
|
key,
|
|
307
307
|
value
|
|
308
|
-
{"".join(f", {column_name}" for column_name in self.extra_columns
|
|
308
|
+
{"".join(f", {column_name}" for column_name in self.extra_columns)}
|
|
309
309
|
)
|
|
310
310
|
VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})
|
|
311
311
|
ON CONFLICT (key) DO UPDATE SET
|
|
312
312
|
value = excluded.value
|
|
313
|
-
{"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns
|
|
313
|
+
{"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns)}
|
|
314
314
|
""",
|
|
315
315
|
items_to_write,
|
|
316
316
|
)
|
|
@@ -321,7 +321,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
321
321
|
f"""INSERT INTO {self.tablename} (
|
|
322
322
|
key,
|
|
323
323
|
value
|
|
324
|
-
{"".join(f", {column_name}" for column_name in self.extra_columns
|
|
324
|
+
{"".join(f", {column_name}" for column_name in self.extra_columns)}
|
|
325
325
|
)
|
|
326
326
|
VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})""",
|
|
327
327
|
item,
|
|
@@ -330,7 +330,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
330
330
|
self._conn.execute(
|
|
331
331
|
f"""UPDATE {self.tablename} SET
|
|
332
332
|
value = ?
|
|
333
|
-
{"".join(f", {column_name} = ?" for column_name in self.extra_columns
|
|
333
|
+
{"".join(f", {column_name} = ?" for column_name in self.extra_columns)}
|
|
334
334
|
WHERE key = ?""",
|
|
335
335
|
(*item[1:], item[0]),
|
|
336
336
|
)
|
|
@@ -155,7 +155,7 @@ class HiveColumnToAvroConverter:
|
|
|
155
155
|
|
|
156
156
|
@staticmethod
|
|
157
157
|
def _parse_basic_datatype_string(s: str) -> Dict[str, object]:
|
|
158
|
-
if s in HiveColumnToAvroConverter._PRIVIMITE_HIVE_TYPE_TO_AVRO_TYPE
|
|
158
|
+
if s in HiveColumnToAvroConverter._PRIVIMITE_HIVE_TYPE_TO_AVRO_TYPE:
|
|
159
159
|
return {
|
|
160
160
|
"type": HiveColumnToAvroConverter._PRIVIMITE_HIVE_TYPE_TO_AVRO_TYPE[s],
|
|
161
161
|
"native_data_type": s,
|
|
@@ -218,7 +218,7 @@ class HiveColumnToAvroConverter:
|
|
|
218
218
|
buf = ""
|
|
219
219
|
level = 0
|
|
220
220
|
for c in s:
|
|
221
|
-
if c in HiveColumnToAvroConverter._BRACKETS
|
|
221
|
+
if c in HiveColumnToAvroConverter._BRACKETS:
|
|
222
222
|
level += 1
|
|
223
223
|
buf += c
|
|
224
224
|
elif c in HiveColumnToAvroConverter._BRACKETS.values():
|
|
@@ -32,10 +32,10 @@ def deploy_source_vars(
|
|
|
32
32
|
name: Optional[str],
|
|
33
33
|
config: str,
|
|
34
34
|
urn: Optional[str],
|
|
35
|
-
executor_id: str,
|
|
35
|
+
executor_id: Optional[str],
|
|
36
36
|
cli_version: Optional[str],
|
|
37
37
|
schedule: Optional[str],
|
|
38
|
-
time_zone: str,
|
|
38
|
+
time_zone: Optional[str],
|
|
39
39
|
extra_pip: Optional[str],
|
|
40
40
|
debug: bool = False,
|
|
41
41
|
) -> dict:
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
import functools
|
|
2
|
-
from typing import Iterable
|
|
3
|
-
|
|
4
|
-
from datahub.emitter.mce_builder import get_sys_time
|
|
5
|
-
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope
|
|
6
|
-
from datahub.ingestion.api.transform import Transformer
|
|
7
|
-
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
8
|
-
from datahub.ingestion.transformer.auto_helper_transformer import AutoHelperTransformer
|
|
9
|
-
from datahub.metadata.schema_classes import SystemMetadataClass
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def auto_system_metadata(
|
|
13
|
-
ctx: PipelineContext,
|
|
14
|
-
stream: Iterable[MetadataWorkUnit],
|
|
15
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
16
|
-
if not ctx.pipeline_config:
|
|
17
|
-
raise ValueError("Pipeline config is required for system metadata")
|
|
18
|
-
set_system_metadata = ctx.pipeline_config.flags.set_system_metadata
|
|
19
|
-
set_pipeline_name = ctx.pipeline_config.flags.set_system_metadata_pipeline_name
|
|
20
|
-
|
|
21
|
-
for workunit in stream:
|
|
22
|
-
if set_system_metadata:
|
|
23
|
-
workunit.metadata.systemMetadata = SystemMetadataClass(
|
|
24
|
-
lastObserved=get_sys_time(), runId=ctx.run_id
|
|
25
|
-
)
|
|
26
|
-
if set_pipeline_name:
|
|
27
|
-
workunit.metadata.systemMetadata.pipelineName = ctx.pipeline_name
|
|
28
|
-
|
|
29
|
-
yield workunit
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class SystemMetadataTransformer(Transformer):
|
|
33
|
-
def __init__(self, ctx: PipelineContext):
|
|
34
|
-
self._inner_transformer = AutoHelperTransformer(
|
|
35
|
-
functools.partial(auto_system_metadata, ctx)
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
def transform(
|
|
39
|
-
self, record_envelopes: Iterable[RecordEnvelope]
|
|
40
|
-
) -> Iterable[RecordEnvelope]:
|
|
41
|
-
yield from self._inner_transformer.transform(record_envelopes)
|
|
42
|
-
|
|
43
|
-
@classmethod
|
|
44
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> Transformer:
|
|
45
|
-
raise NotImplementedError(f"{cls.__name__} cannot be created from config")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|