acryl-datahub 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2557 -2557
- {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +81 -79
- datahub/_version.py +1 -1
- datahub/api/entities/datajob/dataflow.py +15 -0
- datahub/api/entities/datajob/datajob.py +17 -0
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataset/dataset.py +2 -2
- datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
- datahub/cli/ingest_cli.py +4 -4
- datahub/cli/migrate.py +6 -6
- datahub/configuration/common.py +1 -1
- datahub/emitter/mcp_builder.py +4 -0
- datahub/errors.py +4 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/source.py +6 -2
- datahub/ingestion/api/source_helpers.py +35 -2
- datahub/ingestion/graph/client.py +122 -7
- datahub/ingestion/graph/filters.py +41 -16
- datahub/ingestion/run/pipeline.py +0 -6
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/cassandra/cassandra.py +1 -10
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/fivetran/fivetran.py +1 -0
- datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
- datahub/ingestion/source/hex/constants.py +5 -0
- datahub/ingestion/source/hex/hex.py +150 -22
- datahub/ingestion/source/hex/mapper.py +28 -2
- datahub/ingestion/source/hex/model.py +10 -2
- datahub/ingestion/source/hex/query_fetcher.py +300 -0
- datahub/ingestion/source/iceberg/iceberg.py +106 -18
- datahub/ingestion/source/kafka/kafka.py +1 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
- datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +2 -3
- datahub/ingestion/source/mlflow.py +6 -7
- datahub/ingestion/source/mode.py +2 -2
- datahub/ingestion/source/nifi.py +3 -3
- datahub/ingestion/source/openapi.py +3 -3
- datahub/ingestion/source/openapi_parser.py +8 -8
- datahub/ingestion/source/powerbi/config.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +16 -3
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/sigma/sigma.py +6 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/trino.py +4 -3
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/superset.py +108 -81
- datahub/ingestion/source/tableau/tableau.py +4 -4
- datahub/ingestion/source/tableau/tableau_common.py +2 -2
- datahub/ingestion/source/unity/source.py +1 -1
- datahub/ingestion/source/vertexai/vertexai.py +7 -7
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_schema_classes.py +47 -2
- datahub/metadata/_urns/urn_defs.py +56 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +121 -85
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/sdk/search_client.py +81 -8
- datahub/sdk/search_filters.py +73 -11
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +6 -6
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import threading
|
|
4
4
|
import uuid
|
|
5
|
+
from functools import partial
|
|
5
6
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
6
7
|
|
|
7
8
|
from dateutil import parser as dateutil_parser
|
|
@@ -47,6 +48,12 @@ from datahub.emitter.mce_builder import (
|
|
|
47
48
|
)
|
|
48
49
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
49
50
|
from datahub.emitter.mcp_builder import NamespaceKey
|
|
51
|
+
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
52
|
+
auto_patch_last_modified,
|
|
53
|
+
)
|
|
54
|
+
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
55
|
+
EnsureAspectSizeProcessor,
|
|
56
|
+
)
|
|
50
57
|
from datahub.ingestion.api.common import PipelineContext
|
|
51
58
|
from datahub.ingestion.api.decorators import (
|
|
52
59
|
SourceCapability,
|
|
@@ -57,6 +64,14 @@ from datahub.ingestion.api.decorators import (
|
|
|
57
64
|
support_status,
|
|
58
65
|
)
|
|
59
66
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
67
|
+
from datahub.ingestion.api.source_helpers import (
|
|
68
|
+
AutoSystemMetadata,
|
|
69
|
+
auto_fix_duplicate_schema_field_paths,
|
|
70
|
+
auto_fix_empty_field_paths,
|
|
71
|
+
auto_lowercase_urns,
|
|
72
|
+
auto_materialize_referenced_tags_terms,
|
|
73
|
+
auto_workunit_reporter,
|
|
74
|
+
)
|
|
60
75
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
61
76
|
from datahub.ingestion.extractor import schema_util
|
|
62
77
|
from datahub.ingestion.source.common.subtypes import (
|
|
@@ -82,6 +97,8 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
82
97
|
SchemaMetadata,
|
|
83
98
|
)
|
|
84
99
|
from datahub.metadata.schema_classes import (
|
|
100
|
+
BrowsePathEntryClass,
|
|
101
|
+
BrowsePathsV2Class,
|
|
85
102
|
ContainerClass,
|
|
86
103
|
DataPlatformInstanceClass,
|
|
87
104
|
DatasetPropertiesClass,
|
|
@@ -134,6 +151,7 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
134
151
|
super().__init__(config, ctx)
|
|
135
152
|
self.report: IcebergSourceReport = IcebergSourceReport()
|
|
136
153
|
self.config: IcebergSourceConfig = config
|
|
154
|
+
self.ctx: PipelineContext = ctx
|
|
137
155
|
|
|
138
156
|
@classmethod
|
|
139
157
|
def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource":
|
|
@@ -141,8 +159,47 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
141
159
|
return cls(config, ctx)
|
|
142
160
|
|
|
143
161
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
162
|
+
# This source needs to overwrite standard `get_workunit_processor`, because it is unique in terms of usage
|
|
163
|
+
# of parallelism. Because of this, 2 processors won't work as expected:
|
|
164
|
+
# 1. browse_path_processor - it needs aspects for a single entity to be continuous - which is not guaranteed
|
|
165
|
+
# in this source
|
|
166
|
+
# 2. automatic stamping with systemMetadata - in current implementation of the Source class this processor
|
|
167
|
+
# would have been applied in a thread (single) shared between the source, processors and transformers.
|
|
168
|
+
# Since the metadata scraping happens in separate threads, this could lead to difference between
|
|
169
|
+
# time used by systemMetadata and actual time at which metadata was read
|
|
170
|
+
auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
|
|
171
|
+
if (
|
|
172
|
+
self.ctx.pipeline_config
|
|
173
|
+
and self.ctx.pipeline_config.source
|
|
174
|
+
and self.ctx.pipeline_config.source.config
|
|
175
|
+
and (
|
|
176
|
+
(
|
|
177
|
+
hasattr(
|
|
178
|
+
self.ctx.pipeline_config.source.config,
|
|
179
|
+
"convert_urns_to_lowercase",
|
|
180
|
+
)
|
|
181
|
+
and self.ctx.pipeline_config.source.config.convert_urns_to_lowercase
|
|
182
|
+
)
|
|
183
|
+
or (
|
|
184
|
+
hasattr(self.ctx.pipeline_config.source.config, "get")
|
|
185
|
+
and self.ctx.pipeline_config.source.config.get(
|
|
186
|
+
"convert_urns_to_lowercase"
|
|
187
|
+
)
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
):
|
|
191
|
+
auto_lowercase_dataset_urns = auto_lowercase_urns
|
|
192
|
+
|
|
144
193
|
return [
|
|
145
|
-
|
|
194
|
+
auto_lowercase_dataset_urns,
|
|
195
|
+
auto_materialize_referenced_tags_terms,
|
|
196
|
+
partial(
|
|
197
|
+
auto_fix_duplicate_schema_field_paths, platform=self._infer_platform()
|
|
198
|
+
),
|
|
199
|
+
partial(auto_fix_empty_field_paths, platform=self._infer_platform()),
|
|
200
|
+
partial(auto_workunit_reporter, self.get_report()),
|
|
201
|
+
auto_patch_last_modified,
|
|
202
|
+
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
146
203
|
StaleEntityRemovalHandler.create(
|
|
147
204
|
self, self.config, self.ctx
|
|
148
205
|
).workunit_processor,
|
|
@@ -208,6 +265,12 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
208
265
|
)
|
|
209
266
|
thread_local.local_catalog = self.config.get_catalog()
|
|
210
267
|
|
|
268
|
+
if not hasattr(thread_local, "stamping_processor"):
|
|
269
|
+
LOGGER.debug(
|
|
270
|
+
f"Didn't find stamping_processor in thread_local ({thread_local}), initializing new workunit processor"
|
|
271
|
+
)
|
|
272
|
+
thread_local.stamping_processor = AutoSystemMetadata(self.ctx)
|
|
273
|
+
|
|
211
274
|
with PerfTimer() as timer:
|
|
212
275
|
table = thread_local.local_catalog.load_table(dataset_path)
|
|
213
276
|
time_taken = timer.elapsed_seconds()
|
|
@@ -224,9 +287,11 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
224
287
|
for aspect in self._create_iceberg_table_aspects(
|
|
225
288
|
dataset_name, table, namespace_urn
|
|
226
289
|
):
|
|
227
|
-
yield
|
|
228
|
-
|
|
229
|
-
|
|
290
|
+
yield thread_local.stamping_processor.stamp_wu(
|
|
291
|
+
MetadataChangeProposalWrapper(
|
|
292
|
+
entityUrn=dataset_urn, aspect=aspect
|
|
293
|
+
).as_workunit()
|
|
294
|
+
)
|
|
230
295
|
except NoSuchPropertyException as e:
|
|
231
296
|
self.report.warning(
|
|
232
297
|
title="Unable to process table",
|
|
@@ -308,6 +373,7 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
308
373
|
return
|
|
309
374
|
|
|
310
375
|
try:
|
|
376
|
+
stamping_processor = AutoSystemMetadata(self.ctx)
|
|
311
377
|
namespace_ids = self._get_namespaces(catalog)
|
|
312
378
|
namespaces: List[Tuple[Identifier, str]] = []
|
|
313
379
|
for namespace in namespace_ids:
|
|
@@ -323,9 +389,11 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
323
389
|
)
|
|
324
390
|
namespaces.append((namespace, namespace_urn))
|
|
325
391
|
for aspect in self._create_iceberg_namespace_aspects(namespace):
|
|
326
|
-
yield
|
|
327
|
-
|
|
328
|
-
|
|
392
|
+
yield stamping_processor.stamp_wu(
|
|
393
|
+
MetadataChangeProposalWrapper(
|
|
394
|
+
entityUrn=namespace_urn, aspect=aspect
|
|
395
|
+
).as_workunit()
|
|
396
|
+
)
|
|
329
397
|
LOGGER.debug("Namespaces ingestion completed")
|
|
330
398
|
except Exception as e:
|
|
331
399
|
self.report.report_failure(
|
|
@@ -366,7 +434,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
366
434
|
yield dataset_ownership
|
|
367
435
|
|
|
368
436
|
yield self._create_schema_metadata(dataset_name, table)
|
|
369
|
-
|
|
437
|
+
dpi = self._get_dataplatform_instance_aspect()
|
|
438
|
+
yield dpi
|
|
439
|
+
yield self._create_browse_paths_aspect(dpi.instance, str(namespace_urn))
|
|
370
440
|
yield ContainerClass(container=str(namespace_urn))
|
|
371
441
|
|
|
372
442
|
self.report.report_table_processing_time(
|
|
@@ -377,6 +447,22 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
377
447
|
profiler = IcebergProfiler(self.report, self.config.profiling)
|
|
378
448
|
yield from profiler.profile_table(dataset_name, table)
|
|
379
449
|
|
|
450
|
+
def _create_browse_paths_aspect(
|
|
451
|
+
self,
|
|
452
|
+
platform_instance_urn: Optional[str] = None,
|
|
453
|
+
container_urn: Optional[str] = None,
|
|
454
|
+
) -> BrowsePathsV2Class:
|
|
455
|
+
path = []
|
|
456
|
+
if platform_instance_urn:
|
|
457
|
+
path.append(
|
|
458
|
+
BrowsePathEntryClass(
|
|
459
|
+
id=platform_instance_urn, urn=platform_instance_urn
|
|
460
|
+
)
|
|
461
|
+
)
|
|
462
|
+
if container_urn:
|
|
463
|
+
path.append(BrowsePathEntryClass(id=container_urn, urn=container_urn))
|
|
464
|
+
return BrowsePathsV2Class(path=path)
|
|
465
|
+
|
|
380
466
|
def _get_partition_aspect(self, table: Table) -> Optional[str]:
|
|
381
467
|
"""Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
|
|
382
468
|
Each element of the returned array represents a field in the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) that follows [Appendix-C](https://iceberg.apache.org/spec/?#appendix-c-json-serialization) of the Iceberg specification.
|
|
@@ -425,23 +511,21 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
425
511
|
def _get_dataset_properties_aspect(
|
|
426
512
|
self, dataset_name: str, table: Table
|
|
427
513
|
) -> DatasetPropertiesClass:
|
|
428
|
-
|
|
514
|
+
created: Optional[TimeStampClass] = None
|
|
429
515
|
custom_properties = table.metadata.properties.copy()
|
|
430
516
|
custom_properties["location"] = table.metadata.location
|
|
431
517
|
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
432
518
|
custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
|
|
519
|
+
last_modified: Optional[int] = table.metadata.last_updated_ms
|
|
433
520
|
if table.current_snapshot():
|
|
434
521
|
custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
|
|
435
522
|
custom_properties["manifest-list"] = table.current_snapshot().manifest_list
|
|
436
|
-
|
|
437
|
-
int(table.current_snapshot().timestamp_ms)
|
|
438
|
-
)
|
|
523
|
+
if not last_modified:
|
|
524
|
+
last_modified = int(table.current_snapshot().timestamp_ms)
|
|
439
525
|
if "created-at" in custom_properties:
|
|
440
526
|
try:
|
|
441
527
|
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
442
|
-
|
|
443
|
-
int(dt.timestamp() * 1000)
|
|
444
|
-
)
|
|
528
|
+
created = TimeStampClass(int(dt.timestamp() * 1000))
|
|
445
529
|
except Exception as ex:
|
|
446
530
|
LOGGER.warning(
|
|
447
531
|
f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
|
|
@@ -451,8 +535,10 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
451
535
|
name=table.name()[-1],
|
|
452
536
|
description=table.metadata.properties.get("comment", None),
|
|
453
537
|
customProperties=custom_properties,
|
|
454
|
-
lastModified=
|
|
455
|
-
|
|
538
|
+
lastModified=TimeStampClass(last_modified)
|
|
539
|
+
if last_modified is not None
|
|
540
|
+
else None,
|
|
541
|
+
created=created,
|
|
456
542
|
qualifiedName=dataset_name,
|
|
457
543
|
)
|
|
458
544
|
|
|
@@ -530,7 +616,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
530
616
|
name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
|
|
531
617
|
)
|
|
532
618
|
yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
|
|
533
|
-
|
|
619
|
+
dpi = self._get_dataplatform_instance_aspect()
|
|
620
|
+
yield dpi
|
|
621
|
+
yield self._create_browse_paths_aspect(dpi.instance)
|
|
534
622
|
|
|
535
623
|
|
|
536
624
|
class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
|
|
@@ -568,10 +568,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
568
568
|
|
|
569
569
|
for config_key in KafkaTopicConfigKeys:
|
|
570
570
|
try:
|
|
571
|
-
if
|
|
572
|
-
config_key in topic_config.keys()
|
|
573
|
-
and topic_config[config_key] is not None
|
|
574
|
-
):
|
|
571
|
+
if config_key in topic_config and topic_config[config_key] is not None:
|
|
575
572
|
config_value = topic_config[config_key].value
|
|
576
573
|
custom_props[config_key] = (
|
|
577
574
|
config_value
|
|
@@ -197,7 +197,7 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
197
197
|
for name in transform_names:
|
|
198
198
|
transform = {"name": name}
|
|
199
199
|
transforms.append(transform)
|
|
200
|
-
for key in self.connector_manifest.config
|
|
200
|
+
for key in self.connector_manifest.config:
|
|
201
201
|
if key.startswith(f"transforms.{name}."):
|
|
202
202
|
transform[key.replace(f"transforms.{name}.", "")] = (
|
|
203
203
|
self.connector_manifest.config[key]
|
|
@@ -121,7 +121,7 @@ class ConfluentJDBCSourceConnector(BaseConnector):
|
|
|
121
121
|
for name in transform_names:
|
|
122
122
|
transform = {"name": name}
|
|
123
123
|
transforms.append(transform)
|
|
124
|
-
for key in self.connector_manifest.config
|
|
124
|
+
for key in self.connector_manifest.config:
|
|
125
125
|
if key.startswith(f"transforms.{name}."):
|
|
126
126
|
transform[key.replace(f"transforms.{name}.", "")] = (
|
|
127
127
|
self.connector_manifest.config[key]
|
|
@@ -363,7 +363,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
363
363
|
filters: MutableMapping[str, Any] = (
|
|
364
364
|
query.filters if query.filters is not None else {}
|
|
365
365
|
)
|
|
366
|
-
for field in filters
|
|
366
|
+
for field in filters:
|
|
367
367
|
if field is None:
|
|
368
368
|
continue
|
|
369
369
|
|
|
@@ -877,8 +877,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
877
877
|
# fine to set them to None.
|
|
878
878
|
# TODO: Track project names for each explore.
|
|
879
879
|
explores_to_fetch = [
|
|
880
|
-
(None, model, explore)
|
|
881
|
-
for (model, explore) in self.reachable_explores.keys()
|
|
880
|
+
(None, model, explore) for (model, explore) in self.reachable_explores
|
|
882
881
|
]
|
|
883
882
|
explores_to_fetch.sort()
|
|
884
883
|
|
|
@@ -16,7 +16,7 @@ from datahub.api.entities.dataprocess.dataprocess_instance import (
|
|
|
16
16
|
)
|
|
17
17
|
from datahub.configuration.source_common import EnvConfigMixin
|
|
18
18
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
|
-
from datahub.emitter.mcp_builder import
|
|
19
|
+
from datahub.emitter.mcp_builder import ExperimentKey
|
|
20
20
|
from datahub.ingestion.api.common import PipelineContext
|
|
21
21
|
from datahub.ingestion.api.decorators import (
|
|
22
22
|
SupportStatus,
|
|
@@ -36,6 +36,7 @@ from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
|
|
|
36
36
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
37
37
|
StaleEntityRemovalHandler,
|
|
38
38
|
StaleEntityRemovalSourceReport,
|
|
39
|
+
StatefulStaleMetadataRemovalConfig,
|
|
39
40
|
)
|
|
40
41
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
41
42
|
StatefulIngestionConfigBase,
|
|
@@ -77,10 +78,6 @@ from datahub.sdk.dataset import Dataset
|
|
|
77
78
|
T = TypeVar("T")
|
|
78
79
|
|
|
79
80
|
|
|
80
|
-
class ContainerKeyWithId(ContainerKey):
|
|
81
|
-
id: str
|
|
82
|
-
|
|
83
|
-
|
|
84
81
|
class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
85
82
|
tracking_uri: Optional[str] = Field(
|
|
86
83
|
default=None,
|
|
@@ -123,6 +120,8 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
|
123
120
|
default=None, description="Password for MLflow authentication"
|
|
124
121
|
)
|
|
125
122
|
|
|
123
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
124
|
+
|
|
126
125
|
|
|
127
126
|
@dataclass
|
|
128
127
|
class MLflowRegisteredModelStageInfo:
|
|
@@ -252,7 +251,7 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
252
251
|
self, experiment: Experiment
|
|
253
252
|
) -> Iterable[MetadataWorkUnit]:
|
|
254
253
|
experiment_container = Container(
|
|
255
|
-
container_key=
|
|
254
|
+
container_key=ExperimentKey(
|
|
256
255
|
platform=str(DataPlatformUrn(platform_name=self.platform)),
|
|
257
256
|
id=experiment.name,
|
|
258
257
|
),
|
|
@@ -470,7 +469,7 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
470
469
|
def _get_run_workunits(
|
|
471
470
|
self, experiment: Experiment, run: Run
|
|
472
471
|
) -> Iterable[MetadataWorkUnit]:
|
|
473
|
-
experiment_key =
|
|
472
|
+
experiment_key = ExperimentKey(
|
|
474
473
|
platform=str(DataPlatformUrn(self.platform)), id=experiment.name
|
|
475
474
|
)
|
|
476
475
|
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -899,7 +899,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
899
899
|
for match in matches:
|
|
900
900
|
definition = Template(source=match).render()
|
|
901
901
|
parameters = yaml.safe_load(definition)
|
|
902
|
-
for key in parameters
|
|
902
|
+
for key in parameters:
|
|
903
903
|
jinja_params[key] = parameters[key].get("default", "")
|
|
904
904
|
|
|
905
905
|
normalized_query = re.sub(
|
|
@@ -1601,7 +1601,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1601
1601
|
|
|
1602
1602
|
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
1603
1603
|
# Space/collection -> report -> query -> Chart
|
|
1604
|
-
for space_token in self.space_tokens
|
|
1604
|
+
for space_token in self.space_tokens:
|
|
1605
1605
|
reports = self._get_reports(space_token)
|
|
1606
1606
|
for report in reports:
|
|
1607
1607
|
report_token = report.get("token", "")
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -703,7 +703,7 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
703
703
|
if (
|
|
704
704
|
component.nifi_type is NifiType.PROCESSOR
|
|
705
705
|
and component.type
|
|
706
|
-
not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS
|
|
706
|
+
not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS
|
|
707
707
|
) or component.nifi_type not in [
|
|
708
708
|
NifiType.PROCESSOR,
|
|
709
709
|
NifiType.REMOTE_INPUT_PORT,
|
|
@@ -977,7 +977,7 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
977
977
|
)
|
|
978
978
|
|
|
979
979
|
for incoming_from in incoming:
|
|
980
|
-
if incoming_from in self.nifi_flow.remotely_accessible_ports
|
|
980
|
+
if incoming_from in self.nifi_flow.remotely_accessible_ports:
|
|
981
981
|
dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[incoming_from].name}"
|
|
982
982
|
dataset_urn = builder.make_dataset_urn(
|
|
983
983
|
NIFI, dataset_name, self.config.env
|
|
@@ -994,7 +994,7 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
994
994
|
)
|
|
995
995
|
|
|
996
996
|
for outgoing_to in outgoing:
|
|
997
|
-
if outgoing_to in self.nifi_flow.remotely_accessible_ports
|
|
997
|
+
if outgoing_to in self.nifi_flow.remotely_accessible_ports:
|
|
998
998
|
dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[outgoing_to].name}"
|
|
999
999
|
dataset_urn = builder.make_dataset_urn(
|
|
1000
1000
|
NIFI, dataset_name, self.config.env
|
|
@@ -102,7 +102,7 @@ class OpenApiConfig(ConfigModel):
|
|
|
102
102
|
# details there once, and then use that session for all requests.
|
|
103
103
|
self.token = f"Bearer {self.bearer_token}"
|
|
104
104
|
else:
|
|
105
|
-
assert "url_complement" in self.get_token
|
|
105
|
+
assert "url_complement" in self.get_token, (
|
|
106
106
|
"When 'request_type' is set to 'get', an url_complement is needed for the request."
|
|
107
107
|
)
|
|
108
108
|
if self.get_token["request_type"] == "get":
|
|
@@ -317,7 +317,7 @@ class APISource(Source, ABC):
|
|
|
317
317
|
yield wu
|
|
318
318
|
|
|
319
319
|
# Handle schema metadata if available
|
|
320
|
-
if "data" in endpoint_dets
|
|
320
|
+
if "data" in endpoint_dets:
|
|
321
321
|
# we are lucky! data is defined in the swagger for this endpoint
|
|
322
322
|
schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
|
|
323
323
|
wu = MetadataWorkUnit(
|
|
@@ -371,7 +371,7 @@ class APISource(Source, ABC):
|
|
|
371
371
|
else:
|
|
372
372
|
self.report_bad_responses(response.status_code, type=endpoint_k)
|
|
373
373
|
else:
|
|
374
|
-
if endpoint_k not in config.forced_examples
|
|
374
|
+
if endpoint_k not in config.forced_examples:
|
|
375
375
|
# start guessing...
|
|
376
376
|
url_guess = try_guessing(endpoint_k, root_dataset_samples)
|
|
377
377
|
tot_url = clean_url(config.url + self.url_basepath + url_guess)
|
|
@@ -128,18 +128,18 @@ def get_endpoints(sw_dict: dict) -> dict:
|
|
|
128
128
|
|
|
129
129
|
for p_k, p_o in sw_dict["paths"].items():
|
|
130
130
|
method = list(p_o)[0]
|
|
131
|
-
if "200" in p_o[method]["responses"]
|
|
131
|
+
if "200" in p_o[method]["responses"]:
|
|
132
132
|
base_res = p_o[method]["responses"]["200"]
|
|
133
|
-
elif 200 in p_o[method]["responses"]
|
|
133
|
+
elif 200 in p_o[method]["responses"]:
|
|
134
134
|
# if you read a plain yml file the 200 will be an integer
|
|
135
135
|
base_res = p_o[method]["responses"][200]
|
|
136
136
|
else:
|
|
137
137
|
# the endpoint does not have a 200 response
|
|
138
138
|
continue
|
|
139
139
|
|
|
140
|
-
if "description" in p_o[method]
|
|
140
|
+
if "description" in p_o[method]:
|
|
141
141
|
desc = p_o[method]["description"]
|
|
142
|
-
elif "summary" in p_o[method]
|
|
142
|
+
elif "summary" in p_o[method]:
|
|
143
143
|
desc = p_o[method]["summary"]
|
|
144
144
|
else: # still testing
|
|
145
145
|
desc = ""
|
|
@@ -156,7 +156,7 @@ def get_endpoints(sw_dict: dict) -> dict:
|
|
|
156
156
|
url_details[p_k]["data"] = example_data
|
|
157
157
|
|
|
158
158
|
# checking whether there are defined parameters to execute the call...
|
|
159
|
-
if "parameters" in p_o[method]
|
|
159
|
+
if "parameters" in p_o[method]:
|
|
160
160
|
url_details[p_k]["parameters"] = p_o[method]["parameters"]
|
|
161
161
|
|
|
162
162
|
return dict(sorted(url_details.items()))
|
|
@@ -169,7 +169,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
|
|
|
169
169
|
data = {}
|
|
170
170
|
if "content" in base_res:
|
|
171
171
|
res_cont = base_res["content"]
|
|
172
|
-
if "application/json" in res_cont
|
|
172
|
+
if "application/json" in res_cont:
|
|
173
173
|
ex_field = None
|
|
174
174
|
if "example" in res_cont["application/json"]:
|
|
175
175
|
ex_field = "example"
|
|
@@ -186,7 +186,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
|
|
|
186
186
|
logger.warning(
|
|
187
187
|
f"Field in swagger file does not give consistent data --- {key}"
|
|
188
188
|
)
|
|
189
|
-
elif "text/csv" in res_cont
|
|
189
|
+
elif "text/csv" in res_cont:
|
|
190
190
|
data = res_cont["text/csv"]["schema"]
|
|
191
191
|
elif "examples" in base_res:
|
|
192
192
|
data = base_res["examples"]["application/json"]
|
|
@@ -239,7 +239,7 @@ def guessing_url_name(url: str, examples: dict) -> str:
|
|
|
239
239
|
|
|
240
240
|
# substituting the parameter's name w the value
|
|
241
241
|
for name, clean_name in zip(needed_n, cleaned_needed_n):
|
|
242
|
-
if clean_name in examples[ex2use]
|
|
242
|
+
if clean_name in examples[ex2use]:
|
|
243
243
|
guessed_url = re.sub(name, str(examples[ex2use][clean_name]), guessed_url)
|
|
244
244
|
|
|
245
245
|
return guessed_url
|
|
@@ -555,7 +555,7 @@ class PowerBiDashboardSourceConfig(
|
|
|
555
555
|
def map_data_platform(cls, value):
|
|
556
556
|
# For backward compatibility convert input PostgreSql to PostgreSQL
|
|
557
557
|
# PostgreSQL is name of the data-platform in M-Query
|
|
558
|
-
if "PostgreSql" in value
|
|
558
|
+
if "PostgreSql" in value:
|
|
559
559
|
platform_name = value["PostgreSql"]
|
|
560
560
|
del value["PostgreSql"]
|
|
561
561
|
value["PostgreSQL"] = platform_name
|
|
@@ -94,7 +94,7 @@ from datahub.metadata.schema_classes import (
|
|
|
94
94
|
UpstreamLineageClass,
|
|
95
95
|
ViewPropertiesClass,
|
|
96
96
|
)
|
|
97
|
-
from datahub.metadata.urns import ChartUrn
|
|
97
|
+
from datahub.metadata.urns import ChartUrn, DatasetUrn
|
|
98
98
|
from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo
|
|
99
99
|
from datahub.utilities.dedup_list import deduplicate_list
|
|
100
100
|
from datahub.utilities.urns.urn_iter import lowercase_dataset_urn
|
|
@@ -263,7 +263,7 @@ class Mapper:
|
|
|
263
263
|
for upstream_dpt in lineage.upstreams:
|
|
264
264
|
if (
|
|
265
265
|
upstream_dpt.data_platform_pair.powerbi_data_platform_name
|
|
266
|
-
not in self.__config.dataset_type_mapping
|
|
266
|
+
not in self.__config.dataset_type_mapping
|
|
267
267
|
):
|
|
268
268
|
logger.debug(
|
|
269
269
|
f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping",
|
|
@@ -1083,6 +1083,7 @@ class Mapper:
|
|
|
1083
1083
|
report: powerbi_data_classes.Report,
|
|
1084
1084
|
chart_mcps: List[MetadataChangeProposalWrapper],
|
|
1085
1085
|
user_mcps: List[MetadataChangeProposalWrapper],
|
|
1086
|
+
dataset_edges: List[EdgeClass],
|
|
1086
1087
|
) -> List[MetadataChangeProposalWrapper]:
|
|
1087
1088
|
"""
|
|
1088
1089
|
Map PowerBi report to Datahub dashboard
|
|
@@ -1104,6 +1105,7 @@ class Mapper:
|
|
|
1104
1105
|
charts=chart_urn_list,
|
|
1105
1106
|
lastModified=ChangeAuditStamps(),
|
|
1106
1107
|
dashboardUrl=report.webUrl,
|
|
1108
|
+
datasetEdges=dataset_edges,
|
|
1107
1109
|
)
|
|
1108
1110
|
|
|
1109
1111
|
info_mcp = self.new_mcp(
|
|
@@ -1197,12 +1199,23 @@ class Mapper:
|
|
|
1197
1199
|
ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
|
|
1198
1200
|
chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
|
|
1199
1201
|
|
|
1202
|
+
# collect all upstream datasets; using a set to retain unique urns
|
|
1203
|
+
dataset_urns = {
|
|
1204
|
+
dataset.entityUrn
|
|
1205
|
+
for dataset in ds_mcps
|
|
1206
|
+
if dataset.entityType == DatasetUrn.ENTITY_TYPE and dataset.entityUrn
|
|
1207
|
+
}
|
|
1208
|
+
dataset_edges = [
|
|
1209
|
+
EdgeClass(destinationUrn=dataset_urn) for dataset_urn in dataset_urns
|
|
1210
|
+
]
|
|
1211
|
+
|
|
1200
1212
|
# Let's convert report to datahub dashboard
|
|
1201
1213
|
report_mcps = self.report_to_dashboard(
|
|
1202
1214
|
workspace=workspace,
|
|
1203
1215
|
report=report,
|
|
1204
1216
|
chart_mcps=chart_mcps,
|
|
1205
1217
|
user_mcps=user_mcps,
|
|
1218
|
+
dataset_edges=dataset_edges,
|
|
1206
1219
|
)
|
|
1207
1220
|
|
|
1208
1221
|
# Now add MCPs in sequence
|
|
@@ -1340,7 +1353,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1340
1353
|
for data_platform in SupportedDataPlatform
|
|
1341
1354
|
]
|
|
1342
1355
|
|
|
1343
|
-
for key in self.source_config.dataset_type_mapping
|
|
1356
|
+
for key in self.source_config.dataset_type_mapping:
|
|
1344
1357
|
if key not in powerbi_data_platforms:
|
|
1345
1358
|
raise ValueError(f"PowerBI DataPlatform {key} is not supported")
|
|
1346
1359
|
|
|
@@ -42,9 +42,9 @@ class RedshiftProfiler(GenericProfiler):
|
|
|
42
42
|
"max_overflow", self.config.profiling.max_workers
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
for db in tables
|
|
45
|
+
for db in tables:
|
|
46
46
|
profile_requests = []
|
|
47
|
-
for schema in tables.get(db, {})
|
|
47
|
+
for schema in tables.get(db, {}):
|
|
48
48
|
if not self.config.schema_pattern.allowed(schema):
|
|
49
49
|
continue
|
|
50
50
|
for table in tables[db].get(schema, {}):
|
|
@@ -170,7 +170,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
170
170
|
if self.config.workspace_pattern.allowed(workspace.name):
|
|
171
171
|
allowed_workspaces.append(workspace)
|
|
172
172
|
else:
|
|
173
|
-
self.reporter.workspaces.dropped(
|
|
173
|
+
self.reporter.workspaces.dropped(
|
|
174
|
+
f"{workspace.name} ({workspace.workspaceId})"
|
|
175
|
+
)
|
|
174
176
|
logger.info(f"Number of allowed workspaces = {len(allowed_workspaces)}")
|
|
175
177
|
|
|
176
178
|
return allowed_workspaces
|
|
@@ -661,7 +663,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
661
663
|
yield from self._gen_workbook_workunit(workbook)
|
|
662
664
|
|
|
663
665
|
for workspace in self._get_allowed_workspaces():
|
|
664
|
-
self.reporter.workspaces.processed(
|
|
666
|
+
self.reporter.workspaces.processed(
|
|
667
|
+
f"{workspace.name} ({workspace.workspaceId})"
|
|
668
|
+
)
|
|
665
669
|
yield from self._gen_workspace_workunit(workspace)
|
|
666
670
|
yield from self._gen_sigma_dataset_upstream_lineage_workunit()
|
|
667
671
|
|
|
@@ -77,7 +77,7 @@ class SnowsightUrlBuilder:
|
|
|
77
77
|
region: str,
|
|
78
78
|
) -> Tuple[str, str]:
|
|
79
79
|
cloud: str
|
|
80
|
-
if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING
|
|
80
|
+
if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING:
|
|
81
81
|
cloud, cloud_region_id = SNOWFLAKE_REGION_CLOUD_REGION_MAPPING[region]
|
|
82
82
|
elif region.startswith(("aws_", "gcp_", "azure_")):
|
|
83
83
|
# e.g. aws_us_west_2, gcp_us_central1, azure_northeurope
|
|
@@ -26,6 +26,7 @@ from datahub.metadata.schema_classes import (
|
|
|
26
26
|
DataPlatformInstanceClass,
|
|
27
27
|
DataTransformClass,
|
|
28
28
|
DataTransformLogicClass,
|
|
29
|
+
QueryLanguageClass,
|
|
29
30
|
QueryStatementClass,
|
|
30
31
|
SubTypesClass,
|
|
31
32
|
)
|
|
@@ -176,7 +177,17 @@ def _generate_job_workunits(
|
|
|
176
177
|
DataTransformClass(
|
|
177
178
|
queryStatement=QueryStatementClass(
|
|
178
179
|
value=procedure.procedure_definition,
|
|
179
|
-
language=
|
|
180
|
+
language=(
|
|
181
|
+
QueryLanguageClass.SQL
|
|
182
|
+
if procedure.language == "SQL"
|
|
183
|
+
# The language field uses a pretty limited enum.
|
|
184
|
+
# The "UNKNOWN" enum value is pretty new, so we don't want to
|
|
185
|
+
# emit it until it has broader server-side support. As a
|
|
186
|
+
# short-term solution, we map all languages to "SQL".
|
|
187
|
+
# TODO: Once we've released server 1.1.0, we should change
|
|
188
|
+
# this to be "UNKNOWN" for all languages except "SQL".
|
|
189
|
+
else QueryLanguageClass.SQL
|
|
190
|
+
),
|
|
180
191
|
),
|
|
181
192
|
)
|
|
182
193
|
]
|
|
@@ -128,9 +128,10 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
|
|
|
128
128
|
if catalog_name is None:
|
|
129
129
|
raise exc.NoSuchTableError("catalog is required in connection")
|
|
130
130
|
connector_name = get_catalog_connector_name(connection.engine, catalog_name)
|
|
131
|
-
if
|
|
132
|
-
|
|
133
|
-
|
|
131
|
+
if (
|
|
132
|
+
connector_name is not None
|
|
133
|
+
and connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS
|
|
134
|
+
):
|
|
134
135
|
properties_table = self._get_full_table(f"{table_name}$properties", schema)
|
|
135
136
|
query = f"SELECT * FROM {properties_table}"
|
|
136
137
|
row = connection.execute(sql.text(query)).fetchone()
|
|
@@ -45,7 +45,6 @@ class StatefulStaleMetadataRemovalConfig(StatefulIngestionConfig):
|
|
|
45
45
|
description="Prevents large amount of soft deletes & the state from committing from accidental changes to the source configuration if the relative change percent in entities compared to the previous state is above the 'fail_safe_threshold'.",
|
|
46
46
|
le=100.0,
|
|
47
47
|
ge=0.0,
|
|
48
|
-
hidden_from_docs=True,
|
|
49
48
|
)
|
|
50
49
|
|
|
51
50
|
|