acryl-datahub 1.0.0.2rc3__py3-none-any.whl → 1.0.0.2rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.2rc3.dist-info → acryl_datahub-1.0.0.2rc5.dist-info}/METADATA +2564 -2564
- {acryl_datahub-1.0.0.2rc3.dist-info → acryl_datahub-1.0.0.2rc5.dist-info}/RECORD +61 -60
- datahub/_version.py +1 -1
- datahub/api/entities/datajob/dataflow.py +15 -0
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataset/dataset.py +2 -2
- datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
- datahub/cli/migrate.py +6 -6
- datahub/configuration/common.py +1 -1
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/source.py +4 -1
- datahub/ingestion/api/source_helpers.py +26 -1
- datahub/ingestion/run/pipeline.py +0 -6
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/fivetran/fivetran.py +1 -0
- datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
- datahub/ingestion/source/hex/query_fetcher.py +4 -1
- datahub/ingestion/source/iceberg/iceberg.py +97 -9
- datahub/ingestion/source/kafka/kafka.py +1 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
- datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +2 -3
- datahub/ingestion/source/mlflow.py +3 -0
- datahub/ingestion/source/mode.py +2 -2
- datahub/ingestion/source/nifi.py +3 -3
- datahub/ingestion/source/openapi.py +3 -3
- datahub/ingestion/source/openapi_parser.py +8 -8
- datahub/ingestion/source/powerbi/config.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +2 -2
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/tableau/tableau.py +4 -4
- datahub/ingestion/source/tableau/tableau_common.py +2 -2
- datahub/ingestion/source/unity/source.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_schema_classes.py +47 -2
- datahub/metadata/_urns/urn_defs.py +56 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +121 -85
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +6 -6
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0.2rc3.dist-info → acryl_datahub-1.0.0.2rc5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.2rc3.dist-info → acryl_datahub-1.0.0.2rc5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.2rc3.dist-info → acryl_datahub-1.0.0.2rc5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.2rc3.dist-info → acryl_datahub-1.0.0.2rc5.dist-info}/top_level.txt +0 -0
|
@@ -39,9 +39,6 @@ from datahub.ingestion.run.sink_callback import DeadLetterQueueCallback, Logging
|
|
|
39
39
|
from datahub.ingestion.sink.datahub_rest import DatahubRestSink
|
|
40
40
|
from datahub.ingestion.sink.sink_registry import sink_registry
|
|
41
41
|
from datahub.ingestion.source.source_registry import source_registry
|
|
42
|
-
from datahub.ingestion.transformer.system_metadata_transformer import (
|
|
43
|
-
SystemMetadataTransformer,
|
|
44
|
-
)
|
|
45
42
|
from datahub.ingestion.transformer.transform_registry import transform_registry
|
|
46
43
|
from datahub.sdk._attribution import KnownAttribution, change_default_attribution
|
|
47
44
|
from datahub.telemetry import stats
|
|
@@ -286,9 +283,6 @@ class Pipeline:
|
|
|
286
283
|
f"Transformer type:{transformer_type},{transformer_class} configured"
|
|
287
284
|
)
|
|
288
285
|
|
|
289
|
-
# Add the system metadata transformer at the end of the list.
|
|
290
|
-
self.transformers.append(SystemMetadataTransformer(self.ctx))
|
|
291
|
-
|
|
292
286
|
def _configure_reporting(self, report_to: Optional[str]) -> None:
|
|
293
287
|
if self.dry_run:
|
|
294
288
|
# In dry run mode, we don't want to report anything.
|
|
@@ -323,7 +323,7 @@ class ModelProcessor:
|
|
|
323
323
|
model_training_jobs = model_training_jobs.union(
|
|
324
324
|
{
|
|
325
325
|
job_urn
|
|
326
|
-
for job_urn, job_direction in data_url_matched_jobs
|
|
326
|
+
for job_urn, job_direction in data_url_matched_jobs
|
|
327
327
|
if job_direction == JobDirection.TRAINING
|
|
328
328
|
}
|
|
329
329
|
)
|
|
@@ -331,7 +331,7 @@ class ModelProcessor:
|
|
|
331
331
|
model_downstream_jobs = model_downstream_jobs.union(
|
|
332
332
|
{
|
|
333
333
|
job_urn
|
|
334
|
-
for job_urn, job_direction in data_url_matched_jobs
|
|
334
|
+
for job_urn, job_direction in data_url_matched_jobs
|
|
335
335
|
if job_direction == JobDirection.DOWNSTREAM
|
|
336
336
|
}
|
|
337
337
|
)
|
|
@@ -368,7 +368,7 @@ class ModelProcessor:
|
|
|
368
368
|
model_training_jobs = model_training_jobs.union(
|
|
369
369
|
{
|
|
370
370
|
job_urn
|
|
371
|
-
for job_urn, job_direction in name_matched_jobs
|
|
371
|
+
for job_urn, job_direction in name_matched_jobs
|
|
372
372
|
if job_direction == JobDirection.TRAINING
|
|
373
373
|
}
|
|
374
374
|
)
|
|
@@ -376,7 +376,7 @@ class ModelProcessor:
|
|
|
376
376
|
model_downstream_jobs = model_downstream_jobs.union(
|
|
377
377
|
{
|
|
378
378
|
job_urn
|
|
379
|
-
for job_urn, job_direction in name_matched_jobs
|
|
379
|
+
for job_urn, job_direction in name_matched_jobs
|
|
380
380
|
if job_direction == JobDirection.DOWNSTREAM
|
|
381
381
|
}
|
|
382
382
|
)
|
|
@@ -375,7 +375,7 @@ class BigqueryLineageExtractor:
|
|
|
375
375
|
memory_footprint.total_size(lineage)
|
|
376
376
|
)
|
|
377
377
|
|
|
378
|
-
for lineage_key in lineage
|
|
378
|
+
for lineage_key in lineage:
|
|
379
379
|
# For views, we do not use the upstreams obtained by parsing audit logs
|
|
380
380
|
# as they may contain indirectly referenced tables.
|
|
381
381
|
if (
|
|
@@ -362,7 +362,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
|
|
|
362
362
|
if self.config.include_table_item is None:
|
|
363
363
|
return
|
|
364
364
|
dataset_name = f"{region}.{table_name}"
|
|
365
|
-
if dataset_name not in self.config.include_table_item
|
|
365
|
+
if dataset_name not in self.config.include_table_item:
|
|
366
366
|
return
|
|
367
367
|
primary_key_list = self.config.include_table_item.get(dataset_name)
|
|
368
368
|
assert isinstance(primary_key_list, List)
|
|
@@ -215,6 +215,7 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
215
215
|
datajob = DataJob(
|
|
216
216
|
id=connector.connector_id,
|
|
217
217
|
flow_urn=dataflow_urn,
|
|
218
|
+
platform_instance=self.config.platform_instance,
|
|
218
219
|
name=connector.connector_name,
|
|
219
220
|
owners={owner_email} if owner_email else set(),
|
|
220
221
|
)
|
|
@@ -190,7 +190,7 @@ class FivetranLogAPI:
|
|
|
190
190
|
jobs: List[Job] = []
|
|
191
191
|
if connector_sync_log is None:
|
|
192
192
|
return jobs
|
|
193
|
-
for sync_id in connector_sync_log
|
|
193
|
+
for sync_id in connector_sync_log:
|
|
194
194
|
if len(connector_sync_log[sync_id]) != 2:
|
|
195
195
|
# If both sync-start and sync-end event log not present for this sync that means sync is still in progress
|
|
196
196
|
continue
|
|
@@ -18,7 +18,8 @@ from datahub.utilities.time import datetime_to_ts_millis
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
20
20
|
# Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
|
|
21
|
-
|
|
21
|
+
# Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
|
|
22
|
+
HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
@dataclass
|
|
@@ -39,6 +40,7 @@ class HexQueryFetcherReport(SourceReport):
|
|
|
39
40
|
fetched_query_objects: int = 0
|
|
40
41
|
filtered_out_queries_missing_metadata: int = 0
|
|
41
42
|
filtered_out_queries_different_workspace: int = 0
|
|
43
|
+
filtered_out_queries_no_match: int = 0
|
|
42
44
|
filtered_out_queries_no_subjects: int = 0
|
|
43
45
|
total_queries: int = 0
|
|
44
46
|
total_dataset_subjects: int = 0
|
|
@@ -210,6 +212,7 @@ class HexQueryFetcher:
|
|
|
210
212
|
match = re.search(HEX_METADATA_PATTERN, sql_statement)
|
|
211
213
|
|
|
212
214
|
if not match:
|
|
215
|
+
self.report.filtered_out_queries_no_match += 1
|
|
213
216
|
return None
|
|
214
217
|
|
|
215
218
|
try:
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import threading
|
|
4
4
|
import uuid
|
|
5
|
+
from functools import partial
|
|
5
6
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
6
7
|
|
|
7
8
|
from dateutil import parser as dateutil_parser
|
|
@@ -47,6 +48,12 @@ from datahub.emitter.mce_builder import (
|
|
|
47
48
|
)
|
|
48
49
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
49
50
|
from datahub.emitter.mcp_builder import NamespaceKey
|
|
51
|
+
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
52
|
+
auto_patch_last_modified,
|
|
53
|
+
)
|
|
54
|
+
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
55
|
+
EnsureAspectSizeProcessor,
|
|
56
|
+
)
|
|
50
57
|
from datahub.ingestion.api.common import PipelineContext
|
|
51
58
|
from datahub.ingestion.api.decorators import (
|
|
52
59
|
SourceCapability,
|
|
@@ -57,6 +64,14 @@ from datahub.ingestion.api.decorators import (
|
|
|
57
64
|
support_status,
|
|
58
65
|
)
|
|
59
66
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
67
|
+
from datahub.ingestion.api.source_helpers import (
|
|
68
|
+
AutoSystemMetadata,
|
|
69
|
+
auto_fix_duplicate_schema_field_paths,
|
|
70
|
+
auto_fix_empty_field_paths,
|
|
71
|
+
auto_lowercase_urns,
|
|
72
|
+
auto_materialize_referenced_tags_terms,
|
|
73
|
+
auto_workunit_reporter,
|
|
74
|
+
)
|
|
60
75
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
61
76
|
from datahub.ingestion.extractor import schema_util
|
|
62
77
|
from datahub.ingestion.source.common.subtypes import (
|
|
@@ -82,6 +97,8 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
82
97
|
SchemaMetadata,
|
|
83
98
|
)
|
|
84
99
|
from datahub.metadata.schema_classes import (
|
|
100
|
+
BrowsePathEntryClass,
|
|
101
|
+
BrowsePathsV2Class,
|
|
85
102
|
ContainerClass,
|
|
86
103
|
DataPlatformInstanceClass,
|
|
87
104
|
DatasetPropertiesClass,
|
|
@@ -134,6 +151,7 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
134
151
|
super().__init__(config, ctx)
|
|
135
152
|
self.report: IcebergSourceReport = IcebergSourceReport()
|
|
136
153
|
self.config: IcebergSourceConfig = config
|
|
154
|
+
self.ctx: PipelineContext = ctx
|
|
137
155
|
|
|
138
156
|
@classmethod
|
|
139
157
|
def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource":
|
|
@@ -141,8 +159,47 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
141
159
|
return cls(config, ctx)
|
|
142
160
|
|
|
143
161
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
162
|
+
# This source needs to overwrite standard `get_workunit_processor`, because it is unique in terms of usage
|
|
163
|
+
# of parallelism. Because of this, 2 processors won't work as expected:
|
|
164
|
+
# 1. browse_path_processor - it needs aspects for a single entity to be continuous - which is not guaranteed
|
|
165
|
+
# in this source
|
|
166
|
+
# 2. automatic stamping with systemMetadata - in current implementation of the Source class this processor
|
|
167
|
+
# would have been applied in a thread (single) shared between the source, processors and transformers.
|
|
168
|
+
# Since the metadata scraping happens in separate threads, this could lead to difference between
|
|
169
|
+
# time used by systemMetadata and actual time at which metadata was read
|
|
170
|
+
auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
|
|
171
|
+
if (
|
|
172
|
+
self.ctx.pipeline_config
|
|
173
|
+
and self.ctx.pipeline_config.source
|
|
174
|
+
and self.ctx.pipeline_config.source.config
|
|
175
|
+
and (
|
|
176
|
+
(
|
|
177
|
+
hasattr(
|
|
178
|
+
self.ctx.pipeline_config.source.config,
|
|
179
|
+
"convert_urns_to_lowercase",
|
|
180
|
+
)
|
|
181
|
+
and self.ctx.pipeline_config.source.config.convert_urns_to_lowercase
|
|
182
|
+
)
|
|
183
|
+
or (
|
|
184
|
+
hasattr(self.ctx.pipeline_config.source.config, "get")
|
|
185
|
+
and self.ctx.pipeline_config.source.config.get(
|
|
186
|
+
"convert_urns_to_lowercase"
|
|
187
|
+
)
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
):
|
|
191
|
+
auto_lowercase_dataset_urns = auto_lowercase_urns
|
|
192
|
+
|
|
144
193
|
return [
|
|
145
|
-
|
|
194
|
+
auto_lowercase_dataset_urns,
|
|
195
|
+
auto_materialize_referenced_tags_terms,
|
|
196
|
+
partial(
|
|
197
|
+
auto_fix_duplicate_schema_field_paths, platform=self._infer_platform()
|
|
198
|
+
),
|
|
199
|
+
partial(auto_fix_empty_field_paths, platform=self._infer_platform()),
|
|
200
|
+
partial(auto_workunit_reporter, self.get_report()),
|
|
201
|
+
auto_patch_last_modified,
|
|
202
|
+
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
146
203
|
StaleEntityRemovalHandler.create(
|
|
147
204
|
self, self.config, self.ctx
|
|
148
205
|
).workunit_processor,
|
|
@@ -208,6 +265,12 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
208
265
|
)
|
|
209
266
|
thread_local.local_catalog = self.config.get_catalog()
|
|
210
267
|
|
|
268
|
+
if not hasattr(thread_local, "stamping_processor"):
|
|
269
|
+
LOGGER.debug(
|
|
270
|
+
f"Didn't find stamping_processor in thread_local ({thread_local}), initializing new workunit processor"
|
|
271
|
+
)
|
|
272
|
+
thread_local.stamping_processor = AutoSystemMetadata(self.ctx)
|
|
273
|
+
|
|
211
274
|
with PerfTimer() as timer:
|
|
212
275
|
table = thread_local.local_catalog.load_table(dataset_path)
|
|
213
276
|
time_taken = timer.elapsed_seconds()
|
|
@@ -224,9 +287,11 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
224
287
|
for aspect in self._create_iceberg_table_aspects(
|
|
225
288
|
dataset_name, table, namespace_urn
|
|
226
289
|
):
|
|
227
|
-
yield
|
|
228
|
-
|
|
229
|
-
|
|
290
|
+
yield thread_local.stamping_processor.stamp_wu(
|
|
291
|
+
MetadataChangeProposalWrapper(
|
|
292
|
+
entityUrn=dataset_urn, aspect=aspect
|
|
293
|
+
).as_workunit()
|
|
294
|
+
)
|
|
230
295
|
except NoSuchPropertyException as e:
|
|
231
296
|
self.report.warning(
|
|
232
297
|
title="Unable to process table",
|
|
@@ -308,6 +373,7 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
308
373
|
return
|
|
309
374
|
|
|
310
375
|
try:
|
|
376
|
+
stamping_processor = AutoSystemMetadata(self.ctx)
|
|
311
377
|
namespace_ids = self._get_namespaces(catalog)
|
|
312
378
|
namespaces: List[Tuple[Identifier, str]] = []
|
|
313
379
|
for namespace in namespace_ids:
|
|
@@ -323,9 +389,11 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
323
389
|
)
|
|
324
390
|
namespaces.append((namespace, namespace_urn))
|
|
325
391
|
for aspect in self._create_iceberg_namespace_aspects(namespace):
|
|
326
|
-
yield
|
|
327
|
-
|
|
328
|
-
|
|
392
|
+
yield stamping_processor.stamp_wu(
|
|
393
|
+
MetadataChangeProposalWrapper(
|
|
394
|
+
entityUrn=namespace_urn, aspect=aspect
|
|
395
|
+
).as_workunit()
|
|
396
|
+
)
|
|
329
397
|
LOGGER.debug("Namespaces ingestion completed")
|
|
330
398
|
except Exception as e:
|
|
331
399
|
self.report.report_failure(
|
|
@@ -366,7 +434,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
366
434
|
yield dataset_ownership
|
|
367
435
|
|
|
368
436
|
yield self._create_schema_metadata(dataset_name, table)
|
|
369
|
-
|
|
437
|
+
dpi = self._get_dataplatform_instance_aspect()
|
|
438
|
+
yield dpi
|
|
439
|
+
yield self._create_browse_paths_aspect(dpi.instance, str(namespace_urn))
|
|
370
440
|
yield ContainerClass(container=str(namespace_urn))
|
|
371
441
|
|
|
372
442
|
self.report.report_table_processing_time(
|
|
@@ -377,6 +447,22 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
377
447
|
profiler = IcebergProfiler(self.report, self.config.profiling)
|
|
378
448
|
yield from profiler.profile_table(dataset_name, table)
|
|
379
449
|
|
|
450
|
+
def _create_browse_paths_aspect(
|
|
451
|
+
self,
|
|
452
|
+
platform_instance_urn: Optional[str] = None,
|
|
453
|
+
container_urn: Optional[str] = None,
|
|
454
|
+
) -> BrowsePathsV2Class:
|
|
455
|
+
path = []
|
|
456
|
+
if platform_instance_urn:
|
|
457
|
+
path.append(
|
|
458
|
+
BrowsePathEntryClass(
|
|
459
|
+
id=platform_instance_urn, urn=platform_instance_urn
|
|
460
|
+
)
|
|
461
|
+
)
|
|
462
|
+
if container_urn:
|
|
463
|
+
path.append(BrowsePathEntryClass(id=container_urn, urn=container_urn))
|
|
464
|
+
return BrowsePathsV2Class(path=path)
|
|
465
|
+
|
|
380
466
|
def _get_partition_aspect(self, table: Table) -> Optional[str]:
|
|
381
467
|
"""Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
|
|
382
468
|
Each element of the returned array represents a field in the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) that follows [Appendix-C](https://iceberg.apache.org/spec/?#appendix-c-json-serialization) of the Iceberg specification.
|
|
@@ -530,7 +616,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
530
616
|
name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
|
|
531
617
|
)
|
|
532
618
|
yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
|
|
533
|
-
|
|
619
|
+
dpi = self._get_dataplatform_instance_aspect()
|
|
620
|
+
yield dpi
|
|
621
|
+
yield self._create_browse_paths_aspect(dpi.instance)
|
|
534
622
|
|
|
535
623
|
|
|
536
624
|
class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
|
|
@@ -568,10 +568,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
568
568
|
|
|
569
569
|
for config_key in KafkaTopicConfigKeys:
|
|
570
570
|
try:
|
|
571
|
-
if
|
|
572
|
-
config_key in topic_config.keys()
|
|
573
|
-
and topic_config[config_key] is not None
|
|
574
|
-
):
|
|
571
|
+
if config_key in topic_config and topic_config[config_key] is not None:
|
|
575
572
|
config_value = topic_config[config_key].value
|
|
576
573
|
custom_props[config_key] = (
|
|
577
574
|
config_value
|
|
@@ -197,7 +197,7 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
197
197
|
for name in transform_names:
|
|
198
198
|
transform = {"name": name}
|
|
199
199
|
transforms.append(transform)
|
|
200
|
-
for key in self.connector_manifest.config
|
|
200
|
+
for key in self.connector_manifest.config:
|
|
201
201
|
if key.startswith(f"transforms.{name}."):
|
|
202
202
|
transform[key.replace(f"transforms.{name}.", "")] = (
|
|
203
203
|
self.connector_manifest.config[key]
|
|
@@ -121,7 +121,7 @@ class ConfluentJDBCSourceConnector(BaseConnector):
|
|
|
121
121
|
for name in transform_names:
|
|
122
122
|
transform = {"name": name}
|
|
123
123
|
transforms.append(transform)
|
|
124
|
-
for key in self.connector_manifest.config
|
|
124
|
+
for key in self.connector_manifest.config:
|
|
125
125
|
if key.startswith(f"transforms.{name}."):
|
|
126
126
|
transform[key.replace(f"transforms.{name}.", "")] = (
|
|
127
127
|
self.connector_manifest.config[key]
|
|
@@ -363,7 +363,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
363
363
|
filters: MutableMapping[str, Any] = (
|
|
364
364
|
query.filters if query.filters is not None else {}
|
|
365
365
|
)
|
|
366
|
-
for field in filters
|
|
366
|
+
for field in filters:
|
|
367
367
|
if field is None:
|
|
368
368
|
continue
|
|
369
369
|
|
|
@@ -877,8 +877,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
877
877
|
# fine to set them to None.
|
|
878
878
|
# TODO: Track project names for each explore.
|
|
879
879
|
explores_to_fetch = [
|
|
880
|
-
(None, model, explore)
|
|
881
|
-
for (model, explore) in self.reachable_explores.keys()
|
|
880
|
+
(None, model, explore) for (model, explore) in self.reachable_explores
|
|
882
881
|
]
|
|
883
882
|
explores_to_fetch.sort()
|
|
884
883
|
|
|
@@ -36,6 +36,7 @@ from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
|
|
|
36
36
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
37
37
|
StaleEntityRemovalHandler,
|
|
38
38
|
StaleEntityRemovalSourceReport,
|
|
39
|
+
StatefulStaleMetadataRemovalConfig,
|
|
39
40
|
)
|
|
40
41
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
41
42
|
StatefulIngestionConfigBase,
|
|
@@ -119,6 +120,8 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
|
119
120
|
default=None, description="Password for MLflow authentication"
|
|
120
121
|
)
|
|
121
122
|
|
|
123
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
124
|
+
|
|
122
125
|
|
|
123
126
|
@dataclass
|
|
124
127
|
class MLflowRegisteredModelStageInfo:
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -899,7 +899,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
899
899
|
for match in matches:
|
|
900
900
|
definition = Template(source=match).render()
|
|
901
901
|
parameters = yaml.safe_load(definition)
|
|
902
|
-
for key in parameters
|
|
902
|
+
for key in parameters:
|
|
903
903
|
jinja_params[key] = parameters[key].get("default", "")
|
|
904
904
|
|
|
905
905
|
normalized_query = re.sub(
|
|
@@ -1601,7 +1601,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1601
1601
|
|
|
1602
1602
|
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
1603
1603
|
# Space/collection -> report -> query -> Chart
|
|
1604
|
-
for space_token in self.space_tokens
|
|
1604
|
+
for space_token in self.space_tokens:
|
|
1605
1605
|
reports = self._get_reports(space_token)
|
|
1606
1606
|
for report in reports:
|
|
1607
1607
|
report_token = report.get("token", "")
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -703,7 +703,7 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
703
703
|
if (
|
|
704
704
|
component.nifi_type is NifiType.PROCESSOR
|
|
705
705
|
and component.type
|
|
706
|
-
not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS
|
|
706
|
+
not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS
|
|
707
707
|
) or component.nifi_type not in [
|
|
708
708
|
NifiType.PROCESSOR,
|
|
709
709
|
NifiType.REMOTE_INPUT_PORT,
|
|
@@ -977,7 +977,7 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
977
977
|
)
|
|
978
978
|
|
|
979
979
|
for incoming_from in incoming:
|
|
980
|
-
if incoming_from in self.nifi_flow.remotely_accessible_ports
|
|
980
|
+
if incoming_from in self.nifi_flow.remotely_accessible_ports:
|
|
981
981
|
dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[incoming_from].name}"
|
|
982
982
|
dataset_urn = builder.make_dataset_urn(
|
|
983
983
|
NIFI, dataset_name, self.config.env
|
|
@@ -994,7 +994,7 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
994
994
|
)
|
|
995
995
|
|
|
996
996
|
for outgoing_to in outgoing:
|
|
997
|
-
if outgoing_to in self.nifi_flow.remotely_accessible_ports
|
|
997
|
+
if outgoing_to in self.nifi_flow.remotely_accessible_ports:
|
|
998
998
|
dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[outgoing_to].name}"
|
|
999
999
|
dataset_urn = builder.make_dataset_urn(
|
|
1000
1000
|
NIFI, dataset_name, self.config.env
|
|
@@ -102,7 +102,7 @@ class OpenApiConfig(ConfigModel):
|
|
|
102
102
|
# details there once, and then use that session for all requests.
|
|
103
103
|
self.token = f"Bearer {self.bearer_token}"
|
|
104
104
|
else:
|
|
105
|
-
assert "url_complement" in self.get_token
|
|
105
|
+
assert "url_complement" in self.get_token, (
|
|
106
106
|
"When 'request_type' is set to 'get', an url_complement is needed for the request."
|
|
107
107
|
)
|
|
108
108
|
if self.get_token["request_type"] == "get":
|
|
@@ -317,7 +317,7 @@ class APISource(Source, ABC):
|
|
|
317
317
|
yield wu
|
|
318
318
|
|
|
319
319
|
# Handle schema metadata if available
|
|
320
|
-
if "data" in endpoint_dets
|
|
320
|
+
if "data" in endpoint_dets:
|
|
321
321
|
# we are lucky! data is defined in the swagger for this endpoint
|
|
322
322
|
schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
|
|
323
323
|
wu = MetadataWorkUnit(
|
|
@@ -371,7 +371,7 @@ class APISource(Source, ABC):
|
|
|
371
371
|
else:
|
|
372
372
|
self.report_bad_responses(response.status_code, type=endpoint_k)
|
|
373
373
|
else:
|
|
374
|
-
if endpoint_k not in config.forced_examples
|
|
374
|
+
if endpoint_k not in config.forced_examples:
|
|
375
375
|
# start guessing...
|
|
376
376
|
url_guess = try_guessing(endpoint_k, root_dataset_samples)
|
|
377
377
|
tot_url = clean_url(config.url + self.url_basepath + url_guess)
|
|
@@ -128,18 +128,18 @@ def get_endpoints(sw_dict: dict) -> dict:
|
|
|
128
128
|
|
|
129
129
|
for p_k, p_o in sw_dict["paths"].items():
|
|
130
130
|
method = list(p_o)[0]
|
|
131
|
-
if "200" in p_o[method]["responses"]
|
|
131
|
+
if "200" in p_o[method]["responses"]:
|
|
132
132
|
base_res = p_o[method]["responses"]["200"]
|
|
133
|
-
elif 200 in p_o[method]["responses"]
|
|
133
|
+
elif 200 in p_o[method]["responses"]:
|
|
134
134
|
# if you read a plain yml file the 200 will be an integer
|
|
135
135
|
base_res = p_o[method]["responses"][200]
|
|
136
136
|
else:
|
|
137
137
|
# the endpoint does not have a 200 response
|
|
138
138
|
continue
|
|
139
139
|
|
|
140
|
-
if "description" in p_o[method]
|
|
140
|
+
if "description" in p_o[method]:
|
|
141
141
|
desc = p_o[method]["description"]
|
|
142
|
-
elif "summary" in p_o[method]
|
|
142
|
+
elif "summary" in p_o[method]:
|
|
143
143
|
desc = p_o[method]["summary"]
|
|
144
144
|
else: # still testing
|
|
145
145
|
desc = ""
|
|
@@ -156,7 +156,7 @@ def get_endpoints(sw_dict: dict) -> dict:
|
|
|
156
156
|
url_details[p_k]["data"] = example_data
|
|
157
157
|
|
|
158
158
|
# checking whether there are defined parameters to execute the call...
|
|
159
|
-
if "parameters" in p_o[method]
|
|
159
|
+
if "parameters" in p_o[method]:
|
|
160
160
|
url_details[p_k]["parameters"] = p_o[method]["parameters"]
|
|
161
161
|
|
|
162
162
|
return dict(sorted(url_details.items()))
|
|
@@ -169,7 +169,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
|
|
|
169
169
|
data = {}
|
|
170
170
|
if "content" in base_res:
|
|
171
171
|
res_cont = base_res["content"]
|
|
172
|
-
if "application/json" in res_cont
|
|
172
|
+
if "application/json" in res_cont:
|
|
173
173
|
ex_field = None
|
|
174
174
|
if "example" in res_cont["application/json"]:
|
|
175
175
|
ex_field = "example"
|
|
@@ -186,7 +186,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
|
|
|
186
186
|
logger.warning(
|
|
187
187
|
f"Field in swagger file does not give consistent data --- {key}"
|
|
188
188
|
)
|
|
189
|
-
elif "text/csv" in res_cont
|
|
189
|
+
elif "text/csv" in res_cont:
|
|
190
190
|
data = res_cont["text/csv"]["schema"]
|
|
191
191
|
elif "examples" in base_res:
|
|
192
192
|
data = base_res["examples"]["application/json"]
|
|
@@ -239,7 +239,7 @@ def guessing_url_name(url: str, examples: dict) -> str:
|
|
|
239
239
|
|
|
240
240
|
# substituting the parameter's name w the value
|
|
241
241
|
for name, clean_name in zip(needed_n, cleaned_needed_n):
|
|
242
|
-
if clean_name in examples[ex2use]
|
|
242
|
+
if clean_name in examples[ex2use]:
|
|
243
243
|
guessed_url = re.sub(name, str(examples[ex2use][clean_name]), guessed_url)
|
|
244
244
|
|
|
245
245
|
return guessed_url
|
|
@@ -555,7 +555,7 @@ class PowerBiDashboardSourceConfig(
|
|
|
555
555
|
def map_data_platform(cls, value):
|
|
556
556
|
# For backward compatibility convert input PostgreSql to PostgreSQL
|
|
557
557
|
# PostgreSQL is name of the data-platform in M-Query
|
|
558
|
-
if "PostgreSql" in value
|
|
558
|
+
if "PostgreSql" in value:
|
|
559
559
|
platform_name = value["PostgreSql"]
|
|
560
560
|
del value["PostgreSql"]
|
|
561
561
|
value["PostgreSQL"] = platform_name
|
|
@@ -263,7 +263,7 @@ class Mapper:
|
|
|
263
263
|
for upstream_dpt in lineage.upstreams:
|
|
264
264
|
if (
|
|
265
265
|
upstream_dpt.data_platform_pair.powerbi_data_platform_name
|
|
266
|
-
not in self.__config.dataset_type_mapping
|
|
266
|
+
not in self.__config.dataset_type_mapping
|
|
267
267
|
):
|
|
268
268
|
logger.debug(
|
|
269
269
|
f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping",
|
|
@@ -1353,7 +1353,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1353
1353
|
for data_platform in SupportedDataPlatform
|
|
1354
1354
|
]
|
|
1355
1355
|
|
|
1356
|
-
for key in self.source_config.dataset_type_mapping
|
|
1356
|
+
for key in self.source_config.dataset_type_mapping:
|
|
1357
1357
|
if key not in powerbi_data_platforms:
|
|
1358
1358
|
raise ValueError(f"PowerBI DataPlatform {key} is not supported")
|
|
1359
1359
|
|
|
@@ -42,9 +42,9 @@ class RedshiftProfiler(GenericProfiler):
|
|
|
42
42
|
"max_overflow", self.config.profiling.max_workers
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
for db in tables
|
|
45
|
+
for db in tables:
|
|
46
46
|
profile_requests = []
|
|
47
|
-
for schema in tables.get(db, {})
|
|
47
|
+
for schema in tables.get(db, {}):
|
|
48
48
|
if not self.config.schema_pattern.allowed(schema):
|
|
49
49
|
continue
|
|
50
50
|
for table in tables[db].get(schema, {}):
|
|
@@ -77,7 +77,7 @@ class SnowsightUrlBuilder:
|
|
|
77
77
|
region: str,
|
|
78
78
|
) -> Tuple[str, str]:
|
|
79
79
|
cloud: str
|
|
80
|
-
if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING
|
|
80
|
+
if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING:
|
|
81
81
|
cloud, cloud_region_id = SNOWFLAKE_REGION_CLOUD_REGION_MAPPING[region]
|
|
82
82
|
elif region.startswith(("aws_", "gcp_", "azure_")):
|
|
83
83
|
# e.g. aws_us_west_2, gcp_us_central1, azure_northeurope
|
|
@@ -26,6 +26,7 @@ from datahub.metadata.schema_classes import (
|
|
|
26
26
|
DataPlatformInstanceClass,
|
|
27
27
|
DataTransformClass,
|
|
28
28
|
DataTransformLogicClass,
|
|
29
|
+
QueryLanguageClass,
|
|
29
30
|
QueryStatementClass,
|
|
30
31
|
SubTypesClass,
|
|
31
32
|
)
|
|
@@ -176,7 +177,17 @@ def _generate_job_workunits(
|
|
|
176
177
|
DataTransformClass(
|
|
177
178
|
queryStatement=QueryStatementClass(
|
|
178
179
|
value=procedure.procedure_definition,
|
|
179
|
-
language=
|
|
180
|
+
language=(
|
|
181
|
+
QueryLanguageClass.SQL
|
|
182
|
+
if procedure.language == "SQL"
|
|
183
|
+
# The language field uses a pretty limited enum.
|
|
184
|
+
# The "UNKNOWN" enum value is pretty new, so we don't want to
|
|
185
|
+
# emit it until it has broader server-side support. As a
|
|
186
|
+
# short-term solution, we map all languages to "SQL".
|
|
187
|
+
# TODO: Once we've released server 1.1.0, we should change
|
|
188
|
+
# this to be "UNKNOWN" for all languages except "SQL".
|
|
189
|
+
else QueryLanguageClass.SQL
|
|
190
|
+
),
|
|
180
191
|
),
|
|
181
192
|
)
|
|
182
193
|
]
|
|
@@ -1623,7 +1623,7 @@ class TableauSiteSource:
|
|
|
1623
1623
|
# if multiple project has name C. Ideal solution is to use projectLuidWithin to avoid duplicate project,
|
|
1624
1624
|
# however Tableau supports projectLuidWithin in Tableau Cloud June 2022 / Server 2022.3 and later.
|
|
1625
1625
|
project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
|
|
1626
|
-
if project_luid not in self.tableau_project_registry
|
|
1626
|
+
if project_luid not in self.tableau_project_registry:
|
|
1627
1627
|
wrk_name: Optional[str] = workbook.get(c.NAME)
|
|
1628
1628
|
wrk_id: Optional[str] = workbook.get(c.ID)
|
|
1629
1629
|
prj_name: Optional[str] = workbook.get(c.PROJECT_NAME)
|
|
@@ -2253,7 +2253,7 @@ class TableauSiteSource:
|
|
|
2253
2253
|
# It is possible due to https://github.com/tableau/server-client-python/issues/1210
|
|
2254
2254
|
if (
|
|
2255
2255
|
ds.get(c.LUID)
|
|
2256
|
-
and ds[c.LUID] not in self.datasource_project_map
|
|
2256
|
+
and ds[c.LUID] not in self.datasource_project_map
|
|
2257
2257
|
and self.report.get_all_datasources_query_failed
|
|
2258
2258
|
):
|
|
2259
2259
|
logger.debug(
|
|
@@ -2265,7 +2265,7 @@ class TableauSiteSource:
|
|
|
2265
2265
|
|
|
2266
2266
|
if (
|
|
2267
2267
|
ds.get(c.LUID)
|
|
2268
|
-
and ds[c.LUID] in self.datasource_project_map
|
|
2268
|
+
and ds[c.LUID] in self.datasource_project_map
|
|
2269
2269
|
and self.datasource_project_map[ds[c.LUID]] in self.tableau_project_registry
|
|
2270
2270
|
):
|
|
2271
2271
|
return self.datasource_project_map[ds[c.LUID]]
|
|
@@ -3252,7 +3252,7 @@ class TableauSiteSource:
|
|
|
3252
3252
|
|
|
3253
3253
|
parent_key = None
|
|
3254
3254
|
project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
|
|
3255
|
-
if project_luid and project_luid in self.tableau_project_registry
|
|
3255
|
+
if project_luid and project_luid in self.tableau_project_registry:
|
|
3256
3256
|
parent_key = self.gen_project_key(project_luid)
|
|
3257
3257
|
else:
|
|
3258
3258
|
workbook_id: Optional[str] = workbook.get(c.ID)
|
|
@@ -774,7 +774,7 @@ def get_overridden_info(
|
|
|
774
774
|
if (
|
|
775
775
|
lineage_overrides is not None
|
|
776
776
|
and lineage_overrides.platform_override_map is not None
|
|
777
|
-
and original_platform in lineage_overrides.platform_override_map
|
|
777
|
+
and original_platform in lineage_overrides.platform_override_map
|
|
778
778
|
):
|
|
779
779
|
platform = lineage_overrides.platform_override_map[original_platform]
|
|
780
780
|
|
|
@@ -782,7 +782,7 @@ def get_overridden_info(
|
|
|
782
782
|
lineage_overrides is not None
|
|
783
783
|
and lineage_overrides.database_override_map is not None
|
|
784
784
|
and upstream_db is not None
|
|
785
|
-
and upstream_db in lineage_overrides.database_override_map
|
|
785
|
+
and upstream_db in lineage_overrides.database_override_map
|
|
786
786
|
):
|
|
787
787
|
upstream_db = lineage_overrides.database_override_map[upstream_db]
|
|
788
788
|
|