acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
- datahub/_version.py +1 -1
- datahub/cli/check_cli.py +0 -7
- datahub/cli/cli_utils.py +73 -0
- datahub/cli/delete_cli.py +0 -6
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +148 -228
- datahub/cli/exists_cli.py +0 -4
- datahub/cli/get_cli.py +0 -4
- datahub/cli/ingest_cli.py +1 -20
- datahub/cli/put_cli.py +0 -6
- datahub/cli/quickstart_versioning.py +50 -5
- datahub/cli/specific/assertions_cli.py +0 -6
- datahub/cli/specific/datacontract_cli.py +0 -6
- datahub/cli/specific/dataproduct_cli.py +0 -22
- datahub/cli/specific/dataset_cli.py +0 -11
- datahub/cli/specific/forms_cli.py +0 -6
- datahub/cli/specific/group_cli.py +0 -4
- datahub/cli/specific/structuredproperties_cli.py +0 -7
- datahub/cli/specific/user_cli.py +0 -4
- datahub/cli/state_cli.py +0 -4
- datahub/cli/timeline_cli.py +0 -4
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/report.py +183 -35
- datahub/ingestion/autogenerated/capability_summary.json +3431 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +30 -128
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/graph/client.py +2 -2
- datahub/ingestion/run/pipeline.py +47 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +1 -1
- datahub/ingestion/source/data_lake_common/object_store.py +40 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dremio/dremio_source.py +7 -7
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +28 -20
- datahub/ingestion/source/identity/okta.py +0 -13
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
- datahub/ingestion/source/powerbi/powerbi.py +0 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/source.py +19 -3
- datahub/ingestion/source/sigma/sigma.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/hive_metastore.py +0 -10
- datahub/ingestion/source/sql/sql_common.py +4 -0
- datahub/ingestion/source/sql/vertica.py +0 -4
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/superset.py +56 -1
- datahub/ingestion/source/tableau/tableau.py +40 -34
- datahub/ingestion/source/tableau/tableau_constant.py +0 -2
- datahub/ingestion/source/unity/proxy.py +4 -3
- datahub/ingestion/source/unity/source.py +19 -9
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +85 -4
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
- datahub/metadata/schema.avsc +54 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
- datahub/sdk/lineage_client.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
- datahub/sql_parsing/sqlglot_lineage.py +40 -13
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0
|
@@ -44,6 +44,10 @@ from datahub.ingestion.transformer.transform_registry import transform_registry
|
|
|
44
44
|
from datahub.sdk._attribution import KnownAttribution, change_default_attribution
|
|
45
45
|
from datahub.telemetry import stats
|
|
46
46
|
from datahub.telemetry.telemetry import telemetry_instance
|
|
47
|
+
from datahub.upgrade.upgrade import (
|
|
48
|
+
is_server_default_cli_ahead,
|
|
49
|
+
retrieve_version_stats,
|
|
50
|
+
)
|
|
47
51
|
from datahub.utilities._custom_package_loader import model_version_name
|
|
48
52
|
from datahub.utilities.global_warning_util import (
|
|
49
53
|
clear_global_warnings,
|
|
@@ -171,7 +175,10 @@ class Pipeline:
|
|
|
171
175
|
self.last_time_printed = int(time.time())
|
|
172
176
|
self.cli_report = CliReport()
|
|
173
177
|
|
|
174
|
-
with
|
|
178
|
+
with (
|
|
179
|
+
contextlib.ExitStack() as exit_stack,
|
|
180
|
+
contextlib.ExitStack() as inner_exit_stack,
|
|
181
|
+
):
|
|
175
182
|
self.graph: Optional[DataHubGraph] = None
|
|
176
183
|
with _add_init_error_context("connect to DataHub"):
|
|
177
184
|
if self.config.datahub_api:
|
|
@@ -340,6 +347,44 @@ class Pipeline:
|
|
|
340
347
|
except Exception as e:
|
|
341
348
|
logger.warning("Reporting failed on start", exc_info=e)
|
|
342
349
|
|
|
350
|
+
def _warn_old_cli_version(self) -> None:
|
|
351
|
+
"""
|
|
352
|
+
Check if the server default CLI version is ahead of the CLI version being used.
|
|
353
|
+
If so, add a warning to the report.
|
|
354
|
+
"""
|
|
355
|
+
|
|
356
|
+
try:
|
|
357
|
+
version_stats = retrieve_version_stats(timeout=2.0, graph=self.graph)
|
|
358
|
+
except RuntimeError as e:
|
|
359
|
+
# Handle case where there's no event loop available (e.g., in ThreadPoolExecutor)
|
|
360
|
+
if "no current event loop" in str(e):
|
|
361
|
+
logger.debug("Skipping version check - no event loop available")
|
|
362
|
+
return
|
|
363
|
+
raise
|
|
364
|
+
|
|
365
|
+
if not version_stats or not self.graph:
|
|
366
|
+
return
|
|
367
|
+
|
|
368
|
+
if is_server_default_cli_ahead(version_stats):
|
|
369
|
+
server_default_version = (
|
|
370
|
+
version_stats.server.current_server_default_cli_version.version
|
|
371
|
+
if version_stats.server.current_server_default_cli_version
|
|
372
|
+
else None
|
|
373
|
+
)
|
|
374
|
+
current_version = version_stats.client.current.version
|
|
375
|
+
|
|
376
|
+
logger.debug(f"""
|
|
377
|
+
client_version: {current_version}
|
|
378
|
+
server_default_version: {server_default_version}
|
|
379
|
+
server_default_cli_ahead: True
|
|
380
|
+
""")
|
|
381
|
+
|
|
382
|
+
self.source.get_report().warning(
|
|
383
|
+
title="Server default CLI version is ahead of CLI version",
|
|
384
|
+
message="Please upgrade the CLI version being used",
|
|
385
|
+
context=f"Server Default CLI version: {server_default_version}, Used CLI version: {current_version}",
|
|
386
|
+
)
|
|
387
|
+
|
|
343
388
|
def _notify_reporters_on_ingestion_completion(self) -> None:
|
|
344
389
|
for reporter in self.reporters:
|
|
345
390
|
try:
|
|
@@ -396,6 +441,7 @@ class Pipeline:
|
|
|
396
441
|
return False
|
|
397
442
|
|
|
398
443
|
def run(self) -> None:
|
|
444
|
+
self._warn_old_cli_version()
|
|
399
445
|
with self.exit_stack, self.inner_exit_stack:
|
|
400
446
|
if self.config.flags.generate_memory_profiles:
|
|
401
447
|
import memray
|
|
@@ -45,6 +45,7 @@ from datahub.ingestion.source.bigquery_v2.queries_extractor import (
|
|
|
45
45
|
BigQueryQueriesExtractorConfig,
|
|
46
46
|
)
|
|
47
47
|
from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
|
|
48
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
48
49
|
from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
|
|
49
50
|
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
50
51
|
RedundantLineageRunSkipHandler,
|
|
@@ -78,7 +79,14 @@ def cleanup(config: BigQueryV2Config) -> None:
|
|
|
78
79
|
supported=False,
|
|
79
80
|
)
|
|
80
81
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
81
|
-
@capability(
|
|
82
|
+
@capability(
|
|
83
|
+
SourceCapability.CONTAINERS,
|
|
84
|
+
"Enabled by default",
|
|
85
|
+
subtype_modifier=[
|
|
86
|
+
SourceCapabilityModifier.BIGQUERY_PROJECT,
|
|
87
|
+
SourceCapabilityModifier.BIGQUERY_DATASET,
|
|
88
|
+
],
|
|
89
|
+
)
|
|
82
90
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
83
91
|
@capability(
|
|
84
92
|
SourceCapability.DATA_PROFILING,
|
|
@@ -288,28 +296,29 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
288
296
|
):
|
|
289
297
|
return
|
|
290
298
|
|
|
291
|
-
with
|
|
292
|
-
f"*: {QUERIES_EXTRACTION}"
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
299
|
+
with (
|
|
300
|
+
self.report.new_stage(f"*: {QUERIES_EXTRACTION}"),
|
|
301
|
+
BigQueryQueriesExtractor(
|
|
302
|
+
connection=self.config.get_bigquery_client(),
|
|
303
|
+
schema_api=self.bq_schema_extractor.schema_api,
|
|
304
|
+
config=BigQueryQueriesExtractorConfig(
|
|
305
|
+
window=self.config,
|
|
306
|
+
user_email_pattern=self.config.usage.user_email_pattern,
|
|
307
|
+
include_lineage=self.config.include_table_lineage,
|
|
308
|
+
include_usage_statistics=self.config.include_usage_statistics,
|
|
309
|
+
include_operations=self.config.usage.include_operational_stats,
|
|
310
|
+
include_queries=self.config.include_queries,
|
|
311
|
+
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
312
|
+
top_n_queries=self.config.usage.top_n_queries,
|
|
313
|
+
region_qualifiers=self.config.region_qualifiers,
|
|
314
|
+
),
|
|
315
|
+
structured_report=self.report,
|
|
316
|
+
filters=self.filters,
|
|
317
|
+
identifiers=self.identifiers,
|
|
318
|
+
schema_resolver=self.sql_parser_schema_resolver,
|
|
319
|
+
discovered_tables=self.bq_schema_extractor.table_refs,
|
|
320
|
+
) as queries_extractor,
|
|
321
|
+
):
|
|
313
322
|
self.report.queries_extractor = queries_extractor.report
|
|
314
323
|
yield from queries_extractor.get_workunits_internal()
|
|
315
324
|
else:
|
|
@@ -70,11 +70,12 @@ class CassandraProfiler:
|
|
|
70
70
|
) -> Iterable[MetadataWorkUnit]:
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
|
-
with
|
|
74
|
-
f"{keyspace_name}: {PROFILING}"
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
73
|
+
with (
|
|
74
|
+
self.report.new_stage(f"{keyspace_name}: {PROFILING}"),
|
|
75
|
+
ThreadPoolExecutor(
|
|
76
|
+
max_workers=self.config.profiling.max_workers
|
|
77
|
+
) as executor,
|
|
78
|
+
):
|
|
78
79
|
future_to_dataset = {
|
|
79
80
|
executor.submit(
|
|
80
81
|
self.generate_profile,
|
|
@@ -143,7 +143,7 @@ def create_source_capability_modifier_enum():
|
|
|
143
143
|
for enum_class in source_enums:
|
|
144
144
|
for member in enum_class: # type: ignore[var-annotated]
|
|
145
145
|
if member.name in all_values:
|
|
146
|
-
logger.
|
|
146
|
+
logger.debug(
|
|
147
147
|
f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
|
|
148
148
|
)
|
|
149
149
|
continue
|
|
@@ -519,6 +519,13 @@ class ObjectStoreSourceAdapter:
|
|
|
519
519
|
"get_external_url",
|
|
520
520
|
lambda table_data: self.get_gcs_external_url(table_data),
|
|
521
521
|
)
|
|
522
|
+
# Fix URI mismatch issue in pattern matching
|
|
523
|
+
self.register_customization(
|
|
524
|
+
"_normalize_uri_for_pattern_matching",
|
|
525
|
+
self._normalize_gcs_uri_for_pattern_matching,
|
|
526
|
+
)
|
|
527
|
+
# Fix URI handling in schema extraction - override strip_s3_prefix for GCS
|
|
528
|
+
self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
|
|
522
529
|
elif platform == "s3":
|
|
523
530
|
self.register_customization("is_s3_platform", lambda: True)
|
|
524
531
|
self.register_customization("create_s3_path", self.create_s3_path)
|
|
@@ -612,6 +619,39 @@ class ObjectStoreSourceAdapter:
|
|
|
612
619
|
return self.get_abs_external_url(table_data)
|
|
613
620
|
return None
|
|
614
621
|
|
|
622
|
+
def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
|
|
623
|
+
"""
|
|
624
|
+
Normalize GCS URI for pattern matching.
|
|
625
|
+
|
|
626
|
+
This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
|
|
627
|
+
fixing the URI mismatch issue in GCS ingestion.
|
|
628
|
+
|
|
629
|
+
Args:
|
|
630
|
+
uri: The URI to normalize
|
|
631
|
+
|
|
632
|
+
Returns:
|
|
633
|
+
The normalized URI for pattern matching
|
|
634
|
+
"""
|
|
635
|
+
if uri.startswith("gs://"):
|
|
636
|
+
return uri.replace("gs://", "s3://", 1)
|
|
637
|
+
return uri
|
|
638
|
+
|
|
639
|
+
def _strip_gcs_prefix(self, uri: str) -> str:
|
|
640
|
+
"""
|
|
641
|
+
Strip GCS prefix from URI.
|
|
642
|
+
|
|
643
|
+
This method removes the gs:// prefix from GCS URIs for path processing.
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
uri: The URI to strip the prefix from
|
|
647
|
+
|
|
648
|
+
Returns:
|
|
649
|
+
The URI without the gs:// prefix
|
|
650
|
+
"""
|
|
651
|
+
if uri.startswith("gs://"):
|
|
652
|
+
return uri[5:] # Remove "gs://" prefix
|
|
653
|
+
return uri
|
|
654
|
+
|
|
615
655
|
|
|
616
656
|
# Factory function to create an adapter for a specific platform
|
|
617
657
|
def create_object_store_adapter(
|
|
@@ -12,7 +12,7 @@ from datahub.emitter.serialization_helper import post_json_transform
|
|
|
12
12
|
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
|
|
13
13
|
from datahub.ingestion.source.datahub.report import DataHubSourceReport
|
|
14
14
|
from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
|
|
15
|
-
from datahub.metadata.schema_classes import
|
|
15
|
+
from datahub.metadata.schema_classes import SystemMetadataClass
|
|
16
16
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
@@ -374,7 +374,6 @@ class DataHubDatabaseReader:
|
|
|
374
374
|
entityUrn=row["urn"],
|
|
375
375
|
aspect=ASPECT_MAP[row["aspect"]].from_obj(json_aspect),
|
|
376
376
|
systemMetadata=system_metadata,
|
|
377
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
378
377
|
)
|
|
379
378
|
except Exception as e:
|
|
380
379
|
logger.warning(
|
|
@@ -61,7 +61,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
61
61
|
UpstreamClass,
|
|
62
62
|
UpstreamLineage,
|
|
63
63
|
)
|
|
64
|
-
from datahub.metadata.schema_classes import
|
|
64
|
+
from datahub.metadata.schema_classes import SchemaMetadataClass
|
|
65
65
|
from datahub.metadata.urns import CorpUserUrn
|
|
66
66
|
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
67
67
|
KnownQueryLineageInfo,
|
|
@@ -261,9 +261,12 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
261
261
|
|
|
262
262
|
# Profiling
|
|
263
263
|
if self.config.is_profiling_enabled():
|
|
264
|
-
with
|
|
265
|
-
|
|
266
|
-
|
|
264
|
+
with (
|
|
265
|
+
self.report.new_stage(PROFILING),
|
|
266
|
+
ThreadPoolExecutor(
|
|
267
|
+
max_workers=self.config.profiling.max_workers
|
|
268
|
+
) as executor,
|
|
269
|
+
):
|
|
267
270
|
future_to_dataset = {
|
|
268
271
|
executor.submit(self.generate_profiles, dataset): dataset
|
|
269
272
|
for dataset in datasets
|
|
@@ -430,11 +433,8 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
430
433
|
]
|
|
431
434
|
)
|
|
432
435
|
mcp = MetadataChangeProposalWrapper(
|
|
433
|
-
entityType="dataset",
|
|
434
436
|
entityUrn=dataset_urn,
|
|
435
|
-
aspectName=lineage.ASPECT_NAME,
|
|
436
437
|
aspect=lineage,
|
|
437
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
438
438
|
)
|
|
439
439
|
|
|
440
440
|
for upstream_urn in upstream_urns:
|
|
@@ -16,6 +16,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
16
16
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
|
|
17
17
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
18
18
|
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
19
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
19
20
|
from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
|
|
20
21
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import PLATFORM_GCS
|
|
21
22
|
from datahub.ingestion.source.data_lake_common.object_store import (
|
|
@@ -82,7 +83,14 @@ class GCSSourceReport(DataLakeSourceReport):
|
|
|
82
83
|
@platform_name("Google Cloud Storage", id=PLATFORM_GCS)
|
|
83
84
|
@config_class(GCSSourceConfig)
|
|
84
85
|
@support_status(SupportStatus.INCUBATING)
|
|
85
|
-
@capability(
|
|
86
|
+
@capability(
|
|
87
|
+
SourceCapability.CONTAINERS,
|
|
88
|
+
"Enabled by default",
|
|
89
|
+
subtype_modifier=[
|
|
90
|
+
SourceCapabilityModifier.GCS_BUCKET,
|
|
91
|
+
SourceCapabilityModifier.FOLDER,
|
|
92
|
+
],
|
|
93
|
+
)
|
|
86
94
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
87
95
|
@capability(SourceCapability.DATA_PROFILING, "Not supported", supported=False)
|
|
88
96
|
class GCSSource(StatefulIngestionSourceBase):
|
|
@@ -112,6 +120,7 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
112
120
|
env=self.config.env,
|
|
113
121
|
max_rows=self.config.max_rows,
|
|
114
122
|
number_of_files_to_sample=self.config.number_of_files_to_sample,
|
|
123
|
+
platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
|
|
115
124
|
)
|
|
116
125
|
return s3_config
|
|
117
126
|
|
|
@@ -138,7 +147,9 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
138
147
|
|
|
139
148
|
def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
|
|
140
149
|
config = self.create_equivalent_s3_config()
|
|
141
|
-
|
|
150
|
+
# Create a new context for S3 source without graph to avoid duplicate checkpointer registration
|
|
151
|
+
s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
|
|
152
|
+
s3_source = S3Source(config, s3_ctx)
|
|
142
153
|
return self.s3_source_overrides(s3_source)
|
|
143
154
|
|
|
144
155
|
def s3_source_overrides(self, source: S3Source) -> S3Source:
|
|
@@ -1213,26 +1213,34 @@ class DatahubGEProfiler:
|
|
|
1213
1213
|
f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
|
|
1214
1214
|
)
|
|
1215
1215
|
|
|
1216
|
-
with
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1216
|
+
with (
|
|
1217
|
+
PerfTimer() as timer,
|
|
1218
|
+
unittest.mock.patch(
|
|
1219
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
|
|
1220
|
+
get_column_unique_count_dh_patch,
|
|
1221
|
+
),
|
|
1222
|
+
unittest.mock.patch(
|
|
1223
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
|
|
1224
|
+
_get_column_quantiles_bigquery_patch,
|
|
1225
|
+
),
|
|
1226
|
+
unittest.mock.patch(
|
|
1227
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
|
|
1228
|
+
_get_column_quantiles_awsathena_patch,
|
|
1229
|
+
),
|
|
1230
|
+
unittest.mock.patch(
|
|
1231
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
|
|
1232
|
+
_get_column_median_patch,
|
|
1233
|
+
),
|
|
1234
|
+
concurrent.futures.ThreadPoolExecutor(
|
|
1235
|
+
max_workers=max_workers
|
|
1236
|
+
) as async_executor,
|
|
1237
|
+
SQLAlchemyQueryCombiner(
|
|
1238
|
+
enabled=self.config.query_combiner_enabled,
|
|
1239
|
+
catch_exceptions=self.config.catch_exceptions,
|
|
1240
|
+
is_single_row_query_method=_is_single_row_query_method,
|
|
1241
|
+
serial_execution_fallback_enabled=True,
|
|
1242
|
+
).activate() as query_combiner,
|
|
1243
|
+
):
|
|
1236
1244
|
# Submit the profiling requests to the thread pool executor.
|
|
1237
1245
|
async_profiles = collections.deque(
|
|
1238
1246
|
async_executor.submit(
|
|
@@ -41,7 +41,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
|
|
|
41
41
|
)
|
|
42
42
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
43
43
|
from datahub.metadata.schema_classes import (
|
|
44
|
-
ChangeTypeClass,
|
|
45
44
|
CorpGroupInfoClass,
|
|
46
45
|
CorpUserInfoClass,
|
|
47
46
|
GroupMembershipClass,
|
|
@@ -332,18 +331,12 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
332
331
|
yield MetadataWorkUnit(id=wu_id, mce=mce)
|
|
333
332
|
|
|
334
333
|
yield MetadataChangeProposalWrapper(
|
|
335
|
-
entityType="corpGroup",
|
|
336
334
|
entityUrn=datahub_corp_group_snapshot.urn,
|
|
337
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
338
|
-
aspectName="origin",
|
|
339
335
|
aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
|
|
340
336
|
).as_workunit()
|
|
341
337
|
|
|
342
338
|
yield MetadataChangeProposalWrapper(
|
|
343
|
-
entityType="corpGroup",
|
|
344
339
|
entityUrn=datahub_corp_group_snapshot.urn,
|
|
345
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
346
|
-
aspectName="status",
|
|
347
340
|
aspect=StatusClass(removed=False),
|
|
348
341
|
).as_workunit()
|
|
349
342
|
|
|
@@ -418,18 +411,12 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
418
411
|
yield MetadataWorkUnit(id=wu_id, mce=mce)
|
|
419
412
|
|
|
420
413
|
yield MetadataChangeProposalWrapper(
|
|
421
|
-
entityType="corpuser",
|
|
422
414
|
entityUrn=datahub_corp_user_snapshot.urn,
|
|
423
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
424
|
-
aspectName="origin",
|
|
425
415
|
aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
|
|
426
416
|
).as_workunit()
|
|
427
417
|
|
|
428
418
|
yield MetadataChangeProposalWrapper(
|
|
429
|
-
entityType="corpuser",
|
|
430
419
|
entityUrn=datahub_corp_user_snapshot.urn,
|
|
431
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
432
|
-
aspectName="status",
|
|
433
420
|
aspect=StatusClass(removed=False),
|
|
434
421
|
).as_workunit()
|
|
435
422
|
|
|
@@ -20,6 +20,8 @@ from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
|
20
20
|
get_platform_from_sqlalchemy_uri,
|
|
21
21
|
)
|
|
22
22
|
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
23
25
|
|
|
24
26
|
@dataclass
|
|
25
27
|
class ConfluentJDBCSourceConnector(BaseConnector):
|
|
@@ -392,7 +394,7 @@ class MongoSourceConnector(BaseConnector):
|
|
|
392
394
|
db_connection_url=connector_manifest.config.get("connection.uri"),
|
|
393
395
|
source_platform="mongodb",
|
|
394
396
|
database_name=connector_manifest.config.get("database"),
|
|
395
|
-
topic_prefix=connector_manifest.config.get("
|
|
397
|
+
topic_prefix=connector_manifest.config.get("topic.prefix"),
|
|
396
398
|
transforms=(
|
|
397
399
|
connector_manifest.config["transforms"].split(",")
|
|
398
400
|
if "transforms" in connector_manifest.config
|
|
@@ -406,7 +408,11 @@ class MongoSourceConnector(BaseConnector):
|
|
|
406
408
|
lineages: List[KafkaConnectLineage] = list()
|
|
407
409
|
parser = self.get_parser(self.connector_manifest)
|
|
408
410
|
source_platform = parser.source_platform
|
|
409
|
-
|
|
411
|
+
topic_prefix = parser.topic_prefix or ""
|
|
412
|
+
|
|
413
|
+
# Escape topic_prefix to handle cases where it contains dots
|
|
414
|
+
# Some users configure topic.prefix like "my.mongodb" which breaks the regex
|
|
415
|
+
topic_naming_pattern = rf"{re.escape(topic_prefix)}\.(\w+)\.(\w+)"
|
|
410
416
|
|
|
411
417
|
if not self.connector_manifest.topic_names:
|
|
412
418
|
return lineages
|
|
@@ -429,6 +435,26 @@ class MongoSourceConnector(BaseConnector):
|
|
|
429
435
|
|
|
430
436
|
@dataclass
|
|
431
437
|
class DebeziumSourceConnector(BaseConnector):
|
|
438
|
+
# Debezium topic naming patterns by connector type
|
|
439
|
+
# - MySQL: {topic.prefix}.{database}.{table}
|
|
440
|
+
# - PostgreSQL: {topic.prefix}.{schema}.{table}
|
|
441
|
+
# - SQL Server: {topic.prefix}.{database}.{schema}.{table}
|
|
442
|
+
# - Oracle: {topic.prefix}.{schema}.{table}
|
|
443
|
+
# - DB2: {topic.prefix}.{schema}.{table}
|
|
444
|
+
# - MongoDB: {topic.prefix}.{database}.{collection}
|
|
445
|
+
# - Vitess: {topic.prefix}.{keyspace}.{table}
|
|
446
|
+
|
|
447
|
+
# Note SQL Server allows for "database.names" (multiple databases) config,
|
|
448
|
+
# and so database is in the topic naming pattern.
|
|
449
|
+
# However, others have "database.dbname" which is a single database name. For these connectors,
|
|
450
|
+
# additional databases would require a different connector instance
|
|
451
|
+
|
|
452
|
+
# Connectors with 2-level container in pattern (database + schema)
|
|
453
|
+
# Others have either database XOR schema, but not both
|
|
454
|
+
DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN = {
|
|
455
|
+
"io.debezium.connector.sqlserver.SqlServerConnector",
|
|
456
|
+
}
|
|
457
|
+
|
|
432
458
|
@dataclass
|
|
433
459
|
class DebeziumParser:
|
|
434
460
|
source_platform: str
|
|
@@ -514,16 +540,45 @@ class DebeziumSourceConnector(BaseConnector):
|
|
|
514
540
|
source_platform = parser.source_platform
|
|
515
541
|
server_name = parser.server_name
|
|
516
542
|
database_name = parser.database_name
|
|
517
|
-
|
|
543
|
+
# Escape server_name to handle cases where topic.prefix contains dots
|
|
544
|
+
# Some users configure topic.prefix like "my.server" which breaks the regex
|
|
545
|
+
server_name = server_name or ""
|
|
546
|
+
# Regex pattern (\w+\.\w+(?:\.\w+)?) supports BOTH 2-part and 3-part table names
|
|
547
|
+
topic_naming_pattern = rf"({re.escape(server_name)})\.(\w+\.\w+(?:\.\w+)?)"
|
|
518
548
|
|
|
519
549
|
if not self.connector_manifest.topic_names:
|
|
520
550
|
return lineages
|
|
521
551
|
|
|
552
|
+
# Handle connectors with 2-level container (database + schema) in topic pattern
|
|
553
|
+
connector_class = self.connector_manifest.config.get(CONNECTOR_CLASS, "")
|
|
554
|
+
maybe_duplicated_database_name = (
|
|
555
|
+
connector_class
|
|
556
|
+
in self.DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN
|
|
557
|
+
)
|
|
558
|
+
|
|
522
559
|
for topic in self.connector_manifest.topic_names:
|
|
523
560
|
found = re.search(re.compile(topic_naming_pattern), topic)
|
|
561
|
+
logger.debug(
|
|
562
|
+
f"Processing topic: '{topic}' with regex pattern '{topic_naming_pattern}', found: {found}"
|
|
563
|
+
)
|
|
524
564
|
|
|
525
565
|
if found:
|
|
526
|
-
|
|
566
|
+
# Extract the table part after server_name
|
|
567
|
+
table_part = found.group(2)
|
|
568
|
+
|
|
569
|
+
if (
|
|
570
|
+
maybe_duplicated_database_name
|
|
571
|
+
and database_name
|
|
572
|
+
and table_part.startswith(f"{database_name}.")
|
|
573
|
+
):
|
|
574
|
+
table_part = table_part[len(database_name) + 1 :]
|
|
575
|
+
|
|
576
|
+
logger.debug(
|
|
577
|
+
f"Extracted table part: '{table_part}' from topic '{topic}'"
|
|
578
|
+
)
|
|
579
|
+
# Apply database name to create final dataset name
|
|
580
|
+
table_name = get_dataset_name(database_name, table_part)
|
|
581
|
+
logger.debug(f"Final table name: '{table_name}'")
|
|
527
582
|
|
|
528
583
|
lineage = KafkaConnectLineage(
|
|
529
584
|
source_dataset=table_name,
|
|
@@ -21,9 +21,13 @@ from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
|
|
|
21
21
|
)
|
|
22
22
|
from datahub.ingestion.source.mock_data.table_naming_helper import TableNamingHelper
|
|
23
23
|
from datahub.metadata.schema_classes import (
|
|
24
|
+
CalendarIntervalClass,
|
|
24
25
|
DatasetLineageTypeClass,
|
|
26
|
+
DatasetProfileClass,
|
|
27
|
+
DatasetUsageStatisticsClass,
|
|
25
28
|
StatusClass,
|
|
26
29
|
SubTypesClass,
|
|
30
|
+
TimeWindowSizeClass,
|
|
27
31
|
UpstreamClass,
|
|
28
32
|
UpstreamLineageClass,
|
|
29
33
|
)
|
|
@@ -278,6 +282,10 @@ class DataHubMockDataSource(Source):
|
|
|
278
282
|
|
|
279
283
|
yield self._get_subtypes_aspect(table_name, i, j)
|
|
280
284
|
|
|
285
|
+
yield self._get_profile_aspect(table_name)
|
|
286
|
+
|
|
287
|
+
yield self._get_usage_aspect(table_name)
|
|
288
|
+
|
|
281
289
|
yield from self._generate_lineage_for_table(
|
|
282
290
|
table_name=table_name,
|
|
283
291
|
table_level=i,
|
|
@@ -381,5 +389,42 @@ class DataHubMockDataSource(Source):
|
|
|
381
389
|
)
|
|
382
390
|
return mcp.as_workunit()
|
|
383
391
|
|
|
392
|
+
def _get_profile_aspect(self, table: str) -> MetadataWorkUnit:
|
|
393
|
+
urn = make_dataset_urn(
|
|
394
|
+
platform="fake",
|
|
395
|
+
name=table,
|
|
396
|
+
)
|
|
397
|
+
mcp = MetadataChangeProposalWrapper(
|
|
398
|
+
entityUrn=urn,
|
|
399
|
+
entityType="dataset",
|
|
400
|
+
aspect=DatasetProfileClass(
|
|
401
|
+
timestampMillis=0,
|
|
402
|
+
rowCount=100,
|
|
403
|
+
columnCount=10,
|
|
404
|
+
sizeInBytes=1000,
|
|
405
|
+
),
|
|
406
|
+
)
|
|
407
|
+
return mcp.as_workunit()
|
|
408
|
+
|
|
409
|
+
def _get_usage_aspect(self, table: str) -> MetadataWorkUnit:
|
|
410
|
+
urn = make_dataset_urn(
|
|
411
|
+
platform="fake",
|
|
412
|
+
name=table,
|
|
413
|
+
)
|
|
414
|
+
mcp = MetadataChangeProposalWrapper(
|
|
415
|
+
entityUrn=urn,
|
|
416
|
+
entityType="dataset",
|
|
417
|
+
aspect=DatasetUsageStatisticsClass(
|
|
418
|
+
timestampMillis=0,
|
|
419
|
+
eventGranularity=TimeWindowSizeClass(unit=CalendarIntervalClass.DAY),
|
|
420
|
+
uniqueUserCount=0,
|
|
421
|
+
totalSqlQueries=0,
|
|
422
|
+
topSqlQueries=[],
|
|
423
|
+
userCounts=[],
|
|
424
|
+
fieldCounts=[],
|
|
425
|
+
),
|
|
426
|
+
)
|
|
427
|
+
return mcp.as_workunit()
|
|
428
|
+
|
|
384
429
|
def get_report(self) -> SourceReport:
|
|
385
430
|
return self.report
|
|
@@ -294,8 +294,6 @@ class Mapper:
|
|
|
294
294
|
logger.debug(f"Dataset urn = {ds_urn} and its lineage = {upstream_lineage}")
|
|
295
295
|
|
|
296
296
|
mcp = MetadataChangeProposalWrapper(
|
|
297
|
-
entityType=Constant.DATASET,
|
|
298
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
299
297
|
entityUrn=ds_urn,
|
|
300
298
|
aspect=upstream_lineage_class,
|
|
301
299
|
)
|
|
@@ -538,9 +536,7 @@ class Mapper:
|
|
|
538
536
|
profile.columnCount = table.column_count
|
|
539
537
|
|
|
540
538
|
mcp = MetadataChangeProposalWrapper(
|
|
541
|
-
entityType="dataset",
|
|
542
539
|
entityUrn=ds_urn,
|
|
543
|
-
aspectName="datasetProfile",
|
|
544
540
|
aspect=profile,
|
|
545
541
|
)
|
|
546
542
|
dataset_mcps.append(mcp)
|
|
@@ -796,7 +792,6 @@ class Mapper:
|
|
|
796
792
|
guid=container_key.guid(),
|
|
797
793
|
)
|
|
798
794
|
mcp = MetadataChangeProposalWrapper(
|
|
799
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
800
795
|
entityUrn=entity_urn,
|
|
801
796
|
aspect=ContainerClass(container=f"{container_urn}"),
|
|
802
797
|
)
|
|
@@ -673,7 +673,6 @@ class PowerBiAPI:
|
|
|
673
673
|
fill_dashboard_tags()
|
|
674
674
|
self._fill_independent_datasets(workspace=workspace)
|
|
675
675
|
|
|
676
|
-
# flake8: noqa: C901
|
|
677
676
|
def fill_workspaces(
|
|
678
677
|
self, workspaces: List[Workspace], reporter: PowerBiDashboardSourceReport
|
|
679
678
|
) -> Iterable[Workspace]:
|