acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (78) hide show
  1. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
  2. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +0 -7
  5. datahub/cli/cli_utils.py +73 -0
  6. datahub/cli/delete_cli.py +0 -6
  7. datahub/cli/docker_check.py +107 -12
  8. datahub/cli/docker_cli.py +148 -228
  9. datahub/cli/exists_cli.py +0 -4
  10. datahub/cli/get_cli.py +0 -4
  11. datahub/cli/ingest_cli.py +1 -20
  12. datahub/cli/put_cli.py +0 -6
  13. datahub/cli/quickstart_versioning.py +50 -5
  14. datahub/cli/specific/assertions_cli.py +0 -6
  15. datahub/cli/specific/datacontract_cli.py +0 -6
  16. datahub/cli/specific/dataproduct_cli.py +0 -22
  17. datahub/cli/specific/dataset_cli.py +0 -11
  18. datahub/cli/specific/forms_cli.py +0 -6
  19. datahub/cli/specific/group_cli.py +0 -4
  20. datahub/cli/specific/structuredproperties_cli.py +0 -7
  21. datahub/cli/specific/user_cli.py +0 -4
  22. datahub/cli/state_cli.py +0 -4
  23. datahub/cli/timeline_cli.py +0 -4
  24. datahub/entrypoints.py +4 -3
  25. datahub/ingestion/api/report.py +183 -35
  26. datahub/ingestion/autogenerated/capability_summary.json +3431 -0
  27. datahub/ingestion/autogenerated/lineage.json +401 -0
  28. datahub/ingestion/autogenerated/lineage_helper.py +30 -128
  29. datahub/ingestion/extractor/schema_util.py +13 -4
  30. datahub/ingestion/graph/client.py +2 -2
  31. datahub/ingestion/run/pipeline.py +47 -1
  32. datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
  33. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  34. datahub/ingestion/source/common/subtypes.py +1 -1
  35. datahub/ingestion/source/data_lake_common/object_store.py +40 -0
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  37. datahub/ingestion/source/dremio/dremio_source.py +7 -7
  38. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  39. datahub/ingestion/source/ge_data_profiler.py +28 -20
  40. datahub/ingestion/source/identity/okta.py +0 -13
  41. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  42. datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
  43. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  44. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  45. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  46. datahub/ingestion/source/redshift/usage.py +4 -3
  47. datahub/ingestion/source/s3/source.py +19 -3
  48. datahub/ingestion/source/sigma/sigma.py +6 -1
  49. datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
  50. datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
  51. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  52. datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
  53. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  54. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  55. datahub/ingestion/source/sql/sql_common.py +4 -0
  56. datahub/ingestion/source/sql/vertica.py +0 -4
  57. datahub/ingestion/source/sql_queries.py +2 -2
  58. datahub/ingestion/source/superset.py +56 -1
  59. datahub/ingestion/source/tableau/tableau.py +40 -34
  60. datahub/ingestion/source/tableau/tableau_constant.py +0 -2
  61. datahub/ingestion/source/unity/proxy.py +4 -3
  62. datahub/ingestion/source/unity/source.py +19 -9
  63. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  64. datahub/metadata/_internal_schema_classes.py +85 -4
  65. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  66. datahub/metadata/schema.avsc +54 -1
  67. datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
  68. datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
  69. datahub/sdk/lineage_client.py +2 -0
  70. datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
  71. datahub/sql_parsing/sqlglot_lineage.py +40 -13
  72. datahub/upgrade/upgrade.py +46 -13
  73. datahub/utilities/server_config_util.py +8 -0
  74. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  75. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
  76. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
  77. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
  78. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0
@@ -44,6 +44,10 @@ from datahub.ingestion.transformer.transform_registry import transform_registry
44
44
  from datahub.sdk._attribution import KnownAttribution, change_default_attribution
45
45
  from datahub.telemetry import stats
46
46
  from datahub.telemetry.telemetry import telemetry_instance
47
+ from datahub.upgrade.upgrade import (
48
+ is_server_default_cli_ahead,
49
+ retrieve_version_stats,
50
+ )
47
51
  from datahub.utilities._custom_package_loader import model_version_name
48
52
  from datahub.utilities.global_warning_util import (
49
53
  clear_global_warnings,
@@ -171,7 +175,10 @@ class Pipeline:
171
175
  self.last_time_printed = int(time.time())
172
176
  self.cli_report = CliReport()
173
177
 
174
- with contextlib.ExitStack() as exit_stack, contextlib.ExitStack() as inner_exit_stack:
178
+ with (
179
+ contextlib.ExitStack() as exit_stack,
180
+ contextlib.ExitStack() as inner_exit_stack,
181
+ ):
175
182
  self.graph: Optional[DataHubGraph] = None
176
183
  with _add_init_error_context("connect to DataHub"):
177
184
  if self.config.datahub_api:
@@ -340,6 +347,44 @@ class Pipeline:
340
347
  except Exception as e:
341
348
  logger.warning("Reporting failed on start", exc_info=e)
342
349
 
350
+ def _warn_old_cli_version(self) -> None:
351
+ """
352
+ Check if the server default CLI version is ahead of the CLI version being used.
353
+ If so, add a warning to the report.
354
+ """
355
+
356
+ try:
357
+ version_stats = retrieve_version_stats(timeout=2.0, graph=self.graph)
358
+ except RuntimeError as e:
359
+ # Handle case where there's no event loop available (e.g., in ThreadPoolExecutor)
360
+ if "no current event loop" in str(e):
361
+ logger.debug("Skipping version check - no event loop available")
362
+ return
363
+ raise
364
+
365
+ if not version_stats or not self.graph:
366
+ return
367
+
368
+ if is_server_default_cli_ahead(version_stats):
369
+ server_default_version = (
370
+ version_stats.server.current_server_default_cli_version.version
371
+ if version_stats.server.current_server_default_cli_version
372
+ else None
373
+ )
374
+ current_version = version_stats.client.current.version
375
+
376
+ logger.debug(f"""
377
+ client_version: {current_version}
378
+ server_default_version: {server_default_version}
379
+ server_default_cli_ahead: True
380
+ """)
381
+
382
+ self.source.get_report().warning(
383
+ title="Server default CLI version is ahead of CLI version",
384
+ message="Please upgrade the CLI version being used",
385
+ context=f"Server Default CLI version: {server_default_version}, Used CLI version: {current_version}",
386
+ )
387
+
343
388
  def _notify_reporters_on_ingestion_completion(self) -> None:
344
389
  for reporter in self.reporters:
345
390
  try:
@@ -396,6 +441,7 @@ class Pipeline:
396
441
  return False
397
442
 
398
443
  def run(self) -> None:
444
+ self._warn_old_cli_version()
399
445
  with self.exit_stack, self.inner_exit_stack:
400
446
  if self.config.flags.generate_memory_profiles:
401
447
  import memray
@@ -45,6 +45,7 @@ from datahub.ingestion.source.bigquery_v2.queries_extractor import (
45
45
  BigQueryQueriesExtractorConfig,
46
46
  )
47
47
  from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
48
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
48
49
  from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
49
50
  from datahub.ingestion.source.state.redundant_run_skip_handler import (
50
51
  RedundantLineageRunSkipHandler,
@@ -78,7 +79,14 @@ def cleanup(config: BigQueryV2Config) -> None:
78
79
  supported=False,
79
80
  )
80
81
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
81
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
82
+ @capability(
83
+ SourceCapability.CONTAINERS,
84
+ "Enabled by default",
85
+ subtype_modifier=[
86
+ SourceCapabilityModifier.BIGQUERY_PROJECT,
87
+ SourceCapabilityModifier.BIGQUERY_DATASET,
88
+ ],
89
+ )
82
90
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
83
91
  @capability(
84
92
  SourceCapability.DATA_PROFILING,
@@ -288,28 +296,29 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
288
296
  ):
289
297
  return
290
298
 
291
- with self.report.new_stage(
292
- f"*: {QUERIES_EXTRACTION}"
293
- ), BigQueryQueriesExtractor(
294
- connection=self.config.get_bigquery_client(),
295
- schema_api=self.bq_schema_extractor.schema_api,
296
- config=BigQueryQueriesExtractorConfig(
297
- window=self.config,
298
- user_email_pattern=self.config.usage.user_email_pattern,
299
- include_lineage=self.config.include_table_lineage,
300
- include_usage_statistics=self.config.include_usage_statistics,
301
- include_operations=self.config.usage.include_operational_stats,
302
- include_queries=self.config.include_queries,
303
- include_query_usage_statistics=self.config.include_query_usage_statistics,
304
- top_n_queries=self.config.usage.top_n_queries,
305
- region_qualifiers=self.config.region_qualifiers,
306
- ),
307
- structured_report=self.report,
308
- filters=self.filters,
309
- identifiers=self.identifiers,
310
- schema_resolver=self.sql_parser_schema_resolver,
311
- discovered_tables=self.bq_schema_extractor.table_refs,
312
- ) as queries_extractor:
299
+ with (
300
+ self.report.new_stage(f"*: {QUERIES_EXTRACTION}"),
301
+ BigQueryQueriesExtractor(
302
+ connection=self.config.get_bigquery_client(),
303
+ schema_api=self.bq_schema_extractor.schema_api,
304
+ config=BigQueryQueriesExtractorConfig(
305
+ window=self.config,
306
+ user_email_pattern=self.config.usage.user_email_pattern,
307
+ include_lineage=self.config.include_table_lineage,
308
+ include_usage_statistics=self.config.include_usage_statistics,
309
+ include_operations=self.config.usage.include_operational_stats,
310
+ include_queries=self.config.include_queries,
311
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
312
+ top_n_queries=self.config.usage.top_n_queries,
313
+ region_qualifiers=self.config.region_qualifiers,
314
+ ),
315
+ structured_report=self.report,
316
+ filters=self.filters,
317
+ identifiers=self.identifiers,
318
+ schema_resolver=self.sql_parser_schema_resolver,
319
+ discovered_tables=self.bq_schema_extractor.table_refs,
320
+ ) as queries_extractor,
321
+ ):
313
322
  self.report.queries_extractor = queries_extractor.report
314
323
  yield from queries_extractor.get_workunits_internal()
315
324
  else:
@@ -70,11 +70,12 @@ class CassandraProfiler:
70
70
  ) -> Iterable[MetadataWorkUnit]:
71
71
  for keyspace_name in cassandra_data.keyspaces:
72
72
  tables = cassandra_data.tables.get(keyspace_name, [])
73
- with self.report.new_stage(
74
- f"{keyspace_name}: {PROFILING}"
75
- ), ThreadPoolExecutor(
76
- max_workers=self.config.profiling.max_workers
77
- ) as executor:
73
+ with (
74
+ self.report.new_stage(f"{keyspace_name}: {PROFILING}"),
75
+ ThreadPoolExecutor(
76
+ max_workers=self.config.profiling.max_workers
77
+ ) as executor,
78
+ ):
78
79
  future_to_dataset = {
79
80
  executor.submit(
80
81
  self.generate_profile,
@@ -143,7 +143,7 @@ def create_source_capability_modifier_enum():
143
143
  for enum_class in source_enums:
144
144
  for member in enum_class: # type: ignore[var-annotated]
145
145
  if member.name in all_values:
146
- logger.error(
146
+ logger.debug(
147
147
  f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
148
148
  )
149
149
  continue
@@ -519,6 +519,13 @@ class ObjectStoreSourceAdapter:
519
519
  "get_external_url",
520
520
  lambda table_data: self.get_gcs_external_url(table_data),
521
521
  )
522
+ # Fix URI mismatch issue in pattern matching
523
+ self.register_customization(
524
+ "_normalize_uri_for_pattern_matching",
525
+ self._normalize_gcs_uri_for_pattern_matching,
526
+ )
527
+ # Fix URI handling in schema extraction - override strip_s3_prefix for GCS
528
+ self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
522
529
  elif platform == "s3":
523
530
  self.register_customization("is_s3_platform", lambda: True)
524
531
  self.register_customization("create_s3_path", self.create_s3_path)
@@ -612,6 +619,39 @@ class ObjectStoreSourceAdapter:
612
619
  return self.get_abs_external_url(table_data)
613
620
  return None
614
621
 
622
+ def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
623
+ """
624
+ Normalize GCS URI for pattern matching.
625
+
626
+ This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
627
+ fixing the URI mismatch issue in GCS ingestion.
628
+
629
+ Args:
630
+ uri: The URI to normalize
631
+
632
+ Returns:
633
+ The normalized URI for pattern matching
634
+ """
635
+ if uri.startswith("gs://"):
636
+ return uri.replace("gs://", "s3://", 1)
637
+ return uri
638
+
639
+ def _strip_gcs_prefix(self, uri: str) -> str:
640
+ """
641
+ Strip GCS prefix from URI.
642
+
643
+ This method removes the gs:// prefix from GCS URIs for path processing.
644
+
645
+ Args:
646
+ uri: The URI to strip the prefix from
647
+
648
+ Returns:
649
+ The URI without the gs:// prefix
650
+ """
651
+ if uri.startswith("gs://"):
652
+ return uri[5:] # Remove "gs://" prefix
653
+ return uri
654
+
615
655
 
616
656
  # Factory function to create an adapter for a specific platform
617
657
  def create_object_store_adapter(
@@ -12,7 +12,7 @@ from datahub.emitter.serialization_helper import post_json_transform
12
12
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
13
13
  from datahub.ingestion.source.datahub.report import DataHubSourceReport
14
14
  from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
15
- from datahub.metadata.schema_classes import ChangeTypeClass, SystemMetadataClass
15
+ from datahub.metadata.schema_classes import SystemMetadataClass
16
16
  from datahub.utilities.lossy_collections import LossyDict, LossyList
17
17
 
18
18
  logger = logging.getLogger(__name__)
@@ -374,7 +374,6 @@ class DataHubDatabaseReader:
374
374
  entityUrn=row["urn"],
375
375
  aspect=ASPECT_MAP[row["aspect"]].from_obj(json_aspect),
376
376
  systemMetadata=system_metadata,
377
- changeType=ChangeTypeClass.UPSERT,
378
377
  )
379
378
  except Exception as e:
380
379
  logger.warning(
@@ -61,7 +61,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
61
61
  UpstreamClass,
62
62
  UpstreamLineage,
63
63
  )
64
- from datahub.metadata.schema_classes import ChangeTypeClass, SchemaMetadataClass
64
+ from datahub.metadata.schema_classes import SchemaMetadataClass
65
65
  from datahub.metadata.urns import CorpUserUrn
66
66
  from datahub.sql_parsing.sql_parsing_aggregator import (
67
67
  KnownQueryLineageInfo,
@@ -261,9 +261,12 @@ class DremioSource(StatefulIngestionSourceBase):
261
261
 
262
262
  # Profiling
263
263
  if self.config.is_profiling_enabled():
264
- with self.report.new_stage(PROFILING), ThreadPoolExecutor(
265
- max_workers=self.config.profiling.max_workers
266
- ) as executor:
264
+ with (
265
+ self.report.new_stage(PROFILING),
266
+ ThreadPoolExecutor(
267
+ max_workers=self.config.profiling.max_workers
268
+ ) as executor,
269
+ ):
267
270
  future_to_dataset = {
268
271
  executor.submit(self.generate_profiles, dataset): dataset
269
272
  for dataset in datasets
@@ -430,11 +433,8 @@ class DremioSource(StatefulIngestionSourceBase):
430
433
  ]
431
434
  )
432
435
  mcp = MetadataChangeProposalWrapper(
433
- entityType="dataset",
434
436
  entityUrn=dataset_urn,
435
- aspectName=lineage.ASPECT_NAME,
436
437
  aspect=lineage,
437
- changeType=ChangeTypeClass.UPSERT,
438
438
  )
439
439
 
440
440
  for upstream_urn in upstream_urns:
@@ -16,6 +16,7 @@ from datahub.ingestion.api.decorators import (
16
16
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
17
17
  from datahub.ingestion.api.workunit import MetadataWorkUnit
18
18
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
19
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
19
20
  from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
20
21
  from datahub.ingestion.source.data_lake_common.data_lake_utils import PLATFORM_GCS
21
22
  from datahub.ingestion.source.data_lake_common.object_store import (
@@ -82,7 +83,14 @@ class GCSSourceReport(DataLakeSourceReport):
82
83
  @platform_name("Google Cloud Storage", id=PLATFORM_GCS)
83
84
  @config_class(GCSSourceConfig)
84
85
  @support_status(SupportStatus.INCUBATING)
85
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
86
+ @capability(
87
+ SourceCapability.CONTAINERS,
88
+ "Enabled by default",
89
+ subtype_modifier=[
90
+ SourceCapabilityModifier.GCS_BUCKET,
91
+ SourceCapabilityModifier.FOLDER,
92
+ ],
93
+ )
86
94
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
87
95
  @capability(SourceCapability.DATA_PROFILING, "Not supported", supported=False)
88
96
  class GCSSource(StatefulIngestionSourceBase):
@@ -112,6 +120,7 @@ class GCSSource(StatefulIngestionSourceBase):
112
120
  env=self.config.env,
113
121
  max_rows=self.config.max_rows,
114
122
  number_of_files_to_sample=self.config.number_of_files_to_sample,
123
+ platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
115
124
  )
116
125
  return s3_config
117
126
 
@@ -138,7 +147,9 @@ class GCSSource(StatefulIngestionSourceBase):
138
147
 
139
148
  def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
140
149
  config = self.create_equivalent_s3_config()
141
- s3_source = S3Source(config, PipelineContext(ctx.run_id))
150
+ # Create a new context for S3 source without graph to avoid duplicate checkpointer registration
151
+ s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
152
+ s3_source = S3Source(config, s3_ctx)
142
153
  return self.s3_source_overrides(s3_source)
143
154
 
144
155
  def s3_source_overrides(self, source: S3Source) -> S3Source:
@@ -1213,26 +1213,34 @@ class DatahubGEProfiler:
1213
1213
  f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
1214
1214
  )
1215
1215
 
1216
- with PerfTimer() as timer, unittest.mock.patch(
1217
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
1218
- get_column_unique_count_dh_patch,
1219
- ), unittest.mock.patch(
1220
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
1221
- _get_column_quantiles_bigquery_patch,
1222
- ), unittest.mock.patch(
1223
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
1224
- _get_column_quantiles_awsathena_patch,
1225
- ), unittest.mock.patch(
1226
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
1227
- _get_column_median_patch,
1228
- ), concurrent.futures.ThreadPoolExecutor(
1229
- max_workers=max_workers
1230
- ) as async_executor, SQLAlchemyQueryCombiner(
1231
- enabled=self.config.query_combiner_enabled,
1232
- catch_exceptions=self.config.catch_exceptions,
1233
- is_single_row_query_method=_is_single_row_query_method,
1234
- serial_execution_fallback_enabled=True,
1235
- ).activate() as query_combiner:
1216
+ with (
1217
+ PerfTimer() as timer,
1218
+ unittest.mock.patch(
1219
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
1220
+ get_column_unique_count_dh_patch,
1221
+ ),
1222
+ unittest.mock.patch(
1223
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
1224
+ _get_column_quantiles_bigquery_patch,
1225
+ ),
1226
+ unittest.mock.patch(
1227
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
1228
+ _get_column_quantiles_awsathena_patch,
1229
+ ),
1230
+ unittest.mock.patch(
1231
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
1232
+ _get_column_median_patch,
1233
+ ),
1234
+ concurrent.futures.ThreadPoolExecutor(
1235
+ max_workers=max_workers
1236
+ ) as async_executor,
1237
+ SQLAlchemyQueryCombiner(
1238
+ enabled=self.config.query_combiner_enabled,
1239
+ catch_exceptions=self.config.catch_exceptions,
1240
+ is_single_row_query_method=_is_single_row_query_method,
1241
+ serial_execution_fallback_enabled=True,
1242
+ ).activate() as query_combiner,
1243
+ ):
1236
1244
  # Submit the profiling requests to the thread pool executor.
1237
1245
  async_profiles = collections.deque(
1238
1246
  async_executor.submit(
@@ -41,7 +41,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
41
41
  )
42
42
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
43
43
  from datahub.metadata.schema_classes import (
44
- ChangeTypeClass,
45
44
  CorpGroupInfoClass,
46
45
  CorpUserInfoClass,
47
46
  GroupMembershipClass,
@@ -332,18 +331,12 @@ class OktaSource(StatefulIngestionSourceBase):
332
331
  yield MetadataWorkUnit(id=wu_id, mce=mce)
333
332
 
334
333
  yield MetadataChangeProposalWrapper(
335
- entityType="corpGroup",
336
334
  entityUrn=datahub_corp_group_snapshot.urn,
337
- changeType=ChangeTypeClass.UPSERT,
338
- aspectName="origin",
339
335
  aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
340
336
  ).as_workunit()
341
337
 
342
338
  yield MetadataChangeProposalWrapper(
343
- entityType="corpGroup",
344
339
  entityUrn=datahub_corp_group_snapshot.urn,
345
- changeType=ChangeTypeClass.UPSERT,
346
- aspectName="status",
347
340
  aspect=StatusClass(removed=False),
348
341
  ).as_workunit()
349
342
 
@@ -418,18 +411,12 @@ class OktaSource(StatefulIngestionSourceBase):
418
411
  yield MetadataWorkUnit(id=wu_id, mce=mce)
419
412
 
420
413
  yield MetadataChangeProposalWrapper(
421
- entityType="corpuser",
422
414
  entityUrn=datahub_corp_user_snapshot.urn,
423
- changeType=ChangeTypeClass.UPSERT,
424
- aspectName="origin",
425
415
  aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
426
416
  ).as_workunit()
427
417
 
428
418
  yield MetadataChangeProposalWrapper(
429
- entityType="corpuser",
430
419
  entityUrn=datahub_corp_user_snapshot.urn,
431
- changeType=ChangeTypeClass.UPSERT,
432
- aspectName="status",
433
420
  aspect=StatusClass(removed=False),
434
421
  ).as_workunit()
435
422
 
@@ -20,6 +20,8 @@ from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
20
20
  get_platform_from_sqlalchemy_uri,
21
21
  )
22
22
 
23
+ logger = logging.getLogger(__name__)
24
+
23
25
 
24
26
  @dataclass
25
27
  class ConfluentJDBCSourceConnector(BaseConnector):
@@ -392,7 +394,7 @@ class MongoSourceConnector(BaseConnector):
392
394
  db_connection_url=connector_manifest.config.get("connection.uri"),
393
395
  source_platform="mongodb",
394
396
  database_name=connector_manifest.config.get("database"),
395
- topic_prefix=connector_manifest.config.get("topic_prefix"),
397
+ topic_prefix=connector_manifest.config.get("topic.prefix"),
396
398
  transforms=(
397
399
  connector_manifest.config["transforms"].split(",")
398
400
  if "transforms" in connector_manifest.config
@@ -406,7 +408,11 @@ class MongoSourceConnector(BaseConnector):
406
408
  lineages: List[KafkaConnectLineage] = list()
407
409
  parser = self.get_parser(self.connector_manifest)
408
410
  source_platform = parser.source_platform
409
- topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)"
411
+ topic_prefix = parser.topic_prefix or ""
412
+
413
+ # Escape topic_prefix to handle cases where it contains dots
414
+ # Some users configure topic.prefix like "my.mongodb" which breaks the regex
415
+ topic_naming_pattern = rf"{re.escape(topic_prefix)}\.(\w+)\.(\w+)"
410
416
 
411
417
  if not self.connector_manifest.topic_names:
412
418
  return lineages
@@ -429,6 +435,26 @@ class MongoSourceConnector(BaseConnector):
429
435
 
430
436
  @dataclass
431
437
  class DebeziumSourceConnector(BaseConnector):
438
+ # Debezium topic naming patterns by connector type
439
+ # - MySQL: {topic.prefix}.{database}.{table}
440
+ # - PostgreSQL: {topic.prefix}.{schema}.{table}
441
+ # - SQL Server: {topic.prefix}.{database}.{schema}.{table}
442
+ # - Oracle: {topic.prefix}.{schema}.{table}
443
+ # - DB2: {topic.prefix}.{schema}.{table}
444
+ # - MongoDB: {topic.prefix}.{database}.{collection}
445
+ # - Vitess: {topic.prefix}.{keyspace}.{table}
446
+
447
+ # Note SQL Server allows for "database.names" (multiple databases) config,
448
+ # and so database is in the topic naming pattern.
449
+ # However, others have "database.dbname" which is a single database name. For these connectors,
450
+ # additional databases would require a different connector instance
451
+
452
+ # Connectors with 2-level container in pattern (database + schema)
453
+ # Others have either database XOR schema, but not both
454
+ DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN = {
455
+ "io.debezium.connector.sqlserver.SqlServerConnector",
456
+ }
457
+
432
458
  @dataclass
433
459
  class DebeziumParser:
434
460
  source_platform: str
@@ -514,16 +540,45 @@ class DebeziumSourceConnector(BaseConnector):
514
540
  source_platform = parser.source_platform
515
541
  server_name = parser.server_name
516
542
  database_name = parser.database_name
517
- topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)"
543
+ # Escape server_name to handle cases where topic.prefix contains dots
544
+ # Some users configure topic.prefix like "my.server" which breaks the regex
545
+ server_name = server_name or ""
546
+ # Regex pattern (\w+\.\w+(?:\.\w+)?) supports BOTH 2-part and 3-part table names
547
+ topic_naming_pattern = rf"({re.escape(server_name)})\.(\w+\.\w+(?:\.\w+)?)"
518
548
 
519
549
  if not self.connector_manifest.topic_names:
520
550
  return lineages
521
551
 
552
+ # Handle connectors with 2-level container (database + schema) in topic pattern
553
+ connector_class = self.connector_manifest.config.get(CONNECTOR_CLASS, "")
554
+ maybe_duplicated_database_name = (
555
+ connector_class
556
+ in self.DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN
557
+ )
558
+
522
559
  for topic in self.connector_manifest.topic_names:
523
560
  found = re.search(re.compile(topic_naming_pattern), topic)
561
+ logger.debug(
562
+ f"Processing topic: '{topic}' with regex pattern '{topic_naming_pattern}', found: {found}"
563
+ )
524
564
 
525
565
  if found:
526
- table_name = get_dataset_name(database_name, found.group(2))
566
+ # Extract the table part after server_name
567
+ table_part = found.group(2)
568
+
569
+ if (
570
+ maybe_duplicated_database_name
571
+ and database_name
572
+ and table_part.startswith(f"{database_name}.")
573
+ ):
574
+ table_part = table_part[len(database_name) + 1 :]
575
+
576
+ logger.debug(
577
+ f"Extracted table part: '{table_part}' from topic '{topic}'"
578
+ )
579
+ # Apply database name to create final dataset name
580
+ table_name = get_dataset_name(database_name, table_part)
581
+ logger.debug(f"Final table name: '{table_name}'")
527
582
 
528
583
  lineage = KafkaConnectLineage(
529
584
  source_dataset=table_name,
@@ -21,9 +21,13 @@ from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
21
21
  )
22
22
  from datahub.ingestion.source.mock_data.table_naming_helper import TableNamingHelper
23
23
  from datahub.metadata.schema_classes import (
24
+ CalendarIntervalClass,
24
25
  DatasetLineageTypeClass,
26
+ DatasetProfileClass,
27
+ DatasetUsageStatisticsClass,
25
28
  StatusClass,
26
29
  SubTypesClass,
30
+ TimeWindowSizeClass,
27
31
  UpstreamClass,
28
32
  UpstreamLineageClass,
29
33
  )
@@ -278,6 +282,10 @@ class DataHubMockDataSource(Source):
278
282
 
279
283
  yield self._get_subtypes_aspect(table_name, i, j)
280
284
 
285
+ yield self._get_profile_aspect(table_name)
286
+
287
+ yield self._get_usage_aspect(table_name)
288
+
281
289
  yield from self._generate_lineage_for_table(
282
290
  table_name=table_name,
283
291
  table_level=i,
@@ -381,5 +389,42 @@ class DataHubMockDataSource(Source):
381
389
  )
382
390
  return mcp.as_workunit()
383
391
 
392
+ def _get_profile_aspect(self, table: str) -> MetadataWorkUnit:
393
+ urn = make_dataset_urn(
394
+ platform="fake",
395
+ name=table,
396
+ )
397
+ mcp = MetadataChangeProposalWrapper(
398
+ entityUrn=urn,
399
+ entityType="dataset",
400
+ aspect=DatasetProfileClass(
401
+ timestampMillis=0,
402
+ rowCount=100,
403
+ columnCount=10,
404
+ sizeInBytes=1000,
405
+ ),
406
+ )
407
+ return mcp.as_workunit()
408
+
409
+ def _get_usage_aspect(self, table: str) -> MetadataWorkUnit:
410
+ urn = make_dataset_urn(
411
+ platform="fake",
412
+ name=table,
413
+ )
414
+ mcp = MetadataChangeProposalWrapper(
415
+ entityUrn=urn,
416
+ entityType="dataset",
417
+ aspect=DatasetUsageStatisticsClass(
418
+ timestampMillis=0,
419
+ eventGranularity=TimeWindowSizeClass(unit=CalendarIntervalClass.DAY),
420
+ uniqueUserCount=0,
421
+ totalSqlQueries=0,
422
+ topSqlQueries=[],
423
+ userCounts=[],
424
+ fieldCounts=[],
425
+ ),
426
+ )
427
+ return mcp.as_workunit()
428
+
384
429
  def get_report(self) -> SourceReport:
385
430
  return self.report
@@ -294,8 +294,6 @@ class Mapper:
294
294
  logger.debug(f"Dataset urn = {ds_urn} and its lineage = {upstream_lineage}")
295
295
 
296
296
  mcp = MetadataChangeProposalWrapper(
297
- entityType=Constant.DATASET,
298
- changeType=ChangeTypeClass.UPSERT,
299
297
  entityUrn=ds_urn,
300
298
  aspect=upstream_lineage_class,
301
299
  )
@@ -538,9 +536,7 @@ class Mapper:
538
536
  profile.columnCount = table.column_count
539
537
 
540
538
  mcp = MetadataChangeProposalWrapper(
541
- entityType="dataset",
542
539
  entityUrn=ds_urn,
543
- aspectName="datasetProfile",
544
540
  aspect=profile,
545
541
  )
546
542
  dataset_mcps.append(mcp)
@@ -796,7 +792,6 @@ class Mapper:
796
792
  guid=container_key.guid(),
797
793
  )
798
794
  mcp = MetadataChangeProposalWrapper(
799
- changeType=ChangeTypeClass.UPSERT,
800
795
  entityUrn=entity_urn,
801
796
  aspect=ContainerClass(container=f"{container_urn}"),
802
797
  )
@@ -673,7 +673,6 @@ class PowerBiAPI:
673
673
  fill_dashboard_tags()
674
674
  self._fill_independent_datasets(workspace=workspace)
675
675
 
676
- # flake8: noqa: C901
677
676
  def fill_workspaces(
678
677
  self, workspaces: List[Workspace], reporter: PowerBiDashboardSourceReport
679
678
  ) -> Iterable[Workspace]: