acryl-datahub 1.1.0.5rc5__py3-none-any.whl → 1.1.0.5rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/METADATA +2603 -2603
- {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/RECORD +43 -39
- datahub/_version.py +1 -1
- datahub/ingestion/api/report.py +183 -35
- datahub/ingestion/autogenerated/capability_summary.json +3366 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +30 -128
- datahub/ingestion/run/pipeline.py +4 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +23 -22
- datahub/ingestion/source/bigquery_v2/queries.py +2 -2
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +1 -1
- datahub/ingestion/source/data_lake_common/object_store.py +40 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/dremio/dremio_source.py +6 -3
- datahub/ingestion/source/gcs/gcs_source.py +4 -1
- datahub/ingestion/source/ge_data_profiler.py +28 -20
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/snowflake/snowflake_queries.py +47 -3
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +95 -10
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/unity/proxy.py +4 -3
- datahub/ingestion/source/unity/source.py +10 -8
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +85 -4
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
- datahub/metadata/schema.avsc +54 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
- datahub/sdk/lineage_client.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -3
- datahub/sql_parsing/sqlglot_lineage.py +2 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/top_level.txt +0 -0
|
@@ -70,11 +70,12 @@ class CassandraProfiler:
|
|
|
70
70
|
) -> Iterable[MetadataWorkUnit]:
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
|
-
with
|
|
74
|
-
f"{keyspace_name}: {PROFILING}"
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
73
|
+
with (
|
|
74
|
+
self.report.new_stage(f"{keyspace_name}: {PROFILING}"),
|
|
75
|
+
ThreadPoolExecutor(
|
|
76
|
+
max_workers=self.config.profiling.max_workers
|
|
77
|
+
) as executor,
|
|
78
|
+
):
|
|
78
79
|
future_to_dataset = {
|
|
79
80
|
executor.submit(
|
|
80
81
|
self.generate_profile,
|
|
@@ -143,7 +143,7 @@ def create_source_capability_modifier_enum():
|
|
|
143
143
|
for enum_class in source_enums:
|
|
144
144
|
for member in enum_class: # type: ignore[var-annotated]
|
|
145
145
|
if member.name in all_values:
|
|
146
|
-
logger.
|
|
146
|
+
logger.debug(
|
|
147
147
|
f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
|
|
148
148
|
)
|
|
149
149
|
continue
|
|
@@ -519,6 +519,13 @@ class ObjectStoreSourceAdapter:
|
|
|
519
519
|
"get_external_url",
|
|
520
520
|
lambda table_data: self.get_gcs_external_url(table_data),
|
|
521
521
|
)
|
|
522
|
+
# Fix URI mismatch issue in pattern matching
|
|
523
|
+
self.register_customization(
|
|
524
|
+
"_normalize_uri_for_pattern_matching",
|
|
525
|
+
self._normalize_gcs_uri_for_pattern_matching,
|
|
526
|
+
)
|
|
527
|
+
# Fix URI handling in schema extraction - override strip_s3_prefix for GCS
|
|
528
|
+
self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
|
|
522
529
|
elif platform == "s3":
|
|
523
530
|
self.register_customization("is_s3_platform", lambda: True)
|
|
524
531
|
self.register_customization("create_s3_path", self.create_s3_path)
|
|
@@ -612,6 +619,39 @@ class ObjectStoreSourceAdapter:
|
|
|
612
619
|
return self.get_abs_external_url(table_data)
|
|
613
620
|
return None
|
|
614
621
|
|
|
622
|
+
def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
|
|
623
|
+
"""
|
|
624
|
+
Normalize GCS URI for pattern matching.
|
|
625
|
+
|
|
626
|
+
This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
|
|
627
|
+
fixing the URI mismatch issue in GCS ingestion.
|
|
628
|
+
|
|
629
|
+
Args:
|
|
630
|
+
uri: The URI to normalize
|
|
631
|
+
|
|
632
|
+
Returns:
|
|
633
|
+
The normalized URI for pattern matching
|
|
634
|
+
"""
|
|
635
|
+
if uri.startswith("gs://"):
|
|
636
|
+
return uri.replace("gs://", "s3://", 1)
|
|
637
|
+
return uri
|
|
638
|
+
|
|
639
|
+
def _strip_gcs_prefix(self, uri: str) -> str:
|
|
640
|
+
"""
|
|
641
|
+
Strip GCS prefix from URI.
|
|
642
|
+
|
|
643
|
+
This method removes the gs:// prefix from GCS URIs for path processing.
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
uri: The URI to strip the prefix from
|
|
647
|
+
|
|
648
|
+
Returns:
|
|
649
|
+
The URI without the gs:// prefix
|
|
650
|
+
"""
|
|
651
|
+
if uri.startswith("gs://"):
|
|
652
|
+
return uri[5:] # Remove "gs://" prefix
|
|
653
|
+
return uri
|
|
654
|
+
|
|
615
655
|
|
|
616
656
|
# Factory function to create an adapter for a specific platform
|
|
617
657
|
def create_object_store_adapter(
|
|
@@ -166,7 +166,6 @@ class PathSpec(ConfigModel):
|
|
|
166
166
|
return False
|
|
167
167
|
|
|
168
168
|
def allowed(self, path: str, ignore_ext: bool = False) -> bool:
|
|
169
|
-
logger.debug(f"Checking file to inclusion: {path}")
|
|
170
169
|
if self.is_path_hidden(path) and not self.include_hidden_folders:
|
|
171
170
|
return False
|
|
172
171
|
|
|
@@ -174,19 +173,17 @@ class PathSpec(ConfigModel):
|
|
|
174
173
|
self.glob_include, flags=pathlib.GLOBSTAR
|
|
175
174
|
):
|
|
176
175
|
return False
|
|
177
|
-
|
|
176
|
+
|
|
178
177
|
if self.exclude:
|
|
179
178
|
for exclude_path in self.exclude:
|
|
180
179
|
if pathlib.PurePath(path).globmatch(
|
|
181
180
|
exclude_path, flags=pathlib.GLOBSTAR
|
|
182
181
|
):
|
|
183
182
|
return False
|
|
184
|
-
logger.debug(f"{path} is not excluded")
|
|
185
183
|
|
|
186
184
|
table_name, _ = self.extract_table_name_and_path(path)
|
|
187
185
|
if not self.tables_filter_pattern.allowed(table_name):
|
|
188
186
|
return False
|
|
189
|
-
logger.debug(f"{path} is passed table name check")
|
|
190
187
|
|
|
191
188
|
ext = os.path.splitext(path)[1].strip(".")
|
|
192
189
|
|
|
@@ -196,8 +193,6 @@ class PathSpec(ConfigModel):
|
|
|
196
193
|
):
|
|
197
194
|
return False
|
|
198
195
|
|
|
199
|
-
logger.debug(f"{path} had selected extension {ext}")
|
|
200
|
-
logger.debug(f"{path} allowed for dataset creation")
|
|
201
196
|
return True
|
|
202
197
|
|
|
203
198
|
def dir_allowed(self, path: str) -> bool:
|
|
@@ -219,10 +214,8 @@ class PathSpec(ConfigModel):
|
|
|
219
214
|
for _ in range(slash_to_remove_from_glob):
|
|
220
215
|
glob_include = glob_include.rsplit("/", 1)[0]
|
|
221
216
|
|
|
222
|
-
logger.debug(f"Checking dir to inclusion: {path}")
|
|
223
217
|
if not pathlib.PurePath(path).globmatch(glob_include, flags=pathlib.GLOBSTAR):
|
|
224
218
|
return False
|
|
225
|
-
logger.debug(f"{path} matched include ")
|
|
226
219
|
if self.exclude:
|
|
227
220
|
for exclude_path in self.exclude:
|
|
228
221
|
if pathlib.PurePath(path.rstrip("/")).globmatch(
|
|
@@ -236,7 +229,7 @@ class PathSpec(ConfigModel):
|
|
|
236
229
|
)
|
|
237
230
|
if not self.tables_filter_pattern.allowed(table_name):
|
|
238
231
|
return False
|
|
239
|
-
logger.debug(f"{path} is passed table name check")
|
|
232
|
+
# logger.debug(f"{path} is passed table name check")
|
|
240
233
|
|
|
241
234
|
return True
|
|
242
235
|
|
|
@@ -246,10 +239,10 @@ class PathSpec(ConfigModel):
|
|
|
246
239
|
if parsable_include.endswith("/{table}/**"):
|
|
247
240
|
# Remove the last two characters to make it parsable if it ends with {table}/** which marks autodetect partition
|
|
248
241
|
parsable_include = parsable_include[:-2]
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
242
|
+
|
|
243
|
+
# Replace all * with {folder[i]} to make it parsable
|
|
244
|
+
for i in range(parsable_include.count("*")):
|
|
245
|
+
parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
|
|
253
246
|
return parsable_include
|
|
254
247
|
|
|
255
248
|
def get_named_vars(self, path: str) -> Union[None, parse.Result, parse.Match]:
|
|
@@ -330,8 +323,6 @@ class PathSpec(ConfigModel):
|
|
|
330
323
|
if "{table}" in values["include"]:
|
|
331
324
|
v = "{table}"
|
|
332
325
|
else:
|
|
333
|
-
logger.debug(f"include fields: {compiled_include.named_fields}")
|
|
334
|
-
logger.debug(f"table_name fields: {parse.compile(v).named_fields}")
|
|
335
326
|
if not all(
|
|
336
327
|
x in compiled_include.named_fields
|
|
337
328
|
for x in parse.compile(v).named_fields
|
|
@@ -356,9 +347,7 @@ class PathSpec(ConfigModel):
|
|
|
356
347
|
@cached_property
|
|
357
348
|
def compiled_include(self):
|
|
358
349
|
parsable_include = PathSpec.get_parsable_include(self.include)
|
|
359
|
-
logger.debug(f"parsable_include: {parsable_include}")
|
|
360
350
|
compiled_include = parse.compile(parsable_include)
|
|
361
|
-
logger.debug(f"Setting compiled_include: {compiled_include}")
|
|
362
351
|
return compiled_include
|
|
363
352
|
|
|
364
353
|
@cached_property
|
|
@@ -366,9 +355,8 @@ class PathSpec(ConfigModel):
|
|
|
366
355
|
parsable_folder_include = PathSpec.get_parsable_include(self.include).rsplit(
|
|
367
356
|
"/", 1
|
|
368
357
|
)[0]
|
|
369
|
-
logger.debug(f"parsable_folder_include: {parsable_folder_include}")
|
|
370
358
|
compiled_folder_include = parse.compile(parsable_folder_include)
|
|
371
|
-
|
|
359
|
+
|
|
372
360
|
return compiled_folder_include
|
|
373
361
|
|
|
374
362
|
@cached_property
|
|
@@ -376,7 +364,8 @@ class PathSpec(ConfigModel):
|
|
|
376
364
|
# Regular expression to find all substrings enclosed in {}
|
|
377
365
|
pattern = r"\{(.*?)\}"
|
|
378
366
|
# Find all matches
|
|
379
|
-
|
|
367
|
+
split_parts = self.include.split("{table}/")
|
|
368
|
+
matches = re.findall(pattern, split_parts[1]) if len(split_parts) > 1 else []
|
|
380
369
|
return matches
|
|
381
370
|
|
|
382
371
|
def get_partition_from_path(self, path: str) -> Optional[List[Tuple[str, str]]]:
|
|
@@ -563,7 +552,7 @@ class PathSpec(ConfigModel):
|
|
|
563
552
|
f"{{{template_key}}}", var[key]
|
|
564
553
|
)
|
|
565
554
|
else:
|
|
566
|
-
partition_format.replace(f"{{{var_key}}}", var)
|
|
555
|
+
partition_format = partition_format.replace(f"{{{var_key}}}", var)
|
|
567
556
|
return datetime.datetime.strptime(partition_format, datetime_format).replace(
|
|
568
557
|
tzinfo=datetime.timezone.utc
|
|
569
558
|
)
|
|
@@ -261,9 +261,12 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
261
261
|
|
|
262
262
|
# Profiling
|
|
263
263
|
if self.config.is_profiling_enabled():
|
|
264
|
-
with
|
|
265
|
-
|
|
266
|
-
|
|
264
|
+
with (
|
|
265
|
+
self.report.new_stage(PROFILING),
|
|
266
|
+
ThreadPoolExecutor(
|
|
267
|
+
max_workers=self.config.profiling.max_workers
|
|
268
|
+
) as executor,
|
|
269
|
+
):
|
|
267
270
|
future_to_dataset = {
|
|
268
271
|
executor.submit(self.generate_profiles, dataset): dataset
|
|
269
272
|
for dataset in datasets
|
|
@@ -112,6 +112,7 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
112
112
|
env=self.config.env,
|
|
113
113
|
max_rows=self.config.max_rows,
|
|
114
114
|
number_of_files_to_sample=self.config.number_of_files_to_sample,
|
|
115
|
+
platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
|
|
115
116
|
)
|
|
116
117
|
return s3_config
|
|
117
118
|
|
|
@@ -138,7 +139,9 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
138
139
|
|
|
139
140
|
def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
|
|
140
141
|
config = self.create_equivalent_s3_config()
|
|
141
|
-
|
|
142
|
+
# Create a new context for S3 source without graph to avoid duplicate checkpointer registration
|
|
143
|
+
s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
|
|
144
|
+
s3_source = S3Source(config, s3_ctx)
|
|
142
145
|
return self.s3_source_overrides(s3_source)
|
|
143
146
|
|
|
144
147
|
def s3_source_overrides(self, source: S3Source) -> S3Source:
|
|
@@ -1213,26 +1213,34 @@ class DatahubGEProfiler:
|
|
|
1213
1213
|
f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
|
|
1214
1214
|
)
|
|
1215
1215
|
|
|
1216
|
-
with
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1216
|
+
with (
|
|
1217
|
+
PerfTimer() as timer,
|
|
1218
|
+
unittest.mock.patch(
|
|
1219
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
|
|
1220
|
+
get_column_unique_count_dh_patch,
|
|
1221
|
+
),
|
|
1222
|
+
unittest.mock.patch(
|
|
1223
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
|
|
1224
|
+
_get_column_quantiles_bigquery_patch,
|
|
1225
|
+
),
|
|
1226
|
+
unittest.mock.patch(
|
|
1227
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
|
|
1228
|
+
_get_column_quantiles_awsathena_patch,
|
|
1229
|
+
),
|
|
1230
|
+
unittest.mock.patch(
|
|
1231
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
|
|
1232
|
+
_get_column_median_patch,
|
|
1233
|
+
),
|
|
1234
|
+
concurrent.futures.ThreadPoolExecutor(
|
|
1235
|
+
max_workers=max_workers
|
|
1236
|
+
) as async_executor,
|
|
1237
|
+
SQLAlchemyQueryCombiner(
|
|
1238
|
+
enabled=self.config.query_combiner_enabled,
|
|
1239
|
+
catch_exceptions=self.config.catch_exceptions,
|
|
1240
|
+
is_single_row_query_method=_is_single_row_query_method,
|
|
1241
|
+
serial_execution_fallback_enabled=True,
|
|
1242
|
+
).activate() as query_combiner,
|
|
1243
|
+
):
|
|
1236
1244
|
# Submit the profiling requests to the thread pool executor.
|
|
1237
1245
|
async_profiles = collections.deque(
|
|
1238
1246
|
async_executor.submit(
|
|
@@ -20,6 +20,8 @@ from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
|
20
20
|
get_platform_from_sqlalchemy_uri,
|
|
21
21
|
)
|
|
22
22
|
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
23
25
|
|
|
24
26
|
@dataclass
|
|
25
27
|
class ConfluentJDBCSourceConnector(BaseConnector):
|
|
@@ -392,7 +394,7 @@ class MongoSourceConnector(BaseConnector):
|
|
|
392
394
|
db_connection_url=connector_manifest.config.get("connection.uri"),
|
|
393
395
|
source_platform="mongodb",
|
|
394
396
|
database_name=connector_manifest.config.get("database"),
|
|
395
|
-
topic_prefix=connector_manifest.config.get("
|
|
397
|
+
topic_prefix=connector_manifest.config.get("topic.prefix"),
|
|
396
398
|
transforms=(
|
|
397
399
|
connector_manifest.config["transforms"].split(",")
|
|
398
400
|
if "transforms" in connector_manifest.config
|
|
@@ -406,7 +408,11 @@ class MongoSourceConnector(BaseConnector):
|
|
|
406
408
|
lineages: List[KafkaConnectLineage] = list()
|
|
407
409
|
parser = self.get_parser(self.connector_manifest)
|
|
408
410
|
source_platform = parser.source_platform
|
|
409
|
-
|
|
411
|
+
topic_prefix = parser.topic_prefix or ""
|
|
412
|
+
|
|
413
|
+
# Escape topic_prefix to handle cases where it contains dots
|
|
414
|
+
# Some users configure topic.prefix like "my.mongodb" which breaks the regex
|
|
415
|
+
topic_naming_pattern = rf"{re.escape(topic_prefix)}\.(\w+)\.(\w+)"
|
|
410
416
|
|
|
411
417
|
if not self.connector_manifest.topic_names:
|
|
412
418
|
return lineages
|
|
@@ -429,6 +435,26 @@ class MongoSourceConnector(BaseConnector):
|
|
|
429
435
|
|
|
430
436
|
@dataclass
|
|
431
437
|
class DebeziumSourceConnector(BaseConnector):
|
|
438
|
+
# Debezium topic naming patterns by connector type
|
|
439
|
+
# - MySQL: {topic.prefix}.{database}.{table}
|
|
440
|
+
# - PostgreSQL: {topic.prefix}.{schema}.{table}
|
|
441
|
+
# - SQL Server: {topic.prefix}.{database}.{schema}.{table}
|
|
442
|
+
# - Oracle: {topic.prefix}.{schema}.{table}
|
|
443
|
+
# - DB2: {topic.prefix}.{schema}.{table}
|
|
444
|
+
# - MongoDB: {topic.prefix}.{database}.{collection}
|
|
445
|
+
# - Vitess: {topic.prefix}.{keyspace}.{table}
|
|
446
|
+
|
|
447
|
+
# Note SQL Server allows for "database.names" (multiple databases) config,
|
|
448
|
+
# and so database is in the topic naming pattern.
|
|
449
|
+
# However, others have "database.dbname" which is a single database name. For these connectors,
|
|
450
|
+
# additional databases would require a different connector instance
|
|
451
|
+
|
|
452
|
+
# Connectors with 2-level container in pattern (database + schema)
|
|
453
|
+
# Others have either database XOR schema, but not both
|
|
454
|
+
DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN = {
|
|
455
|
+
"io.debezium.connector.sqlserver.SqlServerConnector",
|
|
456
|
+
}
|
|
457
|
+
|
|
432
458
|
@dataclass
|
|
433
459
|
class DebeziumParser:
|
|
434
460
|
source_platform: str
|
|
@@ -514,16 +540,45 @@ class DebeziumSourceConnector(BaseConnector):
|
|
|
514
540
|
source_platform = parser.source_platform
|
|
515
541
|
server_name = parser.server_name
|
|
516
542
|
database_name = parser.database_name
|
|
517
|
-
|
|
543
|
+
# Escape server_name to handle cases where topic.prefix contains dots
|
|
544
|
+
# Some users configure topic.prefix like "my.server" which breaks the regex
|
|
545
|
+
server_name = server_name or ""
|
|
546
|
+
# Regex pattern (\w+\.\w+(?:\.\w+)?) supports BOTH 2-part and 3-part table names
|
|
547
|
+
topic_naming_pattern = rf"({re.escape(server_name)})\.(\w+\.\w+(?:\.\w+)?)"
|
|
518
548
|
|
|
519
549
|
if not self.connector_manifest.topic_names:
|
|
520
550
|
return lineages
|
|
521
551
|
|
|
552
|
+
# Handle connectors with 2-level container (database + schema) in topic pattern
|
|
553
|
+
connector_class = self.connector_manifest.config.get(CONNECTOR_CLASS, "")
|
|
554
|
+
maybe_duplicated_database_name = (
|
|
555
|
+
connector_class
|
|
556
|
+
in self.DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN
|
|
557
|
+
)
|
|
558
|
+
|
|
522
559
|
for topic in self.connector_manifest.topic_names:
|
|
523
560
|
found = re.search(re.compile(topic_naming_pattern), topic)
|
|
561
|
+
logger.debug(
|
|
562
|
+
f"Processing topic: '{topic}' with regex pattern '{topic_naming_pattern}', found: {found}"
|
|
563
|
+
)
|
|
524
564
|
|
|
525
565
|
if found:
|
|
526
|
-
|
|
566
|
+
# Extract the table part after server_name
|
|
567
|
+
table_part = found.group(2)
|
|
568
|
+
|
|
569
|
+
if (
|
|
570
|
+
maybe_duplicated_database_name
|
|
571
|
+
and database_name
|
|
572
|
+
and table_part.startswith(f"{database_name}.")
|
|
573
|
+
):
|
|
574
|
+
table_part = table_part[len(database_name) + 1 :]
|
|
575
|
+
|
|
576
|
+
logger.debug(
|
|
577
|
+
f"Extracted table part: '{table_part}' from topic '{topic}'"
|
|
578
|
+
)
|
|
579
|
+
# Apply database name to create final dataset name
|
|
580
|
+
table_name = get_dataset_name(database_name, table_part)
|
|
581
|
+
logger.debug(f"Final table name: '{table_name}'")
|
|
527
582
|
|
|
528
583
|
lineage = KafkaConnectLineage(
|
|
529
584
|
source_dataset=table_name,
|
|
@@ -21,9 +21,13 @@ from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
|
|
|
21
21
|
)
|
|
22
22
|
from datahub.ingestion.source.mock_data.table_naming_helper import TableNamingHelper
|
|
23
23
|
from datahub.metadata.schema_classes import (
|
|
24
|
+
CalendarIntervalClass,
|
|
24
25
|
DatasetLineageTypeClass,
|
|
26
|
+
DatasetProfileClass,
|
|
27
|
+
DatasetUsageStatisticsClass,
|
|
25
28
|
StatusClass,
|
|
26
29
|
SubTypesClass,
|
|
30
|
+
TimeWindowSizeClass,
|
|
27
31
|
UpstreamClass,
|
|
28
32
|
UpstreamLineageClass,
|
|
29
33
|
)
|
|
@@ -278,6 +282,10 @@ class DataHubMockDataSource(Source):
|
|
|
278
282
|
|
|
279
283
|
yield self._get_subtypes_aspect(table_name, i, j)
|
|
280
284
|
|
|
285
|
+
yield self._get_profile_aspect(table_name)
|
|
286
|
+
|
|
287
|
+
yield self._get_usage_aspect(table_name)
|
|
288
|
+
|
|
281
289
|
yield from self._generate_lineage_for_table(
|
|
282
290
|
table_name=table_name,
|
|
283
291
|
table_level=i,
|
|
@@ -381,5 +389,42 @@ class DataHubMockDataSource(Source):
|
|
|
381
389
|
)
|
|
382
390
|
return mcp.as_workunit()
|
|
383
391
|
|
|
392
|
+
def _get_profile_aspect(self, table: str) -> MetadataWorkUnit:
|
|
393
|
+
urn = make_dataset_urn(
|
|
394
|
+
platform="fake",
|
|
395
|
+
name=table,
|
|
396
|
+
)
|
|
397
|
+
mcp = MetadataChangeProposalWrapper(
|
|
398
|
+
entityUrn=urn,
|
|
399
|
+
entityType="dataset",
|
|
400
|
+
aspect=DatasetProfileClass(
|
|
401
|
+
timestampMillis=0,
|
|
402
|
+
rowCount=100,
|
|
403
|
+
columnCount=10,
|
|
404
|
+
sizeInBytes=1000,
|
|
405
|
+
),
|
|
406
|
+
)
|
|
407
|
+
return mcp.as_workunit()
|
|
408
|
+
|
|
409
|
+
def _get_usage_aspect(self, table: str) -> MetadataWorkUnit:
|
|
410
|
+
urn = make_dataset_urn(
|
|
411
|
+
platform="fake",
|
|
412
|
+
name=table,
|
|
413
|
+
)
|
|
414
|
+
mcp = MetadataChangeProposalWrapper(
|
|
415
|
+
entityUrn=urn,
|
|
416
|
+
entityType="dataset",
|
|
417
|
+
aspect=DatasetUsageStatisticsClass(
|
|
418
|
+
timestampMillis=0,
|
|
419
|
+
eventGranularity=TimeWindowSizeClass(unit=CalendarIntervalClass.DAY),
|
|
420
|
+
uniqueUserCount=0,
|
|
421
|
+
totalSqlQueries=0,
|
|
422
|
+
topSqlQueries=[],
|
|
423
|
+
userCounts=[],
|
|
424
|
+
fieldCounts=[],
|
|
425
|
+
),
|
|
426
|
+
)
|
|
427
|
+
return mcp.as_workunit()
|
|
428
|
+
|
|
384
429
|
def get_report(self) -> SourceReport:
|
|
385
430
|
return self.report
|
|
@@ -182,9 +182,10 @@ class RedshiftUsageExtractor:
|
|
|
182
182
|
self.report.num_operational_stats_filtered = 0
|
|
183
183
|
|
|
184
184
|
if self.config.include_operational_stats:
|
|
185
|
-
with
|
|
186
|
-
USAGE_EXTRACTION_OPERATIONAL_STATS
|
|
187
|
-
|
|
185
|
+
with (
|
|
186
|
+
self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS),
|
|
187
|
+
PerfTimer() as timer,
|
|
188
|
+
):
|
|
188
189
|
# Generate operation aspect workunits
|
|
189
190
|
yield from self._gen_operation_aspect_workunits(
|
|
190
191
|
self.connection, all_tables
|
|
@@ -1,19 +1,21 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
from dataclasses import field as dataclass_field
|
|
3
|
-
from typing import List
|
|
4
3
|
|
|
5
4
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
6
5
|
StaleEntityRemovalSourceReport,
|
|
7
6
|
)
|
|
7
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
@dataclasses.dataclass
|
|
11
11
|
class DataLakeSourceReport(StaleEntityRemovalSourceReport):
|
|
12
12
|
files_scanned = 0
|
|
13
|
-
filtered:
|
|
13
|
+
filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
14
|
+
number_of_files_filtered: int = 0
|
|
14
15
|
|
|
15
16
|
def report_file_scanned(self) -> None:
|
|
16
17
|
self.files_scanned += 1
|
|
17
18
|
|
|
18
19
|
def report_file_dropped(self, file: str) -> None:
|
|
19
20
|
self.filtered.append(file)
|
|
21
|
+
self.number_of_files_filtered += 1
|