acryl-datahub 1.1.0.5rc5__py3-none-any.whl → 1.1.0.5rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (43) hide show
  1. {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/METADATA +2603 -2603
  2. {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/RECORD +43 -39
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/api/report.py +183 -35
  5. datahub/ingestion/autogenerated/capability_summary.json +3366 -0
  6. datahub/ingestion/autogenerated/lineage.json +401 -0
  7. datahub/ingestion/autogenerated/lineage_helper.py +30 -128
  8. datahub/ingestion/run/pipeline.py +4 -1
  9. datahub/ingestion/source/bigquery_v2/bigquery.py +23 -22
  10. datahub/ingestion/source/bigquery_v2/queries.py +2 -2
  11. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  12. datahub/ingestion/source/common/subtypes.py +1 -1
  13. datahub/ingestion/source/data_lake_common/object_store.py +40 -0
  14. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  15. datahub/ingestion/source/dremio/dremio_source.py +6 -3
  16. datahub/ingestion/source/gcs/gcs_source.py +4 -1
  17. datahub/ingestion/source/ge_data_profiler.py +28 -20
  18. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  19. datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
  20. datahub/ingestion/source/redshift/usage.py +4 -3
  21. datahub/ingestion/source/s3/report.py +4 -2
  22. datahub/ingestion/source/s3/source.py +367 -115
  23. datahub/ingestion/source/snowflake/snowflake_queries.py +47 -3
  24. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  25. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  26. datahub/ingestion/source/sql/athena.py +95 -10
  27. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  28. datahub/ingestion/source/unity/proxy.py +4 -3
  29. datahub/ingestion/source/unity/source.py +10 -8
  30. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  31. datahub/metadata/_internal_schema_classes.py +85 -4
  32. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  33. datahub/metadata/schema.avsc +54 -1
  34. datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
  35. datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
  36. datahub/sdk/lineage_client.py +2 -0
  37. datahub/sql_parsing/sql_parsing_aggregator.py +3 -3
  38. datahub/sql_parsing/sqlglot_lineage.py +2 -0
  39. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  40. {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/WHEEL +0 -0
  41. {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/entry_points.txt +0 -0
  42. {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/licenses/LICENSE +0 -0
  43. {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/top_level.txt +0 -0
@@ -70,11 +70,12 @@ class CassandraProfiler:
70
70
  ) -> Iterable[MetadataWorkUnit]:
71
71
  for keyspace_name in cassandra_data.keyspaces:
72
72
  tables = cassandra_data.tables.get(keyspace_name, [])
73
- with self.report.new_stage(
74
- f"{keyspace_name}: {PROFILING}"
75
- ), ThreadPoolExecutor(
76
- max_workers=self.config.profiling.max_workers
77
- ) as executor:
73
+ with (
74
+ self.report.new_stage(f"{keyspace_name}: {PROFILING}"),
75
+ ThreadPoolExecutor(
76
+ max_workers=self.config.profiling.max_workers
77
+ ) as executor,
78
+ ):
78
79
  future_to_dataset = {
79
80
  executor.submit(
80
81
  self.generate_profile,
@@ -143,7 +143,7 @@ def create_source_capability_modifier_enum():
143
143
  for enum_class in source_enums:
144
144
  for member in enum_class: # type: ignore[var-annotated]
145
145
  if member.name in all_values:
146
- logger.error(
146
+ logger.debug(
147
147
  f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
148
148
  )
149
149
  continue
@@ -519,6 +519,13 @@ class ObjectStoreSourceAdapter:
519
519
  "get_external_url",
520
520
  lambda table_data: self.get_gcs_external_url(table_data),
521
521
  )
522
+ # Fix URI mismatch issue in pattern matching
523
+ self.register_customization(
524
+ "_normalize_uri_for_pattern_matching",
525
+ self._normalize_gcs_uri_for_pattern_matching,
526
+ )
527
+ # Fix URI handling in schema extraction - override strip_s3_prefix for GCS
528
+ self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
522
529
  elif platform == "s3":
523
530
  self.register_customization("is_s3_platform", lambda: True)
524
531
  self.register_customization("create_s3_path", self.create_s3_path)
@@ -612,6 +619,39 @@ class ObjectStoreSourceAdapter:
612
619
  return self.get_abs_external_url(table_data)
613
620
  return None
614
621
 
622
+ def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
623
+ """
624
+ Normalize GCS URI for pattern matching.
625
+
626
+ This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
627
+ fixing the URI mismatch issue in GCS ingestion.
628
+
629
+ Args:
630
+ uri: The URI to normalize
631
+
632
+ Returns:
633
+ The normalized URI for pattern matching
634
+ """
635
+ if uri.startswith("gs://"):
636
+ return uri.replace("gs://", "s3://", 1)
637
+ return uri
638
+
639
+ def _strip_gcs_prefix(self, uri: str) -> str:
640
+ """
641
+ Strip GCS prefix from URI.
642
+
643
+ This method removes the gs:// prefix from GCS URIs for path processing.
644
+
645
+ Args:
646
+ uri: The URI to strip the prefix from
647
+
648
+ Returns:
649
+ The URI without the gs:// prefix
650
+ """
651
+ if uri.startswith("gs://"):
652
+ return uri[5:] # Remove "gs://" prefix
653
+ return uri
654
+
615
655
 
616
656
  # Factory function to create an adapter for a specific platform
617
657
  def create_object_store_adapter(
@@ -166,7 +166,6 @@ class PathSpec(ConfigModel):
166
166
  return False
167
167
 
168
168
  def allowed(self, path: str, ignore_ext: bool = False) -> bool:
169
- logger.debug(f"Checking file to inclusion: {path}")
170
169
  if self.is_path_hidden(path) and not self.include_hidden_folders:
171
170
  return False
172
171
 
@@ -174,19 +173,17 @@ class PathSpec(ConfigModel):
174
173
  self.glob_include, flags=pathlib.GLOBSTAR
175
174
  ):
176
175
  return False
177
- logger.debug(f"{path} matched include ")
176
+
178
177
  if self.exclude:
179
178
  for exclude_path in self.exclude:
180
179
  if pathlib.PurePath(path).globmatch(
181
180
  exclude_path, flags=pathlib.GLOBSTAR
182
181
  ):
183
182
  return False
184
- logger.debug(f"{path} is not excluded")
185
183
 
186
184
  table_name, _ = self.extract_table_name_and_path(path)
187
185
  if not self.tables_filter_pattern.allowed(table_name):
188
186
  return False
189
- logger.debug(f"{path} is passed table name check")
190
187
 
191
188
  ext = os.path.splitext(path)[1].strip(".")
192
189
 
@@ -196,8 +193,6 @@ class PathSpec(ConfigModel):
196
193
  ):
197
194
  return False
198
195
 
199
- logger.debug(f"{path} had selected extension {ext}")
200
- logger.debug(f"{path} allowed for dataset creation")
201
196
  return True
202
197
 
203
198
  def dir_allowed(self, path: str) -> bool:
@@ -219,10 +214,8 @@ class PathSpec(ConfigModel):
219
214
  for _ in range(slash_to_remove_from_glob):
220
215
  glob_include = glob_include.rsplit("/", 1)[0]
221
216
 
222
- logger.debug(f"Checking dir to inclusion: {path}")
223
217
  if not pathlib.PurePath(path).globmatch(glob_include, flags=pathlib.GLOBSTAR):
224
218
  return False
225
- logger.debug(f"{path} matched include ")
226
219
  if self.exclude:
227
220
  for exclude_path in self.exclude:
228
221
  if pathlib.PurePath(path.rstrip("/")).globmatch(
@@ -236,7 +229,7 @@ class PathSpec(ConfigModel):
236
229
  )
237
230
  if not self.tables_filter_pattern.allowed(table_name):
238
231
  return False
239
- logger.debug(f"{path} is passed table name check")
232
+ # logger.debug(f"{path} is passed table name check")
240
233
 
241
234
  return True
242
235
 
@@ -246,10 +239,10 @@ class PathSpec(ConfigModel):
246
239
  if parsable_include.endswith("/{table}/**"):
247
240
  # Remove the last two characters to make it parsable if it ends with {table}/** which marks autodetect partition
248
241
  parsable_include = parsable_include[:-2]
249
- else:
250
- # Replace all * with {folder[i]} to make it parsable
251
- for i in range(parsable_include.count("*")):
252
- parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
242
+
243
+ # Replace all * with {folder[i]} to make it parsable
244
+ for i in range(parsable_include.count("*")):
245
+ parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
253
246
  return parsable_include
254
247
 
255
248
  def get_named_vars(self, path: str) -> Union[None, parse.Result, parse.Match]:
@@ -330,8 +323,6 @@ class PathSpec(ConfigModel):
330
323
  if "{table}" in values["include"]:
331
324
  v = "{table}"
332
325
  else:
333
- logger.debug(f"include fields: {compiled_include.named_fields}")
334
- logger.debug(f"table_name fields: {parse.compile(v).named_fields}")
335
326
  if not all(
336
327
  x in compiled_include.named_fields
337
328
  for x in parse.compile(v).named_fields
@@ -356,9 +347,7 @@ class PathSpec(ConfigModel):
356
347
  @cached_property
357
348
  def compiled_include(self):
358
349
  parsable_include = PathSpec.get_parsable_include(self.include)
359
- logger.debug(f"parsable_include: {parsable_include}")
360
350
  compiled_include = parse.compile(parsable_include)
361
- logger.debug(f"Setting compiled_include: {compiled_include}")
362
351
  return compiled_include
363
352
 
364
353
  @cached_property
@@ -366,9 +355,8 @@ class PathSpec(ConfigModel):
366
355
  parsable_folder_include = PathSpec.get_parsable_include(self.include).rsplit(
367
356
  "/", 1
368
357
  )[0]
369
- logger.debug(f"parsable_folder_include: {parsable_folder_include}")
370
358
  compiled_folder_include = parse.compile(parsable_folder_include)
371
- logger.debug(f"Setting compiled_folder_include: {compiled_folder_include}")
359
+
372
360
  return compiled_folder_include
373
361
 
374
362
  @cached_property
@@ -376,7 +364,8 @@ class PathSpec(ConfigModel):
376
364
  # Regular expression to find all substrings enclosed in {}
377
365
  pattern = r"\{(.*?)\}"
378
366
  # Find all matches
379
- matches = re.findall(pattern, self.include.split("{table}/")[1])
367
+ split_parts = self.include.split("{table}/")
368
+ matches = re.findall(pattern, split_parts[1]) if len(split_parts) > 1 else []
380
369
  return matches
381
370
 
382
371
  def get_partition_from_path(self, path: str) -> Optional[List[Tuple[str, str]]]:
@@ -563,7 +552,7 @@ class PathSpec(ConfigModel):
563
552
  f"{{{template_key}}}", var[key]
564
553
  )
565
554
  else:
566
- partition_format.replace(f"{{{var_key}}}", var)
555
+ partition_format = partition_format.replace(f"{{{var_key}}}", var)
567
556
  return datetime.datetime.strptime(partition_format, datetime_format).replace(
568
557
  tzinfo=datetime.timezone.utc
569
558
  )
@@ -261,9 +261,12 @@ class DremioSource(StatefulIngestionSourceBase):
261
261
 
262
262
  # Profiling
263
263
  if self.config.is_profiling_enabled():
264
- with self.report.new_stage(PROFILING), ThreadPoolExecutor(
265
- max_workers=self.config.profiling.max_workers
266
- ) as executor:
264
+ with (
265
+ self.report.new_stage(PROFILING),
266
+ ThreadPoolExecutor(
267
+ max_workers=self.config.profiling.max_workers
268
+ ) as executor,
269
+ ):
267
270
  future_to_dataset = {
268
271
  executor.submit(self.generate_profiles, dataset): dataset
269
272
  for dataset in datasets
@@ -112,6 +112,7 @@ class GCSSource(StatefulIngestionSourceBase):
112
112
  env=self.config.env,
113
113
  max_rows=self.config.max_rows,
114
114
  number_of_files_to_sample=self.config.number_of_files_to_sample,
115
+ platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
115
116
  )
116
117
  return s3_config
117
118
 
@@ -138,7 +139,9 @@ class GCSSource(StatefulIngestionSourceBase):
138
139
 
139
140
  def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
140
141
  config = self.create_equivalent_s3_config()
141
- s3_source = S3Source(config, PipelineContext(ctx.run_id))
142
+ # Create a new context for S3 source without graph to avoid duplicate checkpointer registration
143
+ s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
144
+ s3_source = S3Source(config, s3_ctx)
142
145
  return self.s3_source_overrides(s3_source)
143
146
 
144
147
  def s3_source_overrides(self, source: S3Source) -> S3Source:
@@ -1213,26 +1213,34 @@ class DatahubGEProfiler:
1213
1213
  f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
1214
1214
  )
1215
1215
 
1216
- with PerfTimer() as timer, unittest.mock.patch(
1217
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
1218
- get_column_unique_count_dh_patch,
1219
- ), unittest.mock.patch(
1220
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
1221
- _get_column_quantiles_bigquery_patch,
1222
- ), unittest.mock.patch(
1223
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
1224
- _get_column_quantiles_awsathena_patch,
1225
- ), unittest.mock.patch(
1226
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
1227
- _get_column_median_patch,
1228
- ), concurrent.futures.ThreadPoolExecutor(
1229
- max_workers=max_workers
1230
- ) as async_executor, SQLAlchemyQueryCombiner(
1231
- enabled=self.config.query_combiner_enabled,
1232
- catch_exceptions=self.config.catch_exceptions,
1233
- is_single_row_query_method=_is_single_row_query_method,
1234
- serial_execution_fallback_enabled=True,
1235
- ).activate() as query_combiner:
1216
+ with (
1217
+ PerfTimer() as timer,
1218
+ unittest.mock.patch(
1219
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
1220
+ get_column_unique_count_dh_patch,
1221
+ ),
1222
+ unittest.mock.patch(
1223
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
1224
+ _get_column_quantiles_bigquery_patch,
1225
+ ),
1226
+ unittest.mock.patch(
1227
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
1228
+ _get_column_quantiles_awsathena_patch,
1229
+ ),
1230
+ unittest.mock.patch(
1231
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
1232
+ _get_column_median_patch,
1233
+ ),
1234
+ concurrent.futures.ThreadPoolExecutor(
1235
+ max_workers=max_workers
1236
+ ) as async_executor,
1237
+ SQLAlchemyQueryCombiner(
1238
+ enabled=self.config.query_combiner_enabled,
1239
+ catch_exceptions=self.config.catch_exceptions,
1240
+ is_single_row_query_method=_is_single_row_query_method,
1241
+ serial_execution_fallback_enabled=True,
1242
+ ).activate() as query_combiner,
1243
+ ):
1236
1244
  # Submit the profiling requests to the thread pool executor.
1237
1245
  async_profiles = collections.deque(
1238
1246
  async_executor.submit(
@@ -20,6 +20,8 @@ from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
20
20
  get_platform_from_sqlalchemy_uri,
21
21
  )
22
22
 
23
+ logger = logging.getLogger(__name__)
24
+
23
25
 
24
26
  @dataclass
25
27
  class ConfluentJDBCSourceConnector(BaseConnector):
@@ -392,7 +394,7 @@ class MongoSourceConnector(BaseConnector):
392
394
  db_connection_url=connector_manifest.config.get("connection.uri"),
393
395
  source_platform="mongodb",
394
396
  database_name=connector_manifest.config.get("database"),
395
- topic_prefix=connector_manifest.config.get("topic_prefix"),
397
+ topic_prefix=connector_manifest.config.get("topic.prefix"),
396
398
  transforms=(
397
399
  connector_manifest.config["transforms"].split(",")
398
400
  if "transforms" in connector_manifest.config
@@ -406,7 +408,11 @@ class MongoSourceConnector(BaseConnector):
406
408
  lineages: List[KafkaConnectLineage] = list()
407
409
  parser = self.get_parser(self.connector_manifest)
408
410
  source_platform = parser.source_platform
409
- topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)"
411
+ topic_prefix = parser.topic_prefix or ""
412
+
413
+ # Escape topic_prefix to handle cases where it contains dots
414
+ # Some users configure topic.prefix like "my.mongodb" which breaks the regex
415
+ topic_naming_pattern = rf"{re.escape(topic_prefix)}\.(\w+)\.(\w+)"
410
416
 
411
417
  if not self.connector_manifest.topic_names:
412
418
  return lineages
@@ -429,6 +435,26 @@ class MongoSourceConnector(BaseConnector):
429
435
 
430
436
  @dataclass
431
437
  class DebeziumSourceConnector(BaseConnector):
438
+ # Debezium topic naming patterns by connector type
439
+ # - MySQL: {topic.prefix}.{database}.{table}
440
+ # - PostgreSQL: {topic.prefix}.{schema}.{table}
441
+ # - SQL Server: {topic.prefix}.{database}.{schema}.{table}
442
+ # - Oracle: {topic.prefix}.{schema}.{table}
443
+ # - DB2: {topic.prefix}.{schema}.{table}
444
+ # - MongoDB: {topic.prefix}.{database}.{collection}
445
+ # - Vitess: {topic.prefix}.{keyspace}.{table}
446
+
447
+ # Note SQL Server allows for "database.names" (multiple databases) config,
448
+ # and so database is in the topic naming pattern.
449
+ # However, others have "database.dbname" which is a single database name. For these connectors,
450
+ # additional databases would require a different connector instance
451
+
452
+ # Connectors with 2-level container in pattern (database + schema)
453
+ # Others have either database XOR schema, but not both
454
+ DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN = {
455
+ "io.debezium.connector.sqlserver.SqlServerConnector",
456
+ }
457
+
432
458
  @dataclass
433
459
  class DebeziumParser:
434
460
  source_platform: str
@@ -514,16 +540,45 @@ class DebeziumSourceConnector(BaseConnector):
514
540
  source_platform = parser.source_platform
515
541
  server_name = parser.server_name
516
542
  database_name = parser.database_name
517
- topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)"
543
+ # Escape server_name to handle cases where topic.prefix contains dots
544
+ # Some users configure topic.prefix like "my.server" which breaks the regex
545
+ server_name = server_name or ""
546
+ # Regex pattern (\w+\.\w+(?:\.\w+)?) supports BOTH 2-part and 3-part table names
547
+ topic_naming_pattern = rf"({re.escape(server_name)})\.(\w+\.\w+(?:\.\w+)?)"
518
548
 
519
549
  if not self.connector_manifest.topic_names:
520
550
  return lineages
521
551
 
552
+ # Handle connectors with 2-level container (database + schema) in topic pattern
553
+ connector_class = self.connector_manifest.config.get(CONNECTOR_CLASS, "")
554
+ maybe_duplicated_database_name = (
555
+ connector_class
556
+ in self.DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN
557
+ )
558
+
522
559
  for topic in self.connector_manifest.topic_names:
523
560
  found = re.search(re.compile(topic_naming_pattern), topic)
561
+ logger.debug(
562
+ f"Processing topic: '{topic}' with regex pattern '{topic_naming_pattern}', found: {found}"
563
+ )
524
564
 
525
565
  if found:
526
- table_name = get_dataset_name(database_name, found.group(2))
566
+ # Extract the table part after server_name
567
+ table_part = found.group(2)
568
+
569
+ if (
570
+ maybe_duplicated_database_name
571
+ and database_name
572
+ and table_part.startswith(f"{database_name}.")
573
+ ):
574
+ table_part = table_part[len(database_name) + 1 :]
575
+
576
+ logger.debug(
577
+ f"Extracted table part: '{table_part}' from topic '{topic}'"
578
+ )
579
+ # Apply database name to create final dataset name
580
+ table_name = get_dataset_name(database_name, table_part)
581
+ logger.debug(f"Final table name: '{table_name}'")
527
582
 
528
583
  lineage = KafkaConnectLineage(
529
584
  source_dataset=table_name,
@@ -21,9 +21,13 @@ from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
21
21
  )
22
22
  from datahub.ingestion.source.mock_data.table_naming_helper import TableNamingHelper
23
23
  from datahub.metadata.schema_classes import (
24
+ CalendarIntervalClass,
24
25
  DatasetLineageTypeClass,
26
+ DatasetProfileClass,
27
+ DatasetUsageStatisticsClass,
25
28
  StatusClass,
26
29
  SubTypesClass,
30
+ TimeWindowSizeClass,
27
31
  UpstreamClass,
28
32
  UpstreamLineageClass,
29
33
  )
@@ -278,6 +282,10 @@ class DataHubMockDataSource(Source):
278
282
 
279
283
  yield self._get_subtypes_aspect(table_name, i, j)
280
284
 
285
+ yield self._get_profile_aspect(table_name)
286
+
287
+ yield self._get_usage_aspect(table_name)
288
+
281
289
  yield from self._generate_lineage_for_table(
282
290
  table_name=table_name,
283
291
  table_level=i,
@@ -381,5 +389,42 @@ class DataHubMockDataSource(Source):
381
389
  )
382
390
  return mcp.as_workunit()
383
391
 
392
+ def _get_profile_aspect(self, table: str) -> MetadataWorkUnit:
393
+ urn = make_dataset_urn(
394
+ platform="fake",
395
+ name=table,
396
+ )
397
+ mcp = MetadataChangeProposalWrapper(
398
+ entityUrn=urn,
399
+ entityType="dataset",
400
+ aspect=DatasetProfileClass(
401
+ timestampMillis=0,
402
+ rowCount=100,
403
+ columnCount=10,
404
+ sizeInBytes=1000,
405
+ ),
406
+ )
407
+ return mcp.as_workunit()
408
+
409
+ def _get_usage_aspect(self, table: str) -> MetadataWorkUnit:
410
+ urn = make_dataset_urn(
411
+ platform="fake",
412
+ name=table,
413
+ )
414
+ mcp = MetadataChangeProposalWrapper(
415
+ entityUrn=urn,
416
+ entityType="dataset",
417
+ aspect=DatasetUsageStatisticsClass(
418
+ timestampMillis=0,
419
+ eventGranularity=TimeWindowSizeClass(unit=CalendarIntervalClass.DAY),
420
+ uniqueUserCount=0,
421
+ totalSqlQueries=0,
422
+ topSqlQueries=[],
423
+ userCounts=[],
424
+ fieldCounts=[],
425
+ ),
426
+ )
427
+ return mcp.as_workunit()
428
+
384
429
  def get_report(self) -> SourceReport:
385
430
  return self.report
@@ -182,9 +182,10 @@ class RedshiftUsageExtractor:
182
182
  self.report.num_operational_stats_filtered = 0
183
183
 
184
184
  if self.config.include_operational_stats:
185
- with self.report.new_stage(
186
- USAGE_EXTRACTION_OPERATIONAL_STATS
187
- ), PerfTimer() as timer:
185
+ with (
186
+ self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS),
187
+ PerfTimer() as timer,
188
+ ):
188
189
  # Generate operation aspect workunits
189
190
  yield from self._gen_operation_aspect_workunits(
190
191
  self.connection, all_tables
@@ -1,19 +1,21 @@
1
1
  import dataclasses
2
2
  from dataclasses import field as dataclass_field
3
- from typing import List
4
3
 
5
4
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
6
5
  StaleEntityRemovalSourceReport,
7
6
  )
7
+ from datahub.utilities.lossy_collections import LossyList
8
8
 
9
9
 
10
10
  @dataclasses.dataclass
11
11
  class DataLakeSourceReport(StaleEntityRemovalSourceReport):
12
12
  files_scanned = 0
13
- filtered: List[str] = dataclass_field(default_factory=list)
13
+ filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
14
+ number_of_files_filtered: int = 0
14
15
 
15
16
  def report_file_scanned(self) -> None:
16
17
  self.files_scanned += 1
17
18
 
18
19
  def report_file_dropped(self, file: str) -> None:
19
20
  self.filtered.append(file)
21
+ self.number_of_files_filtered += 1