acryl-datahub 0.15.0.2rc2__py3-none-any.whl → 0.15.0.2rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (44) hide show
  1. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/METADATA +2333 -2333
  2. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/RECORD +44 -44
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/structuredproperties/structuredproperties.py +12 -1
  5. datahub/cli/specific/structuredproperties_cli.py +84 -0
  6. datahub/ingestion/api/source.py +2 -0
  7. datahub/ingestion/graph/client.py +4 -2
  8. datahub/ingestion/source/aws/glue.py +14 -1
  9. datahub/ingestion/source/aws/s3_util.py +24 -1
  10. datahub/ingestion/source/bigquery_v2/bigquery.py +31 -33
  11. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  12. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -11
  13. datahub/ingestion/source/bigquery_v2/lineage.py +7 -7
  14. datahub/ingestion/source/bigquery_v2/usage.py +57 -57
  15. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  16. datahub/ingestion/source/cassandra/cassandra_utils.py +0 -3
  17. datahub/ingestion/source/datahub/config.py +6 -0
  18. datahub/ingestion/source/datahub/datahub_source.py +12 -2
  19. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  20. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  21. datahub/ingestion/source/gc/datahub_gc.py +10 -14
  22. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  23. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +14 -1
  24. datahub/ingestion/source/looker/looker_config.py +8 -3
  25. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  26. datahub/ingestion/source/redshift/redshift.py +32 -34
  27. datahub/ingestion/source/redshift/usage.py +29 -29
  28. datahub/ingestion/source/s3/source.py +10 -14
  29. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  30. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +18 -16
  31. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +46 -47
  32. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -39
  33. datahub/ingestion/source/sql/teradata.py +2 -2
  34. datahub/ingestion/source/tableau/tableau.py +119 -31
  35. datahub/ingestion/source/unity/source.py +71 -71
  36. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  37. datahub/metadata/_schema_classes.py +2 -2
  38. datahub/metadata/_urns/urn_defs.py +15 -15
  39. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  40. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  41. datahub/utilities/perf_timer.py +11 -6
  42. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/WHEEL +0 -0
  43. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/entry_points.txt +0 -0
  44. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/top_level.txt +0 -0
@@ -423,10 +423,10 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
423
423
 
424
424
  database = self.config.database
425
425
  logger.info(f"Processing db {database}")
426
- self.report.report_ingestion_stage_start(METADATA_EXTRACTION)
427
- self.db_tables[database] = defaultdict()
428
- self.db_views[database] = defaultdict()
429
- self.db_schemas.setdefault(database, {})
426
+ with self.report.new_stage(METADATA_EXTRACTION):
427
+ self.db_tables[database] = defaultdict()
428
+ self.db_views[database] = defaultdict()
429
+ self.db_schemas.setdefault(database, {})
430
430
 
431
431
  # TODO: Ideally, we'd push down exception handling to the place where the connection is used, as opposed to keeping
432
432
  # this fallback. For now, this gets us broad coverage quickly.
@@ -462,12 +462,12 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
462
462
  self.process_schemas(connection, database)
463
463
  )
464
464
 
465
- self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
466
- yield from self.extract_lineage_v2(
467
- connection=connection,
468
- database=database,
469
- lineage_extractor=lineage_extractor,
470
- )
465
+ with self.report.new_stage(LINEAGE_EXTRACTION):
466
+ yield from self.extract_lineage_v2(
467
+ connection=connection,
468
+ database=database,
469
+ lineage_extractor=lineage_extractor,
470
+ )
471
471
 
472
472
  all_tables = self.get_all_tables()
473
473
  else:
@@ -480,25 +480,25 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
480
480
  or self.config.include_view_lineage
481
481
  or self.config.include_copy_lineage
482
482
  ):
483
- self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
484
- yield from self.extract_lineage(
485
- connection=connection, all_tables=all_tables, database=database
486
- )
483
+ with self.report.new_stage(LINEAGE_EXTRACTION):
484
+ yield from self.extract_lineage(
485
+ connection=connection, all_tables=all_tables, database=database
486
+ )
487
487
 
488
- self.report.report_ingestion_stage_start(USAGE_EXTRACTION_INGESTION)
489
488
  if self.config.include_usage_statistics:
490
- yield from self.extract_usage(
491
- connection=connection, all_tables=all_tables, database=database
492
- )
489
+ with self.report.new_stage(USAGE_EXTRACTION_INGESTION):
490
+ yield from self.extract_usage(
491
+ connection=connection, all_tables=all_tables, database=database
492
+ )
493
493
 
494
494
  if self.config.is_profiling_enabled():
495
- self.report.report_ingestion_stage_start(PROFILING)
496
- profiler = RedshiftProfiler(
497
- config=self.config,
498
- report=self.report,
499
- state_handler=self.profiling_state_handler,
500
- )
501
- yield from profiler.get_workunits(self.db_tables)
495
+ with self.report.new_stage(PROFILING):
496
+ profiler = RedshiftProfiler(
497
+ config=self.config,
498
+ report=self.report,
499
+ state_handler=self.profiling_state_handler,
500
+ )
501
+ yield from profiler.get_workunits(self.db_tables)
502
502
 
503
503
  def process_schemas(self, connection, database):
504
504
  for schema in self.data_dictionary.get_schemas(
@@ -633,8 +633,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
633
633
  else:
634
634
  logger.info("View processing disabled, skipping")
635
635
 
636
- self.report.metadata_extraction_sec[report_key] = round(
637
- timer.elapsed_seconds(), 2
636
+ self.report.metadata_extraction_sec[report_key] = timer.elapsed_seconds(
637
+ digits=2
638
638
  )
639
639
 
640
640
  def _process_table(
@@ -986,9 +986,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
986
986
 
987
987
  yield from usage_extractor.get_usage_workunits(all_tables=all_tables)
988
988
 
989
- self.report.usage_extraction_sec[database] = round(
990
- timer.elapsed_seconds(), 2
991
- )
989
+ self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2)
992
990
 
993
991
  def extract_lineage(
994
992
  self,
@@ -1011,8 +1009,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
1011
1009
  database=database, connection=connection, all_tables=all_tables
1012
1010
  )
1013
1011
 
1014
- self.report.lineage_extraction_sec[f"{database}"] = round(
1015
- timer.elapsed_seconds(), 2
1012
+ self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
1013
+ digits=2
1016
1014
  )
1017
1015
  yield from self.generate_lineage(
1018
1016
  database, lineage_extractor=lineage_extractor
@@ -1042,8 +1040,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
1042
1040
 
1043
1041
  yield from lineage_extractor.generate()
1044
1042
 
1045
- self.report.lineage_extraction_sec[f"{database}"] = round(
1046
- timer.elapsed_seconds(), 2
1043
+ self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
1044
+ digits=2
1047
1045
  )
1048
1046
 
1049
1047
  if self.redundant_lineage_run_skip_handler:
@@ -182,38 +182,38 @@ class RedshiftUsageExtractor:
182
182
  self.report.num_operational_stats_filtered = 0
183
183
 
184
184
  if self.config.include_operational_stats:
185
- self.report.report_ingestion_stage_start(USAGE_EXTRACTION_OPERATIONAL_STATS)
186
- with PerfTimer() as timer:
187
- # Generate operation aspect workunits
188
- yield from self._gen_operation_aspect_workunits(
189
- self.connection, all_tables
190
- )
191
- self.report.operational_metadata_extraction_sec[
192
- self.config.database
193
- ] = round(timer.elapsed_seconds(), 2)
185
+ with self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS):
186
+ with PerfTimer() as timer:
187
+ # Generate operation aspect workunits
188
+ yield from self._gen_operation_aspect_workunits(
189
+ self.connection, all_tables
190
+ )
191
+ self.report.operational_metadata_extraction_sec[
192
+ self.config.database
193
+ ] = timer.elapsed_seconds(digits=2)
194
194
 
195
195
  # Generate aggregate events
196
- self.report.report_ingestion_stage_start(USAGE_EXTRACTION_USAGE_AGGREGATION)
197
- query: str = self.queries.usage_query(
198
- start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
199
- end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
200
- database=self.config.database,
201
- )
202
- access_events_iterable: Iterable[
203
- RedshiftAccessEvent
204
- ] = self._gen_access_events_from_history_query(
205
- query, connection=self.connection, all_tables=all_tables
206
- )
196
+ with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):
197
+ query: str = self.queries.usage_query(
198
+ start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
199
+ end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
200
+ database=self.config.database,
201
+ )
202
+ access_events_iterable: Iterable[
203
+ RedshiftAccessEvent
204
+ ] = self._gen_access_events_from_history_query(
205
+ query, connection=self.connection, all_tables=all_tables
206
+ )
207
207
 
208
- aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
209
- access_events_iterable
210
- )
211
- # Generate usage workunits from aggregated events.
212
- for time_bucket in aggregated_events.values():
213
- for aggregate in time_bucket.values():
214
- wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
215
- self.report.num_usage_workunits_emitted += 1
216
- yield wu
208
+ aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
209
+ access_events_iterable
210
+ )
211
+ # Generate usage workunits from aggregated events.
212
+ for time_bucket in aggregated_events.values():
213
+ for aggregate in time_bucket.values():
214
+ wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
215
+ self.report.num_usage_workunits_emitted += 1
216
+ yield wu
217
217
 
218
218
  def _gen_operation_aspect_workunits(
219
219
  self,
@@ -6,9 +6,8 @@ import pathlib
6
6
  import re
7
7
  import time
8
8
  from datetime import datetime
9
- from itertools import groupby
10
9
  from pathlib import PurePath
11
- from typing import Any, Dict, Iterable, List, Optional, Tuple
10
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
12
11
  from urllib.parse import urlparse
13
12
 
14
13
  import smart_open.compression as so_compression
@@ -41,6 +40,7 @@ from datahub.ingestion.source.aws.s3_util import (
41
40
  get_bucket_name,
42
41
  get_bucket_relative_path,
43
42
  get_key_prefix,
43
+ group_s3_objects_by_dirname,
44
44
  strip_s3_prefix,
45
45
  )
46
46
  from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
@@ -75,6 +75,9 @@ from datahub.metadata.schema_classes import (
75
75
  from datahub.telemetry import stats, telemetry
76
76
  from datahub.utilities.perf_timer import PerfTimer
77
77
 
78
+ if TYPE_CHECKING:
79
+ from mypy_boto3_s3.service_resource import Bucket
80
+
78
81
  # hide annoying debug errors from py4j
79
82
  logging.getLogger("py4j").setLevel(logging.ERROR)
80
83
  logger: logging.Logger = logging.getLogger(__name__)
@@ -842,7 +845,7 @@ class S3Source(StatefulIngestionSourceBase):
842
845
  def get_folder_info(
843
846
  self,
844
847
  path_spec: PathSpec,
845
- bucket: Any, # Todo: proper type
848
+ bucket: "Bucket",
846
849
  prefix: str,
847
850
  ) -> List[Folder]:
848
851
  """
@@ -857,22 +860,15 @@ class S3Source(StatefulIngestionSourceBase):
857
860
 
858
861
  Parameters:
859
862
  path_spec (PathSpec): The path specification used to determine partitioning.
860
- bucket (Any): The S3 bucket object.
863
+ bucket (Bucket): The S3 bucket object.
861
864
  prefix (str): The prefix path in the S3 bucket to list objects from.
862
865
 
863
866
  Returns:
864
867
  List[Folder]: A list of Folder objects representing the partitions found.
865
868
  """
866
-
867
- prefix_to_list = prefix
868
- files = list(
869
- bucket.objects.filter(Prefix=f"{prefix_to_list}").page_size(PAGE_SIZE)
870
- )
871
- files = sorted(files, key=lambda a: a.last_modified)
872
- grouped_files = groupby(files, lambda x: x.key.rsplit("/", 1)[0])
873
-
874
869
  partitions: List[Folder] = []
875
- for key, group in grouped_files:
870
+ s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
871
+ for key, group in group_s3_objects_by_dirname(s3_objects).items():
876
872
  file_size = 0
877
873
  creation_time = None
878
874
  modification_time = None
@@ -904,7 +900,7 @@ class S3Source(StatefulIngestionSourceBase):
904
900
  Folder(
905
901
  partition_id=id,
906
902
  is_partition=bool(id),
907
- creation_time=creation_time if creation_time else None,
903
+ creation_time=creation_time if creation_time else None, # type: ignore[arg-type]
908
904
  modification_time=modification_time,
909
905
  sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
910
906
  size=file_size,
@@ -166,6 +166,3 @@ class SnowflakeV2Report(
166
166
 
167
167
  def report_tag_processed(self, tag_name: str) -> None:
168
168
  self._processed_tags.add(tag_name)
169
-
170
- def set_ingestion_stage(self, database: str, stage: str) -> None:
171
- self.report_ingestion_stage_start(f"{database}: {stage}")
@@ -216,21 +216,23 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
216
216
 
217
217
  try:
218
218
  for snowflake_db in self.databases:
219
- self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
220
- yield from self._process_database(snowflake_db)
219
+ with self.report.new_stage(
220
+ f"{snowflake_db.name}: {METADATA_EXTRACTION}"
221
+ ):
222
+ yield from self._process_database(snowflake_db)
221
223
 
222
- self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
223
- discovered_tables: List[str] = [
224
- self.identifiers.get_dataset_identifier(
225
- table_name, schema.name, db.name
226
- )
227
- for db in self.databases
228
- for schema in db.schemas
229
- for table_name in schema.tables
230
- ]
231
- if self.aggregator:
232
- for entry in self._external_tables_ddl_lineage(discovered_tables):
233
- self.aggregator.add(entry)
224
+ with self.report.new_stage(f"*: {EXTERNAL_TABLE_DDL_LINEAGE}"):
225
+ discovered_tables: List[str] = [
226
+ self.identifiers.get_dataset_identifier(
227
+ table_name, schema.name, db.name
228
+ )
229
+ for db in self.databases
230
+ for schema in db.schemas
231
+ for table_name in schema.tables
232
+ ]
233
+ if self.aggregator:
234
+ for entry in self._external_tables_ddl_lineage(discovered_tables):
235
+ self.aggregator.add(entry)
234
236
 
235
237
  except SnowflakePermissionError as e:
236
238
  self.structured_reporter.failure(
@@ -332,8 +334,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
332
334
  yield from self._process_db_schemas(snowflake_db, db_tables)
333
335
 
334
336
  if self.profiler and db_tables:
335
- self.report.set_ingestion_stage(snowflake_db.name, PROFILING)
336
- yield from self.profiler.get_workunits(snowflake_db, db_tables)
337
+ with self.report.new_stage(f"{snowflake_db.name}: {PROFILING}"):
338
+ yield from self.profiler.get_workunits(snowflake_db, db_tables)
337
339
 
338
340
  def _process_db_schemas(
339
341
  self,
@@ -146,59 +146,58 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
146
146
  if not self._should_ingest_usage():
147
147
  return
148
148
 
149
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION)
150
- if self.report.edition == SnowflakeEdition.STANDARD.value:
151
- logger.info(
152
- "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
153
- )
154
- return
149
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
150
+ if self.report.edition == SnowflakeEdition.STANDARD.value:
151
+ logger.info(
152
+ "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
153
+ )
154
+ return
155
155
 
156
- logger.info("Checking usage date ranges")
156
+ logger.info("Checking usage date ranges")
157
157
 
158
- self._check_usage_date_ranges()
158
+ self._check_usage_date_ranges()
159
159
 
160
- # If permission error, execution returns from here
161
- if (
162
- self.report.min_access_history_time is None
163
- or self.report.max_access_history_time is None
164
- ):
165
- return
160
+ # If permission error, execution returns from here
161
+ if (
162
+ self.report.min_access_history_time is None
163
+ or self.report.max_access_history_time is None
164
+ ):
165
+ return
166
166
 
167
- # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
168
- # Now, we report the usage as well as operation metadata even if user email is absent
167
+ # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
168
+ # Now, we report the usage as well as operation metadata even if user email is absent
169
169
 
170
- if self.config.include_usage_stats:
171
- yield from auto_empty_dataset_usage_statistics(
172
- self._get_workunits_internal(discovered_datasets),
173
- config=BaseTimeWindowConfig(
174
- start_time=self.start_time,
175
- end_time=self.end_time,
176
- bucket_duration=self.config.bucket_duration,
177
- ),
178
- dataset_urns={
179
- self.identifiers.gen_dataset_urn(dataset_identifier)
180
- for dataset_identifier in discovered_datasets
181
- },
182
- )
170
+ if self.config.include_usage_stats:
171
+ yield from auto_empty_dataset_usage_statistics(
172
+ self._get_workunits_internal(discovered_datasets),
173
+ config=BaseTimeWindowConfig(
174
+ start_time=self.start_time,
175
+ end_time=self.end_time,
176
+ bucket_duration=self.config.bucket_duration,
177
+ ),
178
+ dataset_urns={
179
+ self.identifiers.gen_dataset_urn(dataset_identifier)
180
+ for dataset_identifier in discovered_datasets
181
+ },
182
+ )
183
183
 
184
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS)
184
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
185
+ if self.config.include_operational_stats:
186
+ # Generate the operation workunits.
187
+ access_events = self._get_snowflake_history()
188
+ for event in access_events:
189
+ yield from self._get_operation_aspect_work_unit(
190
+ event, discovered_datasets
191
+ )
185
192
 
186
- if self.config.include_operational_stats:
187
- # Generate the operation workunits.
188
- access_events = self._get_snowflake_history()
189
- for event in access_events:
190
- yield from self._get_operation_aspect_work_unit(
191
- event, discovered_datasets
193
+ if self.redundant_run_skip_handler:
194
+ # Update the checkpoint state for this run.
195
+ self.redundant_run_skip_handler.update_state(
196
+ self.config.start_time,
197
+ self.config.end_time,
198
+ self.config.bucket_duration,
192
199
  )
193
200
 
194
- if self.redundant_run_skip_handler:
195
- # Update the checkpoint state for this run.
196
- self.redundant_run_skip_handler.update_state(
197
- self.config.start_time,
198
- self.config.end_time,
199
- self.config.bucket_duration,
200
- )
201
-
202
201
  def _get_workunits_internal(
203
202
  self, discovered_datasets: List[str]
204
203
  ) -> Iterable[MetadataWorkUnit]:
@@ -386,7 +385,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
386
385
  )
387
386
  self.report_status(USAGE_EXTRACTION_OPERATIONAL_STATS, False)
388
387
  return
389
- self.report.access_history_query_secs = round(timer.elapsed_seconds(), 2)
388
+ self.report.access_history_query_secs = timer.elapsed_seconds(digits=2)
390
389
 
391
390
  for row in results:
392
391
  yield from self._process_snowflake_history_row(row)
@@ -434,8 +433,8 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
434
433
  self.report.max_access_history_time = db_row["MAX_TIME"].astimezone(
435
434
  tz=timezone.utc
436
435
  )
437
- self.report.access_history_range_query_secs = round(
438
- timer.elapsed_seconds(), 2
436
+ self.report.access_history_range_query_secs = timer.elapsed_seconds(
437
+ digits=2
439
438
  )
440
439
 
441
440
  def _get_operation_aspect_work_unit(
@@ -480,8 +480,8 @@ class SnowflakeV2Source(
480
480
  identifiers=self.identifiers,
481
481
  )
482
482
 
483
- self.report.set_ingestion_stage("*", METADATA_EXTRACTION)
484
- yield from schema_extractor.get_workunits_internal()
483
+ with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
484
+ yield from schema_extractor.get_workunits_internal()
485
485
 
486
486
  databases = schema_extractor.databases
487
487
 
@@ -513,47 +513,46 @@ class SnowflakeV2Source(
513
513
  discovered_datasets = discovered_tables + discovered_views
514
514
 
515
515
  if self.config.use_queries_v2:
516
- self.report.set_ingestion_stage("*", VIEW_PARSING)
517
- yield from auto_workunit(self.aggregator.gen_metadata())
518
-
519
- self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
520
-
521
- schema_resolver = self.aggregator._schema_resolver
522
-
523
- queries_extractor = SnowflakeQueriesExtractor(
524
- connection=self.connection,
525
- config=SnowflakeQueriesExtractorConfig(
526
- window=self.config,
527
- temporary_tables_pattern=self.config.temporary_tables_pattern,
528
- include_lineage=self.config.include_table_lineage,
529
- include_usage_statistics=self.config.include_usage_stats,
530
- include_operations=self.config.include_operational_stats,
531
- include_queries=self.config.include_queries,
532
- include_query_usage_statistics=self.config.include_query_usage_statistics,
533
- user_email_pattern=self.config.user_email_pattern,
534
- ),
535
- structured_report=self.report,
536
- filters=self.filters,
537
- identifiers=self.identifiers,
538
- schema_resolver=schema_resolver,
539
- discovered_tables=discovered_datasets,
540
- graph=self.ctx.graph,
541
- )
516
+ with self.report.new_stage(f"*: {VIEW_PARSING}"):
517
+ yield from auto_workunit(self.aggregator.gen_metadata())
542
518
 
543
- # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
544
- # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
545
- # it should be pretty straightforward to refactor this and only initialize the aggregator once.
546
- self.report.queries_extractor = queries_extractor.report
547
- yield from queries_extractor.get_workunits_internal()
548
- queries_extractor.close()
519
+ with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
520
+ schema_resolver = self.aggregator._schema_resolver
521
+
522
+ queries_extractor = SnowflakeQueriesExtractor(
523
+ connection=self.connection,
524
+ config=SnowflakeQueriesExtractorConfig(
525
+ window=self.config,
526
+ temporary_tables_pattern=self.config.temporary_tables_pattern,
527
+ include_lineage=self.config.include_table_lineage,
528
+ include_usage_statistics=self.config.include_usage_stats,
529
+ include_operations=self.config.include_operational_stats,
530
+ include_queries=self.config.include_queries,
531
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
532
+ user_email_pattern=self.config.user_email_pattern,
533
+ ),
534
+ structured_report=self.report,
535
+ filters=self.filters,
536
+ identifiers=self.identifiers,
537
+ schema_resolver=schema_resolver,
538
+ discovered_tables=discovered_datasets,
539
+ graph=self.ctx.graph,
540
+ )
541
+
542
+ # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
543
+ # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
544
+ # it should be pretty straightforward to refactor this and only initialize the aggregator once.
545
+ self.report.queries_extractor = queries_extractor.report
546
+ yield from queries_extractor.get_workunits_internal()
547
+ queries_extractor.close()
549
548
 
550
549
  else:
551
550
  if self.lineage_extractor:
552
- self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION)
553
- self.lineage_extractor.add_time_based_lineage_to_aggregator(
554
- discovered_tables=discovered_tables,
555
- discovered_views=discovered_views,
556
- )
551
+ with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"):
552
+ self.lineage_extractor.add_time_based_lineage_to_aggregator(
553
+ discovered_tables=discovered_tables,
554
+ discovered_views=discovered_views,
555
+ )
557
556
 
558
557
  # This would emit view and external table ddl lineage
559
558
  # as well as query lineage via lineage_extractor
@@ -878,7 +878,7 @@ ORDER by DataBaseName, TableName;
878
878
 
879
879
  urns = self.schema_resolver.get_urns()
880
880
  if self.config.include_table_lineage or self.config.include_usage_statistics:
881
- self.report.report_ingestion_stage_start("audit log extraction")
882
- yield from self.get_audit_log_mcps(urns=urns)
881
+ with self.report.new_stage("Audit log extraction"):
882
+ yield from self.get_audit_log_mcps(urns=urns)
883
883
 
884
884
  yield from self.builder.gen_workunits()