acryl-datahub 0.15.0.2rc1__py3-none-any.whl → 0.15.0.2rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (34) hide show
  1. {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/METADATA +2469 -2459
  2. {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/RECORD +34 -34
  3. {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +12 -1
  6. datahub/cli/specific/structuredproperties_cli.py +84 -0
  7. datahub/ingestion/api/source.py +2 -0
  8. datahub/ingestion/source/bigquery_v2/bigquery.py +31 -33
  9. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  10. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -11
  11. datahub/ingestion/source/bigquery_v2/lineage.py +7 -7
  12. datahub/ingestion/source/bigquery_v2/usage.py +57 -57
  13. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  14. datahub/ingestion/source/cassandra/cassandra_utils.py +0 -3
  15. datahub/ingestion/source/datahub/config.py +6 -0
  16. datahub/ingestion/source/datahub/datahub_source.py +12 -2
  17. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  18. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  19. datahub/ingestion/source/gc/datahub_gc.py +10 -14
  20. datahub/ingestion/source/looker/looker_config.py +8 -3
  21. datahub/ingestion/source/redshift/redshift.py +32 -34
  22. datahub/ingestion/source/redshift/usage.py +29 -29
  23. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  24. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +18 -16
  25. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +46 -47
  26. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -39
  27. datahub/ingestion/source/sql/teradata.py +2 -2
  28. datahub/ingestion/source/tableau/tableau.py +119 -31
  29. datahub/ingestion/source/unity/source.py +71 -71
  30. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  31. datahub/utilities/file_backed_collections.py +1 -1
  32. datahub/utilities/perf_timer.py +11 -6
  33. {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/entry_points.txt +0 -0
  34. {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/top_level.txt +0 -0
@@ -495,62 +495,62 @@ class BigQueryUsageExtractor:
495
495
  def _generate_operational_workunits(
496
496
  self, usage_state: BigQueryUsageState, table_refs: Collection[str]
497
497
  ) -> Iterable[MetadataWorkUnit]:
498
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS)
499
- for audit_event in usage_state.standalone_events():
500
- try:
501
- operational_wu = self._create_operation_workunit(
502
- audit_event, table_refs
503
- )
504
- if operational_wu:
505
- yield operational_wu
506
- self.report.num_operational_stats_workunits_emitted += 1
507
- except Exception as e:
508
- self.report.warning(
509
- message="Unable to generate operation workunit",
510
- context=f"{audit_event}",
511
- exc=e,
512
- )
498
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
499
+ for audit_event in usage_state.standalone_events():
500
+ try:
501
+ operational_wu = self._create_operation_workunit(
502
+ audit_event, table_refs
503
+ )
504
+ if operational_wu:
505
+ yield operational_wu
506
+ self.report.num_operational_stats_workunits_emitted += 1
507
+ except Exception as e:
508
+ self.report.warning(
509
+ message="Unable to generate operation workunit",
510
+ context=f"{audit_event}",
511
+ exc=e,
512
+ )
513
513
 
514
514
  def _generate_usage_workunits(
515
515
  self, usage_state: BigQueryUsageState
516
516
  ) -> Iterable[MetadataWorkUnit]:
517
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION)
518
- top_n = (
519
- self.config.usage.top_n_queries
520
- if self.config.usage.include_top_n_queries
521
- else 0
522
- )
523
- for entry in usage_state.usage_statistics(top_n=top_n):
524
- try:
525
- query_freq = [
526
- (
527
- self.uuid_to_query.get(
528
- query_hash, usage_state.queries[query_hash]
529
- ),
530
- count,
517
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
518
+ top_n = (
519
+ self.config.usage.top_n_queries
520
+ if self.config.usage.include_top_n_queries
521
+ else 0
522
+ )
523
+ for entry in usage_state.usage_statistics(top_n=top_n):
524
+ try:
525
+ query_freq = [
526
+ (
527
+ self.uuid_to_query.get(
528
+ query_hash, usage_state.queries[query_hash]
529
+ ),
530
+ count,
531
+ )
532
+ for query_hash, count in entry.query_freq
533
+ ]
534
+ yield make_usage_workunit(
535
+ bucket_start_time=datetime.fromisoformat(entry.timestamp),
536
+ resource=BigQueryTableRef.from_string_name(entry.resource),
537
+ query_count=entry.query_count,
538
+ query_freq=query_freq,
539
+ user_freq=entry.user_freq,
540
+ column_freq=entry.column_freq,
541
+ bucket_duration=self.config.bucket_duration,
542
+ resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
543
+ top_n_queries=self.config.usage.top_n_queries,
544
+ format_sql_queries=self.config.usage.format_sql_queries,
545
+ queries_character_limit=self.config.usage.queries_character_limit,
546
+ )
547
+ self.report.num_usage_workunits_emitted += 1
548
+ except Exception as e:
549
+ self.report.warning(
550
+ message="Unable to generate usage statistics workunit",
551
+ context=f"{entry.timestamp}, {entry.resource}",
552
+ exc=e,
531
553
  )
532
- for query_hash, count in entry.query_freq
533
- ]
534
- yield make_usage_workunit(
535
- bucket_start_time=datetime.fromisoformat(entry.timestamp),
536
- resource=BigQueryTableRef.from_string_name(entry.resource),
537
- query_count=entry.query_count,
538
- query_freq=query_freq,
539
- user_freq=entry.user_freq,
540
- column_freq=entry.column_freq,
541
- bucket_duration=self.config.bucket_duration,
542
- resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
543
- top_n_queries=self.config.usage.top_n_queries,
544
- format_sql_queries=self.config.usage.format_sql_queries,
545
- queries_character_limit=self.config.usage.queries_character_limit,
546
- )
547
- self.report.num_usage_workunits_emitted += 1
548
- except Exception as e:
549
- self.report.warning(
550
- message="Unable to generate usage statistics workunit",
551
- context=f"{entry.timestamp}, {entry.resource}",
552
- exc=e,
553
- )
554
554
 
555
555
  def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]:
556
556
  if self.config.use_exported_bigquery_audit_metadata:
@@ -559,10 +559,10 @@ class BigQueryUsageExtractor:
559
559
  for project_id in projects:
560
560
  with PerfTimer() as timer:
561
561
  try:
562
- self.report.set_ingestion_stage(
563
- project_id, USAGE_EXTRACTION_INGESTION
564
- )
565
- yield from self._get_parsed_bigquery_log_events(project_id)
562
+ with self.report.new_stage(
563
+ f"{project_id}: {USAGE_EXTRACTION_INGESTION}"
564
+ ):
565
+ yield from self._get_parsed_bigquery_log_events(project_id)
566
566
  except Exception as e:
567
567
  self.report.usage_failed_extraction.append(project_id)
568
568
  self.report.warning(
@@ -572,8 +572,8 @@ class BigQueryUsageExtractor:
572
572
  )
573
573
  self.report_status(f"usage-extraction-{project_id}", False)
574
574
 
575
- self.report.usage_extraction_sec[project_id] = round(
576
- timer.elapsed_seconds(), 2
575
+ self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds(
576
+ digits=2
577
577
  )
578
578
 
579
579
  def _store_usage_event(
@@ -70,30 +70,30 @@ class CassandraProfiler:
70
70
  ) -> Iterable[MetadataWorkUnit]:
71
71
  for keyspace_name in cassandra_data.keyspaces:
72
72
  tables = cassandra_data.tables.get(keyspace_name, [])
73
- self.report.set_ingestion_stage(keyspace_name, PROFILING)
74
- with ThreadPoolExecutor(
75
- max_workers=self.config.profiling.max_workers
76
- ) as executor:
77
- future_to_dataset = {
78
- executor.submit(
79
- self.generate_profile,
80
- keyspace_name,
81
- table_name,
82
- cassandra_data.columns.get(table_name, []),
83
- ): table_name
84
- for table_name in tables
85
- }
86
- for future in as_completed(future_to_dataset):
87
- table_name = future_to_dataset[future]
88
- try:
89
- yield from future.result()
90
- except Exception as exc:
91
- self.report.profiling_skipped_other[table_name] += 1
92
- self.report.failure(
93
- message="Failed to profile for table",
94
- context=f"{keyspace_name}.{table_name}",
95
- exc=exc,
96
- )
73
+ with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
74
+ with ThreadPoolExecutor(
75
+ max_workers=self.config.profiling.max_workers
76
+ ) as executor:
77
+ future_to_dataset = {
78
+ executor.submit(
79
+ self.generate_profile,
80
+ keyspace_name,
81
+ table_name,
82
+ cassandra_data.columns.get(table_name, []),
83
+ ): table_name
84
+ for table_name in tables
85
+ }
86
+ for future in as_completed(future_to_dataset):
87
+ table_name = future_to_dataset[future]
88
+ try:
89
+ yield from future.result()
90
+ except Exception as exc:
91
+ self.report.profiling_skipped_other[table_name] += 1
92
+ self.report.failure(
93
+ message="Failed to profile for table",
94
+ context=f"{keyspace_name}.{table_name}",
95
+ exc=exc,
96
+ )
97
97
 
98
98
  def generate_profile(
99
99
  self,
@@ -54,9 +54,6 @@ class CassandraSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport
54
54
  else:
55
55
  raise KeyError(f"Unknown entity {ent_type}.")
56
56
 
57
- def set_ingestion_stage(self, keyspace: str, stage: str) -> None:
58
- self.report_ingestion_stage_start(f"{keyspace}: {stage}")
59
-
60
57
  # TODO Need to create seperate common config for profiling report
61
58
  profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
62
59
  profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
@@ -108,6 +108,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
108
108
 
109
109
  urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
110
110
 
111
+ drop_duplicate_schema_fields: bool = Field(
112
+ default=False,
113
+ description="Whether to drop duplicate schema fields in the schemaMetadata aspect. "
114
+ "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
115
+ )
116
+
111
117
  @root_validator(skip_on_failure=True)
112
118
  def check_ingesting_data(cls, values):
113
119
  if (
@@ -12,7 +12,10 @@ from datahub.ingestion.api.decorators import (
12
12
  support_status,
13
13
  )
14
14
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
15
- from datahub.ingestion.api.source_helpers import auto_workunit_reporter
15
+ from datahub.ingestion.api.source_helpers import (
16
+ auto_fix_duplicate_schema_field_paths,
17
+ auto_workunit_reporter,
18
+ )
16
19
  from datahub.ingestion.api.workunit import MetadataWorkUnit
17
20
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
18
21
  from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
@@ -57,7 +60,14 @@ class DataHubSource(StatefulIngestionSourceBase):
57
60
 
58
61
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
59
62
  # Exactly replicate data from DataHub source
60
- return [partial(auto_workunit_reporter, self.get_report())]
63
+ return [
64
+ (
65
+ auto_fix_duplicate_schema_field_paths
66
+ if self.config.drop_duplicate_schema_fields
67
+ else None
68
+ ),
69
+ partial(auto_workunit_reporter, self.get_report()),
70
+ ]
61
71
 
62
72
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
63
73
  self.report.stop_time = datetime.now(tz=timezone.utc)
@@ -45,6 +45,3 @@ class DremioSourceReport(
45
45
  self.views_scanned += 1
46
46
  else:
47
47
  raise KeyError(f"Unknown entity {ent_type}.")
48
-
49
- def set_ingestion_stage(self, dataset: str, stage: str) -> None:
50
- self.report_ingestion_stage_start(f"{dataset}: {stage}")
@@ -472,8 +472,8 @@ class DremioSource(StatefulIngestionSourceBase):
472
472
  env=self.config.env,
473
473
  platform_instance=self.config.platform_instance,
474
474
  )
475
- self.report.set_ingestion_stage(dataset_info.resource_name, PROFILING)
476
- yield from self.profiler.get_workunits(dataset_info, dataset_urn)
475
+ with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
476
+ yield from self.profiler.get_workunits(dataset_info, dataset_urn)
477
477
 
478
478
  def generate_view_lineage(
479
479
  self, dataset_urn: str, parents: List[str]
@@ -141,40 +141,36 @@ class DataHubGcSource(Source):
141
141
  ) -> Iterable[MetadataWorkUnit]:
142
142
  if self.config.cleanup_expired_tokens:
143
143
  try:
144
- self.report.report_ingestion_stage_start("Expired Token Cleanup")
145
- self.revoke_expired_tokens()
144
+ with self.report.new_stage("Expired Token Cleanup"):
145
+ self.revoke_expired_tokens()
146
146
  except Exception as e:
147
147
  self.report.failure("While trying to cleanup expired token ", exc=e)
148
148
  if self.config.truncate_indices:
149
149
  try:
150
- self.report.report_ingestion_stage_start("Truncate Indices")
151
- self.truncate_indices()
150
+ with self.report.new_stage("Truncate Indices"):
151
+ self.truncate_indices()
152
152
  except Exception as e:
153
153
  self.report.failure("While trying to truncate indices ", exc=e)
154
154
  if self.config.soft_deleted_entities_cleanup.enabled:
155
155
  try:
156
- self.report.report_ingestion_stage_start(
157
- "Soft Deleted Entities Cleanup"
158
- )
159
- self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
156
+ with self.report.new_stage("Soft Deleted Entities Cleanup"):
157
+ self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
160
158
  except Exception as e:
161
159
  self.report.failure(
162
160
  "While trying to cleanup soft deleted entities ", exc=e
163
161
  )
164
162
  if self.config.dataprocess_cleanup.enabled:
165
163
  try:
166
- self.report.report_ingestion_stage_start("Data Process Cleanup")
167
- yield from self.dataprocess_cleanup.get_workunits_internal()
164
+ with self.report.new_stage("Data Process Cleanup"):
165
+ yield from self.dataprocess_cleanup.get_workunits_internal()
168
166
  except Exception as e:
169
167
  self.report.failure("While trying to cleanup data process ", exc=e)
170
168
  if self.config.execution_request_cleanup.enabled:
171
169
  try:
172
- self.report.report_ingestion_stage_start("Execution request Cleanup")
173
- self.execution_request_cleanup.run()
170
+ with self.report.new_stage("Execution request Cleanup"):
171
+ self.execution_request_cleanup.run()
174
172
  except Exception as e:
175
173
  self.report.failure("While trying to cleanup execution request ", exc=e)
176
- # Otherwise last stage's duration does not get calculated.
177
- self.report.report_ingestion_stage_start("End")
178
174
  yield from []
179
175
 
180
176
  def truncate_indices(self) -> None:
@@ -300,11 +300,16 @@ class LookerDashboardSourceConfig(
300
300
 
301
301
  folder_path_pattern: AllowDenyPattern = Field(
302
302
  default=AllowDenyPattern.allow_all(),
303
- description="Allow or deny dashboards from specific folders. "
303
+ description="Allow or deny dashboards from specific folders using their fully qualified paths. "
304
304
  "For example: \n"
305
305
  "deny: \n"
306
- " - sales/deprecated \n"
307
- "This pattern will deny the ingestion of all dashboards and looks within the sales/deprecated folder. \n"
306
+ " - Shared/deprecated \n"
307
+ "This pattern will deny the ingestion of all dashboards and looks within the Shared/deprecated folder. \n"
308
+ "allow: \n"
309
+ " - Shared/sales \n"
310
+ "This pattern will allow only the ingestion of dashboards within the Shared/sales folder. \n"
311
+ "To get the correct path from Looker, take the folder hierarchy shown in the UI and join it with slashes. "
312
+ "For example, Shared -> Customer Reports -> Sales becomes Shared/Customer Reports/Sales. "
308
313
  "Dashboards will only be ingested if they're allowed by both this config and dashboard_pattern.",
309
314
  )
310
315
 
@@ -423,10 +423,10 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
423
423
 
424
424
  database = self.config.database
425
425
  logger.info(f"Processing db {database}")
426
- self.report.report_ingestion_stage_start(METADATA_EXTRACTION)
427
- self.db_tables[database] = defaultdict()
428
- self.db_views[database] = defaultdict()
429
- self.db_schemas.setdefault(database, {})
426
+ with self.report.new_stage(METADATA_EXTRACTION):
427
+ self.db_tables[database] = defaultdict()
428
+ self.db_views[database] = defaultdict()
429
+ self.db_schemas.setdefault(database, {})
430
430
 
431
431
  # TODO: Ideally, we'd push down exception handling to the place where the connection is used, as opposed to keeping
432
432
  # this fallback. For now, this gets us broad coverage quickly.
@@ -462,12 +462,12 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
462
462
  self.process_schemas(connection, database)
463
463
  )
464
464
 
465
- self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
466
- yield from self.extract_lineage_v2(
467
- connection=connection,
468
- database=database,
469
- lineage_extractor=lineage_extractor,
470
- )
465
+ with self.report.new_stage(LINEAGE_EXTRACTION):
466
+ yield from self.extract_lineage_v2(
467
+ connection=connection,
468
+ database=database,
469
+ lineage_extractor=lineage_extractor,
470
+ )
471
471
 
472
472
  all_tables = self.get_all_tables()
473
473
  else:
@@ -480,25 +480,25 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
480
480
  or self.config.include_view_lineage
481
481
  or self.config.include_copy_lineage
482
482
  ):
483
- self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
484
- yield from self.extract_lineage(
485
- connection=connection, all_tables=all_tables, database=database
486
- )
483
+ with self.report.new_stage(LINEAGE_EXTRACTION):
484
+ yield from self.extract_lineage(
485
+ connection=connection, all_tables=all_tables, database=database
486
+ )
487
487
 
488
- self.report.report_ingestion_stage_start(USAGE_EXTRACTION_INGESTION)
489
488
  if self.config.include_usage_statistics:
490
- yield from self.extract_usage(
491
- connection=connection, all_tables=all_tables, database=database
492
- )
489
+ with self.report.new_stage(USAGE_EXTRACTION_INGESTION):
490
+ yield from self.extract_usage(
491
+ connection=connection, all_tables=all_tables, database=database
492
+ )
493
493
 
494
494
  if self.config.is_profiling_enabled():
495
- self.report.report_ingestion_stage_start(PROFILING)
496
- profiler = RedshiftProfiler(
497
- config=self.config,
498
- report=self.report,
499
- state_handler=self.profiling_state_handler,
500
- )
501
- yield from profiler.get_workunits(self.db_tables)
495
+ with self.report.new_stage(PROFILING):
496
+ profiler = RedshiftProfiler(
497
+ config=self.config,
498
+ report=self.report,
499
+ state_handler=self.profiling_state_handler,
500
+ )
501
+ yield from profiler.get_workunits(self.db_tables)
502
502
 
503
503
  def process_schemas(self, connection, database):
504
504
  for schema in self.data_dictionary.get_schemas(
@@ -633,8 +633,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
633
633
  else:
634
634
  logger.info("View processing disabled, skipping")
635
635
 
636
- self.report.metadata_extraction_sec[report_key] = round(
637
- timer.elapsed_seconds(), 2
636
+ self.report.metadata_extraction_sec[report_key] = timer.elapsed_seconds(
637
+ digits=2
638
638
  )
639
639
 
640
640
  def _process_table(
@@ -986,9 +986,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
986
986
 
987
987
  yield from usage_extractor.get_usage_workunits(all_tables=all_tables)
988
988
 
989
- self.report.usage_extraction_sec[database] = round(
990
- timer.elapsed_seconds(), 2
991
- )
989
+ self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2)
992
990
 
993
991
  def extract_lineage(
994
992
  self,
@@ -1011,8 +1009,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
1011
1009
  database=database, connection=connection, all_tables=all_tables
1012
1010
  )
1013
1011
 
1014
- self.report.lineage_extraction_sec[f"{database}"] = round(
1015
- timer.elapsed_seconds(), 2
1012
+ self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
1013
+ digits=2
1016
1014
  )
1017
1015
  yield from self.generate_lineage(
1018
1016
  database, lineage_extractor=lineage_extractor
@@ -1042,8 +1040,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
1042
1040
 
1043
1041
  yield from lineage_extractor.generate()
1044
1042
 
1045
- self.report.lineage_extraction_sec[f"{database}"] = round(
1046
- timer.elapsed_seconds(), 2
1043
+ self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
1044
+ digits=2
1047
1045
  )
1048
1046
 
1049
1047
  if self.redundant_lineage_run_skip_handler:
@@ -182,38 +182,38 @@ class RedshiftUsageExtractor:
182
182
  self.report.num_operational_stats_filtered = 0
183
183
 
184
184
  if self.config.include_operational_stats:
185
- self.report.report_ingestion_stage_start(USAGE_EXTRACTION_OPERATIONAL_STATS)
186
- with PerfTimer() as timer:
187
- # Generate operation aspect workunits
188
- yield from self._gen_operation_aspect_workunits(
189
- self.connection, all_tables
190
- )
191
- self.report.operational_metadata_extraction_sec[
192
- self.config.database
193
- ] = round(timer.elapsed_seconds(), 2)
185
+ with self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS):
186
+ with PerfTimer() as timer:
187
+ # Generate operation aspect workunits
188
+ yield from self._gen_operation_aspect_workunits(
189
+ self.connection, all_tables
190
+ )
191
+ self.report.operational_metadata_extraction_sec[
192
+ self.config.database
193
+ ] = timer.elapsed_seconds(digits=2)
194
194
 
195
195
  # Generate aggregate events
196
- self.report.report_ingestion_stage_start(USAGE_EXTRACTION_USAGE_AGGREGATION)
197
- query: str = self.queries.usage_query(
198
- start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
199
- end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
200
- database=self.config.database,
201
- )
202
- access_events_iterable: Iterable[
203
- RedshiftAccessEvent
204
- ] = self._gen_access_events_from_history_query(
205
- query, connection=self.connection, all_tables=all_tables
206
- )
196
+ with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):
197
+ query: str = self.queries.usage_query(
198
+ start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
199
+ end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
200
+ database=self.config.database,
201
+ )
202
+ access_events_iterable: Iterable[
203
+ RedshiftAccessEvent
204
+ ] = self._gen_access_events_from_history_query(
205
+ query, connection=self.connection, all_tables=all_tables
206
+ )
207
207
 
208
- aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
209
- access_events_iterable
210
- )
211
- # Generate usage workunits from aggregated events.
212
- for time_bucket in aggregated_events.values():
213
- for aggregate in time_bucket.values():
214
- wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
215
- self.report.num_usage_workunits_emitted += 1
216
- yield wu
208
+ aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
209
+ access_events_iterable
210
+ )
211
+ # Generate usage workunits from aggregated events.
212
+ for time_bucket in aggregated_events.values():
213
+ for aggregate in time_bucket.values():
214
+ wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
215
+ self.report.num_usage_workunits_emitted += 1
216
+ yield wu
217
217
 
218
218
  def _gen_operation_aspect_workunits(
219
219
  self,
@@ -166,6 +166,3 @@ class SnowflakeV2Report(
166
166
 
167
167
  def report_tag_processed(self, tag_name: str) -> None:
168
168
  self._processed_tags.add(tag_name)
169
-
170
- def set_ingestion_stage(self, database: str, stage: str) -> None:
171
- self.report_ingestion_stage_start(f"{database}: {stage}")
@@ -216,21 +216,23 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
216
216
 
217
217
  try:
218
218
  for snowflake_db in self.databases:
219
- self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
220
- yield from self._process_database(snowflake_db)
219
+ with self.report.new_stage(
220
+ f"{snowflake_db.name}: {METADATA_EXTRACTION}"
221
+ ):
222
+ yield from self._process_database(snowflake_db)
221
223
 
222
- self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
223
- discovered_tables: List[str] = [
224
- self.identifiers.get_dataset_identifier(
225
- table_name, schema.name, db.name
226
- )
227
- for db in self.databases
228
- for schema in db.schemas
229
- for table_name in schema.tables
230
- ]
231
- if self.aggregator:
232
- for entry in self._external_tables_ddl_lineage(discovered_tables):
233
- self.aggregator.add(entry)
224
+ with self.report.new_stage(f"*: {EXTERNAL_TABLE_DDL_LINEAGE}"):
225
+ discovered_tables: List[str] = [
226
+ self.identifiers.get_dataset_identifier(
227
+ table_name, schema.name, db.name
228
+ )
229
+ for db in self.databases
230
+ for schema in db.schemas
231
+ for table_name in schema.tables
232
+ ]
233
+ if self.aggregator:
234
+ for entry in self._external_tables_ddl_lineage(discovered_tables):
235
+ self.aggregator.add(entry)
234
236
 
235
237
  except SnowflakePermissionError as e:
236
238
  self.structured_reporter.failure(
@@ -332,8 +334,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
332
334
  yield from self._process_db_schemas(snowflake_db, db_tables)
333
335
 
334
336
  if self.profiler and db_tables:
335
- self.report.set_ingestion_stage(snowflake_db.name, PROFILING)
336
- yield from self.profiler.get_workunits(snowflake_db, db_tables)
337
+ with self.report.new_stage(f"{snowflake_db.name}: {PROFILING}"):
338
+ yield from self.profiler.get_workunits(snowflake_db, db_tables)
337
339
 
338
340
  def _process_db_schemas(
339
341
  self,