acryl-datahub 0.15.0.2rc1__py3-none-any.whl → 0.15.0.2rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (34) hide show
  1. {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/METADATA +2469 -2459
  2. {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/RECORD +34 -34
  3. {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +12 -1
  6. datahub/cli/specific/structuredproperties_cli.py +84 -0
  7. datahub/ingestion/api/source.py +2 -0
  8. datahub/ingestion/source/bigquery_v2/bigquery.py +31 -33
  9. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  10. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -11
  11. datahub/ingestion/source/bigquery_v2/lineage.py +7 -7
  12. datahub/ingestion/source/bigquery_v2/usage.py +57 -57
  13. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  14. datahub/ingestion/source/cassandra/cassandra_utils.py +0 -3
  15. datahub/ingestion/source/datahub/config.py +6 -0
  16. datahub/ingestion/source/datahub/datahub_source.py +12 -2
  17. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  18. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  19. datahub/ingestion/source/gc/datahub_gc.py +10 -14
  20. datahub/ingestion/source/looker/looker_config.py +8 -3
  21. datahub/ingestion/source/redshift/redshift.py +32 -34
  22. datahub/ingestion/source/redshift/usage.py +29 -29
  23. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  24. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +18 -16
  25. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +46 -47
  26. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -39
  27. datahub/ingestion/source/sql/teradata.py +2 -2
  28. datahub/ingestion/source/tableau/tableau.py +119 -31
  29. datahub/ingestion/source/unity/source.py +71 -71
  30. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  31. datahub/utilities/file_backed_collections.py +1 -1
  32. datahub/utilities/perf_timer.py +11 -6
  33. {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/entry_points.txt +0 -0
  34. {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/top_level.txt +0 -0
@@ -146,59 +146,58 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
146
146
  if not self._should_ingest_usage():
147
147
  return
148
148
 
149
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION)
150
- if self.report.edition == SnowflakeEdition.STANDARD.value:
151
- logger.info(
152
- "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
153
- )
154
- return
149
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
150
+ if self.report.edition == SnowflakeEdition.STANDARD.value:
151
+ logger.info(
152
+ "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
153
+ )
154
+ return
155
155
 
156
- logger.info("Checking usage date ranges")
156
+ logger.info("Checking usage date ranges")
157
157
 
158
- self._check_usage_date_ranges()
158
+ self._check_usage_date_ranges()
159
159
 
160
- # If permission error, execution returns from here
161
- if (
162
- self.report.min_access_history_time is None
163
- or self.report.max_access_history_time is None
164
- ):
165
- return
160
+ # If permission error, execution returns from here
161
+ if (
162
+ self.report.min_access_history_time is None
163
+ or self.report.max_access_history_time is None
164
+ ):
165
+ return
166
166
 
167
- # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
168
- # Now, we report the usage as well as operation metadata even if user email is absent
167
+ # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
168
+ # Now, we report the usage as well as operation metadata even if user email is absent
169
169
 
170
- if self.config.include_usage_stats:
171
- yield from auto_empty_dataset_usage_statistics(
172
- self._get_workunits_internal(discovered_datasets),
173
- config=BaseTimeWindowConfig(
174
- start_time=self.start_time,
175
- end_time=self.end_time,
176
- bucket_duration=self.config.bucket_duration,
177
- ),
178
- dataset_urns={
179
- self.identifiers.gen_dataset_urn(dataset_identifier)
180
- for dataset_identifier in discovered_datasets
181
- },
182
- )
170
+ if self.config.include_usage_stats:
171
+ yield from auto_empty_dataset_usage_statistics(
172
+ self._get_workunits_internal(discovered_datasets),
173
+ config=BaseTimeWindowConfig(
174
+ start_time=self.start_time,
175
+ end_time=self.end_time,
176
+ bucket_duration=self.config.bucket_duration,
177
+ ),
178
+ dataset_urns={
179
+ self.identifiers.gen_dataset_urn(dataset_identifier)
180
+ for dataset_identifier in discovered_datasets
181
+ },
182
+ )
183
183
 
184
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS)
184
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
185
+ if self.config.include_operational_stats:
186
+ # Generate the operation workunits.
187
+ access_events = self._get_snowflake_history()
188
+ for event in access_events:
189
+ yield from self._get_operation_aspect_work_unit(
190
+ event, discovered_datasets
191
+ )
185
192
 
186
- if self.config.include_operational_stats:
187
- # Generate the operation workunits.
188
- access_events = self._get_snowflake_history()
189
- for event in access_events:
190
- yield from self._get_operation_aspect_work_unit(
191
- event, discovered_datasets
193
+ if self.redundant_run_skip_handler:
194
+ # Update the checkpoint state for this run.
195
+ self.redundant_run_skip_handler.update_state(
196
+ self.config.start_time,
197
+ self.config.end_time,
198
+ self.config.bucket_duration,
192
199
  )
193
200
 
194
- if self.redundant_run_skip_handler:
195
- # Update the checkpoint state for this run.
196
- self.redundant_run_skip_handler.update_state(
197
- self.config.start_time,
198
- self.config.end_time,
199
- self.config.bucket_duration,
200
- )
201
-
202
201
  def _get_workunits_internal(
203
202
  self, discovered_datasets: List[str]
204
203
  ) -> Iterable[MetadataWorkUnit]:
@@ -386,7 +385,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
386
385
  )
387
386
  self.report_status(USAGE_EXTRACTION_OPERATIONAL_STATS, False)
388
387
  return
389
- self.report.access_history_query_secs = round(timer.elapsed_seconds(), 2)
388
+ self.report.access_history_query_secs = timer.elapsed_seconds(digits=2)
390
389
 
391
390
  for row in results:
392
391
  yield from self._process_snowflake_history_row(row)
@@ -434,8 +433,8 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
434
433
  self.report.max_access_history_time = db_row["MAX_TIME"].astimezone(
435
434
  tz=timezone.utc
436
435
  )
437
- self.report.access_history_range_query_secs = round(
438
- timer.elapsed_seconds(), 2
436
+ self.report.access_history_range_query_secs = timer.elapsed_seconds(
437
+ digits=2
439
438
  )
440
439
 
441
440
  def _get_operation_aspect_work_unit(
@@ -480,8 +480,8 @@ class SnowflakeV2Source(
480
480
  identifiers=self.identifiers,
481
481
  )
482
482
 
483
- self.report.set_ingestion_stage("*", METADATA_EXTRACTION)
484
- yield from schema_extractor.get_workunits_internal()
483
+ with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
484
+ yield from schema_extractor.get_workunits_internal()
485
485
 
486
486
  databases = schema_extractor.databases
487
487
 
@@ -513,47 +513,46 @@ class SnowflakeV2Source(
513
513
  discovered_datasets = discovered_tables + discovered_views
514
514
 
515
515
  if self.config.use_queries_v2:
516
- self.report.set_ingestion_stage("*", VIEW_PARSING)
517
- yield from auto_workunit(self.aggregator.gen_metadata())
518
-
519
- self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
520
-
521
- schema_resolver = self.aggregator._schema_resolver
522
-
523
- queries_extractor = SnowflakeQueriesExtractor(
524
- connection=self.connection,
525
- config=SnowflakeQueriesExtractorConfig(
526
- window=self.config,
527
- temporary_tables_pattern=self.config.temporary_tables_pattern,
528
- include_lineage=self.config.include_table_lineage,
529
- include_usage_statistics=self.config.include_usage_stats,
530
- include_operations=self.config.include_operational_stats,
531
- include_queries=self.config.include_queries,
532
- include_query_usage_statistics=self.config.include_query_usage_statistics,
533
- user_email_pattern=self.config.user_email_pattern,
534
- ),
535
- structured_report=self.report,
536
- filters=self.filters,
537
- identifiers=self.identifiers,
538
- schema_resolver=schema_resolver,
539
- discovered_tables=discovered_datasets,
540
- graph=self.ctx.graph,
541
- )
516
+ with self.report.new_stage(f"*: {VIEW_PARSING}"):
517
+ yield from auto_workunit(self.aggregator.gen_metadata())
542
518
 
543
- # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
544
- # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
545
- # it should be pretty straightforward to refactor this and only initialize the aggregator once.
546
- self.report.queries_extractor = queries_extractor.report
547
- yield from queries_extractor.get_workunits_internal()
548
- queries_extractor.close()
519
+ with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
520
+ schema_resolver = self.aggregator._schema_resolver
521
+
522
+ queries_extractor = SnowflakeQueriesExtractor(
523
+ connection=self.connection,
524
+ config=SnowflakeQueriesExtractorConfig(
525
+ window=self.config,
526
+ temporary_tables_pattern=self.config.temporary_tables_pattern,
527
+ include_lineage=self.config.include_table_lineage,
528
+ include_usage_statistics=self.config.include_usage_stats,
529
+ include_operations=self.config.include_operational_stats,
530
+ include_queries=self.config.include_queries,
531
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
532
+ user_email_pattern=self.config.user_email_pattern,
533
+ ),
534
+ structured_report=self.report,
535
+ filters=self.filters,
536
+ identifiers=self.identifiers,
537
+ schema_resolver=schema_resolver,
538
+ discovered_tables=discovered_datasets,
539
+ graph=self.ctx.graph,
540
+ )
541
+
542
+ # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
543
+ # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
544
+ # it should be pretty straightforward to refactor this and only initialize the aggregator once.
545
+ self.report.queries_extractor = queries_extractor.report
546
+ yield from queries_extractor.get_workunits_internal()
547
+ queries_extractor.close()
549
548
 
550
549
  else:
551
550
  if self.lineage_extractor:
552
- self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION)
553
- self.lineage_extractor.add_time_based_lineage_to_aggregator(
554
- discovered_tables=discovered_tables,
555
- discovered_views=discovered_views,
556
- )
551
+ with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"):
552
+ self.lineage_extractor.add_time_based_lineage_to_aggregator(
553
+ discovered_tables=discovered_tables,
554
+ discovered_views=discovered_views,
555
+ )
557
556
 
558
557
  # This would emit view and external table ddl lineage
559
558
  # as well as query lineage via lineage_extractor
@@ -878,7 +878,7 @@ ORDER by DataBaseName, TableName;
878
878
 
879
879
  urns = self.schema_resolver.get_urns()
880
880
  if self.config.include_table_lineage or self.config.include_usage_statistics:
881
- self.report.report_ingestion_stage_start("audit log extraction")
882
- yield from self.get_audit_log_mcps(urns=urns)
881
+ with self.report.new_stage("Audit log extraction"):
882
+ yield from self.get_audit_log_mcps(urns=urns)
883
883
 
884
884
  yield from self.builder.gen_workunits()
@@ -118,6 +118,7 @@ from datahub.ingestion.source.tableau.tableau_common import (
118
118
  )
119
119
  from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
120
120
  from datahub.ingestion.source.tableau.tableau_validation import check_user_role
121
+ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
121
122
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
122
123
  AuditStamp,
123
124
  ChangeAuditStamps,
@@ -170,6 +171,8 @@ from datahub.sql_parsing.sqlglot_lineage import (
170
171
  create_lineage_sql_parsed_result,
171
172
  )
172
173
  from datahub.utilities import config_clean
174
+ from datahub.utilities.perf_timer import PerfTimer
175
+ from datahub.utilities.stats_collections import TopKDict
173
176
  from datahub.utilities.urns.dataset_urn import DatasetUrn
174
177
 
175
178
  try:
@@ -643,12 +646,41 @@ class SiteIdContentUrl:
643
646
 
644
647
 
645
648
  @dataclass
646
- class TableauSourceReport(StaleEntityRemovalSourceReport):
649
+ class TableauSourceReport(
650
+ StaleEntityRemovalSourceReport,
651
+ IngestionStageReport,
652
+ ):
647
653
  get_all_datasources_query_failed: bool = False
648
654
  num_get_datasource_query_failures: int = 0
649
655
  num_datasource_field_skipped_no_name: int = 0
650
656
  num_csql_field_skipped_no_name: int = 0
651
657
  num_table_field_skipped_no_name: int = 0
658
+ # timers
659
+ extract_usage_stats_timer: Dict[str, float] = dataclass_field(
660
+ default_factory=TopKDict
661
+ )
662
+ fetch_groups_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
663
+ populate_database_server_hostname_map_timer: Dict[str, float] = dataclass_field(
664
+ default_factory=TopKDict
665
+ )
666
+ populate_projects_registry_timer: Dict[str, float] = dataclass_field(
667
+ default_factory=TopKDict
668
+ )
669
+ emit_workbooks_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
670
+ emit_sheets_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
671
+ emit_dashboards_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
672
+ emit_embedded_datasources_timer: Dict[str, float] = dataclass_field(
673
+ default_factory=TopKDict
674
+ )
675
+ emit_published_datasources_timer: Dict[str, float] = dataclass_field(
676
+ default_factory=TopKDict
677
+ )
678
+ emit_custom_sql_datasources_timer: Dict[str, float] = dataclass_field(
679
+ default_factory=TopKDict
680
+ )
681
+ emit_upstream_tables_timer: Dict[str, float] = dataclass_field(
682
+ default_factory=TopKDict
683
+ )
652
684
  # lineage
653
685
  num_tables_with_upstream_lineage: int = 0
654
686
  num_upstream_table_lineage: int = 0
@@ -660,6 +692,7 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
660
692
  num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
661
693
  num_hidden_assets_skipped: int = 0
662
694
  logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
695
+
663
696
  last_authenticated_at: Optional[datetime] = None
664
697
 
665
698
  num_expected_tableau_metadata_queries: int = 0
@@ -834,6 +867,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
834
867
  platform=self.platform,
835
868
  )
836
869
  yield from site_source.ingest_tableau_site()
870
+
837
871
  except MetadataQueryException as md_exception:
838
872
  self.report.failure(
839
873
  title="Failed to Retrieve Tableau Metadata",
@@ -3489,33 +3523,87 @@ class TableauSiteSource:
3489
3523
  return {"permissions": json.dumps(groups)} if len(groups) > 0 else None
3490
3524
 
3491
3525
  def ingest_tableau_site(self):
3492
- # Initialise the dictionary to later look-up for chart and dashboard stat
3493
- if self.config.extract_usage_stats:
3494
- self._populate_usage_stat_registry()
3495
-
3496
- if self.config.permission_ingestion:
3497
- self._fetch_groups()
3498
-
3499
- # Populate the map of database names and database hostnames to be used later to map
3500
- # databases to platform instances.
3501
- if self.config.database_hostname_to_platform_instance_map:
3502
- self._populate_database_server_hostname_map()
3503
-
3504
- self._populate_projects_registry()
3505
-
3506
- if self.config.add_site_container:
3507
- yield from self.emit_site_container()
3508
- yield from self.emit_project_containers()
3509
- yield from self.emit_workbooks()
3510
- if self.sheet_ids:
3511
- yield from self.emit_sheets()
3512
- if self.dashboard_ids:
3513
- yield from self.emit_dashboards()
3514
- if self.embedded_datasource_ids_being_used:
3515
- yield from self.emit_embedded_datasources()
3516
- if self.datasource_ids_being_used:
3517
- yield from self.emit_published_datasources()
3518
- if self.custom_sql_ids_being_used:
3519
- yield from self.emit_custom_sql_datasources()
3520
- if self.database_tables:
3521
- yield from self.emit_upstream_tables()
3526
+ with self.report.new_stage(
3527
+ f"Ingesting Tableau Site: {self.site_id} {self.site_content_url}"
3528
+ ):
3529
+ # Initialise the dictionary to later look-up for chart and dashboard stat
3530
+ if self.config.extract_usage_stats:
3531
+ with PerfTimer() as timer:
3532
+ self._populate_usage_stat_registry()
3533
+ self.report.extract_usage_stats_timer[
3534
+ self.site_content_url
3535
+ ] = timer.elapsed_seconds(digits=2)
3536
+
3537
+ if self.config.permission_ingestion:
3538
+ with PerfTimer() as timer:
3539
+ self._fetch_groups()
3540
+ self.report.fetch_groups_timer[
3541
+ self.site_content_url
3542
+ ] = timer.elapsed_seconds(digits=2)
3543
+
3544
+ # Populate the map of database names and database hostnames to be used later to map
3545
+ # databases to platform instances.
3546
+ if self.config.database_hostname_to_platform_instance_map:
3547
+ with PerfTimer() as timer:
3548
+ self._populate_database_server_hostname_map()
3549
+ self.report.populate_database_server_hostname_map_timer[
3550
+ self.site_content_url
3551
+ ] = timer.elapsed_seconds(digits=2)
3552
+
3553
+ with PerfTimer() as timer:
3554
+ self._populate_projects_registry()
3555
+ self.report.populate_projects_registry_timer[
3556
+ self.site_content_url
3557
+ ] = timer.elapsed_seconds(digits=2)
3558
+
3559
+ if self.config.add_site_container:
3560
+ yield from self.emit_site_container()
3561
+ yield from self.emit_project_containers()
3562
+
3563
+ with PerfTimer() as timer:
3564
+ yield from self.emit_workbooks()
3565
+ self.report.emit_workbooks_timer[
3566
+ self.site_content_url
3567
+ ] = timer.elapsed_seconds(digits=2)
3568
+
3569
+ if self.sheet_ids:
3570
+ with PerfTimer() as timer:
3571
+ yield from self.emit_sheets()
3572
+ self.report.emit_sheets_timer[
3573
+ self.site_content_url
3574
+ ] = timer.elapsed_seconds(digits=2)
3575
+
3576
+ if self.dashboard_ids:
3577
+ with PerfTimer() as timer:
3578
+ yield from self.emit_dashboards()
3579
+ self.report.emit_dashboards_timer[
3580
+ self.site_content_url
3581
+ ] = timer.elapsed_seconds(digits=2)
3582
+
3583
+ if self.embedded_datasource_ids_being_used:
3584
+ with PerfTimer() as timer:
3585
+ yield from self.emit_embedded_datasources()
3586
+ self.report.emit_embedded_datasources_timer[
3587
+ self.site_content_url
3588
+ ] = timer.elapsed_seconds(digits=2)
3589
+
3590
+ if self.datasource_ids_being_used:
3591
+ with PerfTimer() as timer:
3592
+ yield from self.emit_published_datasources()
3593
+ self.report.emit_published_datasources_timer[
3594
+ self.site_content_url
3595
+ ] = timer.elapsed_seconds(digits=2)
3596
+
3597
+ if self.custom_sql_ids_being_used:
3598
+ with PerfTimer() as timer:
3599
+ yield from self.emit_custom_sql_datasources()
3600
+ self.report.emit_custom_sql_datasources_timer[
3601
+ self.site_content_url
3602
+ ] = timer.elapsed_seconds(digits=2)
3603
+
3604
+ if self.database_tables:
3605
+ with PerfTimer() as timer:
3606
+ yield from self.emit_upstream_tables()
3607
+ self.report.emit_upstream_tables_timer[
3608
+ self.site_content_url
3609
+ ] = timer.elapsed_seconds(digits=2)
@@ -263,86 +263,86 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
263
263
  ]
264
264
 
265
265
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
266
- self.report.report_ingestion_stage_start("Ingestion Setup")
267
- wait_on_warehouse = None
268
- if self.config.include_hive_metastore:
269
- self.report.report_ingestion_stage_start("Start warehouse")
270
- # Can take several minutes, so start now and wait later
271
- wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
272
- if wait_on_warehouse is None:
273
- self.report.report_failure(
274
- "initialization",
275
- f"SQL warehouse {self.config.profiling.warehouse_id} not found",
276
- )
277
- return
278
- else:
279
- # wait until warehouse is started
280
- wait_on_warehouse.result()
266
+ with self.report.new_stage("Ingestion Setup"):
267
+ wait_on_warehouse = None
268
+ if self.config.include_hive_metastore:
269
+ with self.report.new_stage("Start warehouse"):
270
+ # Can take several minutes, so start now and wait later
271
+ wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
272
+ if wait_on_warehouse is None:
273
+ self.report.report_failure(
274
+ "initialization",
275
+ f"SQL warehouse {self.config.profiling.warehouse_id} not found",
276
+ )
277
+ return
278
+ else:
279
+ # wait until warehouse is started
280
+ wait_on_warehouse.result()
281
281
 
282
282
  if self.config.include_ownership:
283
- self.report.report_ingestion_stage_start("Ingest service principals")
284
- self.build_service_principal_map()
285
- self.build_groups_map()
283
+ with self.report.new_stage("Ingest service principals"):
284
+ self.build_service_principal_map()
285
+ self.build_groups_map()
286
286
  if self.config.include_notebooks:
287
- self.report.report_ingestion_stage_start("Ingest notebooks")
288
- yield from self.process_notebooks()
287
+ with self.report.new_stage("Ingest notebooks"):
288
+ yield from self.process_notebooks()
289
289
 
290
290
  yield from self.process_metastores()
291
291
 
292
292
  yield from self.get_view_lineage()
293
293
 
294
294
  if self.config.include_notebooks:
295
- self.report.report_ingestion_stage_start("Notebook lineage")
296
- for notebook in self.notebooks.values():
297
- wu = self._gen_notebook_lineage(notebook)
298
- if wu:
299
- yield wu
295
+ with self.report.new_stage("Notebook lineage"):
296
+ for notebook in self.notebooks.values():
297
+ wu = self._gen_notebook_lineage(notebook)
298
+ if wu:
299
+ yield wu
300
300
 
301
301
  if self.config.include_usage_statistics:
302
- self.report.report_ingestion_stage_start("Ingest usage")
303
- usage_extractor = UnityCatalogUsageExtractor(
304
- config=self.config,
305
- report=self.report,
306
- proxy=self.unity_catalog_api_proxy,
307
- table_urn_builder=self.gen_dataset_urn,
308
- user_urn_builder=self.gen_user_urn,
309
- )
310
- yield from usage_extractor.get_usage_workunits(
311
- self.table_refs | self.view_refs
312
- )
313
-
314
- if self.config.is_profiling_enabled():
315
- self.report.report_ingestion_stage_start("Start warehouse")
316
- # Need to start the warehouse again for profiling,
317
- # as it may have been stopped after ingestion might take
318
- # longer time to complete
319
- wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
320
- if wait_on_warehouse is None:
321
- self.report.report_failure(
322
- "initialization",
323
- f"SQL warehouse {self.config.profiling.warehouse_id} not found",
302
+ with self.report.new_stage("Ingest usage"):
303
+ usage_extractor = UnityCatalogUsageExtractor(
304
+ config=self.config,
305
+ report=self.report,
306
+ proxy=self.unity_catalog_api_proxy,
307
+ table_urn_builder=self.gen_dataset_urn,
308
+ user_urn_builder=self.gen_user_urn,
309
+ )
310
+ yield from usage_extractor.get_usage_workunits(
311
+ self.table_refs | self.view_refs
324
312
  )
325
- return
326
- else:
327
- # wait until warehouse is started
328
- wait_on_warehouse.result()
329
313
 
330
- self.report.report_ingestion_stage_start("Profiling")
331
- if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
332
- yield from UnityCatalogAnalyzeProfiler(
333
- self.config.profiling,
334
- self.report,
335
- self.unity_catalog_api_proxy,
336
- self.gen_dataset_urn,
337
- ).get_workunits(self.table_refs)
338
- elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
339
- yield from UnityCatalogGEProfiler(
340
- sql_common_config=self.config,
341
- profiling_config=self.config.profiling,
342
- report=self.report,
343
- ).get_workunits(list(self.tables.values()))
344
- else:
345
- raise ValueError("Unknown profiling config method")
314
+ if self.config.is_profiling_enabled():
315
+ with self.report.new_stage("Start warehouse"):
316
+ # Need to start the warehouse again for profiling,
317
+ # as it may have been stopped after ingestion might take
318
+ # longer time to complete
319
+ wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
320
+ if wait_on_warehouse is None:
321
+ self.report.report_failure(
322
+ "initialization",
323
+ f"SQL warehouse {self.config.profiling.warehouse_id} not found",
324
+ )
325
+ return
326
+ else:
327
+ # wait until warehouse is started
328
+ wait_on_warehouse.result()
329
+
330
+ with self.report.new_stage("Profiling"):
331
+ if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
332
+ yield from UnityCatalogAnalyzeProfiler(
333
+ self.config.profiling,
334
+ self.report,
335
+ self.unity_catalog_api_proxy,
336
+ self.gen_dataset_urn,
337
+ ).get_workunits(self.table_refs)
338
+ elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
339
+ yield from UnityCatalogGEProfiler(
340
+ sql_common_config=self.config,
341
+ profiling_config=self.config.profiling,
342
+ report=self.report,
343
+ ).get_workunits(list(self.tables.values()))
344
+ else:
345
+ raise ValueError("Unknown profiling config method")
346
346
 
347
347
  def build_service_principal_map(self) -> None:
348
348
  try:
@@ -462,11 +462,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
462
462
  self.report.schemas.dropped(schema.id)
463
463
  continue
464
464
 
465
- self.report.report_ingestion_stage_start(f"Ingest schema {schema.id}")
466
- yield from self.gen_schema_containers(schema)
467
- yield from self.process_tables(schema)
465
+ with self.report.new_stage(f"Ingest schema {schema.id}"):
466
+ yield from self.gen_schema_containers(schema)
467
+ yield from self.process_tables(schema)
468
468
 
469
- self.report.schemas.processed(schema.id)
469
+ self.report.schemas.processed(schema.id)
470
470
 
471
471
  def process_tables(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
472
472
  for table in self.unity_catalog_api_proxy.tables(schema=schema):
@@ -1,7 +1,7 @@
1
1
  import logging
2
+ from contextlib import AbstractContextManager
2
3
  from dataclasses import dataclass, field
3
4
  from datetime import datetime, timezone
4
- from typing import Optional
5
5
 
6
6
  from datahub.utilities.perf_timer import PerfTimer
7
7
  from datahub.utilities.stats_collections import TopKDict
@@ -22,25 +22,29 @@ PROFILING = "Profiling"
22
22
 
23
23
  @dataclass
24
24
  class IngestionStageReport:
25
- ingestion_stage: Optional[str] = None
26
25
  ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
27
26
 
28
- _timer: Optional[PerfTimer] = field(
29
- default=None, init=False, repr=False, compare=False
30
- )
31
-
32
- def report_ingestion_stage_start(self, stage: str) -> None:
33
- if self._timer:
34
- elapsed = round(self._timer.elapsed_seconds(), 2)
35
- logger.info(
36
- f"Time spent in stage <{self.ingestion_stage}>: {elapsed} seconds",
37
- stacklevel=2,
38
- )
39
- if self.ingestion_stage:
40
- self.ingestion_stage_durations[self.ingestion_stage] = elapsed
41
- else:
42
- self._timer = PerfTimer()
43
-
44
- self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
45
- logger.info(f"Stage started: {self.ingestion_stage}")
27
+ def new_stage(self, stage: str) -> "IngestionStageContext":
28
+ return IngestionStageContext(stage, self)
29
+
30
+
31
+ @dataclass
32
+ class IngestionStageContext(AbstractContextManager):
33
+ def __init__(self, stage: str, report: IngestionStageReport):
34
+ self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
35
+ self._timer: PerfTimer = PerfTimer()
36
+ self._report = report
37
+
38
+ def __enter__(self) -> "IngestionStageContext":
39
+ logger.info(f"Stage started: {self._ingestion_stage}")
46
40
  self._timer.start()
41
+ return self
42
+
43
+ def __exit__(self, exc_type, exc_val, exc_tb):
44
+ elapsed = self._timer.elapsed_seconds(digits=2)
45
+ logger.info(
46
+ f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
47
+ stacklevel=2,
48
+ )
49
+ self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed
50
+ return None