PyPI - acryl-datahub - Versions diffs - 0.15.0.2rc2__py3-none-any.whl → 0.15.0.2rc3__py3-none-any.whl - Mend

acryl-datahub 0.15.0.2rc2py3-none-any.whl → 0.15.0.2rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (33) hide show

datahub/ingestion/source/snowflake/snowflake_usage_v2.py CHANGED Viewed

@@ -146,59 +146,58 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
         if not self._should_ingest_usage():
             return
-        self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION)
-        if self.report.edition == SnowflakeEdition.STANDARD.value:
-            logger.info(
-                "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
-            )
-            return
+        with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
+            if self.report.edition == SnowflakeEdition.STANDARD.value:
+                logger.info(
+                    "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
+                )
+                return
-        logger.info("Checking usage date ranges")
+            logger.info("Checking usage date ranges")
-        self._check_usage_date_ranges()
+            self._check_usage_date_ranges()
-        # If permission error, execution returns from here
-        if (
-            self.report.min_access_history_time is None
-            or self.report.max_access_history_time is None
-        ):
-            return
+            # If permission error, execution returns from here
+            if (
+                self.report.min_access_history_time is None
+                or self.report.max_access_history_time is None
+            ):
+                return
-        # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
-        # Now, we report the usage as well as operation metadata even if user email is absent
+            # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
+            # Now, we report the usage as well as operation metadata even if user email is absent
-        if self.config.include_usage_stats:
-            yield from auto_empty_dataset_usage_statistics(
-                self._get_workunits_internal(discovered_datasets),
-                config=BaseTimeWindowConfig(
-                    start_time=self.start_time,
-                    end_time=self.end_time,
-                    bucket_duration=self.config.bucket_duration,
-                ),
-                dataset_urns={
-                    self.identifiers.gen_dataset_urn(dataset_identifier)
-                    for dataset_identifier in discovered_datasets
-                },
-            )
+            if self.config.include_usage_stats:
+                yield from auto_empty_dataset_usage_statistics(
+                    self._get_workunits_internal(discovered_datasets),
+                    config=BaseTimeWindowConfig(
+                        start_time=self.start_time,
+                        end_time=self.end_time,
+                        bucket_duration=self.config.bucket_duration,
+                    ),
+                    dataset_urns={
+                        self.identifiers.gen_dataset_urn(dataset_identifier)
+                        for dataset_identifier in discovered_datasets
+                    },
+                )
-        self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS)
+        with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
+            if self.config.include_operational_stats:
+                # Generate the operation workunits.
+                access_events = self._get_snowflake_history()
+                for event in access_events:
+                    yield from self._get_operation_aspect_work_unit(
+                        event, discovered_datasets
+                    )
-        if self.config.include_operational_stats:
-            # Generate the operation workunits.
-            access_events = self._get_snowflake_history()
-            for event in access_events:
-                yield from self._get_operation_aspect_work_unit(
-                    event, discovered_datasets
+            if self.redundant_run_skip_handler:
+                # Update the checkpoint state for this run.
+                self.redundant_run_skip_handler.update_state(
+                    self.config.start_time,
+                    self.config.end_time,
+                    self.config.bucket_duration,
                 )
-        if self.redundant_run_skip_handler:
-            # Update the checkpoint state for this run.
-            self.redundant_run_skip_handler.update_state(
-                self.config.start_time,
-                self.config.end_time,
-                self.config.bucket_duration,
-            )
     def _get_workunits_internal(
         self, discovered_datasets: List[str]
     ) -> Iterable[MetadataWorkUnit]:
@@ -386,7 +385,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
                 )
                 self.report_status(USAGE_EXTRACTION_OPERATIONAL_STATS, False)
                 return
-            self.report.access_history_query_secs = round(timer.elapsed_seconds(), 2)
+            self.report.access_history_query_secs = timer.elapsed_seconds(digits=2)
         for row in results:
             yield from self._process_snowflake_history_row(row)
@@ -434,8 +433,8 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
                     self.report.max_access_history_time = db_row["MAX_TIME"].astimezone(
                         tz=timezone.utc
                     )
-                    self.report.access_history_range_query_secs = round(
-                        timer.elapsed_seconds(), 2
+                    self.report.access_history_range_query_secs = timer.elapsed_seconds(
+                        digits=2
                     )
     def _get_operation_aspect_work_unit(

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -480,8 +480,8 @@ class SnowflakeV2Source(
             identifiers=self.identifiers,
         )
-        self.report.set_ingestion_stage("*", METADATA_EXTRACTION)
-        yield from schema_extractor.get_workunits_internal()
+        with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
+            yield from schema_extractor.get_workunits_internal()
         databases = schema_extractor.databases
@@ -513,47 +513,46 @@ class SnowflakeV2Source(
         discovered_datasets = discovered_tables + discovered_views
         if self.config.use_queries_v2:
-            self.report.set_ingestion_stage("*", VIEW_PARSING)
-            yield from auto_workunit(self.aggregator.gen_metadata())
-            self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
-            schema_resolver = self.aggregator._schema_resolver
-            queries_extractor = SnowflakeQueriesExtractor(
-                connection=self.connection,
-                config=SnowflakeQueriesExtractorConfig(
-                    window=self.config,
-                    temporary_tables_pattern=self.config.temporary_tables_pattern,
-                    include_lineage=self.config.include_table_lineage,
-                    include_usage_statistics=self.config.include_usage_stats,
-                    include_operations=self.config.include_operational_stats,
-                    include_queries=self.config.include_queries,
-                    include_query_usage_statistics=self.config.include_query_usage_statistics,
-                    user_email_pattern=self.config.user_email_pattern,
-                ),
-                structured_report=self.report,
-                filters=self.filters,
-                identifiers=self.identifiers,
-                schema_resolver=schema_resolver,
-                discovered_tables=discovered_datasets,
-                graph=self.ctx.graph,
-            )
+            with self.report.new_stage(f"*: {VIEW_PARSING}"):
+                yield from auto_workunit(self.aggregator.gen_metadata())
-            # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
-            # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
-            # it should be pretty straightforward to refactor this and only initialize the aggregator once.
-            self.report.queries_extractor = queries_extractor.report
-            yield from queries_extractor.get_workunits_internal()
-            queries_extractor.close()
+            with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
+                schema_resolver = self.aggregator._schema_resolver
+                queries_extractor = SnowflakeQueriesExtractor(
+                    connection=self.connection,
+                    config=SnowflakeQueriesExtractorConfig(
+                        window=self.config,
+                        temporary_tables_pattern=self.config.temporary_tables_pattern,
+                        include_lineage=self.config.include_table_lineage,
+                        include_usage_statistics=self.config.include_usage_stats,
+                        include_operations=self.config.include_operational_stats,
+                        include_queries=self.config.include_queries,
+                        include_query_usage_statistics=self.config.include_query_usage_statistics,
+                        user_email_pattern=self.config.user_email_pattern,
+                    ),
+                    structured_report=self.report,
+                    filters=self.filters,
+                    identifiers=self.identifiers,
+                    schema_resolver=schema_resolver,
+                    discovered_tables=discovered_datasets,
+                    graph=self.ctx.graph,
+                )
+                # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
+                # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
+                # it should be pretty straightforward to refactor this and only initialize the aggregator once.
+                self.report.queries_extractor = queries_extractor.report
+                yield from queries_extractor.get_workunits_internal()
+                queries_extractor.close()
         else:
             if self.lineage_extractor:
-                self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION)
-                self.lineage_extractor.add_time_based_lineage_to_aggregator(
-                    discovered_tables=discovered_tables,
-                    discovered_views=discovered_views,
-                )
+                with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"):
+                    self.lineage_extractor.add_time_based_lineage_to_aggregator(
+                        discovered_tables=discovered_tables,
+                        discovered_views=discovered_views,
+                    )
             # This would emit view and external table ddl lineage
             # as well as query lineage via lineage_extractor

datahub/ingestion/source/sql/teradata.py CHANGED Viewed

@@ -878,7 +878,7 @@ ORDER by DataBaseName, TableName;
         urns = self.schema_resolver.get_urns()
         if self.config.include_table_lineage or self.config.include_usage_statistics:
-            self.report.report_ingestion_stage_start("audit log extraction")
-            yield from self.get_audit_log_mcps(urns=urns)
+            with self.report.new_stage("Audit log extraction"):
+                yield from self.get_audit_log_mcps(urns=urns)
         yield from self.builder.gen_workunits()

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -118,6 +118,7 @@ from datahub.ingestion.source.tableau.tableau_common import (
 )
 from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
 from datahub.ingestion.source.tableau.tableau_validation import check_user_role
+from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
 from datahub.metadata.com.linkedin.pegasus2avro.common import (
     AuditStamp,
     ChangeAuditStamps,
@@ -170,6 +171,8 @@ from datahub.sql_parsing.sqlglot_lineage import (
     create_lineage_sql_parsed_result,
 )
 from datahub.utilities import config_clean
+from datahub.utilities.perf_timer import PerfTimer
+from datahub.utilities.stats_collections import TopKDict
 from datahub.utilities.urns.dataset_urn import DatasetUrn
 try:
@@ -643,12 +646,41 @@ class SiteIdContentUrl:
 @dataclass
-class TableauSourceReport(StaleEntityRemovalSourceReport):
+class TableauSourceReport(
+    StaleEntityRemovalSourceReport,
+    IngestionStageReport,
+):
     get_all_datasources_query_failed: bool = False
     num_get_datasource_query_failures: int = 0
     num_datasource_field_skipped_no_name: int = 0
     num_csql_field_skipped_no_name: int = 0
     num_table_field_skipped_no_name: int = 0
+    # timers
+    extract_usage_stats_timer: Dict[str, float] = dataclass_field(
+        default_factory=TopKDict
+    )
+    fetch_groups_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
+    populate_database_server_hostname_map_timer: Dict[str, float] = dataclass_field(
+        default_factory=TopKDict
+    )
+    populate_projects_registry_timer: Dict[str, float] = dataclass_field(
+        default_factory=TopKDict
+    )
+    emit_workbooks_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
+    emit_sheets_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
+    emit_dashboards_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
+    emit_embedded_datasources_timer: Dict[str, float] = dataclass_field(
+        default_factory=TopKDict
+    )
+    emit_published_datasources_timer: Dict[str, float] = dataclass_field(
+        default_factory=TopKDict
+    )
+    emit_custom_sql_datasources_timer: Dict[str, float] = dataclass_field(
+        default_factory=TopKDict
+    )
+    emit_upstream_tables_timer: Dict[str, float] = dataclass_field(
+        default_factory=TopKDict
+    )
     # lineage
     num_tables_with_upstream_lineage: int = 0
     num_upstream_table_lineage: int = 0
@@ -660,6 +692,7 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
     num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
     num_hidden_assets_skipped: int = 0
     logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
     last_authenticated_at: Optional[datetime] = None
     num_expected_tableau_metadata_queries: int = 0
@@ -834,6 +867,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
                     platform=self.platform,
                 )
                 yield from site_source.ingest_tableau_site()
         except MetadataQueryException as md_exception:
             self.report.failure(
                 title="Failed to Retrieve Tableau Metadata",
@@ -3489,33 +3523,87 @@ class TableauSiteSource:
         return {"permissions": json.dumps(groups)} if len(groups) > 0 else None
     def ingest_tableau_site(self):
-        # Initialise the dictionary to later look-up for chart and dashboard stat
-        if self.config.extract_usage_stats:
-            self._populate_usage_stat_registry()
-        if self.config.permission_ingestion:
-            self._fetch_groups()
-        # Populate the map of database names and database hostnames to be used later to map
-        # databases to platform instances.
-        if self.config.database_hostname_to_platform_instance_map:
-            self._populate_database_server_hostname_map()
-        self._populate_projects_registry()
-        if self.config.add_site_container:
-            yield from self.emit_site_container()
-        yield from self.emit_project_containers()
-        yield from self.emit_workbooks()
-        if self.sheet_ids:
-            yield from self.emit_sheets()
-        if self.dashboard_ids:
-            yield from self.emit_dashboards()
-        if self.embedded_datasource_ids_being_used:
-            yield from self.emit_embedded_datasources()
-        if self.datasource_ids_being_used:
-            yield from self.emit_published_datasources()
-        if self.custom_sql_ids_being_used:
-            yield from self.emit_custom_sql_datasources()
-        if self.database_tables:
-            yield from self.emit_upstream_tables()
+        with self.report.new_stage(
+            f"Ingesting Tableau Site: {self.site_id} {self.site_content_url}"
+        ):
+            # Initialise the dictionary to later look-up for chart and dashboard stat
+            if self.config.extract_usage_stats:
+                with PerfTimer() as timer:
+                    self._populate_usage_stat_registry()
+                    self.report.extract_usage_stats_timer[
+                        self.site_content_url
+                    ] = timer.elapsed_seconds(digits=2)
+            if self.config.permission_ingestion:
+                with PerfTimer() as timer:
+                    self._fetch_groups()
+                    self.report.fetch_groups_timer[
+                        self.site_content_url
+                    ] = timer.elapsed_seconds(digits=2)
+            # Populate the map of database names and database hostnames to be used later to map
+            # databases to platform instances.
+            if self.config.database_hostname_to_platform_instance_map:
+                with PerfTimer() as timer:
+                    self._populate_database_server_hostname_map()
+                    self.report.populate_database_server_hostname_map_timer[
+                        self.site_content_url
+                    ] = timer.elapsed_seconds(digits=2)
+            with PerfTimer() as timer:
+                self._populate_projects_registry()
+                self.report.populate_projects_registry_timer[
+                    self.site_content_url
+                ] = timer.elapsed_seconds(digits=2)
+            if self.config.add_site_container:
+                yield from self.emit_site_container()
+            yield from self.emit_project_containers()
+            with PerfTimer() as timer:
+                yield from self.emit_workbooks()
+                self.report.emit_workbooks_timer[
+                    self.site_content_url
+                ] = timer.elapsed_seconds(digits=2)
+            if self.sheet_ids:
+                with PerfTimer() as timer:
+                    yield from self.emit_sheets()
+                    self.report.emit_sheets_timer[
+                        self.site_content_url
+                    ] = timer.elapsed_seconds(digits=2)
+            if self.dashboard_ids:
+                with PerfTimer() as timer:
+                    yield from self.emit_dashboards()
+                    self.report.emit_dashboards_timer[
+                        self.site_content_url
+                    ] = timer.elapsed_seconds(digits=2)
+            if self.embedded_datasource_ids_being_used:
+                with PerfTimer() as timer:
+                    yield from self.emit_embedded_datasources()
+                    self.report.emit_embedded_datasources_timer[
+                        self.site_content_url
+                    ] = timer.elapsed_seconds(digits=2)
+            if self.datasource_ids_being_used:
+                with PerfTimer() as timer:
+                    yield from self.emit_published_datasources()
+                    self.report.emit_published_datasources_timer[
+                        self.site_content_url
+                    ] = timer.elapsed_seconds(digits=2)
+            if self.custom_sql_ids_being_used:
+                with PerfTimer() as timer:
+                    yield from self.emit_custom_sql_datasources()
+                    self.report.emit_custom_sql_datasources_timer[
+                        self.site_content_url
+                    ] = timer.elapsed_seconds(digits=2)
+            if self.database_tables:
+                with PerfTimer() as timer:
+                    yield from self.emit_upstream_tables()
+                    self.report.emit_upstream_tables_timer[
+                        self.site_content_url
+                    ] = timer.elapsed_seconds(digits=2)

datahub/ingestion/source/unity/source.py CHANGED Viewed

@@ -263,86 +263,86 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
         ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
-        self.report.report_ingestion_stage_start("Ingestion Setup")
-        wait_on_warehouse = None
-        if self.config.include_hive_metastore:
-            self.report.report_ingestion_stage_start("Start warehouse")
-            # Can take several minutes, so start now and wait later
-            wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
-            if wait_on_warehouse is None:
-                self.report.report_failure(
-                    "initialization",
-                    f"SQL warehouse {self.config.profiling.warehouse_id} not found",
-                )
-                return
-            else:
-                # wait until warehouse is started
-                wait_on_warehouse.result()
+        with self.report.new_stage("Ingestion Setup"):
+            wait_on_warehouse = None
+            if self.config.include_hive_metastore:
+                with self.report.new_stage("Start warehouse"):
+                    # Can take several minutes, so start now and wait later
+                    wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
+                    if wait_on_warehouse is None:
+                        self.report.report_failure(
+                            "initialization",
+                            f"SQL warehouse {self.config.profiling.warehouse_id} not found",
+                        )
+                        return
+                    else:
+                        # wait until warehouse is started
+                        wait_on_warehouse.result()
         if self.config.include_ownership:
-            self.report.report_ingestion_stage_start("Ingest service principals")
-            self.build_service_principal_map()
-            self.build_groups_map()
+            with self.report.new_stage("Ingest service principals"):
+                self.build_service_principal_map()
+                self.build_groups_map()
         if self.config.include_notebooks:
-            self.report.report_ingestion_stage_start("Ingest notebooks")
-            yield from self.process_notebooks()
+            with self.report.new_stage("Ingest notebooks"):
+                yield from self.process_notebooks()
         yield from self.process_metastores()
         yield from self.get_view_lineage()
         if self.config.include_notebooks:
-            self.report.report_ingestion_stage_start("Notebook lineage")
-            for notebook in self.notebooks.values():
-                wu = self._gen_notebook_lineage(notebook)
-                if wu:
-                    yield wu
+            with self.report.new_stage("Notebook lineage"):
+                for notebook in self.notebooks.values():
+                    wu = self._gen_notebook_lineage(notebook)
+                    if wu:
+                        yield wu
         if self.config.include_usage_statistics:
-            self.report.report_ingestion_stage_start("Ingest usage")
-            usage_extractor = UnityCatalogUsageExtractor(
-                config=self.config,
-                report=self.report,
-                proxy=self.unity_catalog_api_proxy,
-                table_urn_builder=self.gen_dataset_urn,
-                user_urn_builder=self.gen_user_urn,
-            )
-            yield from usage_extractor.get_usage_workunits(
-                self.table_refs | self.view_refs
-            )
-        if self.config.is_profiling_enabled():
-            self.report.report_ingestion_stage_start("Start warehouse")
-            # Need to start the warehouse again for profiling,
-            # as it may have been stopped after ingestion might take
-            # longer time to complete
-            wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
-            if wait_on_warehouse is None:
-                self.report.report_failure(
-                    "initialization",
-                    f"SQL warehouse {self.config.profiling.warehouse_id} not found",
+            with self.report.new_stage("Ingest usage"):
+                usage_extractor = UnityCatalogUsageExtractor(
+                    config=self.config,
+                    report=self.report,
+                    proxy=self.unity_catalog_api_proxy,
+                    table_urn_builder=self.gen_dataset_urn,
+                    user_urn_builder=self.gen_user_urn,
+                )
+                yield from usage_extractor.get_usage_workunits(
+                    self.table_refs | self.view_refs
                 )
-                return
-            else:
-                # wait until warehouse is started
-                wait_on_warehouse.result()
-            self.report.report_ingestion_stage_start("Profiling")
-            if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
-                yield from UnityCatalogAnalyzeProfiler(
-                    self.config.profiling,
-                    self.report,
-                    self.unity_catalog_api_proxy,
-                    self.gen_dataset_urn,
-                ).get_workunits(self.table_refs)
-            elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
-                yield from UnityCatalogGEProfiler(
-                    sql_common_config=self.config,
-                    profiling_config=self.config.profiling,
-                    report=self.report,
-                ).get_workunits(list(self.tables.values()))
-            else:
-                raise ValueError("Unknown profiling config method")
+        if self.config.is_profiling_enabled():
+            with self.report.new_stage("Start warehouse"):
+                # Need to start the warehouse again for profiling,
+                # as it may have been stopped after ingestion might take
+                # longer time to complete
+                wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
+                if wait_on_warehouse is None:
+                    self.report.report_failure(
+                        "initialization",
+                        f"SQL warehouse {self.config.profiling.warehouse_id} not found",
+                    )
+                    return
+                else:
+                    # wait until warehouse is started
+                    wait_on_warehouse.result()
+            with self.report.new_stage("Profiling"):
+                if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
+                    yield from UnityCatalogAnalyzeProfiler(
+                        self.config.profiling,
+                        self.report,
+                        self.unity_catalog_api_proxy,
+                        self.gen_dataset_urn,
+                    ).get_workunits(self.table_refs)
+                elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
+                    yield from UnityCatalogGEProfiler(
+                        sql_common_config=self.config,
+                        profiling_config=self.config.profiling,
+                        report=self.report,
+                    ).get_workunits(list(self.tables.values()))
+                else:
+                    raise ValueError("Unknown profiling config method")
     def build_service_principal_map(self) -> None:
         try:
@@ -462,11 +462,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
                 self.report.schemas.dropped(schema.id)
                 continue
-            self.report.report_ingestion_stage_start(f"Ingest schema {schema.id}")
-            yield from self.gen_schema_containers(schema)
-            yield from self.process_tables(schema)
+            with self.report.new_stage(f"Ingest schema {schema.id}"):
+                yield from self.gen_schema_containers(schema)
+                yield from self.process_tables(schema)
-            self.report.schemas.processed(schema.id)
+                self.report.schemas.processed(schema.id)
     def process_tables(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
         for table in self.unity_catalog_api_proxy.tables(schema=schema):

datahub/ingestion/source_report/ingestion_stage.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
+from contextlib import AbstractContextManager
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
-from typing import Optional
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.stats_collections import TopKDict
@@ -22,25 +22,29 @@ PROFILING = "Profiling"
 @dataclass
 class IngestionStageReport:
-    ingestion_stage: Optional[str] = None
     ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
-    _timer: Optional[PerfTimer] = field(
-        default=None, init=False, repr=False, compare=False
-    )
-    def report_ingestion_stage_start(self, stage: str) -> None:
-        if self._timer:
-            elapsed = round(self._timer.elapsed_seconds(), 2)
-            logger.info(
-                f"Time spent in stage <{self.ingestion_stage}>: {elapsed} seconds",
-                stacklevel=2,
-            )
-            if self.ingestion_stage:
-                self.ingestion_stage_durations[self.ingestion_stage] = elapsed
-        else:
-            self._timer = PerfTimer()
-        self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
-        logger.info(f"Stage started: {self.ingestion_stage}")
+    def new_stage(self, stage: str) -> "IngestionStageContext":
+        return IngestionStageContext(stage, self)
+@dataclass
+class IngestionStageContext(AbstractContextManager):
+    def __init__(self, stage: str, report: IngestionStageReport):
+        self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
+        self._timer: PerfTimer = PerfTimer()
+        self._report = report
+    def __enter__(self) -> "IngestionStageContext":
+        logger.info(f"Stage started: {self._ingestion_stage}")
         self._timer.start()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        elapsed = self._timer.elapsed_seconds(digits=2)
+        logger.info(
+            f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
+            stacklevel=2,
+        )
+        self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed
+        return None

acryl-datahub 0.15.0.2rc2__py3-none-any.whl → 0.15.0.2rc3__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.2rc2py3-none-any.whl → 0.15.0.2rc3py3-none-any.whl