PyPI - acryl-datahub - Versions diffs - 0.15.0.2rc1__py3-none-any.whl → 0.15.0.2rc3__py3-none-any.whl - Mend

acryl-datahub 0.15.0.2rc1py3-none-any.whl → 0.15.0.2rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (34) hide show

datahub/ingestion/source/bigquery_v2/usage.py CHANGED Viewed

@@ -495,62 +495,62 @@ class BigQueryUsageExtractor:
     def _generate_operational_workunits(
         self, usage_state: BigQueryUsageState, table_refs: Collection[str]
     ) -> Iterable[MetadataWorkUnit]:
-        self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS)
-        for audit_event in usage_state.standalone_events():
-            try:
-                operational_wu = self._create_operation_workunit(
-                    audit_event, table_refs
-                )
-                if operational_wu:
-                    yield operational_wu
-                    self.report.num_operational_stats_workunits_emitted += 1
-            except Exception as e:
-                self.report.warning(
-                    message="Unable to generate operation workunit",
-                    context=f"{audit_event}",
-                    exc=e,
-                )
+        with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
+            for audit_event in usage_state.standalone_events():
+                try:
+                    operational_wu = self._create_operation_workunit(
+                        audit_event, table_refs
+                    )
+                    if operational_wu:
+                        yield operational_wu
+                        self.report.num_operational_stats_workunits_emitted += 1
+                except Exception as e:
+                    self.report.warning(
+                        message="Unable to generate operation workunit",
+                        context=f"{audit_event}",
+                        exc=e,
+                    )
     def _generate_usage_workunits(
         self, usage_state: BigQueryUsageState
     ) -> Iterable[MetadataWorkUnit]:
-        self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION)
-        top_n = (
-            self.config.usage.top_n_queries
-            if self.config.usage.include_top_n_queries
-            else 0
-        )
-        for entry in usage_state.usage_statistics(top_n=top_n):
-            try:
-                query_freq = [
-                    (
-                        self.uuid_to_query.get(
-                            query_hash, usage_state.queries[query_hash]
-                        ),
-                        count,
+        with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
+            top_n = (
+                self.config.usage.top_n_queries
+                if self.config.usage.include_top_n_queries
+                else 0
+            )
+            for entry in usage_state.usage_statistics(top_n=top_n):
+                try:
+                    query_freq = [
+                        (
+                            self.uuid_to_query.get(
+                                query_hash, usage_state.queries[query_hash]
+                            ),
+                            count,
+                        )
+                        for query_hash, count in entry.query_freq
+                    ]
+                    yield make_usage_workunit(
+                        bucket_start_time=datetime.fromisoformat(entry.timestamp),
+                        resource=BigQueryTableRef.from_string_name(entry.resource),
+                        query_count=entry.query_count,
+                        query_freq=query_freq,
+                        user_freq=entry.user_freq,
+                        column_freq=entry.column_freq,
+                        bucket_duration=self.config.bucket_duration,
+                        resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
+                        top_n_queries=self.config.usage.top_n_queries,
+                        format_sql_queries=self.config.usage.format_sql_queries,
+                        queries_character_limit=self.config.usage.queries_character_limit,
+                    )
+                    self.report.num_usage_workunits_emitted += 1
+                except Exception as e:
+                    self.report.warning(
+                        message="Unable to generate usage statistics workunit",
+                        context=f"{entry.timestamp}, {entry.resource}",
+                        exc=e,
                     )
-                    for query_hash, count in entry.query_freq
-                ]
-                yield make_usage_workunit(
-                    bucket_start_time=datetime.fromisoformat(entry.timestamp),
-                    resource=BigQueryTableRef.from_string_name(entry.resource),
-                    query_count=entry.query_count,
-                    query_freq=query_freq,
-                    user_freq=entry.user_freq,
-                    column_freq=entry.column_freq,
-                    bucket_duration=self.config.bucket_duration,
-                    resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
-                    top_n_queries=self.config.usage.top_n_queries,
-                    format_sql_queries=self.config.usage.format_sql_queries,
-                    queries_character_limit=self.config.usage.queries_character_limit,
-                )
-                self.report.num_usage_workunits_emitted += 1
-            except Exception as e:
-                self.report.warning(
-                    message="Unable to generate usage statistics workunit",
-                    context=f"{entry.timestamp}, {entry.resource}",
-                    exc=e,
-                )
     def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]:
         if self.config.use_exported_bigquery_audit_metadata:
@@ -559,10 +559,10 @@ class BigQueryUsageExtractor:
         for project_id in projects:
             with PerfTimer() as timer:
                 try:
-                    self.report.set_ingestion_stage(
-                        project_id, USAGE_EXTRACTION_INGESTION
-                    )
-                    yield from self._get_parsed_bigquery_log_events(project_id)
+                    with self.report.new_stage(
+                        f"{project_id}: {USAGE_EXTRACTION_INGESTION}"
+                    ):
+                        yield from self._get_parsed_bigquery_log_events(project_id)
                 except Exception as e:
                     self.report.usage_failed_extraction.append(project_id)
                     self.report.warning(
@@ -572,8 +572,8 @@ class BigQueryUsageExtractor:
                     )
                     self.report_status(f"usage-extraction-{project_id}", False)
-                self.report.usage_extraction_sec[project_id] = round(
-                    timer.elapsed_seconds(), 2
+                self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds(
+                    digits=2
                 )
     def _store_usage_event(

datahub/ingestion/source/cassandra/cassandra_profiling.py CHANGED Viewed

@@ -70,30 +70,30 @@ class CassandraProfiler:
     ) -> Iterable[MetadataWorkUnit]:
         for keyspace_name in cassandra_data.keyspaces:
             tables = cassandra_data.tables.get(keyspace_name, [])
-            self.report.set_ingestion_stage(keyspace_name, PROFILING)
-            with ThreadPoolExecutor(
-                max_workers=self.config.profiling.max_workers
-            ) as executor:
-                future_to_dataset = {
-                    executor.submit(
-                        self.generate_profile,
-                        keyspace_name,
-                        table_name,
-                        cassandra_data.columns.get(table_name, []),
-                    ): table_name
-                    for table_name in tables
-                }
-                for future in as_completed(future_to_dataset):
-                    table_name = future_to_dataset[future]
-                    try:
-                        yield from future.result()
-                    except Exception as exc:
-                        self.report.profiling_skipped_other[table_name] += 1
-                        self.report.failure(
-                            message="Failed to profile for table",
-                            context=f"{keyspace_name}.{table_name}",
-                            exc=exc,
-                        )
+            with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
+                with ThreadPoolExecutor(
+                    max_workers=self.config.profiling.max_workers
+                ) as executor:
+                    future_to_dataset = {
+                        executor.submit(
+                            self.generate_profile,
+                            keyspace_name,
+                            table_name,
+                            cassandra_data.columns.get(table_name, []),
+                        ): table_name
+                        for table_name in tables
+                    }
+                    for future in as_completed(future_to_dataset):
+                        table_name = future_to_dataset[future]
+                        try:
+                            yield from future.result()
+                        except Exception as exc:
+                            self.report.profiling_skipped_other[table_name] += 1
+                            self.report.failure(
+                                message="Failed to profile for table",
+                                context=f"{keyspace_name}.{table_name}",
+                                exc=exc,
+                            )
     def generate_profile(
         self,

datahub/ingestion/source/cassandra/cassandra_utils.py CHANGED Viewed

@@ -54,9 +54,6 @@ class CassandraSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport
         else:
             raise KeyError(f"Unknown entity {ent_type}.")
-    def set_ingestion_stage(self, keyspace: str, stage: str) -> None:
-        self.report_ingestion_stage_start(f"{keyspace}: {stage}")
     # TODO Need to create seperate common config for profiling report
     profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
     profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(

datahub/ingestion/source/datahub/config.py CHANGED Viewed

@@ -108,6 +108,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
     urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
+    drop_duplicate_schema_fields: bool = Field(
+        default=False,
+        description="Whether to drop duplicate schema fields in the schemaMetadata aspect. "
+        "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
+    )
     @root_validator(skip_on_failure=True)
     def check_ingesting_data(cls, values):
         if (

datahub/ingestion/source/datahub/datahub_source.py CHANGED Viewed

@@ -12,7 +12,10 @@ from datahub.ingestion.api.decorators import (
     support_status,
 )
 from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
-from datahub.ingestion.api.source_helpers import auto_workunit_reporter
+from datahub.ingestion.api.source_helpers import (
+    auto_fix_duplicate_schema_field_paths,
+    auto_workunit_reporter,
+)
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.datahub.config import DataHubSourceConfig
 from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
@@ -57,7 +60,14 @@ class DataHubSource(StatefulIngestionSourceBase):
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         # Exactly replicate data from DataHub source
-        return [partial(auto_workunit_reporter, self.get_report())]
+        return [
+            (
+                auto_fix_duplicate_schema_field_paths
+                if self.config.drop_duplicate_schema_fields
+                else None
+            ),
+            partial(auto_workunit_reporter, self.get_report()),
+        ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         self.report.stop_time = datetime.now(tz=timezone.utc)

datahub/ingestion/source/dremio/dremio_reporting.py CHANGED Viewed

@@ -45,6 +45,3 @@ class DremioSourceReport(
             self.views_scanned += 1
         else:
             raise KeyError(f"Unknown entity {ent_type}.")
-    def set_ingestion_stage(self, dataset: str, stage: str) -> None:
-        self.report_ingestion_stage_start(f"{dataset}: {stage}")

datahub/ingestion/source/dremio/dremio_source.py CHANGED Viewed

@@ -472,8 +472,8 @@ class DremioSource(StatefulIngestionSourceBase):
             env=self.config.env,
             platform_instance=self.config.platform_instance,
         )
-        self.report.set_ingestion_stage(dataset_info.resource_name, PROFILING)
-        yield from self.profiler.get_workunits(dataset_info, dataset_urn)
+        with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
+            yield from self.profiler.get_workunits(dataset_info, dataset_urn)
     def generate_view_lineage(
         self, dataset_urn: str, parents: List[str]

datahub/ingestion/source/gc/datahub_gc.py CHANGED Viewed

@@ -141,40 +141,36 @@ class DataHubGcSource(Source):
     ) -> Iterable[MetadataWorkUnit]:
         if self.config.cleanup_expired_tokens:
             try:
-                self.report.report_ingestion_stage_start("Expired Token Cleanup")
-                self.revoke_expired_tokens()
+                with self.report.new_stage("Expired Token Cleanup"):
+                    self.revoke_expired_tokens()
             except Exception as e:
                 self.report.failure("While trying to cleanup expired token ", exc=e)
         if self.config.truncate_indices:
             try:
-                self.report.report_ingestion_stage_start("Truncate Indices")
-                self.truncate_indices()
+                with self.report.new_stage("Truncate Indices"):
+                    self.truncate_indices()
             except Exception as e:
                 self.report.failure("While trying to truncate indices ", exc=e)
         if self.config.soft_deleted_entities_cleanup.enabled:
             try:
-                self.report.report_ingestion_stage_start(
-                    "Soft Deleted Entities Cleanup"
-                )
-                self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
+                with self.report.new_stage("Soft Deleted Entities Cleanup"):
+                    self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
             except Exception as e:
                 self.report.failure(
                     "While trying to cleanup soft deleted entities ", exc=e
                 )
         if self.config.dataprocess_cleanup.enabled:
             try:
-                self.report.report_ingestion_stage_start("Data Process Cleanup")
-                yield from self.dataprocess_cleanup.get_workunits_internal()
+                with self.report.new_stage("Data Process Cleanup"):
+                    yield from self.dataprocess_cleanup.get_workunits_internal()
             except Exception as e:
                 self.report.failure("While trying to cleanup data process ", exc=e)
         if self.config.execution_request_cleanup.enabled:
             try:
-                self.report.report_ingestion_stage_start("Execution request Cleanup")
-                self.execution_request_cleanup.run()
+                with self.report.new_stage("Execution request Cleanup"):
+                    self.execution_request_cleanup.run()
             except Exception as e:
                 self.report.failure("While trying to cleanup execution request ", exc=e)
-        # Otherwise last stage's duration does not get calculated.
-        self.report.report_ingestion_stage_start("End")
         yield from []
     def truncate_indices(self) -> None:

datahub/ingestion/source/looker/looker_config.py CHANGED Viewed

@@ -300,11 +300,16 @@ class LookerDashboardSourceConfig(
     folder_path_pattern: AllowDenyPattern = Field(
         default=AllowDenyPattern.allow_all(),
-        description="Allow or deny dashboards from specific folders. "
+        description="Allow or deny dashboards from specific folders using their fully qualified paths. "
         "For example: \n"
         "deny: \n"
-        " - sales/deprecated \n"
-        "This pattern will deny the ingestion of all dashboards and looks within the sales/deprecated folder. \n"
+        " - Shared/deprecated \n"
+        "This pattern will deny the ingestion of all dashboards and looks within the Shared/deprecated folder. \n"
+        "allow: \n"
+        " - Shared/sales \n"
+        "This pattern will allow only the ingestion of dashboards within the Shared/sales folder. \n"
+        "To get the correct path from Looker, take the folder hierarchy shown in the UI and join it with slashes. "
+        "For example, Shared -> Customer Reports -> Sales becomes Shared/Customer Reports/Sales. "
         "Dashboards will only be ingested if they're allowed by both this config and dashboard_pattern.",
     )

datahub/ingestion/source/redshift/redshift.py CHANGED Viewed

@@ -423,10 +423,10 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
         database = self.config.database
         logger.info(f"Processing db {database}")
-        self.report.report_ingestion_stage_start(METADATA_EXTRACTION)
-        self.db_tables[database] = defaultdict()
-        self.db_views[database] = defaultdict()
-        self.db_schemas.setdefault(database, {})
+        with self.report.new_stage(METADATA_EXTRACTION):
+            self.db_tables[database] = defaultdict()
+            self.db_views[database] = defaultdict()
+            self.db_schemas.setdefault(database, {})
         # TODO: Ideally, we'd push down exception handling to the place where the connection is used, as opposed to keeping
         # this fallback. For now, this gets us broad coverage quickly.
@@ -462,12 +462,12 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
                     self.process_schemas(connection, database)
                 )
-                self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
-                yield from self.extract_lineage_v2(
-                    connection=connection,
-                    database=database,
-                    lineage_extractor=lineage_extractor,
-                )
+                with self.report.new_stage(LINEAGE_EXTRACTION):
+                    yield from self.extract_lineage_v2(
+                        connection=connection,
+                        database=database,
+                        lineage_extractor=lineage_extractor,
+                    )
             all_tables = self.get_all_tables()
         else:
@@ -480,25 +480,25 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
                 or self.config.include_view_lineage
                 or self.config.include_copy_lineage
             ):
-                self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
-                yield from self.extract_lineage(
-                    connection=connection, all_tables=all_tables, database=database
-                )
+                with self.report.new_stage(LINEAGE_EXTRACTION):
+                    yield from self.extract_lineage(
+                        connection=connection, all_tables=all_tables, database=database
+                    )
-        self.report.report_ingestion_stage_start(USAGE_EXTRACTION_INGESTION)
         if self.config.include_usage_statistics:
-            yield from self.extract_usage(
-                connection=connection, all_tables=all_tables, database=database
-            )
+            with self.report.new_stage(USAGE_EXTRACTION_INGESTION):
+                yield from self.extract_usage(
+                    connection=connection, all_tables=all_tables, database=database
+                )
         if self.config.is_profiling_enabled():
-            self.report.report_ingestion_stage_start(PROFILING)
-            profiler = RedshiftProfiler(
-                config=self.config,
-                report=self.report,
-                state_handler=self.profiling_state_handler,
-            )
-            yield from profiler.get_workunits(self.db_tables)
+            with self.report.new_stage(PROFILING):
+                profiler = RedshiftProfiler(
+                    config=self.config,
+                    report=self.report,
+                    state_handler=self.profiling_state_handler,
+                )
+                yield from profiler.get_workunits(self.db_tables)
     def process_schemas(self, connection, database):
         for schema in self.data_dictionary.get_schemas(
@@ -633,8 +633,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
             else:
                 logger.info("View processing disabled, skipping")
-            self.report.metadata_extraction_sec[report_key] = round(
-                timer.elapsed_seconds(), 2
+            self.report.metadata_extraction_sec[report_key] = timer.elapsed_seconds(
+                digits=2
             )
     def _process_table(
@@ -986,9 +986,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
             yield from usage_extractor.get_usage_workunits(all_tables=all_tables)
-            self.report.usage_extraction_sec[database] = round(
-                timer.elapsed_seconds(), 2
-            )
+            self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2)
     def extract_lineage(
         self,
@@ -1011,8 +1009,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
                 database=database, connection=connection, all_tables=all_tables
             )
-            self.report.lineage_extraction_sec[f"{database}"] = round(
-                timer.elapsed_seconds(), 2
+            self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
+                digits=2
             )
             yield from self.generate_lineage(
                 database, lineage_extractor=lineage_extractor
@@ -1042,8 +1040,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
             yield from lineage_extractor.generate()
-            self.report.lineage_extraction_sec[f"{database}"] = round(
-                timer.elapsed_seconds(), 2
+            self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
+                digits=2
             )
         if self.redundant_lineage_run_skip_handler:

datahub/ingestion/source/redshift/usage.py CHANGED Viewed

@@ -182,38 +182,38 @@ class RedshiftUsageExtractor:
         self.report.num_operational_stats_filtered = 0
         if self.config.include_operational_stats:
-            self.report.report_ingestion_stage_start(USAGE_EXTRACTION_OPERATIONAL_STATS)
-            with PerfTimer() as timer:
-                # Generate operation aspect workunits
-                yield from self._gen_operation_aspect_workunits(
-                    self.connection, all_tables
-                )
-                self.report.operational_metadata_extraction_sec[
-                    self.config.database
-                ] = round(timer.elapsed_seconds(), 2)
+            with self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS):
+                with PerfTimer() as timer:
+                    # Generate operation aspect workunits
+                    yield from self._gen_operation_aspect_workunits(
+                        self.connection, all_tables
+                    )
+                    self.report.operational_metadata_extraction_sec[
+                        self.config.database
+                    ] = timer.elapsed_seconds(digits=2)
         # Generate aggregate events
-        self.report.report_ingestion_stage_start(USAGE_EXTRACTION_USAGE_AGGREGATION)
-        query: str = self.queries.usage_query(
-            start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
-            end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
-            database=self.config.database,
-        )
-        access_events_iterable: Iterable[
-            RedshiftAccessEvent
-        ] = self._gen_access_events_from_history_query(
-            query, connection=self.connection, all_tables=all_tables
-        )
+        with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):
+            query: str = self.queries.usage_query(
+                start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
+                end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
+                database=self.config.database,
+            )
+            access_events_iterable: Iterable[
+                RedshiftAccessEvent
+            ] = self._gen_access_events_from_history_query(
+                query, connection=self.connection, all_tables=all_tables
+            )
-        aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
-            access_events_iterable
-        )
-        # Generate usage workunits from aggregated events.
-        for time_bucket in aggregated_events.values():
-            for aggregate in time_bucket.values():
-                wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
-                self.report.num_usage_workunits_emitted += 1
-                yield wu
+            aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
+                access_events_iterable
+            )
+            # Generate usage workunits from aggregated events.
+            for time_bucket in aggregated_events.values():
+                for aggregate in time_bucket.values():
+                    wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
+                    self.report.num_usage_workunits_emitted += 1
+                    yield wu
     def _gen_operation_aspect_workunits(
         self,

datahub/ingestion/source/snowflake/snowflake_report.py CHANGED Viewed

@@ -166,6 +166,3 @@ class SnowflakeV2Report(
     def report_tag_processed(self, tag_name: str) -> None:
         self._processed_tags.add(tag_name)
-    def set_ingestion_stage(self, database: str, stage: str) -> None:
-        self.report_ingestion_stage_start(f"{database}: {stage}")

datahub/ingestion/source/snowflake/snowflake_schema_gen.py CHANGED Viewed

@@ -216,21 +216,23 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         try:
             for snowflake_db in self.databases:
-                self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
-                yield from self._process_database(snowflake_db)
+                with self.report.new_stage(
+                    f"{snowflake_db.name}: {METADATA_EXTRACTION}"
+                ):
+                    yield from self._process_database(snowflake_db)
-            self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
-            discovered_tables: List[str] = [
-                self.identifiers.get_dataset_identifier(
-                    table_name, schema.name, db.name
-                )
-                for db in self.databases
-                for schema in db.schemas
-                for table_name in schema.tables
-            ]
-            if self.aggregator:
-                for entry in self._external_tables_ddl_lineage(discovered_tables):
-                    self.aggregator.add(entry)
+            with self.report.new_stage(f"*: {EXTERNAL_TABLE_DDL_LINEAGE}"):
+                discovered_tables: List[str] = [
+                    self.identifiers.get_dataset_identifier(
+                        table_name, schema.name, db.name
+                    )
+                    for db in self.databases
+                    for schema in db.schemas
+                    for table_name in schema.tables
+                ]
+                if self.aggregator:
+                    for entry in self._external_tables_ddl_lineage(discovered_tables):
+                        self.aggregator.add(entry)
         except SnowflakePermissionError as e:
             self.structured_reporter.failure(
@@ -332,8 +334,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         yield from self._process_db_schemas(snowflake_db, db_tables)
         if self.profiler and db_tables:
-            self.report.set_ingestion_stage(snowflake_db.name, PROFILING)
-            yield from self.profiler.get_workunits(snowflake_db, db_tables)
+            with self.report.new_stage(f"{snowflake_db.name}: {PROFILING}"):
+                yield from self.profiler.get_workunits(snowflake_db, db_tables)
     def _process_db_schemas(
         self,

acryl-datahub 0.15.0.2rc1__py3-none-any.whl → 0.15.0.2rc3__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.2rc1py3-none-any.whl → 0.15.0.2rc3py3-none-any.whl