PyPI - acryl-datahub - Versions diffs - 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl - Mend

acryl-datahub 0.15.0.1rc17py3-none-any.whl → 0.15.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (211) hide show

{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
datahub/__init__.py +1 -1
datahub/api/entities/assertion/assertion_operator.py +3 -5
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/datacontract/assertion_operator.py +3 -5
datahub/api/entities/dataproduct/dataproduct.py +4 -4
datahub/api/entities/dataset/dataset.py +2 -1
datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
datahub/cli/cli_utils.py +13 -2
datahub/cli/delete_cli.py +3 -3
datahub/cli/docker_cli.py +6 -6
datahub/cli/ingest_cli.py +25 -15
datahub/cli/lite_cli.py +2 -2
datahub/cli/migrate.py +5 -5
datahub/cli/specific/assertions_cli.py +3 -3
datahub/cli/specific/structuredproperties_cli.py +84 -0
datahub/cli/timeline_cli.py +1 -1
datahub/configuration/common.py +1 -2
datahub/configuration/config_loader.py +73 -50
datahub/configuration/git.py +2 -2
datahub/configuration/time_window_config.py +10 -5
datahub/emitter/mce_builder.py +4 -8
datahub/emitter/mcp_builder.py +27 -0
datahub/emitter/mcp_patch_builder.py +1 -2
datahub/emitter/rest_emitter.py +126 -85
datahub/entrypoints.py +6 -0
datahub/ingestion/api/incremental_lineage_helper.py +2 -8
datahub/ingestion/api/report.py +1 -2
datahub/ingestion/api/source.py +4 -2
datahub/ingestion/api/source_helpers.py +1 -1
datahub/ingestion/extractor/json_schema_util.py +3 -3
datahub/ingestion/extractor/schema_util.py +3 -5
datahub/ingestion/fs/s3_fs.py +3 -3
datahub/ingestion/glossary/datahub_classifier.py +6 -4
datahub/ingestion/graph/client.py +22 -19
datahub/ingestion/graph/config.py +1 -1
datahub/ingestion/run/pipeline.py +8 -7
datahub/ingestion/run/pipeline_config.py +3 -3
datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
datahub/ingestion/source/abs/source.py +19 -8
datahub/ingestion/source/aws/glue.py +77 -47
datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
datahub/ingestion/source/aws/s3_util.py +24 -1
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
datahub/ingestion/source/bigquery_v2/queries.py +1 -3
datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
datahub/ingestion/source/bigquery_v2/usage.py +60 -60
datahub/ingestion/source/cassandra/cassandra.py +0 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
datahub/ingestion/source/confluent_schema_registry.py +6 -6
datahub/ingestion/source/csv_enricher.py +29 -29
datahub/ingestion/source/datahub/config.py +10 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
datahub/ingestion/source/datahub/datahub_source.py +12 -2
datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
datahub/ingestion/source/dbt/dbt_common.py +9 -7
datahub/ingestion/source/delta_lake/source.py +0 -5
datahub/ingestion/source/demo_data.py +1 -1
datahub/ingestion/source/dremio/dremio_api.py +4 -4
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
datahub/ingestion/source/dremio/dremio_source.py +2 -2
datahub/ingestion/source/elastic_search.py +4 -4
datahub/ingestion/source/fivetran/fivetran.py +1 -6
datahub/ingestion/source/gc/datahub_gc.py +11 -14
datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
datahub/ingestion/source/gcs/gcs_source.py +3 -2
datahub/ingestion/source/ge_data_profiler.py +2 -5
datahub/ingestion/source/ge_profiling_config.py +3 -3
datahub/ingestion/source/iceberg/iceberg.py +13 -6
datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +3 -3
datahub/ingestion/source/identity/okta.py +3 -3
datahub/ingestion/source/kafka/kafka.py +11 -9
datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
datahub/ingestion/source/looker/looker_common.py +19 -19
datahub/ingestion/source/looker/looker_config.py +11 -6
datahub/ingestion/source/looker/looker_source.py +25 -25
datahub/ingestion/source/looker/looker_template_language.py +3 -3
datahub/ingestion/source/looker/looker_usage.py +5 -7
datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
datahub/ingestion/source/looker/lookml_source.py +13 -15
datahub/ingestion/source/looker/view_upstream.py +5 -5
datahub/ingestion/source/metabase.py +1 -6
datahub/ingestion/source/mlflow.py +4 -9
datahub/ingestion/source/mode.py +5 -5
datahub/ingestion/source/mongodb.py +6 -4
datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
datahub/ingestion/source/nifi.py +24 -31
datahub/ingestion/source/openapi.py +9 -9
datahub/ingestion/source/powerbi/config.py +12 -12
datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
datahub/ingestion/source/powerbi/powerbi.py +6 -6
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
datahub/ingestion/source/redash.py +0 -5
datahub/ingestion/source/redshift/config.py +3 -3
datahub/ingestion/source/redshift/redshift.py +45 -46
datahub/ingestion/source/redshift/usage.py +33 -33
datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
datahub/ingestion/source/s3/source.py +11 -15
datahub/ingestion/source/salesforce.py +26 -25
datahub/ingestion/source/schema/json_schema.py +1 -1
datahub/ingestion/source/sigma/sigma.py +3 -3
datahub/ingestion/source/sigma/sigma_api.py +12 -10
datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
datahub/ingestion/source/sql/athena.py +1 -3
datahub/ingestion/source/sql/clickhouse.py +8 -14
datahub/ingestion/source/sql/oracle.py +1 -3
datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
datahub/ingestion/source/sql/sql_types.py +1 -2
datahub/ingestion/source/sql/sql_utils.py +5 -0
datahub/ingestion/source/sql/teradata.py +18 -5
datahub/ingestion/source/state/profiling_state_handler.py +3 -3
datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
datahub/ingestion/source/superset.py +1 -6
datahub/ingestion/source/tableau/tableau.py +343 -117
datahub/ingestion/source/tableau/tableau_common.py +5 -2
datahub/ingestion/source/unity/config.py +3 -1
datahub/ingestion/source/unity/proxy.py +1 -1
datahub/ingestion/source/unity/source.py +74 -74
datahub/ingestion/source/unity/usage.py +3 -1
datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
datahub/ingestion/source/usage/usage_common.py +1 -1
datahub/ingestion/source_report/ingestion_stage.py +24 -20
datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
datahub/ingestion/transformer/add_dataset_properties.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
datahub/ingestion/transformer/tags_to_terms.py +7 -7
datahub/integrations/assertion/snowflake/compiler.py +10 -10
datahub/lite/duckdb_lite.py +12 -10
datahub/metadata/_schema_classes.py +317 -44
datahub/metadata/_urns/urn_defs.py +69 -15
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
datahub/metadata/schema.avsc +302 -89
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
datahub/metadata/schemas/DatasetKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
datahub/metadata/schemas/MLModelKey.avsc +2 -1
datahub/metadata/schemas/MLModelProperties.avsc +96 -48
datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
datahub/metadata/schemas/VersionProperties.avsc +216 -0
datahub/metadata/schemas/VersionSetKey.avsc +26 -0
datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
datahub/secret/datahub_secrets_client.py +12 -21
datahub/secret/secret_common.py +14 -8
datahub/specific/aspect_helpers/custom_properties.py +1 -2
datahub/sql_parsing/schema_resolver.py +5 -10
datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
datahub/sql_parsing/sqlglot_lineage.py +3 -3
datahub/sql_parsing/sqlglot_utils.py +1 -1
datahub/telemetry/stats.py +1 -2
datahub/testing/mcp_diff.py +1 -1
datahub/utilities/file_backed_collections.py +11 -11
datahub/utilities/hive_schema_to_avro.py +2 -2
datahub/utilities/logging_manager.py +2 -2
datahub/utilities/lossy_collections.py +3 -3
datahub/utilities/mapping.py +3 -3
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/perf_timer.py +11 -6
datahub/utilities/serialized_lru_cache.py +3 -1
datahub/utilities/sqlalchemy_query_combiner.py +6 -6
datahub/utilities/sqllineage_patch.py +1 -1
datahub/utilities/stats_collections.py +3 -1
datahub/utilities/urns/_urn_base.py +28 -5
datahub/utilities/urns/urn_iter.py +2 -2
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/tableau/tableau_common.py CHANGED Viewed

@@ -642,8 +642,11 @@ class TableauUpstreamReference:
     @classmethod
     def create(
-        cls, d: dict, default_schema_map: Optional[Dict[str, str]] = None
+        cls, d: Dict, default_schema_map: Optional[Dict[str, str]] = None
     ) -> "TableauUpstreamReference":
+        if d is None:
+            raise ValueError("TableauUpstreamReference.create: d is None")
         # Values directly from `table` object from Tableau
         database_dict = (
             d.get(c.DATABASE) or {}
@@ -717,7 +720,7 @@ class TableauUpstreamReference:
         #  schema
         # TODO: Validate the startswith check. Currently required for our integration tests
-        if full_name is None or not full_name.startswith("["):
+        if full_name is None:
             return None
         return full_name.replace("[", "").replace("]", "").split(".")

datahub/ingestion/source/unity/config.py CHANGED Viewed

@@ -254,7 +254,9 @@ class UnityCatalogSourceConfig(
     )
     # TODO: Remove `type:ignore` by refactoring config
-    profiling: Union[UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig] = Field(  # type: ignore
+    profiling: Union[
+        UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
+    ] = Field(  # type: ignore
         default=UnityCatalogGEProfilerConfig(),
         description="Data profiling configuration",
         discriminator="method",

datahub/ingestion/source/unity/proxy.py CHANGED Viewed

@@ -363,7 +363,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
     @staticmethod
     def _create_metastore(
-        obj: Union[GetMetastoreSummaryResponse, MetastoreInfo]
+        obj: Union[GetMetastoreSummaryResponse, MetastoreInfo],
     ) -> Optional[Metastore]:
         if not obj.name:
             return None

datahub/ingestion/source/unity/source.py CHANGED Viewed

@@ -205,9 +205,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
         self.table_refs: Set[TableReference] = set()
         self.view_refs: Set[TableReference] = set()
         self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
-        self.view_definitions: FileBackedDict[
-            Tuple[TableReference, str]
-        ] = FileBackedDict()
+        self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
+            FileBackedDict()
+        )
         # Global map of tables, for profiling
         self.tables: FileBackedDict[Table] = FileBackedDict()
@@ -263,86 +263,86 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
         ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
-        self.report.report_ingestion_stage_start("Ingestion Setup")
-        wait_on_warehouse = None
-        if self.config.include_hive_metastore:
-            self.report.report_ingestion_stage_start("Start warehouse")
-            # Can take several minutes, so start now and wait later
-            wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
-            if wait_on_warehouse is None:
-                self.report.report_failure(
-                    "initialization",
-                    f"SQL warehouse {self.config.profiling.warehouse_id} not found",
-                )
-                return
-            else:
-                # wait until warehouse is started
-                wait_on_warehouse.result()
+        with self.report.new_stage("Ingestion Setup"):
+            wait_on_warehouse = None
+            if self.config.include_hive_metastore:
+                with self.report.new_stage("Start warehouse"):
+                    # Can take several minutes, so start now and wait later
+                    wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
+                    if wait_on_warehouse is None:
+                        self.report.report_failure(
+                            "initialization",
+                            f"SQL warehouse {self.config.profiling.warehouse_id} not found",
+                        )
+                        return
+                    else:
+                        # wait until warehouse is started
+                        wait_on_warehouse.result()
         if self.config.include_ownership:
-            self.report.report_ingestion_stage_start("Ingest service principals")
-            self.build_service_principal_map()
-            self.build_groups_map()
+            with self.report.new_stage("Ingest service principals"):
+                self.build_service_principal_map()
+                self.build_groups_map()
         if self.config.include_notebooks:
-            self.report.report_ingestion_stage_start("Ingest notebooks")
-            yield from self.process_notebooks()
+            with self.report.new_stage("Ingest notebooks"):
+                yield from self.process_notebooks()
         yield from self.process_metastores()
         yield from self.get_view_lineage()
         if self.config.include_notebooks:
-            self.report.report_ingestion_stage_start("Notebook lineage")
-            for notebook in self.notebooks.values():
-                wu = self._gen_notebook_lineage(notebook)
-                if wu:
-                    yield wu
+            with self.report.new_stage("Notebook lineage"):
+                for notebook in self.notebooks.values():
+                    wu = self._gen_notebook_lineage(notebook)
+                    if wu:
+                        yield wu
         if self.config.include_usage_statistics:
-            self.report.report_ingestion_stage_start("Ingest usage")
-            usage_extractor = UnityCatalogUsageExtractor(
-                config=self.config,
-                report=self.report,
-                proxy=self.unity_catalog_api_proxy,
-                table_urn_builder=self.gen_dataset_urn,
-                user_urn_builder=self.gen_user_urn,
-            )
-            yield from usage_extractor.get_usage_workunits(
-                self.table_refs | self.view_refs
-            )
-        if self.config.is_profiling_enabled():
-            self.report.report_ingestion_stage_start("Start warehouse")
-            # Need to start the warehouse again for profiling,
-            # as it may have been stopped after ingestion might take
-            # longer time to complete
-            wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
-            if wait_on_warehouse is None:
-                self.report.report_failure(
-                    "initialization",
-                    f"SQL warehouse {self.config.profiling.warehouse_id} not found",
+            with self.report.new_stage("Ingest usage"):
+                usage_extractor = UnityCatalogUsageExtractor(
+                    config=self.config,
+                    report=self.report,
+                    proxy=self.unity_catalog_api_proxy,
+                    table_urn_builder=self.gen_dataset_urn,
+                    user_urn_builder=self.gen_user_urn,
+                )
+                yield from usage_extractor.get_usage_workunits(
+                    self.table_refs | self.view_refs
                 )
-                return
-            else:
-                # wait until warehouse is started
-                wait_on_warehouse.result()
-            self.report.report_ingestion_stage_start("Profiling")
-            if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
-                yield from UnityCatalogAnalyzeProfiler(
-                    self.config.profiling,
-                    self.report,
-                    self.unity_catalog_api_proxy,
-                    self.gen_dataset_urn,
-                ).get_workunits(self.table_refs)
-            elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
-                yield from UnityCatalogGEProfiler(
-                    sql_common_config=self.config,
-                    profiling_config=self.config.profiling,
-                    report=self.report,
-                ).get_workunits(list(self.tables.values()))
-            else:
-                raise ValueError("Unknown profiling config method")
+        if self.config.is_profiling_enabled():
+            with self.report.new_stage("Start warehouse"):
+                # Need to start the warehouse again for profiling,
+                # as it may have been stopped after ingestion might take
+                # longer time to complete
+                wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
+                if wait_on_warehouse is None:
+                    self.report.report_failure(
+                        "initialization",
+                        f"SQL warehouse {self.config.profiling.warehouse_id} not found",
+                    )
+                    return
+                else:
+                    # wait until warehouse is started
+                    wait_on_warehouse.result()
+            with self.report.new_stage("Profiling"):
+                if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
+                    yield from UnityCatalogAnalyzeProfiler(
+                        self.config.profiling,
+                        self.report,
+                        self.unity_catalog_api_proxy,
+                        self.gen_dataset_urn,
+                    ).get_workunits(self.table_refs)
+                elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
+                    yield from UnityCatalogGEProfiler(
+                        sql_common_config=self.config,
+                        profiling_config=self.config.profiling,
+                        report=self.report,
+                    ).get_workunits(list(self.tables.values()))
+                else:
+                    raise ValueError("Unknown profiling config method")
     def build_service_principal_map(self) -> None:
         try:
@@ -462,11 +462,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
                 self.report.schemas.dropped(schema.id)
                 continue
-            self.report.report_ingestion_stage_start(f"Ingest schema {schema.id}")
-            yield from self.gen_schema_containers(schema)
-            yield from self.process_tables(schema)
+            with self.report.new_stage(f"Ingest schema {schema.id}"):
+                yield from self.gen_schema_containers(schema)
+                yield from self.process_tables(schema)
-            self.report.schemas.processed(schema.id)
+                self.report.schemas.processed(schema.id)
     def process_tables(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
         for table in self.unity_catalog_api_proxy.tables(schema=schema):

datahub/ingestion/source/unity/usage.py CHANGED Viewed

@@ -103,7 +103,9 @@ class UnityCatalogUsageExtractor:
                                 query, table_info
                             )
                         for source_table in table_info.source_tables:
-                            with self.report.usage_perf_report.aggregator_add_event_timer:
+                            with (
+                                self.report.usage_perf_report.aggregator_add_event_timer
+                            ):
                                 self.usage_aggregator.aggregate_event(
                                     resource=source_table,
                                     start_time=query.start_time,

datahub/ingestion/source/usage/clickhouse_usage.py CHANGED Viewed

@@ -213,15 +213,15 @@ class ClickHouseUsageSource(Source):
     def _aggregate_access_events(
         self, events: List[ClickHouseJoinedAccessEvent]
     ) -> Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]]:
-        datasets: Dict[
-            datetime, Dict[ClickHouseTableRef, AggregatedDataset]
-        ] = collections.defaultdict(dict)
+        datasets: Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]] = (
+            collections.defaultdict(dict)
+        )
         for event in events:
             floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
             resource = (
-                f'{self.config.platform_instance+"." if self.config.platform_instance else ""}'
+                f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
                 f"{event.database}.{event.table}"
             )

datahub/ingestion/source/usage/starburst_trino_usage.py CHANGED Viewed

@@ -235,9 +235,9 @@ class TrinoUsageSource(Source):
     def _aggregate_access_events(
         self, events: List[TrinoJoinedAccessEvent]
     ) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]:
-        datasets: Dict[
-            datetime, Dict[TrinoTableRef, AggregatedDataset]
-        ] = collections.defaultdict(dict)
+        datasets: Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]] = (
+            collections.defaultdict(dict)
+        )
         for event in events:
             floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)

datahub/ingestion/source/usage/usage_common.py CHANGED Viewed

@@ -89,7 +89,7 @@ def make_usage_workunit(
     top_sql_queries: Optional[List[str]] = None
     if query_freq is not None:
         if top_n_queries < len(query_freq):
-            logger.warn(
+            logger.warning(
                 f"Top N query limit exceeded on {str(resource)}.  Max number of queries {top_n_queries} <  {len(query_freq)}. Truncating top queries to {top_n_queries}."
             )
             query_freq = query_freq[0:top_n_queries]

datahub/ingestion/source_report/ingestion_stage.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
+from contextlib import AbstractContextManager
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
-from typing import Optional
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.stats_collections import TopKDict
@@ -22,25 +22,29 @@ PROFILING = "Profiling"
 @dataclass
 class IngestionStageReport:
-    ingestion_stage: Optional[str] = None
     ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
-    _timer: Optional[PerfTimer] = field(
-        default=None, init=False, repr=False, compare=False
-    )
-    def report_ingestion_stage_start(self, stage: str) -> None:
-        if self._timer:
-            elapsed = round(self._timer.elapsed_seconds(), 2)
-            logger.info(
-                f"Time spent in stage <{self.ingestion_stage}>: {elapsed} seconds",
-                stacklevel=2,
-            )
-            if self.ingestion_stage:
-                self.ingestion_stage_durations[self.ingestion_stage] = elapsed
-        else:
-            self._timer = PerfTimer()
-        self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
-        logger.info(f"Stage started: {self.ingestion_stage}")
+    def new_stage(self, stage: str) -> "IngestionStageContext":
+        return IngestionStageContext(stage, self)
+@dataclass
+class IngestionStageContext(AbstractContextManager):
+    def __init__(self, stage: str, report: IngestionStageReport):
+        self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
+        self._timer: PerfTimer = PerfTimer()
+        self._report = report
+    def __enter__(self) -> "IngestionStageContext":
+        logger.info(f"Stage started: {self._ingestion_stage}")
         self._timer.start()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        elapsed = self._timer.elapsed_seconds(digits=2)
+        logger.info(
+            f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
+            stacklevel=2,
+        )
+        self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed
+        return None

datahub/ingestion/transformer/add_dataset_dataproduct.py CHANGED Viewed

@@ -80,10 +80,10 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
                         ).add_asset(container_urn)
                         data_products_container[data_product_urn] = container_product
                     else:
-                        data_products_container[
-                            data_product_urn
-                        ] = data_products_container[data_product_urn].add_asset(
-                            container_urn
+                        data_products_container[data_product_urn] = (
+                            data_products_container[data_product_urn].add_asset(
+                                container_urn
+                            )
                         )
         mcps: List[

datahub/ingestion/transformer/add_dataset_properties.py CHANGED Viewed

@@ -61,9 +61,9 @@ class AddDatasetProperties(DatasetPropertiesTransformer):
     ) -> Optional[DatasetPropertiesClass]:
         assert dataset_properties_aspect
-        server_dataset_properties_aspect: Optional[
-            DatasetPropertiesClass
-        ] = graph.get_dataset_properties(entity_urn)
+        server_dataset_properties_aspect: Optional[DatasetPropertiesClass] = (
+            graph.get_dataset_properties(entity_urn)
+        )
         # No need to take any action if server properties is None or there is not customProperties in server properties
         if (
             server_dataset_properties_aspect is None

datahub/ingestion/transformer/add_dataset_schema_tags.py CHANGED Viewed

@@ -89,9 +89,9 @@ class AddDatasetSchemaTags(DatasetSchemaMetadataTransformer):
         server_field_map: dict = {}
         if self.config.semantics == TransformerSemantics.PATCH:
             assert self.ctx.graph
-            server_schema_metadata_aspect: Optional[
-                SchemaMetadataClass
-            ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
+            server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
+                self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
+            )
             if server_schema_metadata_aspect is not None:
                 if not schema_metadata_aspect:
                     schema_metadata_aspect = server_schema_metadata_aspect

datahub/ingestion/transformer/add_dataset_schema_terms.py CHANGED Viewed

@@ -108,9 +108,9 @@ class AddDatasetSchemaTerms(DatasetSchemaMetadataTransformer):
         ] = {}  # Map to cache server field objects, where fieldPath is key
         if self.config.semantics == TransformerSemantics.PATCH:
             assert self.ctx.graph
-            server_schema_metadata_aspect: Optional[
-                SchemaMetadataClass
-            ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
+            server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
+                self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
+            )
             if server_schema_metadata_aspect is not None:
                 if not schema_metadata_aspect:
                     schema_metadata_aspect = server_schema_metadata_aspect

datahub/ingestion/transformer/dataset_domain_based_on_tags.py CHANGED Viewed

@@ -60,10 +60,10 @@ class DatasetTagDomainMapper(DatasetDomainTransformer):
                 domain_aspect.domains.extend(mapped_domains.domains)
                 if self.config.semantics == TransformerSemantics.PATCH:
                     # Try merging with server-side domains
-                    patch_domain_aspect: Optional[
-                        DomainsClass
-                    ] = AddDatasetDomain._merge_with_server_domains(
-                        self.ctx.graph, entity_urn, domain_aspect
+                    patch_domain_aspect: Optional[DomainsClass] = (
+                        AddDatasetDomain._merge_with_server_domains(
+                            self.ctx.graph, entity_urn, domain_aspect
+                        )
                     )
                     return cast(Optional[Aspect], patch_domain_aspect)
                 return cast(Optional[Aspect], domain_aspect)

datahub/ingestion/transformer/extract_ownership_from_tags.py CHANGED Viewed

@@ -141,9 +141,9 @@ class ExtractOwnersFromTagsTransformer(DatasetTagsTransformer):
                 else:
                     owner_type = get_owner_type(self.config.owner_type)
                     if owner_type == OwnershipTypeClass.CUSTOM:
-                        assert (
-                            self.config.owner_type_urn is not None
-                        ), "owner_type_urn must be set if owner_type is CUSTOM"
+                        assert self.config.owner_type_urn is not None, (
+                            "owner_type_urn must be set if owner_type is CUSTOM"
+                        )
                     owners.append(
                         OwnerClass(

datahub/ingestion/transformer/tags_to_terms.py CHANGED Viewed

@@ -92,9 +92,9 @@ class TagsToTermMapper(TagsToTermTransformer):
         in_global_tags_aspect: Optional[GlobalTagsClass] = self.ctx.graph.get_tags(
             entity_urn
         )
-        in_schema_metadata_aspect: Optional[
-            SchemaMetadataClass
-        ] = self.ctx.graph.get_schema_metadata(entity_urn)
+        in_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
+            self.ctx.graph.get_schema_metadata(entity_urn)
+        )
         if in_global_tags_aspect is None and in_schema_metadata_aspect is None:
             return cast(Aspect, in_glossary_terms)
@@ -134,10 +134,10 @@ class TagsToTermMapper(TagsToTermTransformer):
         )
         if self.config.semantics == TransformerSemantics.PATCH:
-            patch_glossary_terms: Optional[
-                GlossaryTermsClass
-            ] = TagsToTermMapper._merge_with_server_glossary_terms(
-                self.ctx.graph, entity_urn, out_glossary_terms
+            patch_glossary_terms: Optional[GlossaryTermsClass] = (
+                TagsToTermMapper._merge_with_server_glossary_terms(
+                    self.ctx.graph, entity_urn, out_glossary_terms
+                )
             )
             return cast(Optional[Aspect], patch_glossary_terms)
         else:

datahub/integrations/assertion/snowflake/compiler.py CHANGED Viewed

@@ -61,17 +61,17 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
     def create(
         cls, output_dir: str, extras: Dict[str, str]
     ) -> "SnowflakeAssertionCompiler":
-        assert os.path.exists(
-            output_dir
-        ), f"Specified location {output_dir} does not exist."
+        assert os.path.exists(output_dir), (
+            f"Specified location {output_dir} does not exist."
+        )
-        assert os.path.isdir(
-            output_dir
-        ), f"Specified location {output_dir} is not a folder."
+        assert os.path.isdir(output_dir), (
+            f"Specified location {output_dir} is not a folder."
+        )
-        assert any(
-            x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras
-        ), "Must specify value for DMF schema using -x DMF_SCHEMA=<db.schema>"
+        assert any(x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras), (
+            "Must specify value for DMF schema using -x DMF_SCHEMA=<db.schema>"
+        )
         return SnowflakeAssertionCompiler(output_dir, extras)
@@ -232,6 +232,6 @@ def get_dmf_schedule(trigger: AssertionTrigger) -> str:
     elif isinstance(trigger.trigger, CronTrigger):
         return f"USING CRON {trigger.trigger.cron} {trigger.trigger.timezone}"
     elif isinstance(trigger.trigger, IntervalTrigger):
-        return f"{trigger.trigger.interval.seconds/60} MIN"
+        return f"{trigger.trigger.interval.seconds / 60} MIN"
     else:
         raise ValueError(f"Unsupported trigger type {type(trigger.trigger)}")

datahub/lite/duckdb_lite.py CHANGED Viewed

@@ -163,9 +163,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
                 if "properties" not in writeable_dict["systemMetadata"]:
                     writeable_dict["systemMetadata"]["properties"] = {}
-                writeable_dict["systemMetadata"]["properties"][
-                    "sysVersion"
-                ] = new_version
+                writeable_dict["systemMetadata"]["properties"]["sysVersion"] = (
+                    new_version
+                )
                 if needs_write:
                     self.duckdb_client.execute(
                         query="INSERT INTO metadata_aspect_v2 VALUES (?, ?, ?, ?, ?, ?)",
@@ -208,9 +208,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
                             "lastObserved": writeable.systemMetadata.lastObserved
                         }
                     else:
-                        system_metadata[
-                            "lastObserved"
-                        ] = writeable.systemMetadata.lastObserved
+                        system_metadata["lastObserved"] = (
+                            writeable.systemMetadata.lastObserved
+                        )
                     self.duckdb_client.execute(
                         query="UPDATE metadata_aspect_v2 SET system_metadata = ? WHERE urn = ? AND aspect_name = ? AND version = 0",
                         parameters=[
@@ -497,9 +497,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
             aspect_name = r[1]
             aspect_payload = json.loads(r[2])
             if typed:
-                assert (
-                    aspect_name in ASPECT_MAP
-                ), f"Missing aspect name {aspect_name} in the registry"
+                assert aspect_name in ASPECT_MAP, (
+                    f"Missing aspect name {aspect_name} in the registry"
+                )
                 try:
                     aspect_payload = ASPECT_MAP[aspect_name].from_obj(
                         post_json_transform(aspect_payload)
@@ -531,7 +531,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
         for r in results.fetchall():
             urn = r[0]
             aspect_name = r[1]
-            aspect_metadata = ASPECT_MAP[aspect_name].from_obj(post_json_transform(json.loads(r[2])))  # type: ignore
+            aspect_metadata = ASPECT_MAP[aspect_name].from_obj(
+                post_json_transform(json.loads(r[2]))
+            )  # type: ignore
             system_metadata = SystemMetadataClass.from_obj(json.loads(r[3]))
             mcp = MetadataChangeProposalWrapper(
                 entityUrn=urn,

acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.1rc17py3-none-any.whl → 0.15.0.2py3-none-any.whl