PyPI - acryl-datahub - Versions diffs - 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl - Mend

acryl-datahub 0.15.0.1rc16py3-none-any.whl → 0.15.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (214) hide show

{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
datahub/__init__.py +1 -1
datahub/api/entities/assertion/assertion_operator.py +3 -5
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/datacontract/assertion_operator.py +3 -5
datahub/api/entities/dataproduct/dataproduct.py +4 -4
datahub/api/entities/dataset/dataset.py +2 -1
datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
datahub/cli/cli_utils.py +13 -2
datahub/cli/delete_cli.py +3 -3
datahub/cli/docker_cli.py +6 -6
datahub/cli/ingest_cli.py +25 -15
datahub/cli/lite_cli.py +2 -2
datahub/cli/migrate.py +5 -5
datahub/cli/specific/assertions_cli.py +3 -3
datahub/cli/specific/structuredproperties_cli.py +84 -0
datahub/cli/timeline_cli.py +1 -1
datahub/configuration/common.py +1 -2
datahub/configuration/config_loader.py +73 -50
datahub/configuration/git.py +2 -2
datahub/configuration/time_window_config.py +10 -5
datahub/emitter/mce_builder.py +4 -8
datahub/emitter/mcp_builder.py +27 -0
datahub/emitter/mcp_patch_builder.py +1 -2
datahub/emitter/rest_emitter.py +141 -93
datahub/entrypoints.py +6 -0
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
datahub/ingestion/api/incremental_lineage_helper.py +2 -8
datahub/ingestion/api/report.py +1 -2
datahub/ingestion/api/source.py +8 -2
datahub/ingestion/api/source_helpers.py +1 -1
datahub/ingestion/extractor/json_schema_util.py +3 -3
datahub/ingestion/extractor/schema_util.py +3 -5
datahub/ingestion/fs/s3_fs.py +3 -3
datahub/ingestion/glossary/classifier.py +2 -3
datahub/ingestion/glossary/datahub_classifier.py +6 -4
datahub/ingestion/graph/client.py +22 -19
datahub/ingestion/graph/config.py +1 -1
datahub/ingestion/run/pipeline.py +8 -7
datahub/ingestion/run/pipeline_config.py +3 -3
datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
datahub/ingestion/source/abs/source.py +19 -8
datahub/ingestion/source/aws/glue.py +77 -47
datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
datahub/ingestion/source/aws/s3_util.py +24 -1
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
datahub/ingestion/source/bigquery_v2/queries.py +1 -3
datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
datahub/ingestion/source/bigquery_v2/usage.py +60 -60
datahub/ingestion/source/cassandra/cassandra.py +0 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
datahub/ingestion/source/confluent_schema_registry.py +6 -6
datahub/ingestion/source/csv_enricher.py +29 -29
datahub/ingestion/source/datahub/config.py +20 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
datahub/ingestion/source/datahub/datahub_source.py +13 -3
datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
datahub/ingestion/source/dbt/dbt_common.py +9 -7
datahub/ingestion/source/delta_lake/source.py +0 -5
datahub/ingestion/source/demo_data.py +1 -1
datahub/ingestion/source/dremio/dremio_api.py +4 -4
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
datahub/ingestion/source/dremio/dremio_source.py +2 -2
datahub/ingestion/source/elastic_search.py +4 -4
datahub/ingestion/source/fivetran/fivetran.py +1 -6
datahub/ingestion/source/gc/datahub_gc.py +11 -14
datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
datahub/ingestion/source/gcs/gcs_source.py +3 -2
datahub/ingestion/source/ge_data_profiler.py +2 -5
datahub/ingestion/source/ge_profiling_config.py +3 -3
datahub/ingestion/source/iceberg/iceberg.py +13 -6
datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +3 -3
datahub/ingestion/source/identity/okta.py +3 -3
datahub/ingestion/source/kafka/kafka.py +11 -9
datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
datahub/ingestion/source/looker/looker_common.py +19 -19
datahub/ingestion/source/looker/looker_config.py +11 -6
datahub/ingestion/source/looker/looker_source.py +25 -25
datahub/ingestion/source/looker/looker_template_language.py +3 -3
datahub/ingestion/source/looker/looker_usage.py +5 -7
datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
datahub/ingestion/source/looker/lookml_source.py +13 -15
datahub/ingestion/source/looker/view_upstream.py +5 -5
datahub/ingestion/source/metabase.py +1 -6
datahub/ingestion/source/mlflow.py +4 -9
datahub/ingestion/source/mode.py +5 -5
datahub/ingestion/source/mongodb.py +6 -4
datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
datahub/ingestion/source/nifi.py +24 -31
datahub/ingestion/source/openapi.py +9 -9
datahub/ingestion/source/powerbi/config.py +12 -12
datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
datahub/ingestion/source/powerbi/powerbi.py +6 -6
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
datahub/ingestion/source/redash.py +0 -5
datahub/ingestion/source/redshift/config.py +3 -3
datahub/ingestion/source/redshift/redshift.py +45 -46
datahub/ingestion/source/redshift/usage.py +33 -33
datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
datahub/ingestion/source/s3/source.py +11 -15
datahub/ingestion/source/salesforce.py +26 -25
datahub/ingestion/source/schema/json_schema.py +1 -1
datahub/ingestion/source/sigma/sigma.py +3 -3
datahub/ingestion/source/sigma/sigma_api.py +12 -10
datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
datahub/ingestion/source/sql/athena.py +1 -3
datahub/ingestion/source/sql/clickhouse.py +8 -14
datahub/ingestion/source/sql/oracle.py +1 -3
datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
datahub/ingestion/source/sql/sql_types.py +1 -2
datahub/ingestion/source/sql/sql_utils.py +5 -0
datahub/ingestion/source/sql/teradata.py +18 -5
datahub/ingestion/source/state/profiling_state_handler.py +3 -3
datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
datahub/ingestion/source/superset.py +1 -6
datahub/ingestion/source/tableau/tableau.py +343 -117
datahub/ingestion/source/tableau/tableau_common.py +5 -2
datahub/ingestion/source/unity/config.py +3 -1
datahub/ingestion/source/unity/proxy.py +1 -1
datahub/ingestion/source/unity/source.py +74 -78
datahub/ingestion/source/unity/usage.py +3 -1
datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
datahub/ingestion/source/usage/usage_common.py +1 -1
datahub/ingestion/source_report/ingestion_stage.py +24 -20
datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
datahub/ingestion/transformer/add_dataset_properties.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
datahub/ingestion/transformer/tags_to_terms.py +7 -7
datahub/integrations/assertion/snowflake/compiler.py +10 -10
datahub/lite/duckdb_lite.py +12 -10
datahub/metadata/_schema_classes.py +317 -44
datahub/metadata/_urns/urn_defs.py +69 -15
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
datahub/metadata/schema.avsc +302 -89
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
datahub/metadata/schemas/DatasetKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
datahub/metadata/schemas/MLModelKey.avsc +2 -1
datahub/metadata/schemas/MLModelProperties.avsc +96 -48
datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
datahub/metadata/schemas/VersionProperties.avsc +216 -0
datahub/metadata/schemas/VersionSetKey.avsc +26 -0
datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
datahub/secret/datahub_secrets_client.py +12 -21
datahub/secret/secret_common.py +14 -8
datahub/specific/aspect_helpers/custom_properties.py +1 -2
datahub/sql_parsing/schema_resolver.py +5 -10
datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
datahub/sql_parsing/sqlglot_lineage.py +3 -3
datahub/sql_parsing/sqlglot_utils.py +1 -1
datahub/telemetry/stats.py +1 -2
datahub/testing/mcp_diff.py +1 -1
datahub/utilities/file_backed_collections.py +11 -11
datahub/utilities/hive_schema_to_avro.py +2 -2
datahub/utilities/logging_manager.py +2 -2
datahub/utilities/lossy_collections.py +3 -3
datahub/utilities/mapping.py +3 -3
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/perf_timer.py +11 -6
datahub/utilities/serialized_lru_cache.py +3 -1
datahub/utilities/sqlalchemy_query_combiner.py +6 -6
datahub/utilities/sqllineage_patch.py +1 -1
datahub/utilities/stats_collections.py +3 -1
datahub/utilities/urns/_urn_base.py +28 -5
datahub/utilities/urns/urn_iter.py +2 -2
{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/bigquery_v2/bigquery_config.py CHANGED Viewed

@@ -137,9 +137,9 @@ class BigQueryCredential(ConfigModel):
     @root_validator(skip_on_failure=True)
     def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
         if values.get("client_x509_cert_url") is None:
-            values[
-                "client_x509_cert_url"
-            ] = f'https://www.googleapis.com/robot/v1/metadata/x509/{values["client_email"]}'
+            values["client_x509_cert_url"] = (
+                f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}"
+            )
         return values
     def create_credential_temp_file(self) -> str:
@@ -447,6 +447,14 @@ class BigQueryV2Config(
         default=False,
         description="If enabled, uses the new queries extractor to extract queries from bigquery.",
     )
+    include_queries: bool = Field(
+        default=True,
+        description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
+    )
+    include_query_usage_statistics: bool = Field(
+        default=True,
+        description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
+    )
     @property
     def have_table_data_read_permission(self) -> bool:
@@ -603,9 +611,9 @@ class BigQueryV2Config(
         cls, v: Optional[List[str]], values: Dict
     ) -> Optional[List[str]]:
         if values.get("use_exported_bigquery_audit_metadata"):
-            assert (
-                v and len(v) > 0
-            ), "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
+            assert v and len(v) > 0, (
+                "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
+            )
         return v

datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py CHANGED Viewed

@@ -87,9 +87,9 @@ class BigQueryPlatformResourceHelper:
             key=platform_resource_key, graph_client=self.graph
         )
         if platform_resource:
-            self.platform_resource_cache[
-                platform_resource_key.primary_key
-            ] = platform_resource
+            self.platform_resource_cache[platform_resource_key.primary_key] = (
+                platform_resource
+            )
             return platform_resource
         return None
@@ -115,7 +115,11 @@ class BigQueryPlatformResourceHelper:
                 and platform_resource.resource_info.value
             ):
                 try:
-                    existing_info: Optional[BigQueryLabelInfo] = platform_resource.resource_info.value.as_pydantic_object(BigQueryLabelInfo)  # type: ignore
+                    existing_info: Optional[BigQueryLabelInfo] = (
+                        platform_resource.resource_info.value.as_pydantic_object(  # type: ignore
+                            BigQueryLabelInfo
+                        )
+                    )
                 except ValidationError as e:
                     logger.error(
                         f"Error converting existing value to BigQueryLabelInfo: {e}. Creating new one. Maybe this is because of a non backward compatible schema change."

datahub/ingestion/source/bigquery_v2/bigquery_report.py CHANGED Viewed

@@ -190,6 +190,3 @@ class BigQueryV2Report(
     num_skipped_external_table_lineage: int = 0
     queries_extractor: Optional[BigQueryQueriesExtractorReport] = None
-    def set_ingestion_stage(self, project_id: str, stage: str) -> None:
-        self.report_ingestion_stage_start(f"{project_id}: {stage}")

datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py CHANGED Viewed

@@ -248,9 +248,9 @@ class BigQuerySchemaGenerator:
     def get_project_workunits(
         self, project: BigqueryProject
     ) -> Iterable[MetadataWorkUnit]:
-        self.report.set_ingestion_stage(project.id, METADATA_EXTRACTION)
-        logger.info(f"Processing project: {project.id}")
-        yield from self._process_project(project)
+        with self.report.new_stage(f"{project.id}: {METADATA_EXTRACTION}"):
+            logger.info(f"Processing project: {project.id}")
+            yield from self._process_project(project)
     def get_dataplatform_instance_aspect(
         self, dataset_urn: str, project_id: str
@@ -311,8 +311,10 @@ class BigQuerySchemaGenerator:
                         platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
                             label, tag_urn, managed_by_datahub=False
                         )
-                        label_info: BigQueryLabelInfo = platform_resource.resource_info.value.as_pydantic_object(  # type: ignore
-                            BigQueryLabelInfo
+                        label_info: BigQueryLabelInfo = (
+                            platform_resource.resource_info.value.as_pydantic_object(  # type: ignore
+                                BigQueryLabelInfo
+                            )
                         )
                         tag_urn = TagUrn.from_string(label_info.datahub_urn)
@@ -405,11 +407,11 @@ class BigQuerySchemaGenerator:
         if self.config.is_profiling_enabled():
             logger.info(f"Starting profiling project {project_id}")
-            self.report.set_ingestion_stage(project_id, PROFILING)
-            yield from self.profiler.get_workunits(
-                project_id=project_id,
-                tables=db_tables,
-            )
+            with self.report.new_stage(f"{project_id}: {PROFILING}"):
+                yield from self.profiler.get_workunits(
+                    project_id=project_id,
+                    tables=db_tables,
+                )
     def _process_project_datasets(
         self,
@@ -820,8 +822,10 @@ class BigQuerySchemaGenerator:
                         platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
                             label, tag_urn, managed_by_datahub=False
                         )
-                        label_info: BigQueryLabelInfo = platform_resource.resource_info.value.as_pydantic_object(  # type: ignore
-                            BigQueryLabelInfo
+                        label_info: BigQueryLabelInfo = (
+                            platform_resource.resource_info.value.as_pydantic_object(  # type: ignore
+                                BigQueryLabelInfo
+                            )
                         )
                         tag_urn = TagUrn.from_string(label_info.datahub_urn)
@@ -860,8 +864,10 @@ class BigQuerySchemaGenerator:
                         platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
                             label, tag_urn, managed_by_datahub=False
                         )
-                        label_info: BigQueryLabelInfo = platform_resource.resource_info.value.as_pydantic_object(  # type: ignore
-                            BigQueryLabelInfo
+                        label_info: BigQueryLabelInfo = (
+                            platform_resource.resource_info.value.as_pydantic_object(  # type: ignore
+                                BigQueryLabelInfo
+                            )
                         )
                         tag_urn = TagUrn.from_string(label_info.datahub_urn)
@@ -1203,8 +1209,8 @@ class BigQuerySchemaGenerator:
                     report=self.report,
                 )
-        self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = round(
-            timer.elapsed_seconds(), 2
+        self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = (
+            timer.elapsed_seconds(digits=2)
         )
     def get_core_table_details(

datahub/ingestion/source/bigquery_v2/lineage.py CHANGED Viewed

@@ -330,11 +330,11 @@ class BigqueryLineageExtractor:
             projects = ["*"]  # project_id not used when using exported metadata
         for project in projects:
-            self.report.set_ingestion_stage(project, LINEAGE_EXTRACTION)
-            yield from self.generate_lineage(
-                project,
-                table_refs,
-            )
+            with self.report.new_stage(f"{project}: {LINEAGE_EXTRACTION}"):
+                yield from self.generate_lineage(
+                    project,
+                    table_refs,
+                )
         if self.redundant_run_skip_handler:
             # Update the checkpoint state for this run.
@@ -368,8 +368,8 @@ class BigqueryLineageExtractor:
             self.report.lineage_metadata_entries[project_id] = len(lineage)
             logger.info(f"Built lineage map containing {len(lineage)} entries.")
             logger.debug(f"lineage metadata is {lineage}")
-            self.report.lineage_extraction_sec[project_id] = round(
-                timer.elapsed_seconds(), 2
+            self.report.lineage_extraction_sec[project_id] = timer.elapsed_seconds(
+                digits=2
             )
             self.report.lineage_mem_size[project_id] = humanfriendly.format_size(
                 memory_footprint.total_size(lineage)
@@ -697,7 +697,7 @@ class BigqueryLineageExtractor:
                         if parsed_queries[-1]:
                             query = f"""create table `{destination_table.get_sanitized_table_ref().table_identifier.get_table_name()}` AS
                             (
-                                {parsed_queries[-1].sql(dialect='bigquery')}
+                                {parsed_queries[-1].sql(dialect="bigquery")}
                             )"""
                         else:
                             query = e.query
@@ -809,11 +809,11 @@ class BigqueryLineageExtractor:
                             upstream_lineage, temp_table_upstream
                         )
-                        upstreams[
-                            ref_temp_table_upstream
-                        ] = _merge_lineage_edge_columns(
-                            upstreams.get(ref_temp_table_upstream),
-                            collapsed_lineage,
+                        upstreams[ref_temp_table_upstream] = (
+                            _merge_lineage_edge_columns(
+                                upstreams.get(ref_temp_table_upstream),
+                                collapsed_lineage,
+                            )
                         )
             else:
                 upstreams[upstream_table_ref] = _merge_lineage_edge_columns(
@@ -1004,9 +1004,9 @@ class BigqueryLineageExtractor:
                 dataset_urn
             )
             for gcs_dataset_urn in gcs_urns:
-                schema_metadata_for_gcs: Optional[
-                    SchemaMetadataClass
-                ] = graph.get_schema_metadata(gcs_dataset_urn)
+                schema_metadata_for_gcs: Optional[SchemaMetadataClass] = (
+                    graph.get_schema_metadata(gcs_dataset_urn)
+                )
                 if schema_metadata and schema_metadata_for_gcs:
                     fine_grained_lineage = self.get_fine_grained_lineages_with_gcs(
                         dataset_urn,

datahub/ingestion/source/bigquery_v2/queries.py CHANGED Viewed

@@ -387,9 +387,7 @@ AND
     OR
     protoPayload.metadata.tableDataRead.reason = "JOB"
 )
-""".strip(
-    "\t \n"
-)
+""".strip("\t \n")
 def bigquery_audit_metadata_query_template_lineage(

datahub/ingestion/source/bigquery_v2/queries_extractor.py CHANGED Viewed

@@ -271,9 +271,9 @@ class BigQueryQueriesExtractor(Closeable):
             # Preprocessing stage that deduplicates the queries using query hash per usage bucket
             # Note: FileBackedDict is an ordered dictionary, so the order of execution of
             # queries is inherently maintained
-            queries_deduped: FileBackedDict[
-                Dict[int, ObservedQuery]
-            ] = self.deduplicate_queries(queries)
+            queries_deduped: FileBackedDict[Dict[int, ObservedQuery]] = (
+                self.deduplicate_queries(queries)
+            )
             self.report.num_unique_queries = len(queries_deduped)
             logger.info(f"Found {self.report.num_unique_queries} unique queries")

datahub/ingestion/source/bigquery_v2/usage.py CHANGED Viewed

@@ -495,62 +495,62 @@ class BigQueryUsageExtractor:
     def _generate_operational_workunits(
         self, usage_state: BigQueryUsageState, table_refs: Collection[str]
     ) -> Iterable[MetadataWorkUnit]:
-        self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS)
-        for audit_event in usage_state.standalone_events():
-            try:
-                operational_wu = self._create_operation_workunit(
-                    audit_event, table_refs
-                )
-                if operational_wu:
-                    yield operational_wu
-                    self.report.num_operational_stats_workunits_emitted += 1
-            except Exception as e:
-                self.report.warning(
-                    message="Unable to generate operation workunit",
-                    context=f"{audit_event}",
-                    exc=e,
-                )
+        with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
+            for audit_event in usage_state.standalone_events():
+                try:
+                    operational_wu = self._create_operation_workunit(
+                        audit_event, table_refs
+                    )
+                    if operational_wu:
+                        yield operational_wu
+                        self.report.num_operational_stats_workunits_emitted += 1
+                except Exception as e:
+                    self.report.warning(
+                        message="Unable to generate operation workunit",
+                        context=f"{audit_event}",
+                        exc=e,
+                    )
     def _generate_usage_workunits(
         self, usage_state: BigQueryUsageState
     ) -> Iterable[MetadataWorkUnit]:
-        self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION)
-        top_n = (
-            self.config.usage.top_n_queries
-            if self.config.usage.include_top_n_queries
-            else 0
-        )
-        for entry in usage_state.usage_statistics(top_n=top_n):
-            try:
-                query_freq = [
-                    (
-                        self.uuid_to_query.get(
-                            query_hash, usage_state.queries[query_hash]
-                        ),
-                        count,
+        with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
+            top_n = (
+                self.config.usage.top_n_queries
+                if self.config.usage.include_top_n_queries
+                else 0
+            )
+            for entry in usage_state.usage_statistics(top_n=top_n):
+                try:
+                    query_freq = [
+                        (
+                            self.uuid_to_query.get(
+                                query_hash, usage_state.queries[query_hash]
+                            ),
+                            count,
+                        )
+                        for query_hash, count in entry.query_freq
+                    ]
+                    yield make_usage_workunit(
+                        bucket_start_time=datetime.fromisoformat(entry.timestamp),
+                        resource=BigQueryTableRef.from_string_name(entry.resource),
+                        query_count=entry.query_count,
+                        query_freq=query_freq,
+                        user_freq=entry.user_freq,
+                        column_freq=entry.column_freq,
+                        bucket_duration=self.config.bucket_duration,
+                        resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
+                        top_n_queries=self.config.usage.top_n_queries,
+                        format_sql_queries=self.config.usage.format_sql_queries,
+                        queries_character_limit=self.config.usage.queries_character_limit,
+                    )
+                    self.report.num_usage_workunits_emitted += 1
+                except Exception as e:
+                    self.report.warning(
+                        message="Unable to generate usage statistics workunit",
+                        context=f"{entry.timestamp}, {entry.resource}",
+                        exc=e,
                     )
-                    for query_hash, count in entry.query_freq
-                ]
-                yield make_usage_workunit(
-                    bucket_start_time=datetime.fromisoformat(entry.timestamp),
-                    resource=BigQueryTableRef.from_string_name(entry.resource),
-                    query_count=entry.query_count,
-                    query_freq=query_freq,
-                    user_freq=entry.user_freq,
-                    column_freq=entry.column_freq,
-                    bucket_duration=self.config.bucket_duration,
-                    resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
-                    top_n_queries=self.config.usage.top_n_queries,
-                    format_sql_queries=self.config.usage.format_sql_queries,
-                    queries_character_limit=self.config.usage.queries_character_limit,
-                )
-                self.report.num_usage_workunits_emitted += 1
-            except Exception as e:
-                self.report.warning(
-                    message="Unable to generate usage statistics workunit",
-                    context=f"{entry.timestamp}, {entry.resource}",
-                    exc=e,
-                )
     def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]:
         if self.config.use_exported_bigquery_audit_metadata:
@@ -559,10 +559,10 @@ class BigQueryUsageExtractor:
         for project_id in projects:
             with PerfTimer() as timer:
                 try:
-                    self.report.set_ingestion_stage(
-                        project_id, USAGE_EXTRACTION_INGESTION
-                    )
-                    yield from self._get_parsed_bigquery_log_events(project_id)
+                    with self.report.new_stage(
+                        f"{project_id}: {USAGE_EXTRACTION_INGESTION}"
+                    ):
+                        yield from self._get_parsed_bigquery_log_events(project_id)
                 except Exception as e:
                     self.report.usage_failed_extraction.append(project_id)
                     self.report.warning(
@@ -572,8 +572,8 @@ class BigQueryUsageExtractor:
                     )
                     self.report_status(f"usage-extraction-{project_id}", False)
-                self.report.usage_extraction_sec[project_id] = round(
-                    timer.elapsed_seconds(), 2
+                self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds(
+                    digits=2
                 )
     def _store_usage_event(
@@ -763,9 +763,9 @@ class BigQueryUsageExtractor:
                     )
                 if event.query_event.default_dataset:
-                    custom_properties[
-                        "defaultDatabase"
-                    ] = event.query_event.default_dataset
+                    custom_properties["defaultDatabase"] = (
+                        event.query_event.default_dataset
+                    )
             if event.read_event:
                 if event.read_event.readReason:
                     custom_properties["readReason"] = event.read_event.readReason

datahub/ingestion/source/cassandra/cassandra.py CHANGED Viewed

@@ -91,7 +91,6 @@ class KeyspaceKey(ContainerKey):
     supported=True,
 )
 class CassandraSource(StatefulIngestionSourceBase):
     """
     This plugin extracts the following:

datahub/ingestion/source/cassandra/cassandra_profiling.py CHANGED Viewed

@@ -70,30 +70,30 @@ class CassandraProfiler:
     ) -> Iterable[MetadataWorkUnit]:
         for keyspace_name in cassandra_data.keyspaces:
             tables = cassandra_data.tables.get(keyspace_name, [])
-            self.report.set_ingestion_stage(keyspace_name, PROFILING)
-            with ThreadPoolExecutor(
-                max_workers=self.config.profiling.max_workers
-            ) as executor:
-                future_to_dataset = {
-                    executor.submit(
-                        self.generate_profile,
-                        keyspace_name,
-                        table_name,
-                        cassandra_data.columns.get(table_name, []),
-                    ): table_name
-                    for table_name in tables
-                }
-                for future in as_completed(future_to_dataset):
-                    table_name = future_to_dataset[future]
-                    try:
-                        yield from future.result()
-                    except Exception as exc:
-                        self.report.profiling_skipped_other[table_name] += 1
-                        self.report.failure(
-                            message="Failed to profile for table",
-                            context=f"{keyspace_name}.{table_name}",
-                            exc=exc,
-                        )
+            with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
+                with ThreadPoolExecutor(
+                    max_workers=self.config.profiling.max_workers
+                ) as executor:
+                    future_to_dataset = {
+                        executor.submit(
+                            self.generate_profile,
+                            keyspace_name,
+                            table_name,
+                            cassandra_data.columns.get(table_name, []),
+                        ): table_name
+                        for table_name in tables
+                    }
+                    for future in as_completed(future_to_dataset):
+                        table_name = future_to_dataset[future]
+                        try:
+                            yield from future.result()
+                        except Exception as exc:
+                            self.report.profiling_skipped_other[table_name] += 1
+                            self.report.failure(
+                                message="Failed to profile for table",
+                                context=f"{keyspace_name}.{table_name}",
+                                exc=exc,
+                            )
     def generate_profile(
         self,

datahub/ingestion/source/cassandra/cassandra_utils.py CHANGED Viewed

@@ -54,9 +54,6 @@ class CassandraSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport
         else:
             raise KeyError(f"Unknown entity {ent_type}.")
-    def set_ingestion_stage(self, keyspace: str, stage: str) -> None:
-        self.report_ingestion_stage_start(f"{keyspace}: {stage}")
     # TODO Need to create seperate common config for profiling report
     profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
     profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
@@ -110,10 +107,10 @@ class CassandraToSchemaFieldConverter:
     @staticmethod
     def get_column_type(cassandra_column_type: str) -> SchemaFieldDataType:
-        type_class: Optional[
-            Type
-        ] = CassandraToSchemaFieldConverter._field_type_to_schema_field_type.get(
-            cassandra_column_type
+        type_class: Optional[Type] = (
+            CassandraToSchemaFieldConverter._field_type_to_schema_field_type.get(
+                cassandra_column_type
+            )
         )
         if type_class is None:
             logger.warning(

datahub/ingestion/source/confluent_schema_registry.py CHANGED Viewed

@@ -293,9 +293,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
     def _load_json_schema_with_resolved_references(
         self, schema: Schema, name: str, subject: str
     ) -> dict:
-        imported_json_schemas: List[
-            JsonSchemaWrapper
-        ] = self.get_schemas_from_confluent_ref_json(schema, name=name, subject=subject)
+        imported_json_schemas: List[JsonSchemaWrapper] = (
+            self.get_schemas_from_confluent_ref_json(schema, name=name, subject=subject)
+        )
         schema_dict = json.loads(schema.schema_str)
         reference_map = {}
         for imported_schema in imported_json_schemas:
@@ -332,9 +332,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
             )
         elif schema.schema_type == "PROTOBUF":
-            imported_schemas: List[
-                ProtobufSchema
-            ] = self.get_schemas_from_confluent_ref_protobuf(schema)
+            imported_schemas: List[ProtobufSchema] = (
+                self.get_schemas_from_confluent_ref_protobuf(schema)
+            )
             base_name: str = topic.replace(".", "_")
             fields = protobuf_util.protobuf_schema_to_mce_fields(
                 ProtobufSchema(

datahub/ingestion/source/csv_enricher.py CHANGED Viewed

@@ -371,11 +371,11 @@ class CSVEnricherSource(Source):
         domain: Optional[str],
         description: Optional[str],
     ) -> Iterable[MetadataWorkUnit]:
-        maybe_terms_wu: Optional[
-            MetadataWorkUnit
-        ] = self.get_resource_glossary_terms_work_unit(
-            entity_urn=entity_urn,
-            term_associations=term_associations,
+        maybe_terms_wu: Optional[MetadataWorkUnit] = (
+            self.get_resource_glossary_terms_work_unit(
+                entity_urn=entity_urn,
+                term_associations=term_associations,
+            )
         )
         if maybe_terms_wu:
             self.report.num_glossary_term_workunits_produced += 1
@@ -389,31 +389,31 @@ class CSVEnricherSource(Source):
             self.report.num_tag_workunits_produced += 1
             yield maybe_tags_wu
-        maybe_owners_wu: Optional[
-            MetadataWorkUnit
-        ] = self.get_resource_owners_work_unit(
-            entity_urn=entity_urn,
-            owners=owners,
+        maybe_owners_wu: Optional[MetadataWorkUnit] = (
+            self.get_resource_owners_work_unit(
+                entity_urn=entity_urn,
+                owners=owners,
+            )
         )
         if maybe_owners_wu:
             self.report.num_owners_workunits_produced += 1
             yield maybe_owners_wu
-        maybe_domain_wu: Optional[
-            MetadataWorkUnit
-        ] = self.get_resource_domain_work_unit(
-            entity_urn=entity_urn,
-            domain=domain,
+        maybe_domain_wu: Optional[MetadataWorkUnit] = (
+            self.get_resource_domain_work_unit(
+                entity_urn=entity_urn,
+                domain=domain,
+            )
         )
         if maybe_domain_wu:
             self.report.num_domain_workunits_produced += 1
             yield maybe_domain_wu
-        maybe_description_wu: Optional[
-            MetadataWorkUnit
-        ] = self.get_resource_description_work_unit(
-            entity_urn=entity_urn,
-            description=description,
+        maybe_description_wu: Optional[MetadataWorkUnit] = (
+            self.get_resource_description_work_unit(
+                entity_urn=entity_urn,
+                description=description,
+            )
         )
         if maybe_description_wu:
             self.report.num_description_workunits_produced += 1
@@ -426,9 +426,9 @@ class CSVEnricherSource(Source):
         needs_write: bool,
     ) -> Tuple[EditableSchemaMetadataClass, bool]:
         field_path: str = sub_resource_row.field_path
-        term_associations: List[
-            GlossaryTermAssociationClass
-        ] = sub_resource_row.term_associations
+        term_associations: List[GlossaryTermAssociationClass] = (
+            sub_resource_row.term_associations
+        )
         tag_associations: List[TagAssociationClass] = sub_resource_row.tag_associations
         description: Optional[str] = sub_resource_row.description
         has_terms: bool = len(term_associations) > 0
@@ -517,9 +517,9 @@ class CSVEnricherSource(Source):
             # Boolean field to tell whether we need to write an MCPW.
             needs_write = False
-            current_editable_schema_metadata: Optional[
-                EditableSchemaMetadataClass
-            ] = None
+            current_editable_schema_metadata: Optional[EditableSchemaMetadataClass] = (
+                None
+            )
             if self.ctx.graph and not self.should_overwrite:
                 # Fetch the current editable schema metadata
                 current_editable_schema_metadata = self.ctx.graph.get_aspect(
@@ -655,9 +655,9 @@ class CSVEnricherSource(Source):
             entity_urn = row["resource"]
             entity_type = Urn.from_string(row["resource"]).get_type()
-            term_associations: List[
-                GlossaryTermAssociationClass
-            ] = self.maybe_extract_glossary_terms(row)
+            term_associations: List[GlossaryTermAssociationClass] = (
+                self.maybe_extract_glossary_terms(row)
+            )
             tag_associations: List[TagAssociationClass] = self.maybe_extract_tags(row)
             owners: List[OwnerClass] = self.maybe_extract_owners(row, is_resource_row)

acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.1rc16py3-none-any.whl → 0.15.0.2py3-none-any.whl