PyPI - acryl-datahub - Versions diffs - 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl - Mend

acryl-datahub 0.15.0.1rc17py3-none-any.whl → 0.15.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (211) hide show

{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
datahub/__init__.py +1 -1
datahub/api/entities/assertion/assertion_operator.py +3 -5
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/datacontract/assertion_operator.py +3 -5
datahub/api/entities/dataproduct/dataproduct.py +4 -4
datahub/api/entities/dataset/dataset.py +2 -1
datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
datahub/cli/cli_utils.py +13 -2
datahub/cli/delete_cli.py +3 -3
datahub/cli/docker_cli.py +6 -6
datahub/cli/ingest_cli.py +25 -15
datahub/cli/lite_cli.py +2 -2
datahub/cli/migrate.py +5 -5
datahub/cli/specific/assertions_cli.py +3 -3
datahub/cli/specific/structuredproperties_cli.py +84 -0
datahub/cli/timeline_cli.py +1 -1
datahub/configuration/common.py +1 -2
datahub/configuration/config_loader.py +73 -50
datahub/configuration/git.py +2 -2
datahub/configuration/time_window_config.py +10 -5
datahub/emitter/mce_builder.py +4 -8
datahub/emitter/mcp_builder.py +27 -0
datahub/emitter/mcp_patch_builder.py +1 -2
datahub/emitter/rest_emitter.py +126 -85
datahub/entrypoints.py +6 -0
datahub/ingestion/api/incremental_lineage_helper.py +2 -8
datahub/ingestion/api/report.py +1 -2
datahub/ingestion/api/source.py +4 -2
datahub/ingestion/api/source_helpers.py +1 -1
datahub/ingestion/extractor/json_schema_util.py +3 -3
datahub/ingestion/extractor/schema_util.py +3 -5
datahub/ingestion/fs/s3_fs.py +3 -3
datahub/ingestion/glossary/datahub_classifier.py +6 -4
datahub/ingestion/graph/client.py +22 -19
datahub/ingestion/graph/config.py +1 -1
datahub/ingestion/run/pipeline.py +8 -7
datahub/ingestion/run/pipeline_config.py +3 -3
datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
datahub/ingestion/source/abs/source.py +19 -8
datahub/ingestion/source/aws/glue.py +77 -47
datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
datahub/ingestion/source/aws/s3_util.py +24 -1
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
datahub/ingestion/source/bigquery_v2/queries.py +1 -3
datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
datahub/ingestion/source/bigquery_v2/usage.py +60 -60
datahub/ingestion/source/cassandra/cassandra.py +0 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
datahub/ingestion/source/confluent_schema_registry.py +6 -6
datahub/ingestion/source/csv_enricher.py +29 -29
datahub/ingestion/source/datahub/config.py +10 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
datahub/ingestion/source/datahub/datahub_source.py +12 -2
datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
datahub/ingestion/source/dbt/dbt_common.py +9 -7
datahub/ingestion/source/delta_lake/source.py +0 -5
datahub/ingestion/source/demo_data.py +1 -1
datahub/ingestion/source/dremio/dremio_api.py +4 -4
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
datahub/ingestion/source/dremio/dremio_source.py +2 -2
datahub/ingestion/source/elastic_search.py +4 -4
datahub/ingestion/source/fivetran/fivetran.py +1 -6
datahub/ingestion/source/gc/datahub_gc.py +11 -14
datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
datahub/ingestion/source/gcs/gcs_source.py +3 -2
datahub/ingestion/source/ge_data_profiler.py +2 -5
datahub/ingestion/source/ge_profiling_config.py +3 -3
datahub/ingestion/source/iceberg/iceberg.py +13 -6
datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +3 -3
datahub/ingestion/source/identity/okta.py +3 -3
datahub/ingestion/source/kafka/kafka.py +11 -9
datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
datahub/ingestion/source/looker/looker_common.py +19 -19
datahub/ingestion/source/looker/looker_config.py +11 -6
datahub/ingestion/source/looker/looker_source.py +25 -25
datahub/ingestion/source/looker/looker_template_language.py +3 -3
datahub/ingestion/source/looker/looker_usage.py +5 -7
datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
datahub/ingestion/source/looker/lookml_source.py +13 -15
datahub/ingestion/source/looker/view_upstream.py +5 -5
datahub/ingestion/source/metabase.py +1 -6
datahub/ingestion/source/mlflow.py +4 -9
datahub/ingestion/source/mode.py +5 -5
datahub/ingestion/source/mongodb.py +6 -4
datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
datahub/ingestion/source/nifi.py +24 -31
datahub/ingestion/source/openapi.py +9 -9
datahub/ingestion/source/powerbi/config.py +12 -12
datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
datahub/ingestion/source/powerbi/powerbi.py +6 -6
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
datahub/ingestion/source/redash.py +0 -5
datahub/ingestion/source/redshift/config.py +3 -3
datahub/ingestion/source/redshift/redshift.py +45 -46
datahub/ingestion/source/redshift/usage.py +33 -33
datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
datahub/ingestion/source/s3/source.py +11 -15
datahub/ingestion/source/salesforce.py +26 -25
datahub/ingestion/source/schema/json_schema.py +1 -1
datahub/ingestion/source/sigma/sigma.py +3 -3
datahub/ingestion/source/sigma/sigma_api.py +12 -10
datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
datahub/ingestion/source/sql/athena.py +1 -3
datahub/ingestion/source/sql/clickhouse.py +8 -14
datahub/ingestion/source/sql/oracle.py +1 -3
datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
datahub/ingestion/source/sql/sql_types.py +1 -2
datahub/ingestion/source/sql/sql_utils.py +5 -0
datahub/ingestion/source/sql/teradata.py +18 -5
datahub/ingestion/source/state/profiling_state_handler.py +3 -3
datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
datahub/ingestion/source/superset.py +1 -6
datahub/ingestion/source/tableau/tableau.py +343 -117
datahub/ingestion/source/tableau/tableau_common.py +5 -2
datahub/ingestion/source/unity/config.py +3 -1
datahub/ingestion/source/unity/proxy.py +1 -1
datahub/ingestion/source/unity/source.py +74 -74
datahub/ingestion/source/unity/usage.py +3 -1
datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
datahub/ingestion/source/usage/usage_common.py +1 -1
datahub/ingestion/source_report/ingestion_stage.py +24 -20
datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
datahub/ingestion/transformer/add_dataset_properties.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
datahub/ingestion/transformer/tags_to_terms.py +7 -7
datahub/integrations/assertion/snowflake/compiler.py +10 -10
datahub/lite/duckdb_lite.py +12 -10
datahub/metadata/_schema_classes.py +317 -44
datahub/metadata/_urns/urn_defs.py +69 -15
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
datahub/metadata/schema.avsc +302 -89
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
datahub/metadata/schemas/DatasetKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
datahub/metadata/schemas/MLModelKey.avsc +2 -1
datahub/metadata/schemas/MLModelProperties.avsc +96 -48
datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
datahub/metadata/schemas/VersionProperties.avsc +216 -0
datahub/metadata/schemas/VersionSetKey.avsc +26 -0
datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
datahub/secret/datahub_secrets_client.py +12 -21
datahub/secret/secret_common.py +14 -8
datahub/specific/aspect_helpers/custom_properties.py +1 -2
datahub/sql_parsing/schema_resolver.py +5 -10
datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
datahub/sql_parsing/sqlglot_lineage.py +3 -3
datahub/sql_parsing/sqlglot_utils.py +1 -1
datahub/telemetry/stats.py +1 -2
datahub/testing/mcp_diff.py +1 -1
datahub/utilities/file_backed_collections.py +11 -11
datahub/utilities/hive_schema_to_avro.py +2 -2
datahub/utilities/logging_manager.py +2 -2
datahub/utilities/lossy_collections.py +3 -3
datahub/utilities/mapping.py +3 -3
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/perf_timer.py +11 -6
datahub/utilities/serialized_lru_cache.py +3 -1
datahub/utilities/sqlalchemy_query_combiner.py +6 -6
datahub/utilities/sqllineage_patch.py +1 -1
datahub/utilities/stats_collections.py +3 -1
datahub/utilities/urns/_urn_base.py +28 -5
datahub/utilities/urns/urn_iter.py +2 -2
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/snowflake/snowflake_schema_gen.py CHANGED Viewed

@@ -4,12 +4,14 @@ from typing import Dict, Iterable, List, Optional, Union
 from datahub.configuration.pattern_utils import is_schema_allowed
 from datahub.emitter.mce_builder import (
+    get_sys_time,
     make_data_platform_urn,
     make_dataset_urn_with_platform_instance,
     make_schema_field_urn,
     make_tag_urn,
 )
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.emitter.mcp_builder import add_structured_properties_to_entity_wu
 from datahub.ingestion.api.source import SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.glossary.classification_mixin import (
@@ -72,6 +74,7 @@ from datahub.ingestion.source_report.ingestion_stage import (
     PROFILING,
 )
 from datahub.metadata.com.linkedin.pegasus2avro.common import (
+    AuditStamp,
     GlobalTags,
     Status,
     SubTypes,
@@ -98,7 +101,18 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     StringType,
     TimeType,
 )
+from datahub.metadata.com.linkedin.pegasus2avro.structured import (
+    StructuredPropertyDefinition,
+)
 from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
+from datahub.metadata.urns import (
+    ContainerUrn,
+    DatasetUrn,
+    DataTypeUrn,
+    EntityTypeUrn,
+    SchemaFieldUrn,
+    StructuredPropertyUrn,
+)
 from datahub.sql_parsing.sql_parsing_aggregator import (
     KnownLineageMapping,
     SqlParsingAggregator,
@@ -180,9 +194,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
             config, self.data_dictionary, self.report
         )
         self.profiler: Optional[SnowflakeProfiler] = profiler
-        self.snowsight_url_builder: Optional[
-            SnowsightUrlBuilder
-        ] = snowsight_url_builder
+        self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = (
+            snowsight_url_builder
+        )
         # These are populated as side-effects of get_workunits_internal.
         self.databases: List[SnowflakeDatabase] = []
@@ -216,21 +230,23 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         try:
             for snowflake_db in self.databases:
-                self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
-                yield from self._process_database(snowflake_db)
+                with self.report.new_stage(
+                    f"{snowflake_db.name}: {METADATA_EXTRACTION}"
+                ):
+                    yield from self._process_database(snowflake_db)
-            self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
-            discovered_tables: List[str] = [
-                self.identifiers.get_dataset_identifier(
-                    table_name, schema.name, db.name
-                )
-                for db in self.databases
-                for schema in db.schemas
-                for table_name in schema.tables
-            ]
-            if self.aggregator:
-                for entry in self._external_tables_ddl_lineage(discovered_tables):
-                    self.aggregator.add(entry)
+            with self.report.new_stage(f"*: {EXTERNAL_TABLE_DDL_LINEAGE}"):
+                discovered_tables: List[str] = [
+                    self.identifiers.get_dataset_identifier(
+                        table_name, schema.name, db.name
+                    )
+                    for db in self.databases
+                    for schema in db.schemas
+                    for table_name in schema.tables
+                ]
+                if self.aggregator:
+                    for entry in self._external_tables_ddl_lineage(discovered_tables):
+                        self.aggregator.add(entry)
         except SnowflakePermissionError as e:
             self.structured_reporter.failure(
@@ -251,9 +267,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
             )
             return None
         else:
-            ischema_databases: List[
-                SnowflakeDatabase
-            ] = self.get_databases_from_ischema(databases)
+            ischema_databases: List[SnowflakeDatabase] = (
+                self.get_databases_from_ischema(databases)
+            )
             if len(ischema_databases) == 0:
                 self.structured_reporter.failure(
@@ -332,8 +348,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         yield from self._process_db_schemas(snowflake_db, db_tables)
         if self.profiler and db_tables:
-            self.report.set_ingestion_stage(snowflake_db.name, PROFILING)
-            yield from self.profiler.get_workunits(snowflake_db, db_tables)
+            with self.report.new_stage(f"{snowflake_db.name}: {PROFILING}"):
+                yield from self.profiler.get_workunits(snowflake_db, db_tables)
     def _process_db_schemas(
         self,
@@ -671,14 +687,31 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
             yield from self.gen_dataset_workunits(view, schema_name, db_name)
     def _process_tag(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
-        tag_identifier = tag.identifier()
+        use_sp = self.config.extract_tags_as_structured_properties
+        identifier = (
+            self.snowflake_identifier(tag.structured_property_identifier())
+            if use_sp
+            else tag.tag_identifier()
+        )
-        if self.report.is_tag_processed(tag_identifier):
+        if self.report.is_tag_processed(identifier):
             return
-        self.report.report_tag_processed(tag_identifier)
-        yield from self.gen_tag_workunits(tag)
+        self.report.report_tag_processed(identifier)
+        if use_sp:
+            yield from self.gen_tag_as_structured_property_workunits(tag)
+        else:
+            yield from self.gen_tag_workunits(tag)
+    def _format_tags_as_structured_properties(
+        self, tags: List[SnowflakeTag]
+    ) -> Dict[StructuredPropertyUrn, str]:
+        return {
+            StructuredPropertyUrn(
+                self.snowflake_identifier(tag.structured_property_identifier())
+            ): tag.value
+            for tag in tags
+        }
     def gen_dataset_workunits(
         self,
@@ -723,6 +756,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
             env=self.config.env,
         )
+        if self.config.extract_tags_as_structured_properties:
+            yield from self.gen_column_tags_as_structured_properties(dataset_urn, table)
         yield from add_table_to_schema_container(
             dataset_urn=dataset_urn,
             parent_container_key=schema_container_key,
@@ -756,16 +792,24 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
             )
         if table.tags:
-            tag_associations = [
-                TagAssociation(
-                    tag=make_tag_urn(self.snowflake_identifier(tag.identifier()))
+            if self.config.extract_tags_as_structured_properties:
+                yield from add_structured_properties_to_entity_wu(
+                    dataset_urn,
+                    self._format_tags_as_structured_properties(table.tags),
                 )
-                for tag in table.tags
-            ]
-            global_tags = GlobalTags(tag_associations)
-            yield MetadataChangeProposalWrapper(
-                entityUrn=dataset_urn, aspect=global_tags
-            ).as_workunit()
+            else:
+                tag_associations = [
+                    TagAssociation(
+                        tag=make_tag_urn(
+                            self.snowflake_identifier(tag.tag_identifier())
+                        )
+                    )
+                    for tag in table.tags
+                ]
+                global_tags = GlobalTags(tag_associations)
+                yield MetadataChangeProposalWrapper(
+                    entityUrn=dataset_urn, aspect=global_tags
+                ).as_workunit()
         if isinstance(table, SnowflakeView) and table.view_definition is not None:
             view_properties_aspect = ViewProperties(
@@ -838,10 +882,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         )
     def gen_tag_workunits(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
-        tag_urn = make_tag_urn(self.snowflake_identifier(tag.identifier()))
+        tag_urn = make_tag_urn(self.snowflake_identifier(tag.tag_identifier()))
         tag_properties_aspect = TagProperties(
-            name=tag.display_name(),
+            name=tag.tag_display_name(),
             description=f"Represents the Snowflake tag `{tag._id_prefix_as_str()}` with value `{tag.value}`.",
         )
@@ -849,6 +893,41 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
             entityUrn=tag_urn, aspect=tag_properties_aspect
         ).as_workunit()
+    def gen_tag_as_structured_property_workunits(
+        self, tag: SnowflakeTag
+    ) -> Iterable[MetadataWorkUnit]:
+        identifier = self.snowflake_identifier(tag.structured_property_identifier())
+        urn = StructuredPropertyUrn(identifier).urn()
+        aspect = StructuredPropertyDefinition(
+            qualifiedName=identifier,
+            displayName=tag.name,
+            valueType=DataTypeUrn("datahub.string").urn(),
+            entityTypes=[
+                EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(),
+                EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(),
+                EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(),
+            ],
+            lastModified=AuditStamp(
+                time=get_sys_time(), actor="urn:li:corpuser:datahub"
+            ),
+        )
+        yield MetadataChangeProposalWrapper(
+            entityUrn=urn,
+            aspect=aspect,
+        ).as_workunit()
+    def gen_column_tags_as_structured_properties(
+        self, dataset_urn: str, table: Union[SnowflakeTable, SnowflakeView]
+    ) -> Iterable[MetadataWorkUnit]:
+        for column_name in table.column_tags:
+            schema_field_urn = SchemaFieldUrn(dataset_urn, column_name).urn()
+            yield from add_structured_properties_to_entity_wu(
+                schema_field_urn,
+                self._format_tags_as_structured_properties(
+                    table.column_tags[column_name]
+                ),
+            )
     def gen_schema_metadata(
         self,
         table: Union[SnowflakeTable, SnowflakeView],
@@ -890,13 +969,14 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
                             [
                                 TagAssociation(
                                     make_tag_urn(
-                                        self.snowflake_identifier(tag.identifier())
+                                        self.snowflake_identifier(tag.tag_identifier())
                                     )
                                 )
                                 for tag in table.column_tags[col.name]
                             ]
                         )
                         if col.name in table.column_tags
+                        and not self.config.extract_tags_as_structured_properties
                         else None
                     ),
                 )
@@ -983,8 +1063,17 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
                 )
             ),
             tags=(
-                [self.snowflake_identifier(tag.identifier()) for tag in database.tags]
+                [
+                    self.snowflake_identifier(tag.tag_identifier())
+                    for tag in database.tags
+                ]
                 if database.tags
+                and not self.config.extract_tags_as_structured_properties
+                else None
+            ),
+            structured_properties=(
+                self._format_tags_as_structured_properties(database.tags)
+                if database.tags and self.config.extract_tags_as_structured_properties
                 else None
             ),
         )
@@ -1036,8 +1125,13 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
                 else None
             ),
             tags=(
-                [self.snowflake_identifier(tag.identifier()) for tag in schema.tags]
-                if schema.tags
+                [self.snowflake_identifier(tag.tag_identifier()) for tag in schema.tags]
+                if schema.tags and not self.config.extract_tags_as_structured_properties
+                else None
+            ),
+            structured_properties=(
+                self._format_tags_as_structured_properties(schema.tags)
+                if schema.tags and self.config.extract_tags_as_structured_properties
                 else None
             ),
         )

datahub/ingestion/source/snowflake/snowflake_tag.py CHANGED Viewed

@@ -38,9 +38,9 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
         table_name: Optional[str],
     ) -> List[SnowflakeTag]:
         if db_name not in self.tag_cache:
-            self.tag_cache[
-                db_name
-            ] = self.data_dictionary.get_tags_for_database_without_propagation(db_name)
+            self.tag_cache[db_name] = (
+                self.data_dictionary.get_tags_for_database_without_propagation(db_name)
+            )
         if domain == SnowflakeObjectDomain.DATABASE:
             return self.tag_cache[db_name].get_database_tags(db_name)
@@ -130,10 +130,10 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
         temp_column_tags: Dict[str, List[SnowflakeTag]] = {}
         if self.config.extract_tags == TagOption.without_lineage:
             if db_name not in self.tag_cache:
-                self.tag_cache[
-                    db_name
-                ] = self.data_dictionary.get_tags_for_database_without_propagation(
-                    db_name
+                self.tag_cache[db_name] = (
+                    self.data_dictionary.get_tags_for_database_without_propagation(
+                        db_name
+                    )
                 )
             temp_column_tags = self.tag_cache[db_name].get_column_tags_for_table(
                 table_name, schema_name, db_name
@@ -165,10 +165,20 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
         allowed_tags = []
         for tag in tags:
-            tag_identifier = tag.identifier()
-            self.report.report_entity_scanned(tag_identifier, "tag")
-            if not self.config.tag_pattern.allowed(tag_identifier):
-                self.report.report_dropped(tag_identifier)
+            identifier = (
+                tag._id_prefix_as_str()
+                if self.config.extract_tags_as_structured_properties
+                else tag.tag_identifier()
+            )
+            self.report.report_entity_scanned(identifier, "tag")
+            pattern = (
+                self.config.structured_property_pattern
+                if self.config.extract_tags_as_structured_properties
+                else self.config.tag_pattern
+            )
+            if not pattern.allowed(identifier):
+                self.report.report_dropped(identifier)
             else:
                 allowed_tags.append(tag)
         return allowed_tags

datahub/ingestion/source/snowflake/snowflake_usage_v2.py CHANGED Viewed

@@ -146,59 +146,58 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
         if not self._should_ingest_usage():
             return
-        self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION)
-        if self.report.edition == SnowflakeEdition.STANDARD.value:
-            logger.info(
-                "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
-            )
-            return
+        with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
+            if self.report.edition == SnowflakeEdition.STANDARD.value:
+                logger.info(
+                    "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
+                )
+                return
-        logger.info("Checking usage date ranges")
+            logger.info("Checking usage date ranges")
-        self._check_usage_date_ranges()
+            self._check_usage_date_ranges()
-        # If permission error, execution returns from here
-        if (
-            self.report.min_access_history_time is None
-            or self.report.max_access_history_time is None
-        ):
-            return
+            # If permission error, execution returns from here
+            if (
+                self.report.min_access_history_time is None
+                or self.report.max_access_history_time is None
+            ):
+                return
-        # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
-        # Now, we report the usage as well as operation metadata even if user email is absent
+            # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
+            # Now, we report the usage as well as operation metadata even if user email is absent
-        if self.config.include_usage_stats:
-            yield from auto_empty_dataset_usage_statistics(
-                self._get_workunits_internal(discovered_datasets),
-                config=BaseTimeWindowConfig(
-                    start_time=self.start_time,
-                    end_time=self.end_time,
-                    bucket_duration=self.config.bucket_duration,
-                ),
-                dataset_urns={
-                    self.identifiers.gen_dataset_urn(dataset_identifier)
-                    for dataset_identifier in discovered_datasets
-                },
-            )
+            if self.config.include_usage_stats:
+                yield from auto_empty_dataset_usage_statistics(
+                    self._get_workunits_internal(discovered_datasets),
+                    config=BaseTimeWindowConfig(
+                        start_time=self.start_time,
+                        end_time=self.end_time,
+                        bucket_duration=self.config.bucket_duration,
+                    ),
+                    dataset_urns={
+                        self.identifiers.gen_dataset_urn(dataset_identifier)
+                        for dataset_identifier in discovered_datasets
+                    },
+                )
-        self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS)
+        with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
+            if self.config.include_operational_stats:
+                # Generate the operation workunits.
+                access_events = self._get_snowflake_history()
+                for event in access_events:
+                    yield from self._get_operation_aspect_work_unit(
+                        event, discovered_datasets
+                    )
-        if self.config.include_operational_stats:
-            # Generate the operation workunits.
-            access_events = self._get_snowflake_history()
-            for event in access_events:
-                yield from self._get_operation_aspect_work_unit(
-                    event, discovered_datasets
+            if self.redundant_run_skip_handler:
+                # Update the checkpoint state for this run.
+                self.redundant_run_skip_handler.update_state(
+                    self.config.start_time,
+                    self.config.end_time,
+                    self.config.bucket_duration,
                 )
-        if self.redundant_run_skip_handler:
-            # Update the checkpoint state for this run.
-            self.redundant_run_skip_handler.update_state(
-                self.config.start_time,
-                self.config.end_time,
-                self.config.bucket_duration,
-            )
     def _get_workunits_internal(
         self, discovered_datasets: List[str]
     ) -> Iterable[MetadataWorkUnit]:
@@ -386,7 +385,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
                 )
                 self.report_status(USAGE_EXTRACTION_OPERATIONAL_STATS, False)
                 return
-            self.report.access_history_query_secs = round(timer.elapsed_seconds(), 2)
+            self.report.access_history_query_secs = timer.elapsed_seconds(digits=2)
         for row in results:
             yield from self._process_snowflake_history_row(row)
@@ -434,8 +433,8 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
                     self.report.max_access_history_time = db_row["MAX_TIME"].astimezone(
                         tz=timezone.utc
                     )
-                    self.report.access_history_range_query_secs = round(
-                        timer.elapsed_seconds(), 2
+                    self.report.access_history_range_query_secs = timer.elapsed_seconds(
+                        digits=2
                     )
     def _get_operation_aspect_work_unit(
@@ -550,9 +549,9 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
         ):
             # NOTE: Generated emails may be incorrect, as email may be different than
             # username@email_domain
-            event_dict[
-                "EMAIL"
-            ] = f'{event_dict["USER_NAME"]}@{self.config.email_domain}'.lower()
+            event_dict["EMAIL"] = (
+                f"{event_dict['USER_NAME']}@{self.config.email_domain}".lower()
+            )
         if not event_dict["EMAIL"]:
             self.report.rows_missing_email += 1

datahub/ingestion/source/snowflake/snowflake_utils.py CHANGED Viewed

@@ -21,8 +21,7 @@ from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Repor
 class SnowflakeStructuredReportMixin(abc.ABC):
     @property
     @abc.abstractmethod
-    def structured_reporter(self) -> SourceReport:
-        ...
+    def structured_reporter(self) -> SourceReport: ...
 class SnowsightUrlBuilder:

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -23,7 +23,6 @@ from datahub.ingestion.api.incremental_properties_helper import (
 from datahub.ingestion.api.source import (
     CapabilityReport,
     MetadataWorkUnitProcessor,
-    Source,
     SourceCapability,
     SourceReport,
     TestableSource,
@@ -212,9 +211,9 @@ class SnowflakeV2Source(
         self.usage_extractor: Optional[SnowflakeUsageExtractor] = None
         if self.config.include_usage_stats or self.config.include_operational_stats:
-            redundant_usage_run_skip_handler: Optional[
-                RedundantUsageRunSkipHandler
-            ] = None
+            redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = (
+                None
+            )
             if self.config.enable_stateful_usage_ingestion:
                 redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler(
                     source=self,
@@ -251,11 +250,6 @@ class SnowflakeV2Source(
         self.add_config_to_report()
-    @classmethod
-    def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
-        config = SnowflakeV2Config.parse_obj(config_dict)
-        return cls(ctx, config)
     @staticmethod
     def test_connection(config_dict: dict) -> TestConnectionReport:
         test_report = TestConnectionReport()
@@ -302,7 +296,16 @@ class SnowflakeV2Source(
         _report: Dict[Union[SourceCapability, str], CapabilityReport] = dict()
         privileges: List[SnowflakePrivilege] = []
-        capabilities: List[SourceCapability] = [c.capability for c in SnowflakeV2Source.get_capabilities() if c.capability not in (SourceCapability.PLATFORM_INSTANCE, SourceCapability.DOMAINS, SourceCapability.DELETION_DETECTION)]  # type: ignore
+        capabilities: List[SourceCapability] = [
+            c.capability
+            for c in SnowflakeV2Source.get_capabilities()  # type: ignore
+            if c.capability
+            not in (
+                SourceCapability.PLATFORM_INSTANCE,
+                SourceCapability.DOMAINS,
+                SourceCapability.DELETION_DETECTION,
+            )
+        ]
         cur = conn.query("select current_role()")
         current_role = [row["CURRENT_ROLE()"] for row in cur][0]
@@ -480,8 +483,8 @@ class SnowflakeV2Source(
             identifiers=self.identifiers,
         )
-        self.report.set_ingestion_stage("*", METADATA_EXTRACTION)
-        yield from schema_extractor.get_workunits_internal()
+        with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
+            yield from schema_extractor.get_workunits_internal()
         databases = schema_extractor.databases
@@ -513,45 +516,46 @@ class SnowflakeV2Source(
         discovered_datasets = discovered_tables + discovered_views
         if self.config.use_queries_v2:
-            self.report.set_ingestion_stage("*", VIEW_PARSING)
-            yield from auto_workunit(self.aggregator.gen_metadata())
-            self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
-            schema_resolver = self.aggregator._schema_resolver
-            queries_extractor = SnowflakeQueriesExtractor(
-                connection=self.connection,
-                config=SnowflakeQueriesExtractorConfig(
-                    window=self.config,
-                    temporary_tables_pattern=self.config.temporary_tables_pattern,
-                    include_lineage=self.config.include_table_lineage,
-                    include_usage_statistics=self.config.include_usage_stats,
-                    include_operations=self.config.include_operational_stats,
-                    user_email_pattern=self.config.user_email_pattern,
-                ),
-                structured_report=self.report,
-                filters=self.filters,
-                identifiers=self.identifiers,
-                schema_resolver=schema_resolver,
-                discovered_tables=discovered_datasets,
-                graph=self.ctx.graph,
-            )
+            with self.report.new_stage(f"*: {VIEW_PARSING}"):
+                yield from auto_workunit(self.aggregator.gen_metadata())
-            # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
-            # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
-            # it should be pretty straightforward to refactor this and only initialize the aggregator once.
-            self.report.queries_extractor = queries_extractor.report
-            yield from queries_extractor.get_workunits_internal()
-            queries_extractor.close()
+            with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
+                schema_resolver = self.aggregator._schema_resolver
+                queries_extractor = SnowflakeQueriesExtractor(
+                    connection=self.connection,
+                    config=SnowflakeQueriesExtractorConfig(
+                        window=self.config,
+                        temporary_tables_pattern=self.config.temporary_tables_pattern,
+                        include_lineage=self.config.include_table_lineage,
+                        include_usage_statistics=self.config.include_usage_stats,
+                        include_operations=self.config.include_operational_stats,
+                        include_queries=self.config.include_queries,
+                        include_query_usage_statistics=self.config.include_query_usage_statistics,
+                        user_email_pattern=self.config.user_email_pattern,
+                    ),
+                    structured_report=self.report,
+                    filters=self.filters,
+                    identifiers=self.identifiers,
+                    schema_resolver=schema_resolver,
+                    discovered_tables=discovered_datasets,
+                    graph=self.ctx.graph,
+                )
+                # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
+                # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
+                # it should be pretty straightforward to refactor this and only initialize the aggregator once.
+                self.report.queries_extractor = queries_extractor.report
+                yield from queries_extractor.get_workunits_internal()
+                queries_extractor.close()
         else:
             if self.lineage_extractor:
-                self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION)
-                self.lineage_extractor.add_time_based_lineage_to_aggregator(
-                    discovered_tables=discovered_tables,
-                    discovered_views=discovered_views,
-                )
+                with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"):
+                    self.lineage_extractor.add_time_based_lineage_to_aggregator(
+                        discovered_tables=discovered_tables,
+                        discovered_views=discovered_views,
+                    )
             # This would emit view and external table ddl lineage
             # as well as query lineage via lineage_extractor

datahub/ingestion/source/sql/athena.py CHANGED Viewed

@@ -104,9 +104,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
             return "\n".join([r for r in res])
     @typing.no_type_check
-    def _get_column_type(
-        self, type_: Union[str, Dict[str, Any]]
-    ) -> TypeEngine:  # noqa: C901
+    def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine:  # noqa: C901
         """Derives the data type of the Athena column.
         This method is overwritten to extend the behavior of PyAthena.

acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.1rc17py3-none-any.whl → 0.15.0.2py3-none-any.whl