PyPI - acryl-datahub - Versions diffs - 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl - Mend

acryl-datahub 0.15.0.1rc17py3-none-any.whl → 0.15.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (211) hide show

{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
datahub/__init__.py +1 -1
datahub/api/entities/assertion/assertion_operator.py +3 -5
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/datacontract/assertion_operator.py +3 -5
datahub/api/entities/dataproduct/dataproduct.py +4 -4
datahub/api/entities/dataset/dataset.py +2 -1
datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
datahub/cli/cli_utils.py +13 -2
datahub/cli/delete_cli.py +3 -3
datahub/cli/docker_cli.py +6 -6
datahub/cli/ingest_cli.py +25 -15
datahub/cli/lite_cli.py +2 -2
datahub/cli/migrate.py +5 -5
datahub/cli/specific/assertions_cli.py +3 -3
datahub/cli/specific/structuredproperties_cli.py +84 -0
datahub/cli/timeline_cli.py +1 -1
datahub/configuration/common.py +1 -2
datahub/configuration/config_loader.py +73 -50
datahub/configuration/git.py +2 -2
datahub/configuration/time_window_config.py +10 -5
datahub/emitter/mce_builder.py +4 -8
datahub/emitter/mcp_builder.py +27 -0
datahub/emitter/mcp_patch_builder.py +1 -2
datahub/emitter/rest_emitter.py +126 -85
datahub/entrypoints.py +6 -0
datahub/ingestion/api/incremental_lineage_helper.py +2 -8
datahub/ingestion/api/report.py +1 -2
datahub/ingestion/api/source.py +4 -2
datahub/ingestion/api/source_helpers.py +1 -1
datahub/ingestion/extractor/json_schema_util.py +3 -3
datahub/ingestion/extractor/schema_util.py +3 -5
datahub/ingestion/fs/s3_fs.py +3 -3
datahub/ingestion/glossary/datahub_classifier.py +6 -4
datahub/ingestion/graph/client.py +22 -19
datahub/ingestion/graph/config.py +1 -1
datahub/ingestion/run/pipeline.py +8 -7
datahub/ingestion/run/pipeline_config.py +3 -3
datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
datahub/ingestion/source/abs/source.py +19 -8
datahub/ingestion/source/aws/glue.py +77 -47
datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
datahub/ingestion/source/aws/s3_util.py +24 -1
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
datahub/ingestion/source/bigquery_v2/queries.py +1 -3
datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
datahub/ingestion/source/bigquery_v2/usage.py +60 -60
datahub/ingestion/source/cassandra/cassandra.py +0 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
datahub/ingestion/source/confluent_schema_registry.py +6 -6
datahub/ingestion/source/csv_enricher.py +29 -29
datahub/ingestion/source/datahub/config.py +10 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
datahub/ingestion/source/datahub/datahub_source.py +12 -2
datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
datahub/ingestion/source/dbt/dbt_common.py +9 -7
datahub/ingestion/source/delta_lake/source.py +0 -5
datahub/ingestion/source/demo_data.py +1 -1
datahub/ingestion/source/dremio/dremio_api.py +4 -4
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
datahub/ingestion/source/dremio/dremio_source.py +2 -2
datahub/ingestion/source/elastic_search.py +4 -4
datahub/ingestion/source/fivetran/fivetran.py +1 -6
datahub/ingestion/source/gc/datahub_gc.py +11 -14
datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
datahub/ingestion/source/gcs/gcs_source.py +3 -2
datahub/ingestion/source/ge_data_profiler.py +2 -5
datahub/ingestion/source/ge_profiling_config.py +3 -3
datahub/ingestion/source/iceberg/iceberg.py +13 -6
datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +3 -3
datahub/ingestion/source/identity/okta.py +3 -3
datahub/ingestion/source/kafka/kafka.py +11 -9
datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
datahub/ingestion/source/looker/looker_common.py +19 -19
datahub/ingestion/source/looker/looker_config.py +11 -6
datahub/ingestion/source/looker/looker_source.py +25 -25
datahub/ingestion/source/looker/looker_template_language.py +3 -3
datahub/ingestion/source/looker/looker_usage.py +5 -7
datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
datahub/ingestion/source/looker/lookml_source.py +13 -15
datahub/ingestion/source/looker/view_upstream.py +5 -5
datahub/ingestion/source/metabase.py +1 -6
datahub/ingestion/source/mlflow.py +4 -9
datahub/ingestion/source/mode.py +5 -5
datahub/ingestion/source/mongodb.py +6 -4
datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
datahub/ingestion/source/nifi.py +24 -31
datahub/ingestion/source/openapi.py +9 -9
datahub/ingestion/source/powerbi/config.py +12 -12
datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
datahub/ingestion/source/powerbi/powerbi.py +6 -6
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
datahub/ingestion/source/redash.py +0 -5
datahub/ingestion/source/redshift/config.py +3 -3
datahub/ingestion/source/redshift/redshift.py +45 -46
datahub/ingestion/source/redshift/usage.py +33 -33
datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
datahub/ingestion/source/s3/source.py +11 -15
datahub/ingestion/source/salesforce.py +26 -25
datahub/ingestion/source/schema/json_schema.py +1 -1
datahub/ingestion/source/sigma/sigma.py +3 -3
datahub/ingestion/source/sigma/sigma_api.py +12 -10
datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
datahub/ingestion/source/sql/athena.py +1 -3
datahub/ingestion/source/sql/clickhouse.py +8 -14
datahub/ingestion/source/sql/oracle.py +1 -3
datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
datahub/ingestion/source/sql/sql_types.py +1 -2
datahub/ingestion/source/sql/sql_utils.py +5 -0
datahub/ingestion/source/sql/teradata.py +18 -5
datahub/ingestion/source/state/profiling_state_handler.py +3 -3
datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
datahub/ingestion/source/superset.py +1 -6
datahub/ingestion/source/tableau/tableau.py +343 -117
datahub/ingestion/source/tableau/tableau_common.py +5 -2
datahub/ingestion/source/unity/config.py +3 -1
datahub/ingestion/source/unity/proxy.py +1 -1
datahub/ingestion/source/unity/source.py +74 -74
datahub/ingestion/source/unity/usage.py +3 -1
datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
datahub/ingestion/source/usage/usage_common.py +1 -1
datahub/ingestion/source_report/ingestion_stage.py +24 -20
datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
datahub/ingestion/transformer/add_dataset_properties.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
datahub/ingestion/transformer/tags_to_terms.py +7 -7
datahub/integrations/assertion/snowflake/compiler.py +10 -10
datahub/lite/duckdb_lite.py +12 -10
datahub/metadata/_schema_classes.py +317 -44
datahub/metadata/_urns/urn_defs.py +69 -15
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
datahub/metadata/schema.avsc +302 -89
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
datahub/metadata/schemas/DatasetKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
datahub/metadata/schemas/MLModelKey.avsc +2 -1
datahub/metadata/schemas/MLModelProperties.avsc +96 -48
datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
datahub/metadata/schemas/VersionProperties.avsc +216 -0
datahub/metadata/schemas/VersionSetKey.avsc +26 -0
datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
datahub/secret/datahub_secrets_client.py +12 -21
datahub/secret/secret_common.py +14 -8
datahub/specific/aspect_helpers/custom_properties.py +1 -2
datahub/sql_parsing/schema_resolver.py +5 -10
datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
datahub/sql_parsing/sqlglot_lineage.py +3 -3
datahub/sql_parsing/sqlglot_utils.py +1 -1
datahub/telemetry/stats.py +1 -2
datahub/testing/mcp_diff.py +1 -1
datahub/utilities/file_backed_collections.py +11 -11
datahub/utilities/hive_schema_to_avro.py +2 -2
datahub/utilities/logging_manager.py +2 -2
datahub/utilities/lossy_collections.py +3 -3
datahub/utilities/mapping.py +3 -3
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/perf_timer.py +11 -6
datahub/utilities/serialized_lru_cache.py +3 -1
datahub/utilities/sqlalchemy_query_combiner.py +6 -6
datahub/utilities/sqllineage_patch.py +1 -1
datahub/utilities/stats_collections.py +3 -1
datahub/utilities/urns/_urn_base.py +28 -5
datahub/utilities/urns/urn_iter.py +2 -2
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/redshift/redshift.py CHANGED Viewed

@@ -276,6 +276,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
         "HLLSKETCH": NullType,
         "TIMETZ": TimeType,
         "VARBYTE": StringType,
+        "SUPER": NullType,
     }
     def get_platform_instance_id(self) -> str:
@@ -304,13 +305,13 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
             test_report.capability_report = {}
             try:
                 RedshiftDataDictionary.get_schemas(connection, database=config.database)
-                test_report.capability_report[
-                    SourceCapability.SCHEMA_METADATA
-                ] = CapabilityReport(capable=True)
+                test_report.capability_report[SourceCapability.SCHEMA_METADATA] = (
+                    CapabilityReport(capable=True)
+                )
             except Exception as e:
-                test_report.capability_report[
-                    SourceCapability.SCHEMA_METADATA
-                ] = CapabilityReport(capable=False, failure_reason=str(e))
+                test_report.capability_report[SourceCapability.SCHEMA_METADATA] = (
+                    CapabilityReport(capable=False, failure_reason=str(e))
+                )
         except Exception as e:
             test_report.basic_connectivity = CapabilityReport(
@@ -423,10 +424,10 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
         database = self.config.database
         logger.info(f"Processing db {database}")
-        self.report.report_ingestion_stage_start(METADATA_EXTRACTION)
-        self.db_tables[database] = defaultdict()
-        self.db_views[database] = defaultdict()
-        self.db_schemas.setdefault(database, {})
+        with self.report.new_stage(METADATA_EXTRACTION):
+            self.db_tables[database] = defaultdict()
+            self.db_views[database] = defaultdict()
+            self.db_schemas.setdefault(database, {})
         # TODO: Ideally, we'd push down exception handling to the place where the connection is used, as opposed to keeping
         # this fallback. For now, this gets us broad coverage quickly.
@@ -462,12 +463,12 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
                     self.process_schemas(connection, database)
                 )
-                self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
-                yield from self.extract_lineage_v2(
-                    connection=connection,
-                    database=database,
-                    lineage_extractor=lineage_extractor,
-                )
+                with self.report.new_stage(LINEAGE_EXTRACTION):
+                    yield from self.extract_lineage_v2(
+                        connection=connection,
+                        database=database,
+                        lineage_extractor=lineage_extractor,
+                    )
             all_tables = self.get_all_tables()
         else:
@@ -480,25 +481,25 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
                 or self.config.include_view_lineage
                 or self.config.include_copy_lineage
             ):
-                self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
-                yield from self.extract_lineage(
-                    connection=connection, all_tables=all_tables, database=database
-                )
+                with self.report.new_stage(LINEAGE_EXTRACTION):
+                    yield from self.extract_lineage(
+                        connection=connection, all_tables=all_tables, database=database
+                    )
-        self.report.report_ingestion_stage_start(USAGE_EXTRACTION_INGESTION)
         if self.config.include_usage_statistics:
-            yield from self.extract_usage(
-                connection=connection, all_tables=all_tables, database=database
-            )
+            with self.report.new_stage(USAGE_EXTRACTION_INGESTION):
+                yield from self.extract_usage(
+                    connection=connection, all_tables=all_tables, database=database
+                )
         if self.config.is_profiling_enabled():
-            self.report.report_ingestion_stage_start(PROFILING)
-            profiler = RedshiftProfiler(
-                config=self.config,
-                report=self.report,
-                state_handler=self.profiling_state_handler,
-            )
-            yield from profiler.get_workunits(self.db_tables)
+            with self.report.new_stage(PROFILING):
+                profiler = RedshiftProfiler(
+                    config=self.config,
+                    report=self.report,
+                    state_handler=self.profiling_state_handler,
+                )
+                yield from profiler.get_workunits(self.db_tables)
     def process_schemas(self, connection, database):
         for schema in self.data_dictionary.get_schemas(
@@ -633,8 +634,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
             else:
                 logger.info("View processing disabled, skipping")
-            self.report.metadata_extraction_sec[report_key] = round(
-                timer.elapsed_seconds(), 2
+            self.report.metadata_extraction_sec[report_key] = timer.elapsed_seconds(
+                digits=2
             )
     def _process_table(
@@ -946,9 +947,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
     def get_all_tables(
         self,
     ) -> Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]]:
-        all_tables: Dict[
-            str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]
-        ] = defaultdict(dict)
+        all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]] = (
+            defaultdict(dict)
+        )
         for db in set().union(self.db_tables, self.db_views):
             tables = self.db_tables.get(db, {})
             views = self.db_views.get(db, {})
@@ -966,9 +967,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
         all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
     ) -> Iterable[MetadataWorkUnit]:
         with PerfTimer() as timer:
-            redundant_usage_run_skip_handler: Optional[
-                RedundantUsageRunSkipHandler
-            ] = None
+            redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = (
+                None
+            )
             if self.config.enable_stateful_usage_ingestion:
                 redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler(
                     source=self,
@@ -986,9 +987,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
             yield from usage_extractor.get_usage_workunits(all_tables=all_tables)
-            self.report.usage_extraction_sec[database] = round(
-                timer.elapsed_seconds(), 2
-            )
+            self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2)
     def extract_lineage(
         self,
@@ -1011,8 +1010,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
                 database=database, connection=connection, all_tables=all_tables
             )
-            self.report.lineage_extraction_sec[f"{database}"] = round(
-                timer.elapsed_seconds(), 2
+            self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
+                digits=2
             )
             yield from self.generate_lineage(
                 database, lineage_extractor=lineage_extractor
@@ -1042,8 +1041,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
             yield from lineage_extractor.generate()
-            self.report.lineage_extraction_sec[f"{database}"] = round(
-                timer.elapsed_seconds(), 2
+            self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
+                digits=2
             )
         if self.redundant_lineage_run_skip_handler:

datahub/ingestion/source/redshift/usage.py CHANGED Viewed

@@ -182,38 +182,38 @@ class RedshiftUsageExtractor:
         self.report.num_operational_stats_filtered = 0
         if self.config.include_operational_stats:
-            self.report.report_ingestion_stage_start(USAGE_EXTRACTION_OPERATIONAL_STATS)
-            with PerfTimer() as timer:
-                # Generate operation aspect workunits
-                yield from self._gen_operation_aspect_workunits(
-                    self.connection, all_tables
-                )
-                self.report.operational_metadata_extraction_sec[
-                    self.config.database
-                ] = round(timer.elapsed_seconds(), 2)
+            with self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS):
+                with PerfTimer() as timer:
+                    # Generate operation aspect workunits
+                    yield from self._gen_operation_aspect_workunits(
+                        self.connection, all_tables
+                    )
+                    self.report.operational_metadata_extraction_sec[
+                        self.config.database
+                    ] = timer.elapsed_seconds(digits=2)
         # Generate aggregate events
-        self.report.report_ingestion_stage_start(USAGE_EXTRACTION_USAGE_AGGREGATION)
-        query: str = self.queries.usage_query(
-            start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
-            end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
-            database=self.config.database,
-        )
-        access_events_iterable: Iterable[
-            RedshiftAccessEvent
-        ] = self._gen_access_events_from_history_query(
-            query, connection=self.connection, all_tables=all_tables
-        )
+        with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):
+            query: str = self.queries.usage_query(
+                start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
+                end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
+                database=self.config.database,
+            )
+            access_events_iterable: Iterable[RedshiftAccessEvent] = (
+                self._gen_access_events_from_history_query(
+                    query, connection=self.connection, all_tables=all_tables
+                )
+            )
-        aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
-            access_events_iterable
-        )
-        # Generate usage workunits from aggregated events.
-        for time_bucket in aggregated_events.values():
-            for aggregate in time_bucket.values():
-                wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
-                self.report.num_usage_workunits_emitted += 1
-                yield wu
+            aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
+                access_events_iterable
+            )
+            # Generate usage workunits from aggregated events.
+            for time_bucket in aggregated_events.values():
+                for aggregate in time_bucket.values():
+                    wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
+                    self.report.num_usage_workunits_emitted += 1
+                    yield wu
     def _gen_operation_aspect_workunits(
         self,
@@ -225,10 +225,10 @@ class RedshiftUsageExtractor:
             start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
             end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
         )
-        access_events_iterable: Iterable[
-            RedshiftAccessEvent
-        ] = self._gen_access_events_from_history_query(
-            query, connection, all_tables=all_tables
+        access_events_iterable: Iterable[RedshiftAccessEvent] = (
+            self._gen_access_events_from_history_query(
+                query, connection, all_tables=all_tables
+            )
         )
         # Generate operation aspect work units from the access events

datahub/ingestion/source/s3/datalake_profiler_config.py CHANGED Viewed

@@ -85,8 +85,8 @@ class DataLakeProfilerConfig(ConfigModel):
                 if field_level_metric.startswith("include_field_"):
                     values.setdefault(field_level_metric, False)
-            assert (
-                max_num_fields_to_profile is None
-            ), f"{max_num_fields_to_profile_key} should be set to None"
+            assert max_num_fields_to_profile is None, (
+                f"{max_num_fields_to_profile_key} should be set to None"
+            )
         return values

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -6,9 +6,8 @@ import pathlib
 import re
 import time
 from datetime import datetime
-from itertools import groupby
 from pathlib import PurePath
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
 from urllib.parse import urlparse
 import smart_open.compression as so_compression
@@ -41,6 +40,7 @@ from datahub.ingestion.source.aws.s3_util import (
     get_bucket_name,
     get_bucket_relative_path,
     get_key_prefix,
+    group_s3_objects_by_dirname,
     strip_s3_prefix,
 )
 from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
@@ -75,6 +75,9 @@ from datahub.metadata.schema_classes import (
 from datahub.telemetry import stats, telemetry
 from datahub.utilities.perf_timer import PerfTimer
+if TYPE_CHECKING:
+    from mypy_boto3_s3.service_resource import Bucket
 # hide annoying debug errors from py4j
 logging.getLogger("py4j").setLevel(logging.ERROR)
 logger: logging.Logger = logging.getLogger(__name__)
@@ -842,7 +845,7 @@ class S3Source(StatefulIngestionSourceBase):
     def get_folder_info(
         self,
         path_spec: PathSpec,
-        bucket: Any,  # Todo: proper type
+        bucket: "Bucket",
         prefix: str,
     ) -> List[Folder]:
         """
@@ -857,22 +860,15 @@ class S3Source(StatefulIngestionSourceBase):
         Parameters:
         path_spec (PathSpec): The path specification used to determine partitioning.
-        bucket (Any): The S3 bucket object.
+        bucket (Bucket): The S3 bucket object.
         prefix (str): The prefix path in the S3 bucket to list objects from.
         Returns:
         List[Folder]: A list of Folder objects representing the partitions found.
         """
-        prefix_to_list = prefix
-        files = list(
-            bucket.objects.filter(Prefix=f"{prefix_to_list}").page_size(PAGE_SIZE)
-        )
-        files = sorted(files, key=lambda a: a.last_modified)
-        grouped_files = groupby(files, lambda x: x.key.rsplit("/", 1)[0])
         partitions: List[Folder] = []
-        for key, group in grouped_files:
+        s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
+        for key, group in group_s3_objects_by_dirname(s3_objects).items():
             file_size = 0
             creation_time = None
             modification_time = None
@@ -904,7 +900,7 @@ class S3Source(StatefulIngestionSourceBase):
                 Folder(
                     partition_id=id,
                     is_partition=bool(id),
-                    creation_time=creation_time if creation_time else None,
+                    creation_time=creation_time if creation_time else None,  # type: ignore[arg-type]
                     modification_time=modification_time,
                     sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
                     size=file_size,
@@ -1128,7 +1124,7 @@ class S3Source(StatefulIngestionSourceBase):
                                 table_data.table_path
                             ].timestamp = table_data.timestamp
-                for guid, table_data in table_dict.items():
+                for _, table_data in table_dict.items():
                     yield from self.ingest_table(table_data, path_spec)
             if not self.source_config.is_profiling_enabled():

datahub/ingestion/source/salesforce.py CHANGED Viewed

@@ -236,12 +236,12 @@ class SalesforceSource(Source):
         try:
             if self.config.auth is SalesforceAuthType.DIRECT_ACCESS_TOKEN:
                 logger.debug("Access Token Provided in Config")
-                assert (
-                    self.config.access_token is not None
-                ), "Config access_token is required for DIRECT_ACCESS_TOKEN auth"
-                assert (
-                    self.config.instance_url is not None
-                ), "Config instance_url is required for DIRECT_ACCESS_TOKEN auth"
+                assert self.config.access_token is not None, (
+                    "Config access_token is required for DIRECT_ACCESS_TOKEN auth"
+                )
+                assert self.config.instance_url is not None, (
+                    "Config instance_url is required for DIRECT_ACCESS_TOKEN auth"
+                )
                 self.sf = Salesforce(
                     instance_url=self.config.instance_url,
@@ -250,15 +250,15 @@ class SalesforceSource(Source):
                 )
             elif self.config.auth is SalesforceAuthType.USERNAME_PASSWORD:
                 logger.debug("Username/Password Provided in Config")
-                assert (
-                    self.config.username is not None
-                ), "Config username is required for USERNAME_PASSWORD auth"
-                assert (
-                    self.config.password is not None
-                ), "Config password is required for USERNAME_PASSWORD auth"
-                assert (
-                    self.config.security_token is not None
-                ), "Config security_token is required for USERNAME_PASSWORD auth"
+                assert self.config.username is not None, (
+                    "Config username is required for USERNAME_PASSWORD auth"
+                )
+                assert self.config.password is not None, (
+                    "Config password is required for USERNAME_PASSWORD auth"
+                )
+                assert self.config.security_token is not None, (
+                    "Config security_token is required for USERNAME_PASSWORD auth"
+                )
                 self.sf = Salesforce(
                     username=self.config.username,
@@ -269,15 +269,15 @@ class SalesforceSource(Source):
             elif self.config.auth is SalesforceAuthType.JSON_WEB_TOKEN:
                 logger.debug("Json Web Token provided in the config")
-                assert (
-                    self.config.username is not None
-                ), "Config username is required for JSON_WEB_TOKEN auth"
-                assert (
-                    self.config.consumer_key is not None
-                ), "Config consumer_key is required for JSON_WEB_TOKEN auth"
-                assert (
-                    self.config.private_key is not None
-                ), "Config private_key is required for JSON_WEB_TOKEN auth"
+                assert self.config.username is not None, (
+                    "Config username is required for JSON_WEB_TOKEN auth"
+                )
+                assert self.config.consumer_key is not None, (
+                    "Config consumer_key is required for JSON_WEB_TOKEN auth"
+                )
+                assert self.config.private_key is not None, (
+                    "Config private_key is required for JSON_WEB_TOKEN auth"
+                )
                 self.sf = Salesforce(
                     username=self.config.username,
@@ -439,7 +439,8 @@ class SalesforceSource(Source):
         dataPlatformInstance = DataPlatformInstanceClass(
             builder.make_data_platform_urn(self.platform),
             instance=builder.make_dataplatform_instance_urn(
-                self.platform, self.config.platform_instance  # type:ignore
+                self.platform,
+                self.config.platform_instance,  # type:ignore
             ),
         )

datahub/ingestion/source/schema/json_schema.py CHANGED Viewed

@@ -354,7 +354,7 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
             browse_prefix = f"/{self.config.env.lower()}/{self.config.platform}/{self.config.platform_instance}"
         if os.path.isdir(self.config.path):
-            for root, dirs, files in os.walk(self.config.path, topdown=False):
+            for root, _, files in os.walk(self.config.path, topdown=False):
                 for file_name in [f for f in files if f.endswith(".json")]:
                     try:
                         yield from self._load_one_file(

datahub/ingestion/source/sigma/sigma.py CHANGED Viewed

@@ -477,9 +477,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
                     upstream_dataset_urns
                     and dataset_urn not in self.dataset_upstream_urn_mapping
                 ):
-                    self.dataset_upstream_urn_mapping[
-                        dataset_urn
-                    ] = upstream_dataset_urns
+                    self.dataset_upstream_urn_mapping[dataset_urn] = (
+                        upstream_dataset_urns
+                    )
             element_input_fields = [
                 InputFieldClass(

datahub/ingestion/source/sigma/sigma_api.py CHANGED Viewed

@@ -126,9 +126,9 @@ class SigmaAPI:
                 response.raise_for_status()
                 response_dict = response.json()
                 for workspace_dict in response_dict[Constant.ENTRIES]:
-                    self.workspaces[
-                        workspace_dict[Constant.WORKSPACEID]
-                    ] = Workspace.parse_obj(workspace_dict)
+                    self.workspaces[workspace_dict[Constant.WORKSPACEID]] = (
+                        Workspace.parse_obj(workspace_dict)
+                    )
                 if response_dict[Constant.NEXTPAGE]:
                     url = f"{workspace_url}&page={response_dict[Constant.NEXTPAGE]}"
                 else:
@@ -147,9 +147,9 @@ class SigmaAPI:
                 response.raise_for_status()
                 response_dict = response.json()
                 for user_dict in response_dict[Constant.ENTRIES]:
-                    users[
-                        user_dict[Constant.MEMBERID]
-                    ] = f"{user_dict[Constant.FIRSTNAME]}_{user_dict[Constant.LASTNAME]}"
+                    users[user_dict[Constant.MEMBERID]] = (
+                        f"{user_dict[Constant.FIRSTNAME]}_{user_dict[Constant.LASTNAME]}"
+                    )
                 if response_dict[Constant.NEXTPAGE]:
                     url = f"{members_url}&page={response_dict[Constant.NEXTPAGE]}"
                 else:
@@ -327,10 +327,12 @@ class SigmaAPI:
             response.raise_for_status()
             for i, element_dict in enumerate(response.json()[Constant.ENTRIES]):
                 if not element_dict.get(Constant.NAME):
-                    element_dict[Constant.NAME] = f"Element {i+1} of Page '{page.name}'"
-                element_dict[
-                    Constant.URL
-                ] = f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true"
+                    element_dict[Constant.NAME] = (
+                        f"Element {i + 1} of Page '{page.name}'"
+                    )
+                element_dict[Constant.URL] = (
+                    f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true"
+                )
                 element = Element.parse_obj(element_dict)
                 if (
                     self.config.extract_lineage

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -221,6 +221,14 @@ class SnowflakeV2Config(
         default=False,
         description="If enabled, uses the new queries extractor to extract queries from snowflake.",
     )
+    include_queries: bool = Field(
+        default=True,
+        description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
+    )
+    include_query_usage_statistics: bool = Field(
+        default=True,
+        description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
+    )
     lazy_schema_resolver: bool = Field(
         default=True,
@@ -236,6 +244,11 @@ class SnowflakeV2Config(
         description="""Optional. Allowed values are `without_lineage`, `with_lineage`, and `skip` (default). `without_lineage` only extracts tags that have been applied directly to the given entity. `with_lineage` extracts both directly applied and propagated tags, but will be significantly slower. See the [Snowflake documentation](https://docs.snowflake.com/en/user-guide/object-tagging.html#tag-lineage) for information about tag lineage/propagation. """,
     )
+    extract_tags_as_structured_properties: bool = Field(
+        default=False,
+        description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
+    )
     include_external_url: bool = Field(
         default=True,
         description="Whether to populate Snowsight url for Snowflake Objects",
@@ -255,6 +268,14 @@ class SnowflakeV2Config(
         description="List of regex patterns for tags to include in ingestion. Only used if `extract_tags` is enabled.",
     )
+    structured_property_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description=(
+            "List of regex patterns for structured properties to include in ingestion."
+            " Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
+        ),
+    )
     # This is required since access_history table does not capture whether the table was temporary table.
     temporary_tables_pattern: List[str] = Field(
         default=DEFAULT_TEMP_TABLES_PATTERNS,
@@ -363,18 +384,20 @@ class SnowflakeV2Config(
                     assert all(
                         consumer.platform_instance != share_details.platform_instance
                         for consumer in share_details.consumers
-                    ), "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake."
+                    ), (
+                        "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake."
+                    )
                 databases_included_in_share.append(shared_db)
                 databases_created_from_share.extend(share_details.consumers)
             for db_from_share in databases_created_from_share:
-                assert (
-                    db_from_share not in databases_included_in_share
-                ), "Database included in a share can not be present as consumer in any share."
-                assert (
-                    databases_created_from_share.count(db_from_share) == 1
-                ), "Same database can not be present as consumer in more than one share."
+                assert db_from_share not in databases_included_in_share, (
+                    "Database included in a share can not be present as consumer in any share."
+                )
+                assert databases_created_from_share.count(db_from_share) == 1, (
+                    "Same database can not be present as consumer in more than one share."
+                )
         return shares

datahub/ingestion/source/snowflake/snowflake_connection.py CHANGED Viewed

@@ -250,9 +250,9 @@ class SnowflakeConnectionConfig(ConfigModel):
             if self.private_key is not None:
                 pkey_bytes = self.private_key.replace("\\n", "\n").encode()
             else:
-                assert (
-                    self.private_key_path
-                ), "missing required private key path to read key from"
+                assert self.private_key_path, (
+                    "missing required private key path to read key from"
+                )
                 with open(self.private_key_path, "rb") as key:
                     pkey_bytes = key.read()
@@ -284,9 +284,9 @@ class SnowflakeConnectionConfig(ConfigModel):
         return self.options
     def get_oauth_connection(self) -> NativeSnowflakeConnection:
-        assert (
-            self.oauth_config
-        ), "oauth_config should be provided if using oauth based authentication"
+        assert self.oauth_config, (
+            "oauth_config should be provided if using oauth based authentication"
+        )
         generator = OAuthTokenGenerator(
             client_id=self.oauth_config.client_id,
             authority_url=self.oauth_config.authority_url,

datahub/ingestion/source/snowflake/snowflake_queries.py CHANGED Viewed

@@ -623,7 +623,7 @@ fingerprinted_queries as (
         query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3)
         AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3)
         AND execution_status = 'SUCCESS'
-        AND {users_filter or 'TRUE'}
+        AND {users_filter or "TRUE"}
 )
 , deduplicated_queries as (
     SELECT
@@ -651,7 +651,7 @@ fingerprinted_queries as (
     WHERE
         query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
         AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
-        AND {users_filter or 'TRUE'}
+        AND {users_filter or "TRUE"}
         AND query_id IN (
             SELECT query_id FROM deduplicated_queries
         )

datahub/ingestion/source/snowflake/snowflake_report.py CHANGED Viewed

@@ -166,6 +166,3 @@ class SnowflakeV2Report(
     def report_tag_processed(self, tag_name: str) -> None:
         self._processed_tags.add(tag_name)
-    def set_ingestion_stage(self, database: str, stage: str) -> None:
-        self.report_ingestion_stage_start(f"{database}: {stage}")

datahub/ingestion/source/snowflake/snowflake_schema.py CHANGED Viewed

@@ -45,15 +45,18 @@ class SnowflakeTag:
     name: str
     value: str
-    def display_name(self) -> str:
+    def tag_display_name(self) -> str:
         return f"{self.name}: {self.value}"
-    def identifier(self) -> str:
+    def tag_identifier(self) -> str:
         return f"{self._id_prefix_as_str()}:{self.value}"
     def _id_prefix_as_str(self) -> str:
         return f"{self.database}.{self.schema}.{self.name}"
+    def structured_property_identifier(self) -> str:
+        return f"snowflake.{self.database}.{self.schema}.{self.name}"
 @dataclass
 class SnowflakeColumn(BaseColumn):
@@ -139,9 +142,9 @@ class _SnowflakeTagCache:
         )
         # self._table_tags[<database_name>][<schema_name>][<table_name>] = list of tags applied to table
-        self._table_tags: Dict[
-            str, Dict[str, Dict[str, List[SnowflakeTag]]]
-        ] = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+        self._table_tags: Dict[str, Dict[str, Dict[str, List[SnowflakeTag]]]] = (
+            defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+        )
         # self._column_tags[<database_name>][<schema_name>][<table_name>][<column_name>] = list of tags applied to column
         self._column_tags: Dict[

acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.1rc17py3-none-any.whl → 0.15.0.2py3-none-any.whl