PyPI - acryl-datahub - Versions diffs - 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl - Mend

acryl-datahub 0.15.0.1rc17py3-none-any.whl → 0.15.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (211) hide show

{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
datahub/__init__.py +1 -1
datahub/api/entities/assertion/assertion_operator.py +3 -5
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/datacontract/assertion_operator.py +3 -5
datahub/api/entities/dataproduct/dataproduct.py +4 -4
datahub/api/entities/dataset/dataset.py +2 -1
datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
datahub/cli/cli_utils.py +13 -2
datahub/cli/delete_cli.py +3 -3
datahub/cli/docker_cli.py +6 -6
datahub/cli/ingest_cli.py +25 -15
datahub/cli/lite_cli.py +2 -2
datahub/cli/migrate.py +5 -5
datahub/cli/specific/assertions_cli.py +3 -3
datahub/cli/specific/structuredproperties_cli.py +84 -0
datahub/cli/timeline_cli.py +1 -1
datahub/configuration/common.py +1 -2
datahub/configuration/config_loader.py +73 -50
datahub/configuration/git.py +2 -2
datahub/configuration/time_window_config.py +10 -5
datahub/emitter/mce_builder.py +4 -8
datahub/emitter/mcp_builder.py +27 -0
datahub/emitter/mcp_patch_builder.py +1 -2
datahub/emitter/rest_emitter.py +126 -85
datahub/entrypoints.py +6 -0
datahub/ingestion/api/incremental_lineage_helper.py +2 -8
datahub/ingestion/api/report.py +1 -2
datahub/ingestion/api/source.py +4 -2
datahub/ingestion/api/source_helpers.py +1 -1
datahub/ingestion/extractor/json_schema_util.py +3 -3
datahub/ingestion/extractor/schema_util.py +3 -5
datahub/ingestion/fs/s3_fs.py +3 -3
datahub/ingestion/glossary/datahub_classifier.py +6 -4
datahub/ingestion/graph/client.py +22 -19
datahub/ingestion/graph/config.py +1 -1
datahub/ingestion/run/pipeline.py +8 -7
datahub/ingestion/run/pipeline_config.py +3 -3
datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
datahub/ingestion/source/abs/source.py +19 -8
datahub/ingestion/source/aws/glue.py +77 -47
datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
datahub/ingestion/source/aws/s3_util.py +24 -1
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
datahub/ingestion/source/bigquery_v2/queries.py +1 -3
datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
datahub/ingestion/source/bigquery_v2/usage.py +60 -60
datahub/ingestion/source/cassandra/cassandra.py +0 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
datahub/ingestion/source/confluent_schema_registry.py +6 -6
datahub/ingestion/source/csv_enricher.py +29 -29
datahub/ingestion/source/datahub/config.py +10 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
datahub/ingestion/source/datahub/datahub_source.py +12 -2
datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
datahub/ingestion/source/dbt/dbt_common.py +9 -7
datahub/ingestion/source/delta_lake/source.py +0 -5
datahub/ingestion/source/demo_data.py +1 -1
datahub/ingestion/source/dremio/dremio_api.py +4 -4
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
datahub/ingestion/source/dremio/dremio_source.py +2 -2
datahub/ingestion/source/elastic_search.py +4 -4
datahub/ingestion/source/fivetran/fivetran.py +1 -6
datahub/ingestion/source/gc/datahub_gc.py +11 -14
datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
datahub/ingestion/source/gcs/gcs_source.py +3 -2
datahub/ingestion/source/ge_data_profiler.py +2 -5
datahub/ingestion/source/ge_profiling_config.py +3 -3
datahub/ingestion/source/iceberg/iceberg.py +13 -6
datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +3 -3
datahub/ingestion/source/identity/okta.py +3 -3
datahub/ingestion/source/kafka/kafka.py +11 -9
datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
datahub/ingestion/source/looker/looker_common.py +19 -19
datahub/ingestion/source/looker/looker_config.py +11 -6
datahub/ingestion/source/looker/looker_source.py +25 -25
datahub/ingestion/source/looker/looker_template_language.py +3 -3
datahub/ingestion/source/looker/looker_usage.py +5 -7
datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
datahub/ingestion/source/looker/lookml_source.py +13 -15
datahub/ingestion/source/looker/view_upstream.py +5 -5
datahub/ingestion/source/metabase.py +1 -6
datahub/ingestion/source/mlflow.py +4 -9
datahub/ingestion/source/mode.py +5 -5
datahub/ingestion/source/mongodb.py +6 -4
datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
datahub/ingestion/source/nifi.py +24 -31
datahub/ingestion/source/openapi.py +9 -9
datahub/ingestion/source/powerbi/config.py +12 -12
datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
datahub/ingestion/source/powerbi/powerbi.py +6 -6
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
datahub/ingestion/source/redash.py +0 -5
datahub/ingestion/source/redshift/config.py +3 -3
datahub/ingestion/source/redshift/redshift.py +45 -46
datahub/ingestion/source/redshift/usage.py +33 -33
datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
datahub/ingestion/source/s3/source.py +11 -15
datahub/ingestion/source/salesforce.py +26 -25
datahub/ingestion/source/schema/json_schema.py +1 -1
datahub/ingestion/source/sigma/sigma.py +3 -3
datahub/ingestion/source/sigma/sigma_api.py +12 -10
datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
datahub/ingestion/source/sql/athena.py +1 -3
datahub/ingestion/source/sql/clickhouse.py +8 -14
datahub/ingestion/source/sql/oracle.py +1 -3
datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
datahub/ingestion/source/sql/sql_types.py +1 -2
datahub/ingestion/source/sql/sql_utils.py +5 -0
datahub/ingestion/source/sql/teradata.py +18 -5
datahub/ingestion/source/state/profiling_state_handler.py +3 -3
datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
datahub/ingestion/source/superset.py +1 -6
datahub/ingestion/source/tableau/tableau.py +343 -117
datahub/ingestion/source/tableau/tableau_common.py +5 -2
datahub/ingestion/source/unity/config.py +3 -1
datahub/ingestion/source/unity/proxy.py +1 -1
datahub/ingestion/source/unity/source.py +74 -74
datahub/ingestion/source/unity/usage.py +3 -1
datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
datahub/ingestion/source/usage/usage_common.py +1 -1
datahub/ingestion/source_report/ingestion_stage.py +24 -20
datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
datahub/ingestion/transformer/add_dataset_properties.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
datahub/ingestion/transformer/tags_to_terms.py +7 -7
datahub/integrations/assertion/snowflake/compiler.py +10 -10
datahub/lite/duckdb_lite.py +12 -10
datahub/metadata/_schema_classes.py +317 -44
datahub/metadata/_urns/urn_defs.py +69 -15
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
datahub/metadata/schema.avsc +302 -89
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
datahub/metadata/schemas/DatasetKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
datahub/metadata/schemas/MLModelKey.avsc +2 -1
datahub/metadata/schemas/MLModelProperties.avsc +96 -48
datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
datahub/metadata/schemas/VersionProperties.avsc +216 -0
datahub/metadata/schemas/VersionSetKey.avsc +26 -0
datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
datahub/secret/datahub_secrets_client.py +12 -21
datahub/secret/secret_common.py +14 -8
datahub/specific/aspect_helpers/custom_properties.py +1 -2
datahub/sql_parsing/schema_resolver.py +5 -10
datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
datahub/sql_parsing/sqlglot_lineage.py +3 -3
datahub/sql_parsing/sqlglot_utils.py +1 -1
datahub/telemetry/stats.py +1 -2
datahub/testing/mcp_diff.py +1 -1
datahub/utilities/file_backed_collections.py +11 -11
datahub/utilities/hive_schema_to_avro.py +2 -2
datahub/utilities/logging_manager.py +2 -2
datahub/utilities/lossy_collections.py +3 -3
datahub/utilities/mapping.py +3 -3
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/perf_timer.py +11 -6
datahub/utilities/serialized_lru_cache.py +3 -1
datahub/utilities/sqlalchemy_query_combiner.py +6 -6
datahub/utilities/sqllineage_patch.py +1 -1
datahub/utilities/stats_collections.py +3 -1
datahub/utilities/urns/_urn_base.py +28 -5
datahub/utilities/urns/urn_iter.py +2 -2
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0

datahub/ingestion/run/pipeline.py CHANGED Viewed

@@ -76,8 +76,9 @@ class LoggingCallback(WriteCallback):
         failure_metadata: dict,
     ) -> None:
         logger.error(
-            f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}"
-            f" with {failure_exception} and info {failure_metadata}"
+            f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}",
+            extra={"failure_metadata": failure_metadata},
+            exc_info=failure_exception,
         )
@@ -108,9 +109,9 @@ class DeadLetterQueueCallback(WriteCallback):
                         mcp.systemMetadata.properties = {}
                     if "workunit_id" not in mcp.systemMetadata.properties:
                         # update the workunit id
-                        mcp.systemMetadata.properties[
-                            "workunit_id"
-                        ] = record_envelope.metadata["workunit_id"]
+                        mcp.systemMetadata.properties["workunit_id"] = (
+                            record_envelope.metadata["workunit_id"]
+                        )
                 record_envelope.record = mcp
         self.file_sink.write_record_async(record_envelope, self.logging_callback)
@@ -700,7 +701,7 @@ class Pipeline:
             num_failures_sink = len(self.sink.get_report().failures)
             click.secho(
                 message_template.format(
-                    status=f"with at least {num_failures_source+num_failures_sink} failures"
+                    status=f"with at least {num_failures_source + num_failures_sink} failures"
                 ),
                 fg=self._get_text_color(
                     running=currently_running, failures=True, warnings=False
@@ -718,7 +719,7 @@ class Pipeline:
             num_warn_global = len(global_warnings)
             click.secho(
                 message_template.format(
-                    status=f"with at least {num_warn_source+num_warn_sink+num_warn_global} warnings"
+                    status=f"with at least {num_warn_source + num_warn_sink + num_warn_global} warnings"
                 ),
                 fg=self._get_text_color(
                     running=currently_running, failures=False, warnings=True

datahub/ingestion/run/pipeline_config.py CHANGED Viewed

@@ -92,9 +92,9 @@ class PipelineConfig(ConfigModel):
     pipeline_name: Optional[str] = None
     failure_log: FailureLoggingConfig = FailureLoggingConfig()
-    _raw_dict: Optional[
-        dict
-    ] = None  # the raw dict that was parsed to construct this config
+    _raw_dict: Optional[dict] = (
+        None  # the raw dict that was parsed to construct this config
+    )
     @validator("run_id", pre=True, always=True)
     def run_id_should_be_semantic(

datahub/ingestion/source/abs/datalake_profiler_config.py CHANGED Viewed

@@ -85,8 +85,8 @@ class DataLakeProfilerConfig(ConfigModel):
                 if field_level_metric.startswith("include_field_"):
                     values.setdefault(field_level_metric, False)
-            assert (
-                max_num_fields_to_profile is None
-            ), f"{max_num_fields_to_profile_key} should be set to None"
+            assert max_num_fields_to_profile is None, (
+                f"{max_num_fields_to_profile_key} should be set to None"
+            )
         return values

datahub/ingestion/source/abs/source.py CHANGED Viewed

@@ -508,7 +508,12 @@ class ABSSource(StatefulIngestionSourceBase):
                         ):
                             abs_path = self.create_abs_path(obj.name)
                             logger.debug(f"Sampling file: {abs_path}")
-                            yield abs_path, obj.name, obj.last_modified, obj.size,
+                            yield (
+                                abs_path,
+                                obj.name,
+                                obj.last_modified,
+                                obj.size,
+                            )
                 except Exception as e:
                     # This odd check if being done because boto does not have a proper exception to catch
                     # The exception that appears in stacktrace cannot actually be caught without a lot more work
@@ -552,9 +557,12 @@ class ABSSource(StatefulIngestionSourceBase):
         if os.path.isfile(prefix):
             logger.debug(f"Scanning single local file: {prefix}")
             file_name = prefix
-            yield prefix, file_name, datetime.utcfromtimestamp(
-                os.path.getmtime(prefix)
-            ), os.path.getsize(prefix)
+            yield (
+                prefix,
+                file_name,
+                datetime.utcfromtimestamp(os.path.getmtime(prefix)),
+                os.path.getsize(prefix),
+            )
         else:
             logger.debug(f"Scanning files under local folder: {prefix}")
             for root, dirs, files in os.walk(prefix):
@@ -565,9 +573,12 @@ class ABSSource(StatefulIngestionSourceBase):
                     full_path = PurePath(
                         os.path.normpath(os.path.join(root, file))
                     ).as_posix()
-                    yield full_path, file, datetime.utcfromtimestamp(
-                        os.path.getmtime(full_path)
-                    ), os.path.getsize(full_path)
+                    yield (
+                        full_path,
+                        file,
+                        datetime.utcfromtimestamp(os.path.getmtime(full_path)),
+                        os.path.getsize(full_path),
+                    )
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         self.container_WU_creator = ContainerWUCreator(
@@ -613,7 +624,7 @@ class ABSSource(StatefulIngestionSourceBase):
                                 table_data.table_path
                             ].timestamp = table_data.timestamp
-                for guid, table_data in table_dict.items():
+                for _, table_data in table_dict.items():
                     yield from self.ingest_table(table_data, path_spec)
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:

datahub/ingestion/source/aws/glue.py CHANGED Viewed

@@ -248,6 +248,9 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
     "Enabled by default when stateful ingestion is turned on.",
 )
 @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
+@capability(
+    SourceCapability.LINEAGE_FINE, "Support via the `emit_s3_lineage` config field"
+)
 class GlueSource(StatefulIngestionSourceBase):
     """
     Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../../../../docs/generated/ingestion/sources/s3.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub.
@@ -284,12 +287,22 @@ class GlueSource(StatefulIngestionSourceBase):
         "Action": [
             "glue:GetDataflowGraph",
             "glue:GetJobs",
+            "s3:GetObject",
         ],
         "Resource": "*"
     }
     ```
-    plus `s3:GetObject` for the job script locations.
+    For profiling datasets, the following additional permissions are required:
+    ```json
+        {
+        "Effect": "Allow",
+        "Action": [
+            "glue:GetPartitions",
+        ],
+        "Resource": "*"
+    }
+    ```
     """
@@ -508,7 +521,7 @@ class GlueSource(StatefulIngestionSourceBase):
         # otherwise, a node represents a transformation
         else:
             node_urn = mce_builder.make_data_job_urn_with_flow(
-                flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}'
+                flow_urn, job_id=f"{node['NodeType']}-{node['Id']}"
             )
         return {
@@ -666,7 +679,7 @@ class GlueSource(StatefulIngestionSourceBase):
             )
         )
-        return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce)
+        return MetadataWorkUnit(id=f"{job_name}-{node['Id']}", mce=mce)
     def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
         logger.debug("Getting all databases")
@@ -737,13 +750,13 @@ class GlueSource(StatefulIngestionSourceBase):
     ) -> Optional[MetadataWorkUnit]:
         if self.source_config.emit_s3_lineage:
             # extract dataset properties aspect
-            dataset_properties: Optional[
-                DatasetPropertiesClass
-            ] = mce_builder.get_aspect_if_available(mce, DatasetPropertiesClass)
+            dataset_properties: Optional[DatasetPropertiesClass] = (
+                mce_builder.get_aspect_if_available(mce, DatasetPropertiesClass)
+            )
             # extract dataset schema aspect
-            schema_metadata: Optional[
-                SchemaMetadataClass
-            ] = mce_builder.get_aspect_if_available(mce, SchemaMetadataClass)
+            schema_metadata: Optional[SchemaMetadataClass] = (
+                mce_builder.get_aspect_if_available(mce, SchemaMetadataClass)
+            )
             if dataset_properties and "Location" in dataset_properties.customProperties:
                 location = dataset_properties.customProperties["Location"]
@@ -752,9 +765,9 @@ class GlueSource(StatefulIngestionSourceBase):
                         location, self.source_config.env
                     )
                     assert self.ctx.graph
-                    schema_metadata_for_s3: Optional[
-                        SchemaMetadataClass
-                    ] = self.ctx.graph.get_schema_metadata(s3_dataset_urn)
+                    schema_metadata_for_s3: Optional[SchemaMetadataClass] = (
+                        self.ctx.graph.get_schema_metadata(s3_dataset_urn)
+                    )
                     if self.source_config.glue_s3_lineage_direction == "upstream":
                         fine_grained_lineages = None
@@ -1054,49 +1067,66 @@ class GlueSource(StatefulIngestionSourceBase):
             yield from self.gen_database_containers(database)
         for table in tables:
-            database_name = table["DatabaseName"]
             table_name = table["Name"]
-            full_table_name = f"{database_name}.{table_name}"
-            self.report.report_table_scanned()
-            if not self.source_config.database_pattern.allowed(
-                database_name
-            ) or not self.source_config.table_pattern.allowed(full_table_name):
-                self.report.report_table_dropped(full_table_name)
-                continue
+            try:
+                yield from self._gen_table_wu(table=table)
+            except KeyError as e:
+                self.report.report_failure(
+                    message="Failed to extract workunit for table",
+                    context=f"Table: {table_name}",
+                    exc=e,
+                )
+        if self.extract_transforms:
+            yield from self._transform_extraction()
-            dataset_urn = make_dataset_urn_with_platform_instance(
-                platform=self.platform,
-                name=full_table_name,
-                env=self.env,
-                platform_instance=self.source_config.platform_instance,
-            )
+    def _gen_table_wu(self, table: Dict) -> Iterable[MetadataWorkUnit]:
+        database_name = table["DatabaseName"]
+        table_name = table["Name"]
+        full_table_name = f"{database_name}.{table_name}"
+        self.report.report_table_scanned()
+        if not self.source_config.database_pattern.allowed(
+            database_name
+        ) or not self.source_config.table_pattern.allowed(full_table_name):
+            self.report.report_table_dropped(full_table_name)
+            return
+        dataset_urn = make_dataset_urn_with_platform_instance(
+            platform=self.platform,
+            name=full_table_name,
+            env=self.env,
+            platform_instance=self.source_config.platform_instance,
+        )
-            mce = self._extract_record(dataset_urn, table, full_table_name)
-            yield MetadataWorkUnit(full_table_name, mce=mce)
+        mce = self._extract_record(dataset_urn, table, full_table_name)
+        yield MetadataWorkUnit(full_table_name, mce=mce)
-            # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
-            # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
-            yield MetadataChangeProposalWrapper(
-                entityUrn=dataset_urn,
-                aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
-            ).as_workunit()
+        # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
+        # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
+        yield MetadataChangeProposalWrapper(
+            entityUrn=dataset_urn,
+            aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
+        ).as_workunit()
-            yield from self._get_domain_wu(
-                dataset_name=full_table_name,
-                entity_urn=dataset_urn,
-            )
-            yield from self.add_table_to_database_container(
-                dataset_urn=dataset_urn, db_name=database_name
-            )
+        yield from self._get_domain_wu(
+            dataset_name=full_table_name,
+            entity_urn=dataset_urn,
+        )
+        yield from self.add_table_to_database_container(
+            dataset_urn=dataset_urn, db_name=database_name
+        )
-            wu = self.get_lineage_if_enabled(mce)
-            if wu:
-                yield wu
+        wu = self.get_lineage_if_enabled(mce)
+        if wu:
+            yield wu
+        try:
             yield from self.get_profile_if_enabled(mce, database_name, table_name)
-        if self.extract_transforms:
-            yield from self._transform_extraction()
+        except KeyError as e:
+            self.report.report_failure(
+                message="Failed to extract profile for table",
+                context=f"Table: {dataset_urn}",
+                exc=e,
+            )
     def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
         dags: Dict[str, Optional[Dict[str, Any]]] = {}

datahub/ingestion/source/aws/s3_boto_utils.py CHANGED Viewed

@@ -40,7 +40,7 @@ def get_s3_tags(
                 ]
             )
         except s3.meta.client.exceptions.ClientError:
-            logger.warn(f"No tags found for bucket={bucket_name}")
+            logger.warning(f"No tags found for bucket={bucket_name}")
     if use_s3_object_tags and key_name is not None:
         s3_client = aws_config.get_s3_client()
@@ -53,7 +53,7 @@ def get_s3_tags(
         else:
             # Unlike bucket tags, if an object does not have tags, it will just return an empty array
             # as opposed to an exception.
-            logger.warn(f"No tags found for bucket={bucket_name} key={key_name}")
+            logger.warning(f"No tags found for bucket={bucket_name} key={key_name}")
     if len(tags_to_add) == 0:
         return None
     if ctx.graph is not None:
@@ -65,7 +65,7 @@ def get_s3_tags(
         if current_tags:
             tags_to_add.extend([current_tag.tag for current_tag in current_tags.tags])
     else:
-        logger.warn("Could not connect to DatahubApi. No current tags to maintain")
+        logger.warning("Could not connect to DatahubApi. No current tags to maintain")
     # Remove duplicate tags
     tags_to_add = sorted(list(set(tags_to_add)))
     new_tags = GlobalTagsClass(

datahub/ingestion/source/aws/s3_util.py CHANGED Viewed

@@ -1,6 +1,11 @@
 import logging
 import os
-from typing import Optional
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional
+if TYPE_CHECKING:
+    from mypy_boto3_s3.service_resource import ObjectSummary
 S3_PREFIXES = ["s3://", "s3n://", "s3a://"]
@@ -68,3 +73,21 @@ def get_key_prefix(s3_uri: str) -> str:
             f"Not an S3 URI. Must start with one of the following prefixes: {str(S3_PREFIXES)}"
         )
     return strip_s3_prefix(s3_uri).split("/", maxsplit=1)[1]
+def group_s3_objects_by_dirname(
+    s3_objects: Iterable["ObjectSummary"],
+) -> Dict[str, List["ObjectSummary"]]:
+    """
+    Groups S3 objects by their directory name.
+    If a s3_object in the root directory (i.e., s3://bucket/file.txt), it is grouped under '/'.
+    """
+    grouped_s3_objs = defaultdict(list)
+    for obj in s3_objects:
+        if "/" in obj.key:
+            dirname = obj.key.rsplit("/", 1)[0]
+        else:
+            dirname = "/"
+        grouped_s3_objs[dirname].append(obj)
+    return grouped_s3_objs

datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py CHANGED Viewed

@@ -257,7 +257,7 @@ class FeatureGroupProcessor:
             mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)
         return MetadataWorkUnit(
-            id=f'{feature_group_details["FeatureGroupName"]}-{feature["FeatureName"]}',
+            id=f"{feature_group_details['FeatureGroupName']}-{feature['FeatureName']}",
             mce=mce,
         )

datahub/ingestion/source/aws/sagemaker_processors/models.py CHANGED Viewed

@@ -212,7 +212,7 @@ class ModelProcessor:
         mce = MetadataChangeEvent(proposedSnapshot=endpoint_snapshot)
         return MetadataWorkUnit(
-            id=f'{endpoint_details["EndpointName"]}',
+            id=f"{endpoint_details['EndpointName']}",
             mce=mce,
         )
@@ -503,7 +503,7 @@ class ModelProcessor:
         mce = MetadataChangeEvent(proposedSnapshot=model_snapshot)
         return MetadataWorkUnit(
-            id=f'{model_details["ModelName"]}',
+            id=f"{model_details['ModelName']}",
             mce=mce,
         )

datahub/ingestion/source/bigquery_v2/bigquery.py CHANGED Viewed

@@ -132,9 +132,9 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
         self.filters = BigQueryFilter(self.config, self.report)
         self.identifiers = BigQueryIdentifierBuilder(self.config, self.report)
-        redundant_lineage_run_skip_handler: Optional[
-            RedundantLineageRunSkipHandler
-        ] = None
+        redundant_lineage_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = (
+            None
+        )
         if self.config.enable_stateful_lineage_ingestion:
             redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
                 source=self,
@@ -253,14 +253,14 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
         for project in projects:
             yield from self.bq_schema_extractor.get_project_workunits(project)
-        self.report.set_ingestion_stage("*", "View and Snapshot Lineage")
-        yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
-            [p.id for p in projects],
-            self.bq_schema_extractor.view_refs_by_project,
-            self.bq_schema_extractor.view_definitions,
-            self.bq_schema_extractor.snapshot_refs_by_project,
-            self.bq_schema_extractor.snapshots_by_ref,
-        )
+        with self.report.new_stage("*: View and Snapshot Lineage"):
+            yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
+                [p.id for p in projects],
+                self.bq_schema_extractor.view_refs_by_project,
+                self.bq_schema_extractor.view_definitions,
+                self.bq_schema_extractor.snapshot_refs_by_project,
+                self.bq_schema_extractor.snapshots_by_ref,
+            )
         if self.config.use_queries_v2:
             # if both usage and lineage are disabled then skip queries extractor piece
@@ -270,29 +270,29 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
             ):
                 return
-            self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
-            with BigQueryQueriesExtractor(
-                connection=self.config.get_bigquery_client(),
-                schema_api=self.bq_schema_extractor.schema_api,
-                config=BigQueryQueriesExtractorConfig(
-                    window=self.config,
-                    user_email_pattern=self.config.usage.user_email_pattern,
-                    include_lineage=self.config.include_table_lineage,
-                    include_usage_statistics=self.config.include_usage_statistics,
-                    include_operations=self.config.usage.include_operational_stats,
-                    top_n_queries=self.config.usage.top_n_queries,
-                    region_qualifiers=self.config.region_qualifiers,
-                ),
-                structured_report=self.report,
-                filters=self.filters,
-                identifiers=self.identifiers,
-                schema_resolver=self.sql_parser_schema_resolver,
-                discovered_tables=self.bq_schema_extractor.table_refs,
-            ) as queries_extractor:
-                self.report.queries_extractor = queries_extractor.report
-                yield from queries_extractor.get_workunits_internal()
+            with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
+                with BigQueryQueriesExtractor(
+                    connection=self.config.get_bigquery_client(),
+                    schema_api=self.bq_schema_extractor.schema_api,
+                    config=BigQueryQueriesExtractorConfig(
+                        window=self.config,
+                        user_email_pattern=self.config.usage.user_email_pattern,
+                        include_lineage=self.config.include_table_lineage,
+                        include_usage_statistics=self.config.include_usage_statistics,
+                        include_operations=self.config.usage.include_operational_stats,
+                        include_queries=self.config.include_queries,
+                        include_query_usage_statistics=self.config.include_query_usage_statistics,
+                        top_n_queries=self.config.usage.top_n_queries,
+                        region_qualifiers=self.config.region_qualifiers,
+                    ),
+                    structured_report=self.report,
+                    filters=self.filters,
+                    identifiers=self.identifiers,
+                    schema_resolver=self.sql_parser_schema_resolver,
+                    discovered_tables=self.bq_schema_extractor.table_refs,
+                ) as queries_extractor:
+                    self.report.queries_extractor = queries_extractor.report
+                    yield from queries_extractor.get_workunits_internal()
         else:
             if self.config.include_usage_statistics:
                 yield from self.usage_extractor.get_usage_workunits(

datahub/ingestion/source/bigquery_v2/bigquery_audit.py CHANGED Viewed

@@ -37,9 +37,9 @@ class BigqueryTableIdentifier:
     # Note: this regex may get overwritten by the sharded_table_pattern config.
     # The class-level constant, however, will not be overwritten.
-    _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[
-        str
-    ] = _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX
+    _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[str] = (
+        _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX
+    )
     _BIGQUERY_WILDCARD_REGEX: ClassVar[str] = "((_(\\d+)?)\\*$)|\\*$"
     _BQ_SHARDED_TABLE_SUFFIX: str = "_yyyymmdd"

datahub/ingestion/source/bigquery_v2/bigquery_config.py CHANGED Viewed

@@ -137,9 +137,9 @@ class BigQueryCredential(ConfigModel):
     @root_validator(skip_on_failure=True)
     def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
         if values.get("client_x509_cert_url") is None:
-            values[
-                "client_x509_cert_url"
-            ] = f'https://www.googleapis.com/robot/v1/metadata/x509/{values["client_email"]}'
+            values["client_x509_cert_url"] = (
+                f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}"
+            )
         return values
     def create_credential_temp_file(self) -> str:
@@ -447,6 +447,14 @@ class BigQueryV2Config(
         default=False,
         description="If enabled, uses the new queries extractor to extract queries from bigquery.",
     )
+    include_queries: bool = Field(
+        default=True,
+        description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
+    )
+    include_query_usage_statistics: bool = Field(
+        default=True,
+        description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
+    )
     @property
     def have_table_data_read_permission(self) -> bool:
@@ -603,9 +611,9 @@ class BigQueryV2Config(
         cls, v: Optional[List[str]], values: Dict
     ) -> Optional[List[str]]:
         if values.get("use_exported_bigquery_audit_metadata"):
-            assert (
-                v and len(v) > 0
-            ), "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
+            assert v and len(v) > 0, (
+                "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
+            )
         return v

datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py CHANGED Viewed

@@ -87,9 +87,9 @@ class BigQueryPlatformResourceHelper:
             key=platform_resource_key, graph_client=self.graph
         )
         if platform_resource:
-            self.platform_resource_cache[
-                platform_resource_key.primary_key
-            ] = platform_resource
+            self.platform_resource_cache[platform_resource_key.primary_key] = (
+                platform_resource
+            )
             return platform_resource
         return None
@@ -115,7 +115,11 @@ class BigQueryPlatformResourceHelper:
                 and platform_resource.resource_info.value
             ):
                 try:
-                    existing_info: Optional[BigQueryLabelInfo] = platform_resource.resource_info.value.as_pydantic_object(BigQueryLabelInfo)  # type: ignore
+                    existing_info: Optional[BigQueryLabelInfo] = (
+                        platform_resource.resource_info.value.as_pydantic_object(  # type: ignore
+                            BigQueryLabelInfo
+                        )
+                    )
                 except ValidationError as e:
                     logger.error(
                         f"Error converting existing value to BigQueryLabelInfo: {e}. Creating new one. Maybe this is because of a non backward compatible schema change."

datahub/ingestion/source/bigquery_v2/bigquery_report.py CHANGED Viewed

@@ -190,6 +190,3 @@ class BigQueryV2Report(
     num_skipped_external_table_lineage: int = 0
     queries_extractor: Optional[BigQueryQueriesExtractorReport] = None
-    def set_ingestion_stage(self, project_id: str, stage: str) -> None:
-        self.report_ingestion_stage_start(f"{project_id}: {stage}")

acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.1rc17py3-none-any.whl → 0.15.0.2py3-none-any.whl