PyPI - acryl-datahub - Versions diffs - 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl - Mend

acryl-datahub 0.15.0.1rc16py3-none-any.whl → 0.15.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (214) hide show

{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
datahub/__init__.py +1 -1
datahub/api/entities/assertion/assertion_operator.py +3 -5
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/datacontract/assertion_operator.py +3 -5
datahub/api/entities/dataproduct/dataproduct.py +4 -4
datahub/api/entities/dataset/dataset.py +2 -1
datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
datahub/cli/cli_utils.py +13 -2
datahub/cli/delete_cli.py +3 -3
datahub/cli/docker_cli.py +6 -6
datahub/cli/ingest_cli.py +25 -15
datahub/cli/lite_cli.py +2 -2
datahub/cli/migrate.py +5 -5
datahub/cli/specific/assertions_cli.py +3 -3
datahub/cli/specific/structuredproperties_cli.py +84 -0
datahub/cli/timeline_cli.py +1 -1
datahub/configuration/common.py +1 -2
datahub/configuration/config_loader.py +73 -50
datahub/configuration/git.py +2 -2
datahub/configuration/time_window_config.py +10 -5
datahub/emitter/mce_builder.py +4 -8
datahub/emitter/mcp_builder.py +27 -0
datahub/emitter/mcp_patch_builder.py +1 -2
datahub/emitter/rest_emitter.py +141 -93
datahub/entrypoints.py +6 -0
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
datahub/ingestion/api/incremental_lineage_helper.py +2 -8
datahub/ingestion/api/report.py +1 -2
datahub/ingestion/api/source.py +8 -2
datahub/ingestion/api/source_helpers.py +1 -1
datahub/ingestion/extractor/json_schema_util.py +3 -3
datahub/ingestion/extractor/schema_util.py +3 -5
datahub/ingestion/fs/s3_fs.py +3 -3
datahub/ingestion/glossary/classifier.py +2 -3
datahub/ingestion/glossary/datahub_classifier.py +6 -4
datahub/ingestion/graph/client.py +22 -19
datahub/ingestion/graph/config.py +1 -1
datahub/ingestion/run/pipeline.py +8 -7
datahub/ingestion/run/pipeline_config.py +3 -3
datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
datahub/ingestion/source/abs/source.py +19 -8
datahub/ingestion/source/aws/glue.py +77 -47
datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
datahub/ingestion/source/aws/s3_util.py +24 -1
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
datahub/ingestion/source/bigquery_v2/queries.py +1 -3
datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
datahub/ingestion/source/bigquery_v2/usage.py +60 -60
datahub/ingestion/source/cassandra/cassandra.py +0 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
datahub/ingestion/source/confluent_schema_registry.py +6 -6
datahub/ingestion/source/csv_enricher.py +29 -29
datahub/ingestion/source/datahub/config.py +20 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
datahub/ingestion/source/datahub/datahub_source.py +13 -3
datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
datahub/ingestion/source/dbt/dbt_common.py +9 -7
datahub/ingestion/source/delta_lake/source.py +0 -5
datahub/ingestion/source/demo_data.py +1 -1
datahub/ingestion/source/dremio/dremio_api.py +4 -4
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
datahub/ingestion/source/dremio/dremio_source.py +2 -2
datahub/ingestion/source/elastic_search.py +4 -4
datahub/ingestion/source/fivetran/fivetran.py +1 -6
datahub/ingestion/source/gc/datahub_gc.py +11 -14
datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
datahub/ingestion/source/gcs/gcs_source.py +3 -2
datahub/ingestion/source/ge_data_profiler.py +2 -5
datahub/ingestion/source/ge_profiling_config.py +3 -3
datahub/ingestion/source/iceberg/iceberg.py +13 -6
datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +3 -3
datahub/ingestion/source/identity/okta.py +3 -3
datahub/ingestion/source/kafka/kafka.py +11 -9
datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
datahub/ingestion/source/looker/looker_common.py +19 -19
datahub/ingestion/source/looker/looker_config.py +11 -6
datahub/ingestion/source/looker/looker_source.py +25 -25
datahub/ingestion/source/looker/looker_template_language.py +3 -3
datahub/ingestion/source/looker/looker_usage.py +5 -7
datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
datahub/ingestion/source/looker/lookml_source.py +13 -15
datahub/ingestion/source/looker/view_upstream.py +5 -5
datahub/ingestion/source/metabase.py +1 -6
datahub/ingestion/source/mlflow.py +4 -9
datahub/ingestion/source/mode.py +5 -5
datahub/ingestion/source/mongodb.py +6 -4
datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
datahub/ingestion/source/nifi.py +24 -31
datahub/ingestion/source/openapi.py +9 -9
datahub/ingestion/source/powerbi/config.py +12 -12
datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
datahub/ingestion/source/powerbi/powerbi.py +6 -6
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
datahub/ingestion/source/redash.py +0 -5
datahub/ingestion/source/redshift/config.py +3 -3
datahub/ingestion/source/redshift/redshift.py +45 -46
datahub/ingestion/source/redshift/usage.py +33 -33
datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
datahub/ingestion/source/s3/source.py +11 -15
datahub/ingestion/source/salesforce.py +26 -25
datahub/ingestion/source/schema/json_schema.py +1 -1
datahub/ingestion/source/sigma/sigma.py +3 -3
datahub/ingestion/source/sigma/sigma_api.py +12 -10
datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
datahub/ingestion/source/sql/athena.py +1 -3
datahub/ingestion/source/sql/clickhouse.py +8 -14
datahub/ingestion/source/sql/oracle.py +1 -3
datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
datahub/ingestion/source/sql/sql_types.py +1 -2
datahub/ingestion/source/sql/sql_utils.py +5 -0
datahub/ingestion/source/sql/teradata.py +18 -5
datahub/ingestion/source/state/profiling_state_handler.py +3 -3
datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
datahub/ingestion/source/superset.py +1 -6
datahub/ingestion/source/tableau/tableau.py +343 -117
datahub/ingestion/source/tableau/tableau_common.py +5 -2
datahub/ingestion/source/unity/config.py +3 -1
datahub/ingestion/source/unity/proxy.py +1 -1
datahub/ingestion/source/unity/source.py +74 -78
datahub/ingestion/source/unity/usage.py +3 -1
datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
datahub/ingestion/source/usage/usage_common.py +1 -1
datahub/ingestion/source_report/ingestion_stage.py +24 -20
datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
datahub/ingestion/transformer/add_dataset_properties.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
datahub/ingestion/transformer/tags_to_terms.py +7 -7
datahub/integrations/assertion/snowflake/compiler.py +10 -10
datahub/lite/duckdb_lite.py +12 -10
datahub/metadata/_schema_classes.py +317 -44
datahub/metadata/_urns/urn_defs.py +69 -15
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
datahub/metadata/schema.avsc +302 -89
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
datahub/metadata/schemas/DatasetKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
datahub/metadata/schemas/MLModelKey.avsc +2 -1
datahub/metadata/schemas/MLModelProperties.avsc +96 -48
datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
datahub/metadata/schemas/VersionProperties.avsc +216 -0
datahub/metadata/schemas/VersionSetKey.avsc +26 -0
datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
datahub/secret/datahub_secrets_client.py +12 -21
datahub/secret/secret_common.py +14 -8
datahub/specific/aspect_helpers/custom_properties.py +1 -2
datahub/sql_parsing/schema_resolver.py +5 -10
datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
datahub/sql_parsing/sqlglot_lineage.py +3 -3
datahub/sql_parsing/sqlglot_utils.py +1 -1
datahub/telemetry/stats.py +1 -2
datahub/testing/mcp_diff.py +1 -1
datahub/utilities/file_backed_collections.py +11 -11
datahub/utilities/hive_schema_to_avro.py +2 -2
datahub/utilities/logging_manager.py +2 -2
datahub/utilities/lossy_collections.py +3 -3
datahub/utilities/mapping.py +3 -3
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/perf_timer.py +11 -6
datahub/utilities/serialized_lru_cache.py +3 -1
datahub/utilities/sqlalchemy_query_combiner.py +6 -6
datahub/utilities/sqllineage_patch.py +1 -1
datahub/utilities/stats_collections.py +3 -1
datahub/utilities/urns/_urn_base.py +28 -5
datahub/utilities/urns/urn_iter.py +2 -2
{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/datahub/config.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from typing import Optional, Set
+import pydantic
 from pydantic import Field, root_validator
 from datahub.configuration.common import AllowDenyPattern
@@ -24,6 +25,10 @@ DEFAULT_EXCLUDE_ASPECTS = {
     "globalSettingsKey",
     "globalSettingsInfo",
     "testResults",
+    "dataHubExecutionRequestKey",
+    "dataHubExecutionRequestInput",
+    "dataHubExecutionRequestSignal",
+    "dataHubExecutionRequestResult",
 }
@@ -107,6 +112,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
     urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
+    drop_duplicate_schema_fields: bool = Field(
+        default=False,
+        description="Whether to drop duplicate schema fields in the schemaMetadata aspect. "
+        "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
+    )
     @root_validator(skip_on_failure=True)
     def check_ingesting_data(cls, values):
         if (
@@ -119,3 +130,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
                 " Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
             )
         return values
+    @pydantic.validator("database_connection")
+    def validate_mysql_scheme(
+        cls, v: SQLAlchemyConnectionConfig
+    ) -> SQLAlchemyConnectionConfig:
+        if "mysql" in v.scheme:
+            if v.scheme != "mysql+pymysql":
+                raise ValueError("For MySQL, the scheme must be mysql+pymysql.")
+        return v

datahub/ingestion/source/datahub/datahub_database_reader.py CHANGED Viewed

@@ -151,8 +151,12 @@ class DataHubDatabaseReader:
         self, query: str, params: Dict[str, Any]
     ) -> Iterable[Dict[str, Any]]:
         with self.engine.connect() as conn:
-            if self.engine.dialect.name == "postgresql":
-                with conn.begin():  # Transaction required for PostgreSQL server-side cursor
+            if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
+                with (
+                    conn.begin()
+                ):  # Transaction required for PostgreSQL server-side cursor
+                    # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
+                    # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
                     conn = conn.execution_options(
                         stream_results=True,
                         yield_per=self.config.database_query_batch_size,
@@ -160,22 +164,6 @@ class DataHubDatabaseReader:
                     result = conn.execute(query, params)
                     for row in result:
                         yield dict(row)
-            elif self.engine.dialect.name == "mysql":  # MySQL
-                import MySQLdb
-                with contextlib.closing(
-                    conn.connection.cursor(MySQLdb.cursors.SSCursor)
-                ) as cursor:
-                    logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
-                    cursor.execute(query, params)
-                    columns = [desc[0] for desc in cursor.description]
-                    while True:
-                        rows = cursor.fetchmany(self.config.database_query_batch_size)
-                        if not rows:
-                            break  # Use break instead of return in generator
-                        for row in rows:
-                            yield dict(zip(columns, row))
             else:
                 raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
@@ -236,7 +224,7 @@ class DataHubDatabaseReader:
             )
         except Exception as e:
             logger.warning(
-                f'Failed to parse metadata for {row["urn"]}: {e}', exc_info=True
+                f"Failed to parse metadata for {row['urn']}: {e}", exc_info=True
             )
             self.report.num_database_parse_errors += 1
             self.report.database_parse_errors.setdefault(

datahub/ingestion/source/datahub/datahub_source.py CHANGED Viewed

@@ -12,7 +12,10 @@ from datahub.ingestion.api.decorators import (
     support_status,
 )
 from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
-from datahub.ingestion.api.source_helpers import auto_workunit_reporter
+from datahub.ingestion.api.source_helpers import (
+    auto_fix_duplicate_schema_field_paths,
+    auto_workunit_reporter,
+)
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.datahub.config import DataHubSourceConfig
 from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
@@ -57,7 +60,14 @@ class DataHubSource(StatefulIngestionSourceBase):
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         # Exactly replicate data from DataHub source
-        return [partial(auto_workunit_reporter, self.get_report())]
+        return [
+            (
+                auto_fix_duplicate_schema_field_paths
+                if self.config.drop_duplicate_schema_fields
+                else None
+            ),
+            partial(auto_workunit_reporter, self.get_report()),
+        ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         self.report.stop_time = datetime.now(tz=timezone.utc)
@@ -130,7 +140,7 @@ class DataHubSource(StatefulIngestionSourceBase):
             self._commit_progress(i)
     def _get_kafka_workunits(
-        self, from_offsets: Dict[int, int], soft_deleted_urns: List[str] = []
+        self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
     ) -> Iterable[MetadataWorkUnit]:
         if self.config.kafka_connection is None:
             return

datahub/ingestion/source/dbt/dbt_cloud.py CHANGED Viewed

@@ -194,20 +194,20 @@ _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS = """
 _DBT_FIELDS_BY_TYPE = {
     "models": f"""
-    { _DBT_GRAPHQL_COMMON_FIELDS }
-    { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
-    { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
+    {_DBT_GRAPHQL_COMMON_FIELDS}
+    {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
+    {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
     dependsOn
     materializedType
 """,
     "seeds": f"""
-    { _DBT_GRAPHQL_COMMON_FIELDS }
-    { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
-    { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
+    {_DBT_GRAPHQL_COMMON_FIELDS}
+    {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
+    {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
 """,
     "sources": f"""
-    { _DBT_GRAPHQL_COMMON_FIELDS }
-    { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
+    {_DBT_GRAPHQL_COMMON_FIELDS}
+    {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
     identifier
     sourceName
     sourceDescription
@@ -218,9 +218,9 @@ _DBT_FIELDS_BY_TYPE = {
     loader
 """,
     "snapshots": f"""
-    { _DBT_GRAPHQL_COMMON_FIELDS }
-    { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
-    { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
+    {_DBT_GRAPHQL_COMMON_FIELDS}
+    {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
+    {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
     parentsSources {{
       uniqueId
     }}
@@ -229,7 +229,7 @@ _DBT_FIELDS_BY_TYPE = {
     }}
 """,
     "tests": f"""
-    { _DBT_GRAPHQL_COMMON_FIELDS }
+    {_DBT_GRAPHQL_COMMON_FIELDS}
     state
     columnName
     status
@@ -315,7 +315,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
             res = response.json()
             if "errors" in res:
                 raise ValueError(
-                    f'Unable to fetch metadata from dbt Cloud: {res["errors"]}'
+                    f"Unable to fetch metadata from dbt Cloud: {res['errors']}"
                 )
             data = res["data"]
         except JSONDecodeError as e:

datahub/ingestion/source/dbt/dbt_common.py CHANGED Viewed

@@ -506,16 +506,18 @@ class DBTNode:
     materialization: Optional[str]  # table, view, ephemeral, incremental, snapshot
     # see https://docs.getdbt.com/reference/artifacts/manifest-json
     catalog_type: Optional[str]
-    missing_from_catalog: bool  # indicates if the node was missing from the catalog.json
+    missing_from_catalog: (
+        bool  # indicates if the node was missing from the catalog.json
+    )
     owner: Optional[str]
     columns: List[DBTColumn] = field(default_factory=list)
     upstream_nodes: List[str] = field(default_factory=list)  # list of upstream dbt_name
     upstream_cll: List[DBTColumnLineageInfo] = field(default_factory=list)
-    raw_sql_parsing_result: Optional[
-        SqlParsingResult
-    ] = None  # only set for nodes that don't depend on ephemeral models
+    raw_sql_parsing_result: Optional[SqlParsingResult] = (
+        None  # only set for nodes that don't depend on ephemeral models
+    )
     cll_debug_info: Optional[SqlParsingDebugInfo] = None
     meta: Dict[str, Any] = field(default_factory=dict)
@@ -869,10 +871,10 @@ class DBTSourceBase(StatefulIngestionSourceBase):
                                 "platform": DBT_PLATFORM,
                                 "name": node.dbt_name,
                                 "instance": self.config.platform_instance,
+                                # Ideally we'd include the env unconditionally. However, we started out
+                                # not including env in the guid, so we need to maintain backwards compatibility
+                                # with existing PROD assertions.
                                 **(
-                                    # Ideally we'd include the env unconditionally. However, we started out
-                                    # not including env in the guid, so we need to maintain backwards compatibility
-                                    # with existing PROD assertions.
                                     {"env": self.config.env}
                                     if self.config.env != mce_builder.DEFAULT_ENV
                                     and self.config.include_env_in_assertion_guid

datahub/ingestion/source/delta_lake/source.py CHANGED Viewed

@@ -122,11 +122,6 @@ class DeltaLakeSource(Source):
             config_report,
         )
-    @classmethod
-    def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
-        config = DeltaLakeSourceConfig.parse_obj(config_dict)
-        return cls(config, ctx)
     def _parse_datatype(self, raw_field_json_str: str) -> List[SchemaFieldClass]:
         raw_field_json = json.loads(raw_field_json_str)

datahub/ingestion/source/demo_data.py CHANGED Viewed

@@ -29,7 +29,7 @@ class DemoDataSource(Source):
     def __init__(self, ctx: PipelineContext, config: DemoDataConfig):
         file_config = FileSourceConfig(path=str(download_sample_data()))
-        self.file_source = GenericFileSource(ctx, file_config)
+        self.file_source: GenericFileSource = GenericFileSource(ctx, file_config)
     def get_workunits(self) -> Iterable[MetadataWorkUnit]:
         yield from self.file_source.get_workunits()

datahub/ingestion/source/dremio/dremio_api.py CHANGED Viewed

@@ -181,7 +181,7 @@ class DremioAPIOperations:
             return
         # On-prem Dremio authentication (PAT or Basic Auth)
-        for retry in range(1, self._retry_count + 1):
+        for _ in range(1, self._retry_count + 1):
             try:
                 if connection_args.authentication_method == "PAT":
                     self.session.headers.update(
@@ -191,9 +191,9 @@ class DremioAPIOperations:
                     )
                     return
                 else:
-                    assert (
-                        connection_args.username and connection_args.password
-                    ), "Username and password are required for authentication"
+                    assert connection_args.username and connection_args.password, (
+                        "Username and password are required for authentication"
+                    )
                     host = connection_args.hostname
                     port = connection_args.port
                     protocol = "https" if connection_args.tls else "http"

datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py CHANGED Viewed

@@ -101,9 +101,9 @@ class DremioToDataHubSourceTypeMapping:
         Add a new source type if not in the map (e.g., Dremio ARP).
         """
         dremio_source_type = dremio_source_type.upper()
-        DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[
-            dremio_source_type
-        ] = datahub_source_type
+        DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[dremio_source_type] = (
+            datahub_source_type
+        )
         if category:
             if category.lower() == "file_object_storage":

datahub/ingestion/source/dremio/dremio_reporting.py CHANGED Viewed

@@ -45,6 +45,3 @@ class DremioSourceReport(
             self.views_scanned += 1
         else:
             raise KeyError(f"Unknown entity {ent_type}.")
-    def set_ingestion_stage(self, dataset: str, stage: str) -> None:
-        self.report_ingestion_stage_start(f"{dataset}: {stage}")

datahub/ingestion/source/dremio/dremio_source.py CHANGED Viewed

@@ -472,8 +472,8 @@ class DremioSource(StatefulIngestionSourceBase):
             env=self.config.env,
             platform_instance=self.config.platform_instance,
         )
-        self.report.set_ingestion_stage(dataset_info.resource_name, PROFILING)
-        yield from self.profiler.get_workunits(dataset_info, dataset_urn)
+        with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
+            yield from self.profiler.get_workunits(dataset_info, dataset_urn)
     def generate_view_lineage(
         self, dataset_urn: str, parents: List[str]

datahub/ingestion/source/elastic_search.py CHANGED Viewed

@@ -111,10 +111,10 @@ class ElasticToSchemaFieldConverter:
     @staticmethod
     def get_column_type(elastic_column_type: str) -> SchemaFieldDataType:
-        type_class: Optional[
-            Type
-        ] = ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
-            elastic_column_type
+        type_class: Optional[Type] = (
+            ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
+                elastic_column_type
+            )
         )
         if type_class is None:
             logger.warning(

datahub/ingestion/source/fivetran/fivetran.py CHANGED Viewed

@@ -16,7 +16,7 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.fivetran.config import (
     KNOWN_DATA_PLATFORM_MAPPING,
@@ -291,11 +291,6 @@ class FivetranSource(StatefulIngestionSourceBase):
             dpi = self._generate_dpi_from_job(job, datajob)
             yield from self._get_dpi_workunits(job, dpi)
-    @classmethod
-    def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
-        config = FivetranSourceConfig.parse_obj(config_dict)
-        return cls(config, ctx)
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         return [
             *super().get_workunit_processors(),

datahub/ingestion/source/gc/datahub_gc.py CHANGED Viewed

@@ -141,40 +141,36 @@ class DataHubGcSource(Source):
     ) -> Iterable[MetadataWorkUnit]:
         if self.config.cleanup_expired_tokens:
             try:
-                self.report.report_ingestion_stage_start("Expired Token Cleanup")
-                self.revoke_expired_tokens()
+                with self.report.new_stage("Expired Token Cleanup"):
+                    self.revoke_expired_tokens()
             except Exception as e:
                 self.report.failure("While trying to cleanup expired token ", exc=e)
         if self.config.truncate_indices:
             try:
-                self.report.report_ingestion_stage_start("Truncate Indices")
-                self.truncate_indices()
+                with self.report.new_stage("Truncate Indices"):
+                    self.truncate_indices()
             except Exception as e:
                 self.report.failure("While trying to truncate indices ", exc=e)
         if self.config.soft_deleted_entities_cleanup.enabled:
             try:
-                self.report.report_ingestion_stage_start(
-                    "Soft Deleted Entities Cleanup"
-                )
-                self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
+                with self.report.new_stage("Soft Deleted Entities Cleanup"):
+                    self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
             except Exception as e:
                 self.report.failure(
                     "While trying to cleanup soft deleted entities ", exc=e
                 )
         if self.config.dataprocess_cleanup.enabled:
             try:
-                self.report.report_ingestion_stage_start("Data Process Cleanup")
-                yield from self.dataprocess_cleanup.get_workunits_internal()
+                with self.report.new_stage("Data Process Cleanup"):
+                    yield from self.dataprocess_cleanup.get_workunits_internal()
             except Exception as e:
                 self.report.failure("While trying to cleanup data process ", exc=e)
         if self.config.execution_request_cleanup.enabled:
             try:
-                self.report.report_ingestion_stage_start("Execution request Cleanup")
-                self.execution_request_cleanup.run()
+                with self.report.new_stage("Execution request Cleanup"):
+                    self.execution_request_cleanup.run()
             except Exception as e:
                 self.report.failure("While trying to cleanup execution request ", exc=e)
-        # Otherwise last stage's duration does not get calculated.
-        self.report.report_ingestion_stage_start("End")
         yield from []
     def truncate_indices(self) -> None:
@@ -296,6 +292,7 @@ class DataHubGcSource(Source):
             tokens = list_access_tokens.get("tokens", [])
             total = list_access_tokens.get("total", 0)
             if tokens == []:
+                # Due to a server bug we cannot rely on just total
                 break
             for token in tokens:
                 self.report.expired_tokens_revoked += 1

datahub/ingestion/source/gc/execution_request_cleanup.py CHANGED Viewed

@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
     )
     keep_history_max_days: int = Field(
-        30,
+        90,
         description="Maximum number of days to keep execution requests for, per ingestion source",
     )
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
         description="Maximum runtime in seconds for the cleanup task",
     )
+    limit_entities_delete: Optional[int] = Field(
+        10000, description="Max number of execution requests to hard delete."
+    )
     max_read_errors: int = Field(
         default=10,
         description="Maximum number of read errors before aborting",
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
     ergc_delete_errors: int = 0
     ergc_start_time: Optional[datetime.datetime] = None
     ergc_end_time: Optional[datetime.datetime] = None
+    ergc_delete_limit_reached: bool = False
+    ergc_runtime_limit_reached: bool = False
 class CleanupRecord(BaseModel):
@@ -85,12 +91,20 @@ class DatahubExecutionRequestCleanup:
         self.graph = graph
         self.report = report
         self.instance_id = int(time.time())
+        self.last_print_time = 0.0
         if config is not None:
             self.config = config
         else:
             self.config = DatahubExecutionRequestCleanupConfig()
+    def _print_report(self) -> None:
+        time_taken = round(time.time() - self.last_print_time, 1)
+        # Print report every 2 minutes
+        if time_taken > 120:
+            self.last_print_time = time.time()
+            logger.info(f"\n{self.report.as_string()}")
     def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
         input_aspect = (
             entry.get("aspects", {})
@@ -175,6 +189,7 @@ class DatahubExecutionRequestCleanup:
         running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
         for entry in self._scroll_execution_requests():
+            self._print_report()
             self.report.ergc_records_read += 1
             key = entry.ingestion_source
@@ -225,15 +240,12 @@ class DatahubExecutionRequestCleanup:
                     f"record timestamp: {entry.requested_at}."
                 )
             )
-            self.report.ergc_records_deleted += 1
             yield entry
     def _delete_entry(self, entry: CleanupRecord) -> None:
         try:
-            logger.info(
-                f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
-            )
             self.graph.delete_entity(entry.urn, True)
+            self.report.ergc_records_deleted += 1
         except Exception as e:
             self.report.ergc_delete_errors += 1
             self.report.failure(
@@ -252,10 +264,23 @@ class DatahubExecutionRequestCleanup:
                 >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
             )
         ):
+            self.report.ergc_runtime_limit_reached = True
             logger.info(f"ergc({self.instance_id}): max runtime reached.")
             return True
         return False
+    def _reached_delete_limit(self) -> bool:
+        if (
+            self.config.limit_entities_delete
+            and self.report.ergc_records_deleted >= self.config.limit_entities_delete
+        ):
+            logger.info(
+                f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
+            )
+            self.report.ergc_delete_limit_reached = True
+            return True
+        return False
     def run(self) -> None:
         if not self.config.enabled:
             logger.info(
@@ -274,7 +299,7 @@ class DatahubExecutionRequestCleanup:
         )
         for entry in self._scroll_garbage_records():
-            if self._reached_runtime_limit():
+            if self._reached_runtime_limit() or self._reached_delete_limit():
                 break
             self._delete_entry(entry)

datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py CHANGED Viewed

@@ -19,8 +19,8 @@ from datahub.utilities.urns._urn_base import Urn
 logger = logging.getLogger(__name__)
-QUERY_QUERY_ENTITY = """
-query listQueries($input: ScrollAcrossEntitiesInput!) {
+QUERY_ENTITIES = """
+query listEntities($input: ScrollAcrossEntitiesInput!) {
   scrollAcrossEntities(input: $input) {
     nextScrollId
     count
@@ -29,6 +29,9 @@ query listQueries($input: ScrollAcrossEntitiesInput!) {
         ... on QueryEntity {
           urn
         }
+        ... on DataProcessInstance {
+          urn
+        }
       }
     }
   }
@@ -96,7 +99,8 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
 @dataclass
 class SoftDeletedEntitiesReport(SourceReport):
-    num_queries_found: int = 0
+    num_calls_made: Dict[str, int] = field(default_factory=dict)
+    num_entities_found: Dict[str, int] = field(default_factory=dict)
     num_soft_deleted_entity_processed: int = 0
     num_soft_deleted_retained_due_to_age: int = 0
     num_soft_deleted_entity_removal_started: int = 0
@@ -151,9 +155,9 @@ class SoftDeletedEntitiesCleanup:
             current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
             self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
             if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
-                self.report.sample_hard_deleted_aspects_by_type[
-                    entity_type
-                ] = LossyList()
+                self.report.sample_hard_deleted_aspects_by_type[entity_type] = (
+                    LossyList()
+                )
             self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
     def delete_entity(self, urn: str) -> None:
@@ -225,19 +229,33 @@ class SoftDeletedEntitiesCleanup:
                     time.sleep(self.config.delay)
         return futures
-    def _get_soft_deleted_queries(self) -> Iterable[str]:
+    def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
         assert self.ctx.graph
         scroll_id: Optional[str] = None
+        batch_size = self.config.batch_size
+        if entity_type == "DATA_PROCESS_INSTANCE":
+            # Due to a bug in Data process instance querying this is a temp workaround
+            # to avoid a giant stacktrace by having a smaller batch size in first call
+            # This will be remove in future version after server with fix has been
+            # around for a while
+            batch_size = 10
         while True:
             try:
+                if entity_type not in self.report.num_calls_made:
+                    self.report.num_calls_made[entity_type] = 1
+                else:
+                    self.report.num_calls_made[entity_type] += 1
+                self._print_report()
                 result = self.ctx.graph.execute_graphql(
-                    QUERY_QUERY_ENTITY,
+                    graphql_query,
                     {
                         "input": {
-                            "types": ["QUERY"],
+                            "types": [entity_type],
                             "query": "*",
                             "scrollId": scroll_id if scroll_id else None,
-                            "count": self.config.batch_size,
+                            "count": batch_size,
                             "orFilters": [
                                 {
                                     "and": [
@@ -254,15 +272,29 @@ class SoftDeletedEntitiesCleanup:
                 )
             except Exception as e:
                 self.report.failure(
-                    f"While trying to get queries with {scroll_id}", exc=e
+                    f"While trying to get {entity_type} with {scroll_id}", exc=e
                 )
                 break
             scroll_across_entities = result.get("scrollAcrossEntities")
-            if not scroll_across_entities or not scroll_across_entities.get("count"):
+            if not scroll_across_entities:
                 break
+            search_results = scroll_across_entities.get("searchResults")
+            count = scroll_across_entities.get("count")
+            if not count or not search_results:
+                # Due to a server bug we cannot rely on just count as it was returning response like this
+                # {'count': 1, 'nextScrollId': None, 'searchResults': []}
+                break
+            if entity_type == "DATA_PROCESS_INSTANCE":
+                # Temp workaround. See note in beginning of the function
+                # We make the batch size = config after call has succeeded once
+                batch_size = self.config.batch_size
             scroll_id = scroll_across_entities.get("nextScrollId")
-            self.report.num_queries_found += scroll_across_entities.get("count")
-            for query in scroll_across_entities.get("searchResults"):
+            if entity_type not in self.report.num_entities_found:
+                self.report.num_entities_found[entity_type] = 0
+            self.report.num_entities_found[entity_type] += scroll_across_entities.get(
+                "count"
+            )
+            for query in search_results:
                 yield query["entity"]["urn"]
     def _get_urns(self) -> Iterable[str]:
@@ -275,7 +307,8 @@ class SoftDeletedEntitiesCleanup:
             status=RemovedStatusFilter.ONLY_SOFT_DELETED,
             batch_size=self.config.batch_size,
         )
-        yield from self._get_soft_deleted_queries()
+        yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
+        yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
     def _times_up(self) -> bool:
         if (

datahub/ingestion/source/gcs/gcs_source.py CHANGED Viewed

@@ -141,8 +141,9 @@ class GCSSource(StatefulIngestionSourceBase):
         source.source_config.platform = PLATFORM_GCS
         source.is_s3_platform = lambda: True  # type: ignore
-        source.create_s3_path = lambda bucket_name, key: unquote(f"s3://{bucket_name}/{key}")  # type: ignore
+        source.create_s3_path = lambda bucket_name, key: unquote(  # type: ignore
+            f"s3://{bucket_name}/{key}"
+        )
         return source
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:

acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.1rc16py3-none-any.whl → 0.15.0.2py3-none-any.whl