PyPI - acryl-datahub - Versions diffs - 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl - Mend

acryl-datahub 1.0.0.2rc4py3-none-any.whl → 1.0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (159) hide show

{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
datahub/_version.py +1 -1
datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
datahub/api/entities/datacontract/datacontract.py +35 -3
datahub/api/entities/datajob/dataflow.py +3 -3
datahub/api/entities/datajob/datajob.py +7 -4
datahub/api/entities/dataset/dataset.py +9 -11
datahub/api/entities/forms/forms.py +34 -34
datahub/api/graphql/assertion.py +1 -1
datahub/api/graphql/operation.py +4 -4
datahub/cli/check_cli.py +3 -2
datahub/cli/config_utils.py +2 -2
datahub/cli/delete_cli.py +6 -5
datahub/cli/docker_cli.py +2 -2
datahub/cli/exists_cli.py +2 -1
datahub/cli/get_cli.py +2 -1
datahub/cli/iceberg_cli.py +6 -5
datahub/cli/ingest_cli.py +9 -6
datahub/cli/migrate.py +4 -3
datahub/cli/migration_utils.py +4 -3
datahub/cli/put_cli.py +3 -2
datahub/cli/specific/assertions_cli.py +2 -1
datahub/cli/specific/datacontract_cli.py +3 -2
datahub/cli/specific/dataproduct_cli.py +10 -9
datahub/cli/specific/dataset_cli.py +4 -3
datahub/cli/specific/forms_cli.py +2 -1
datahub/cli/specific/group_cli.py +2 -1
datahub/cli/specific/structuredproperties_cli.py +4 -3
datahub/cli/specific/user_cli.py +2 -1
datahub/cli/state_cli.py +2 -1
datahub/cli/timeline_cli.py +2 -1
datahub/configuration/common.py +5 -0
datahub/configuration/source_common.py +1 -1
datahub/emitter/mcp.py +20 -5
datahub/emitter/request_helper.py +116 -3
datahub/emitter/rest_emitter.py +163 -93
datahub/entrypoints.py +2 -1
datahub/errors.py +4 -0
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
datahub/ingestion/api/source.py +2 -5
datahub/ingestion/api/source_helpers.py +1 -0
datahub/ingestion/glossary/classification_mixin.py +4 -2
datahub/ingestion/graph/client.py +33 -8
datahub/ingestion/graph/config.py +14 -0
datahub/ingestion/graph/filters.py +1 -1
datahub/ingestion/graph/links.py +53 -0
datahub/ingestion/run/pipeline.py +9 -6
datahub/ingestion/run/pipeline_config.py +1 -1
datahub/ingestion/sink/datahub_rest.py +5 -6
datahub/ingestion/source/apply/datahub_apply.py +2 -1
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
datahub/ingestion/source/common/subtypes.py +3 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
datahub/ingestion/source/dbt/dbt_common.py +10 -2
datahub/ingestion/source/dbt/dbt_core.py +82 -42
datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
datahub/ingestion/source/feast.py +4 -4
datahub/ingestion/source/fivetran/config.py +1 -1
datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
datahub/ingestion/source/ge_data_profiler.py +27 -1
datahub/ingestion/source/hex/api.py +1 -20
datahub/ingestion/source/hex/query_fetcher.py +4 -1
datahub/ingestion/source/iceberg/iceberg.py +20 -4
datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
datahub/ingestion/source/ldap.py +1 -1
datahub/ingestion/source/looker/looker_common.py +17 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
datahub/ingestion/source/looker/looker_source.py +34 -5
datahub/ingestion/source/looker/lookml_source.py +7 -1
datahub/ingestion/source/metadata/lineage.py +2 -1
datahub/ingestion/source/mlflow.py +19 -6
datahub/ingestion/source/mode.py +74 -28
datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
datahub/ingestion/source/powerbi/config.py +13 -1
datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
datahub/ingestion/source/redshift/usage.py +10 -9
datahub/ingestion/source/sigma/config.py +74 -6
datahub/ingestion/source/sigma/sigma.py +16 -1
datahub/ingestion/source/sigma/sigma_api.py +99 -58
datahub/ingestion/source/slack/slack.py +4 -52
datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
datahub/ingestion/source/sql/athena.py +2 -1
datahub/ingestion/source/sql/clickhouse.py +5 -1
datahub/ingestion/source/sql/druid.py +7 -2
datahub/ingestion/source/sql/hive.py +7 -2
datahub/ingestion/source/sql/hive_metastore.py +5 -5
datahub/ingestion/source/sql/mssql/source.py +1 -1
datahub/ingestion/source/sql/oracle.py +6 -2
datahub/ingestion/source/sql/sql_config.py +1 -34
datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
datahub/ingestion/source/tableau/tableau.py +31 -6
datahub/ingestion/source/tableau/tableau_validation.py +1 -1
datahub/ingestion/source/unity/config.py +2 -1
datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
datahub/ingestion/source/vertexai/vertexai.py +316 -4
datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
datahub/integrations/assertion/common.py +3 -2
datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
datahub/metadata/_urns/urn_defs.py +1819 -1763
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/schema.avsc +17296 -16883
datahub/metadata/schema_classes.py +3 -3
datahub/metadata/schemas/DataContractKey.avsc +2 -1
datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
datahub/metadata/schemas/FormInfo.avsc +5 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
datahub/metadata/schemas/QueryProperties.avsc +4 -2
datahub/metadata/schemas/SystemMetadata.avsc +86 -0
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/_all_entities.py +4 -0
datahub/sdk/_shared.py +142 -4
datahub/sdk/_utils.py +4 -0
datahub/sdk/dataset.py +2 -2
datahub/sdk/entity_client.py +8 -0
datahub/sdk/lineage_client.py +235 -0
datahub/sdk/main_client.py +6 -3
datahub/sdk/mlmodel.py +301 -0
datahub/sdk/mlmodelgroup.py +233 -0
datahub/secret/datahub_secret_store.py +2 -1
datahub/specific/dataset.py +12 -0
datahub/sql_parsing/fingerprint_utils.py +6 -0
datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
datahub/sql_parsing/sqlglot_utils.py +18 -14
datahub/telemetry/telemetry.py +2 -2
datahub/testing/check_imports.py +1 -1
datahub/testing/mcp_diff.py +15 -2
datahub/upgrade/upgrade.py +10 -12
datahub/utilities/logging_manager.py +8 -1
datahub/utilities/server_config_util.py +350 -10
datahub/utilities/sqlalchemy_query_combiner.py +4 -5
datahub/utilities/urn_encoder.py +1 -1
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/dbt/dbt_core.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import dataclasses
 import json
 import logging
 import re
@@ -12,16 +13,15 @@ from pydantic import BaseModel, Field, validator
 from datahub.configuration.git import GitReference
 from datahub.configuration.validate_field_rename import pydantic_renamed_field
+from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SupportStatus,
-    capability,
     config_class,
     platform_name,
     support_status,
 )
 from datahub.ingestion.api.source import (
     CapabilityReport,
-    SourceCapability,
     TestableSource,
     TestConnectionReport,
 )
@@ -40,19 +40,28 @@ from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
 logger = logging.getLogger(__name__)
+@dataclasses.dataclass
+class DBTCoreReport(DBTSourceReport):
+    catalog_info: Optional[dict] = None
+    manifest_info: Optional[dict] = None
 class DBTCoreConfig(DBTCommonConfig):
     manifest_path: str = Field(
-        description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json Note "
-        "this can be a local file or a URI."
+        description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json. "
+        "This can be a local file or a URI."
     )
-    catalog_path: str = Field(
-        description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json Note this "
-        "can be a local file or a URI."
+    catalog_path: Optional[str] = Field(
+        None,
+        description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json. "
+        "This file is optional, but highly recommended. Without it, some metadata like column info will be incomplete or missing. "
+        "This can be a local file or a URI.",
     )
     sources_path: Optional[str] = Field(
         default=None,
-        description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. If not "
-        "specified, last-modified fields will not be populated. Note this can be a local file or a URI.",
+        description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. "
+        "If not specified, last-modified fields will not be populated. "
+        "This can be a local file or a URI.",
     )
     run_results_paths: List[str] = Field(
         default=[],
@@ -161,7 +170,7 @@ def get_columns(
 def extract_dbt_entities(
     all_manifest_entities: Dict[str, Dict[str, Any]],
-    all_catalog_entities: Dict[str, Dict[str, Any]],
+    all_catalog_entities: Optional[Dict[str, Dict[str, Any]]],
     sources_results: List[Dict[str, Any]],
     manifest_adapter: str,
     use_identifiers: bool,
@@ -186,15 +195,6 @@ def extract_dbt_entities(
         ):
             name = manifest_node["alias"]
-        # initialize comment to "" for consistency with descriptions
-        # (since dbt null/undefined descriptions as "")
-        comment = ""
-        if key in all_catalog_entities and all_catalog_entities[key]["metadata"].get(
-            "comment"
-        ):
-            comment = all_catalog_entities[key]["metadata"]["comment"]
         materialization = None
         if "materialized" in manifest_node.get("config", {}):
             # It's a model
@@ -204,8 +204,9 @@ def extract_dbt_entities(
         if "depends_on" in manifest_node and "nodes" in manifest_node["depends_on"]:
             upstream_nodes = manifest_node["depends_on"]["nodes"]
-        # It's a source
-        catalog_node = all_catalog_entities.get(key)
+        catalog_node = (
+            all_catalog_entities.get(key) if all_catalog_entities is not None else None
+        )
         missing_from_catalog = catalog_node is None
         catalog_type = None
@@ -214,16 +215,23 @@ def extract_dbt_entities(
                 # Test and ephemeral nodes will never show up in the catalog.
                 missing_from_catalog = False
             else:
-                if not only_include_if_in_catalog:
+                if all_catalog_entities is not None and not only_include_if_in_catalog:
+                    # If the catalog file is missing, we have already generated a general message.
                     report.warning(
                         title="Node missing from catalog",
                         message="Found a node in the manifest file but not in the catalog. "
                         "This usually means the catalog file was not generated by `dbt docs generate` and so is incomplete. "
-                        "Some metadata, such as column types and descriptions, will be impacted.",
+                        "Some metadata, particularly schema information, will be impacted.",
                         context=key,
                     )
         else:
-            catalog_type = all_catalog_entities[key]["metadata"]["type"]
+            catalog_type = catalog_node["metadata"]["type"]
+        # initialize comment to "" for consistency with descriptions
+        # (since dbt null/undefined descriptions as "")
+        comment = ""
+        if catalog_node is not None and catalog_node.get("metadata", {}).get("comment"):
+            comment = catalog_node["metadata"]["comment"]
         query_tag_props = manifest_node.get("query_tag", {})
@@ -231,12 +239,15 @@ def extract_dbt_entities(
         owner = meta.get("owner")
         if owner is None:
-            owner = manifest_node.get("config", {}).get("meta", {}).get("owner")
+            owner = (manifest_node.get("config", {}).get("meta") or {}).get("owner")
+        if not meta:
+            # On older versions of dbt, the meta field was nested under config
+            # for some node types.
+            meta = manifest_node.get("config", {}).get("meta") or {}
         tags = manifest_node.get("tags", [])
         tags = [tag_prefix + tag for tag in tags]
-        if not meta:
-            meta = manifest_node.get("config", {}).get("meta", {})
         max_loaded_at_str = sources_by_id.get(key, {}).get("max_loaded_at")
         max_loaded_at = None
@@ -453,15 +464,18 @@ def load_run_results(
 @platform_name("dbt")
 @config_class(DBTCoreConfig)
 @support_status(SupportStatus.CERTIFIED)
-@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
-@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
 class DBTCoreSource(DBTSourceBase, TestableSource):
     config: DBTCoreConfig
+    report: DBTCoreReport
+    def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
+        super().__init__(config, ctx)
+        self.report = DBTCoreReport()
     @classmethod
     def create(cls, config_dict, ctx):
         config = DBTCoreConfig.parse_obj(config_dict)
-        return cls(config, ctx, "dbt")
+        return cls(config, ctx)
     @staticmethod
     def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -471,9 +485,10 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
             DBTCoreSource.load_file_as_json(
                 source_config.manifest_path, source_config.aws_connection
             )
-            DBTCoreSource.load_file_as_json(
-                source_config.catalog_path, source_config.aws_connection
-            )
+            if source_config.catalog_path is not None:
+                DBTCoreSource.load_file_as_json(
+                    source_config.catalog_path, source_config.aws_connection
+                )
             test_report.basic_connectivity = CapabilityReport(capable=True)
         except Exception as e:
             test_report.basic_connectivity = CapabilityReport(
@@ -511,11 +526,31 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
         dbt_manifest_json = self.load_file_as_json(
             self.config.manifest_path, self.config.aws_connection
         )
-        dbt_catalog_json = self.load_file_as_json(
-            self.config.catalog_path, self.config.aws_connection
+        dbt_manifest_metadata = dbt_manifest_json["metadata"]
+        self.report.manifest_info = dict(
+            generated_at=dbt_manifest_metadata.get("generated_at", "unknown"),
+            dbt_version=dbt_manifest_metadata.get("dbt_version", "unknown"),
+            project_name=dbt_manifest_metadata.get("project_name", "unknown"),
         )
+        dbt_catalog_json = None
+        dbt_catalog_metadata = None
+        if self.config.catalog_path is not None:
+            dbt_catalog_json = self.load_file_as_json(
+                self.config.catalog_path, self.config.aws_connection
+            )
+            dbt_catalog_metadata = dbt_catalog_json.get("metadata", {})
+            self.report.catalog_info = dict(
+                generated_at=dbt_catalog_metadata.get("generated_at", "unknown"),
+                dbt_version=dbt_catalog_metadata.get("dbt_version", "unknown"),
+                project_name=dbt_catalog_metadata.get("project_name", "unknown"),
+            )
+        else:
+            self.report.warning(
+                title="No catalog file configured",
+                message="Some metadata, particularly schema information, will be missing.",
+            )
         if self.config.sources_path is not None:
             dbt_sources_json = self.load_file_as_json(
                 self.config.sources_path, self.config.aws_connection
@@ -528,18 +563,23 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
         manifest_version = dbt_manifest_json["metadata"].get("dbt_version")
         manifest_adapter = dbt_manifest_json["metadata"].get("adapter_type")
-        catalog_schema = dbt_catalog_json.get("metadata", {}).get("dbt_schema_version")
-        catalog_version = dbt_catalog_json.get("metadata", {}).get("dbt_version")
+        catalog_schema = None
+        catalog_version = None
+        if dbt_catalog_metadata is not None:
+            catalog_schema = dbt_catalog_metadata.get("dbt_schema_version")
+            catalog_version = dbt_catalog_metadata.get("dbt_version")
         manifest_nodes = dbt_manifest_json["nodes"]
         manifest_sources = dbt_manifest_json["sources"]
         all_manifest_entities = {**manifest_nodes, **manifest_sources}
-        catalog_nodes = dbt_catalog_json["nodes"]
-        catalog_sources = dbt_catalog_json["sources"]
+        all_catalog_entities = None
+        if dbt_catalog_json is not None:
+            catalog_nodes = dbt_catalog_json["nodes"]
+            catalog_sources = dbt_catalog_json["sources"]
-        all_catalog_entities = {**catalog_nodes, **catalog_sources}
+            all_catalog_entities = {**catalog_nodes, **catalog_sources}
         nodes = extract_dbt_entities(
             all_manifest_entities=all_manifest_entities,
@@ -590,7 +630,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
                 )
         except Exception as e:
             self.report.info(
-                title="Dbt Catalog Version",
+                title="dbt Catalog Version",
                 message="Failed to determine the catalog version",
                 exc=e,
             )

datahub/ingestion/source/dynamodb/dynamodb.py CHANGED Viewed

@@ -474,6 +474,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
             dataset_properties.customProperties["schema.downsampled"] = "True"
             dataset_properties.customProperties["schema.totalFields"] = f"{schema_size}"
         # append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include MAX_SCHEMA_SIZE items
+        primary_keys = []
         for schema_field in sorted(
             table_fields,
             key=lambda x: (
@@ -484,22 +485,23 @@ class DynamoDBSource(StatefulIngestionSourceBase):
             field_path = schema_field["delimited_name"]
             native_data_type = self.get_native_type(schema_field["type"], table_name)
             type = self.get_field_type(schema_field["type"], table_name)
-            description = None
             nullable = True
             if field_path in primary_key_dict:
-                description = (
+                # primary key should not be nullable
+                type_key = (
                     "Partition Key"
                     if primary_key_dict.get(field_path) == "HASH"
                     else "Sort Key"
                 )
-                # primary key should not be nullable
+                dataset_properties.customProperties[type_key] = field_path
                 nullable = False
+                primary_keys.append(field_path)
             field = SchemaField(
                 fieldPath=field_path,
                 nativeDataType=native_data_type,
                 type=type,
-                description=description,
+                description=None,
                 nullable=nullable,
                 recursive=False,
             )
@@ -513,6 +515,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
             hash="",
             platformSchema=SchemalessClass(),
             fields=canonical_schema,
+            primaryKeys=primary_keys,
         )
         return schema_metadata

datahub/ingestion/source/feast.py CHANGED Viewed

@@ -135,10 +135,10 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
     """
     This plugin extracts:
-    - Entities as [`MLPrimaryKey`](https://datahubproject.io/docs/graphql/objects#mlprimarykey)
-    - Fields as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature)
-    - Feature views and on-demand feature views as [`MLFeatureTable`](https://datahubproject.io/docs/graphql/objects#mlfeaturetable)
-    - Batch and stream source details as [`Dataset`](https://datahubproject.io/docs/graphql/objects#dataset)
+    - Entities as [`MLPrimaryKey`](https://docs.datahub.com/docs/graphql/objects#mlprimarykey)
+    - Fields as [`MLFeature`](https://docs.datahub.com/docs/graphql/objects#mlfeature)
+    - Feature views and on-demand feature views as [`MLFeatureTable`](https://docs.datahub.com/docs/graphql/objects#mlfeaturetable)
+    - Batch and stream source details as [`Dataset`](https://docs.datahub.com/docs/graphql/objects#dataset)
     - Column types associated with each entity and feature
     """

datahub/ingestion/source/fivetran/config.py CHANGED Viewed

@@ -16,7 +16,7 @@ from datahub.configuration.source_common import DatasetSourceConfigMixin
 from datahub.configuration.validate_field_rename import pydantic_renamed_field
 from datahub.emitter.mce_builder import DEFAULT_ENV
 from datahub.ingestion.api.report import Report
-from datahub.ingestion.source.bigquery_v2.bigquery_config import (
+from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
     BigQueryConnectionConfig,
 )
 from datahub.ingestion.source.snowflake.snowflake_connection import (

datahub/ingestion/source/fivetran/fivetran_log_api.py CHANGED Viewed

@@ -54,7 +54,7 @@ class FivetranLogAPI:
                         snowflake_destination_config.database,
                     )
                 )
-                fivetran_log_query.set_db(
+                fivetran_log_query.set_schema(
                     snowflake_destination_config.log_schema,
                 )
                 fivetran_log_database = snowflake_destination_config.database
@@ -66,8 +66,12 @@ class FivetranLogAPI:
                 engine = create_engine(
                     bigquery_destination_config.get_sql_alchemy_url(),
                 )
-                fivetran_log_query.set_db(bigquery_destination_config.dataset)
-                fivetran_log_database = bigquery_destination_config.dataset
+                fivetran_log_query.set_schema(bigquery_destination_config.dataset)
+                # The "database" should be the BigQuery project name.
+                fivetran_log_database = engine.execute(
+                    "SELECT @@project_id"
+                ).fetchone()[0]
         else:
             raise ConfigurationError(
                 f"Destination platform '{destination_platform}' is not yet supported."

datahub/ingestion/source/fivetran/fivetran_query.py CHANGED Viewed

@@ -12,14 +12,14 @@ class FivetranLogQuery:
     def __init__(self) -> None:
         # Select query db clause
-        self.db_clause: str = ""
-    def set_db(self, db_name: str) -> None:
-        self.db_clause = f"{db_name}."
+        self.schema_clause: str = ""
     def use_database(self, db_name: str) -> str:
         return f"use database {db_name}"
+    def set_schema(self, schema_name: str) -> None:
+        self.schema_clause = f"{schema_name}."
     def get_connectors_query(self) -> str:
         return f"""\
 SELECT
@@ -30,7 +30,7 @@ SELECT
   paused,
   sync_frequency,
   destination_id
-FROM {self.db_clause}connector
+FROM {self.schema_clause}connector
 WHERE
   _fivetran_deleted = FALSE
 QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY _fivetran_synced DESC) = 1
@@ -42,7 +42,7 @@ SELECT id as user_id,
 given_name,
 family_name,
 email
-FROM {self.db_clause}user
+FROM {self.schema_clause}user
 """
     def get_sync_logs_query(
@@ -62,7 +62,7 @@ WITH ranked_syncs AS (
         MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
         MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
         ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn
-    FROM {self.db_clause}log
+    FROM {self.schema_clause}log
     WHERE message_event in ('sync_start', 'sync_end')
     AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
     AND connector_id IN ({formatted_connector_ids})
@@ -99,11 +99,11 @@ FROM (
         dsm.name as destination_schema_name,
         tl.created_at as created_at,
         ROW_NUMBER() OVER (PARTITION BY stm.connector_id, stm.id, dtm.id ORDER BY tl.created_at DESC) as table_combo_rn
-    FROM {self.db_clause}table_lineage as tl
-    JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id
-    JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
-    JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
-    JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id
+    FROM {self.schema_clause}table_lineage as tl
+    JOIN {self.schema_clause}source_table_metadata as stm on tl.source_table_id = stm.id
+    JOIN {self.schema_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
+    JOIN {self.schema_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
+    JOIN {self.schema_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id
     WHERE stm.connector_id IN ({formatted_connector_ids})
 )
 -- Ensure that we only get back one entry per source and destination pair.
@@ -131,13 +131,13 @@ FROM (
         dcm.name as destination_column_name,
         cl.created_at as created_at,
         ROW_NUMBER() OVER (PARTITION BY stm.connector_id, cl.source_column_id, cl.destination_column_id ORDER BY cl.created_at DESC) as column_combo_rn
-    FROM {self.db_clause}column_lineage as cl
-    JOIN {self.db_clause}source_column_metadata as scm
+    FROM {self.schema_clause}column_lineage as cl
+    JOIN {self.schema_clause}source_column_metadata as scm
       ON cl.source_column_id = scm.id
-    JOIN {self.db_clause}destination_column_metadata as dcm
+    JOIN {self.schema_clause}destination_column_metadata as dcm
       ON cl.destination_column_id = dcm.id
     -- Only joining source_table_metadata to get the connector_id.
-    JOIN {self.db_clause}source_table_metadata as stm
+    JOIN {self.schema_clause}source_table_metadata as stm
       ON scm.table_id = stm.id
     WHERE stm.connector_id IN ({formatted_connector_ids})
 )

datahub/ingestion/source/ge_data_profiler.py CHANGED Viewed

@@ -5,6 +5,7 @@ import concurrent.futures
 import contextlib
 import dataclasses
 import functools
+import importlib.metadata
 import json
 import logging
 import re
@@ -51,6 +52,7 @@ from typing_extensions import Concatenate, ParamSpec
 from datahub.emitter import mce_builder
 from datahub.emitter.mce_builder import get_sys_time
 from datahub.ingestion.graph.client import get_default_graph
+from datahub.ingestion.graph.config import ClientMode
 from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
 from datahub.ingestion.source.profiling.common import (
     Cardinality,
@@ -83,6 +85,30 @@ if TYPE_CHECKING:
     from pyathena.cursor import Cursor
 assert MARKUPSAFE_PATCHED
+# We need to ensure that acryl-great-expectations is installed
+# and great-expectations is not installed.
+try:
+    acryl_gx_version = bool(importlib.metadata.distribution("acryl-great-expectations"))
+except importlib.metadata.PackageNotFoundError:
+    acryl_gx_version = False
+try:
+    original_gx_version = bool(importlib.metadata.distribution("great-expectations"))
+except importlib.metadata.PackageNotFoundError:
+    original_gx_version = False
+if acryl_gx_version and original_gx_version:
+    raise RuntimeError(
+        "acryl-great-expectations and great-expectations cannot both be installed because their files will conflict. "
+        "You will need to (1) uninstall great-expectations and (2) re-install acryl-great-expectations. "
+        "See https://github.com/pypa/pip/issues/4625."
+    )
+elif original_gx_version:
+    raise RuntimeError(
+        "We expect acryl-great-expectations to be installed, but great-expectations is installed instead."
+    )
 logger: logging.Logger = logging.getLogger(__name__)
 _original_get_column_median = SqlAlchemyDataset.get_column_median
@@ -1569,7 +1595,7 @@ def _get_columns_to_ignore_sampling(
         name=dataset_name, platform=platform, env=env
     )
-    datahub_graph = get_default_graph()
+    datahub_graph = get_default_graph(ClientMode.INGESTION)
     dataset_tags = datahub_graph.get_tags(dataset_urn)
     if dataset_tags:

datahub/ingestion/source/hex/api.py CHANGED Viewed

@@ -27,6 +27,7 @@ logger = logging.getLogger(__name__)
 # The following models were Claude-generated from Hex API OpenAPI definition https://static.hex.site/openapi.json
 # To be exclusively used internally for the deserialization of the API response
+# Model is incomplete and fields may have not been mapped if not used in the ingestion
 class HexApiAppViewStats(BaseModel):
@@ -83,20 +84,10 @@ class HexApiUser(BaseModel):
     email: str
-class HexApiAccessType(StrEnum):
-    """Access type enum."""
-    NONE = "NONE"
-    VIEW = "VIEW"
-    EDIT = "EDIT"
-    FULL_ACCESS = "FULL_ACCESS"
 class HexApiUserAccess(BaseModel):
     """User access model."""
     user: HexApiUser
-    access: Optional[HexApiAccessType] = None
 class HexApiCollectionData(BaseModel):
@@ -109,13 +100,6 @@ class HexApiCollectionAccess(BaseModel):
     """Collection access model."""
     collection: HexApiCollectionData
-    access: Optional[HexApiAccessType] = None
-class HexApiAccessSettings(BaseModel):
-    """Access settings model."""
-    access: Optional[HexApiAccessType] = None
 class HexApiWeeklySchedule(BaseModel):
@@ -145,9 +129,6 @@ class HexApiSharing(BaseModel):
     users: Optional[List[HexApiUserAccess]] = []
     collections: Optional[List[HexApiCollectionAccess]] = []
     groups: Optional[List[Any]] = []
-    workspace: Optional[HexApiAccessSettings] = None
-    public_web: Optional[HexApiAccessSettings] = Field(default=None, alias="publicWeb")
-    support: Optional[HexApiAccessSettings] = None
     class Config:
         extra = "ignore"  # Allow extra fields in the JSON

datahub/ingestion/source/hex/query_fetcher.py CHANGED Viewed

@@ -18,7 +18,8 @@ from datahub.utilities.time import datetime_to_ts_millis
 logger = logging.getLogger(__name__)
 # Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
-HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
+# Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
+HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
 @dataclass
@@ -39,6 +40,7 @@ class HexQueryFetcherReport(SourceReport):
     fetched_query_objects: int = 0
     filtered_out_queries_missing_metadata: int = 0
     filtered_out_queries_different_workspace: int = 0
+    filtered_out_queries_no_match: int = 0
     filtered_out_queries_no_subjects: int = 0
     total_queries: int = 0
     total_dataset_subjects: int = 0
@@ -210,6 +212,7 @@ class HexQueryFetcher:
         match = re.search(HEX_METADATA_PATTERN, sql_statement)
         if not match:
+            self.report.filtered_out_queries_no_match += 1
             return None
         try:

datahub/ingestion/source/iceberg/iceberg.py CHANGED Viewed

@@ -16,7 +16,7 @@ from pyiceberg.exceptions import (
 )
 from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
 from pyiceberg.table import Table
-from pyiceberg.typedef import Identifier
+from pyiceberg.typedef import Identifier, Properties
 from pyiceberg.types import (
     BinaryType,
     BooleanType,
@@ -387,8 +387,13 @@ class IcebergSource(StatefulIngestionSourceBase):
                         env=self.config.env,
                     )
                 )
+                namespace_properties: Properties = catalog.load_namespace_properties(
+                    namespace
+                )
                 namespaces.append((namespace, namespace_urn))
-                for aspect in self._create_iceberg_namespace_aspects(namespace):
+                for aspect in self._create_iceberg_namespace_aspects(
+                    namespace, namespace_properties
+                ):
                     yield stamping_processor.stamp_wu(
                         MetadataChangeProposalWrapper(
                             entityUrn=namespace_urn, aspect=aspect
@@ -608,12 +613,23 @@ class IcebergSource(StatefulIngestionSourceBase):
         return self.report
     def _create_iceberg_namespace_aspects(
-        self, namespace: Identifier
+        self, namespace: Identifier, properties: Properties
     ) -> Iterable[_Aspect]:
         namespace_repr = ".".join(namespace)
+        custom_properties: Dict[str, str] = {}
+        for k, v in properties.items():
+            try:
+                custom_properties[str(k)] = str(v)
+            except Exception as e:
+                LOGGER.warning(
+                    f"Exception when trying to parse namespace properties for {namespace_repr}. Exception: {e}"
+                )
         yield Status(removed=False)
         yield ContainerProperties(
-            name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
+            name=namespace_repr,
+            qualifiedName=namespace_repr,
+            env=self.config.env,
+            customProperties=custom_properties,
         )
         yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
         dpi = self._get_dataplatform_instance_aspect()

datahub/ingestion/source/iceberg/iceberg_common.py CHANGED Viewed

@@ -40,11 +40,11 @@ class TimeoutHTTPAdapter(HTTPAdapter):
             del kwargs["timeout"]
         super().__init__(*args, **kwargs)
-    def send(self, request, **kwargs):
+    def send(self, request, *args, **kwargs):
         timeout = kwargs.get("timeout")
         if timeout is None and hasattr(self, "timeout"):
             kwargs["timeout"] = self.timeout
-        return super().send(request, **kwargs)
+        return super().send(request, *args, **kwargs)
 class IcebergProfilingConfig(ConfigModel):

datahub/ingestion/source/ldap.py CHANGED Viewed

@@ -515,5 +515,5 @@ def parse_ldap_dn(input_clean: bytes) -> str:
 def get_attr_or_none(
     attrs: Dict[str, Any], key: str, default: Optional[str] = None
-) -> str:
+) -> Optional[str]:
     return attrs[key][0].decode() if attrs.get(key) else default

acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.2rc4py3-none-any.whl → 1.0.0.3py3-none-any.whl