PyPI - acryl-datahub - Versions diffs - 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

acryl-datahub 0.14.1.13rc9py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (133) hide show

{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
datahub/__init__.py +1 -1
datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
datahub/cli/cli_utils.py +2 -0
datahub/cli/delete_cli.py +103 -24
datahub/cli/ingest_cli.py +110 -0
datahub/cli/put_cli.py +1 -1
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/structuredproperties_cli.py +2 -1
datahub/configuration/common.py +3 -3
datahub/configuration/git.py +7 -1
datahub/configuration/kafka_consumer_config.py +31 -1
datahub/emitter/mcp_patch_builder.py +43 -0
datahub/emitter/rest_emitter.py +17 -4
datahub/ingestion/api/incremental_properties_helper.py +69 -0
datahub/ingestion/api/source.py +6 -1
datahub/ingestion/api/source_helpers.py +4 -2
datahub/ingestion/graph/client.py +2 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
datahub/ingestion/run/pipeline.py +6 -5
datahub/ingestion/run/pipeline_config.py +6 -0
datahub/ingestion/sink/datahub_rest.py +15 -4
datahub/ingestion/source/abs/source.py +4 -0
datahub/ingestion/source/aws/aws_common.py +13 -1
datahub/ingestion/source/aws/sagemaker.py +8 -0
datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/common/subtypes.py +2 -0
datahub/ingestion/source/csv_enricher.py +1 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
datahub/ingestion/source/datahub/datahub_source.py +8 -1
datahub/ingestion/source/dbt/dbt_common.py +7 -61
datahub/ingestion/source/dremio/dremio_api.py +204 -86
datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
datahub/ingestion/source/dremio/dremio_config.py +5 -0
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
datahub/ingestion/source/dremio/dremio_entities.py +4 -0
datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
datahub/ingestion/source/dremio/dremio_source.py +7 -2
datahub/ingestion/source/elastic_search.py +1 -1
datahub/ingestion/source/feast.py +97 -6
datahub/ingestion/source/gc/datahub_gc.py +46 -35
datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
datahub/ingestion/source/ge_data_profiler.py +23 -1
datahub/ingestion/source/iceberg/iceberg.py +12 -5
datahub/ingestion/source/kafka/kafka.py +39 -19
datahub/ingestion/source/kafka/kafka_connect.py +81 -51
datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
datahub/ingestion/source/looker/view_upstream.py +65 -30
datahub/ingestion/source/metadata/business_glossary.py +35 -18
datahub/ingestion/source/mode.py +0 -23
datahub/ingestion/source/neo4j/__init__.py +0 -0
datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
datahub/ingestion/source/powerbi/__init__.py +0 -1
datahub/ingestion/source/powerbi/config.py +3 -3
datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
datahub/ingestion/source/powerbi/powerbi.py +12 -6
datahub/ingestion/source/preset.py +1 -0
datahub/ingestion/source/pulsar.py +21 -2
datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
datahub/ingestion/source/redash.py +13 -63
datahub/ingestion/source/redshift/config.py +1 -0
datahub/ingestion/source/redshift/redshift.py +3 -0
datahub/ingestion/source/s3/source.py +2 -3
datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
datahub/ingestion/source/sql/athena.py +46 -22
datahub/ingestion/source/sql/mssql/source.py +0 -2
datahub/ingestion/source/sql/sql_common.py +34 -21
datahub/ingestion/source/sql/sql_report.py +1 -0
datahub/ingestion/source/sql/sql_types.py +85 -8
datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
datahub/ingestion/source/superset.py +215 -65
datahub/ingestion/source/tableau/tableau.py +237 -76
datahub/ingestion/source/tableau/tableau_common.py +12 -6
datahub/ingestion/source/tableau/tableau_constant.py +2 -0
datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
datahub/ingestion/source/tableau/tableau_validation.py +48 -0
datahub/ingestion/source/unity/proxy_types.py +1 -0
datahub/ingestion/source/unity/source.py +4 -0
datahub/ingestion/source/unity/usage.py +20 -11
datahub/ingestion/transformer/add_dataset_tags.py +1 -1
datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
datahub/integrations/assertion/common.py +1 -1
datahub/lite/duckdb_lite.py +12 -17
datahub/metadata/_schema_classes.py +512 -392
datahub/metadata/_urns/urn_defs.py +1355 -1355
datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
datahub/metadata/schema.avsc +17222 -17499
datahub/metadata/schemas/FormInfo.avsc +4 -0
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
datahub/specific/chart.py +0 -39
datahub/specific/dashboard.py +0 -39
datahub/specific/datajob.py +7 -57
datahub/sql_parsing/schema_resolver.py +23 -0
datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
datahub/sql_parsing/sqlglot_lineage.py +55 -14
datahub/sql_parsing/sqlglot_utils.py +8 -2
datahub/telemetry/telemetry.py +23 -9
datahub/testing/compare_metadata_json.py +1 -1
datahub/testing/doctest.py +12 -0
datahub/utilities/file_backed_collections.py +35 -2
datahub/utilities/partition_executor.py +1 -1
datahub/utilities/urn_encoder.py +2 -1
datahub/utilities/urns/_urn_base.py +1 -1
datahub/utilities/urns/structured_properties_urn.py +1 -1
datahub/utilities/sql_lineage_parser_impl.py +0 -160
datahub/utilities/sql_parser.py +0 -94
datahub/utilities/sql_parser_base.py +0 -21
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/aws/sagemaker_processors/lineage.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 from collections import defaultdict
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, DefaultDict, Dict, List, Set
@@ -6,6 +7,8 @@ from datahub.ingestion.source.aws.sagemaker_processors.common import (
     SagemakerSourceReport,
 )
+logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from mypy_boto3_sagemaker import SageMakerClient
     from mypy_boto3_sagemaker.type_defs import (
@@ -88,7 +91,6 @@ class LineageProcessor:
         paginator = self.sagemaker_client.get_paginator("list_contexts")
         for page in paginator.paginate():
             contexts += page["ContextSummaries"]
         return contexts
     def get_incoming_edges(self, node_arn: str) -> List["AssociationSummaryTypeDef"]:
@@ -225,27 +227,32 @@ class LineageProcessor:
         """
         Get the lineage of all artifacts in SageMaker.
         """
+        logger.info("Getting lineage for SageMaker artifacts...")
+        logger.info("Getting all actions")
         for action in self.get_all_actions():
             self.nodes[action["ActionArn"]] = {**action, "node_type": "action"}
+        logger.info("Getting all artifacts")
         for artifact in self.get_all_artifacts():
             self.nodes[artifact["ArtifactArn"]] = {**artifact, "node_type": "artifact"}
+        logger.info("Getting all contexts")
         for context in self.get_all_contexts():
             self.nodes[context["ContextArn"]] = {**context, "node_type": "context"}
+        logger.info("Getting lineage for model deployments and model groups")
         for node_arn, node in self.nodes.items():
+            logger.debug(f"Getting lineage for node {node_arn}")
             # get model-endpoint lineage
             if (
                 node["node_type"] == "action"
                 and node.get("ActionType") == "ModelDeployment"
             ):
                 self.get_model_deployment_lineage(node_arn)
+                self.report.model_endpoint_lineage += 1
             # get model-group lineage
             if (
                 node["node_type"] == "context"
                 and node.get("ContextType") == "ModelGroup"
             ):
                 self.get_model_group_lineage(node_arn, node)
+                self.report.model_group_lineage += 1
         return self.lineage_info

datahub/ingestion/source/aws/sagemaker_processors/models.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 from collections import defaultdict
 from dataclasses import dataclass, field
 from datetime import datetime
@@ -65,6 +66,8 @@ ENDPOINT_STATUS_MAP: Dict[str, str] = {
     "Unknown": DeploymentStatusClass.UNKNOWN,
 }
+logger = logging.getLogger(__name__)
 @dataclass
 class ModelProcessor:
@@ -385,6 +388,26 @@ class ModelProcessor:
             model_metrics,
         )
+    @staticmethod
+    def get_group_name_from_arn(arn: str) -> str:
+        """
+        Extract model package group name from a SageMaker ARN.
+        Args:
+            arn (str): Full ARN of the model package group
+        Returns:
+            str: Name of the model package group
+        Example:
+            >>> ModelProcessor.get_group_name_from_arn("arn:aws:sagemaker:eu-west-1:123456789:model-package-group/my-model-group")
+            'my-model-group'
+        """
+        logger.debug(
+            f"Extracting group name from ARN: {arn} because group was not seen before"
+        )
+        return arn.split("/")[-1]
     def get_model_wu(
         self,
         model_details: "DescribeModelOutputTypeDef",
@@ -425,8 +448,14 @@ class ModelProcessor:
         model_group_arns = model_uri_groups | model_image_groups
         model_group_names = sorted(
-            [self.group_arn_to_name[x] for x in model_group_arns]
+            [
+                self.group_arn_to_name[x]
+                if x in self.group_arn_to_name
+                else self.get_group_name_from_arn(x)
+                for x in model_group_arns
+            ]
         )
         model_group_urns = [
             builder.make_ml_model_group_urn("sagemaker", x, self.env)
             for x in model_group_names

datahub/ingestion/source/bigquery_v2/bigquery_audit.py CHANGED Viewed

@@ -190,7 +190,7 @@ class BigQueryTableRef:
     @classmethod
     def from_urn(cls, urn: str) -> "BigQueryTableRef":
         """Raises: ValueError if urn is not a valid BigQuery table URN."""
-        dataset_urn = DatasetUrn.create_from_string(urn)
+        dataset_urn = DatasetUrn.from_string(urn)
         split = dataset_urn.name.rsplit(".", 3)
         if len(split) == 3:
             project, dataset, table = split

datahub/ingestion/source/common/subtypes.py CHANGED Viewed

@@ -22,6 +22,8 @@ class DatasetSubTypes(StrEnum):
     SAC_MODEL = "Model"
     SAC_IMPORT_DATA_MODEL = "Import Data Model"
     SAC_LIVE_DATA_MODEL = "Live Data Model"
+    NEO4J_NODE = "Neo4j Node"
+    NEO4J_RELATIONSHIP = "Neo4j Relationship"
     # TODO: Create separate entity...
     NOTEBOOK = "Notebook"

datahub/ingestion/source/csv_enricher.py CHANGED Viewed

@@ -653,7 +653,7 @@ class CSVEnricherSource(Source):
             is_resource_row: bool = not row["subresource"]
             entity_urn = row["resource"]
-            entity_type = Urn.create_from_string(row["resource"]).get_type()
+            entity_type = Urn.from_string(row["resource"]).get_type()
             term_associations: List[
                 GlossaryTermAssociationClass

datahub/ingestion/source/datahub/datahub_database_reader.py CHANGED Viewed

@@ -147,6 +147,47 @@ class DataHubDatabaseReader:
             version
         """
+    def execute_server_cursor(
+        self, query: str, params: Dict[str, Any]
+    ) -> Iterable[Dict[str, Any]]:
+        with self.engine.connect() as conn:
+            if self.engine.dialect.name == "postgresql":
+                with conn.begin():  # Transaction required for PostgreSQL server-side cursor
+                    conn = conn.execution_options(
+                        stream_results=True,
+                        yield_per=self.config.database_query_batch_size,
+                    )
+                    result = conn.execute(query, params)
+                    for row in result:
+                        yield dict(row)
+            elif self.engine.dialect.name == "mysql":  # MySQL
+                import MySQLdb
+                with contextlib.closing(
+                    conn.connection.cursor(MySQLdb.cursors.SSCursor)
+                ) as cursor:
+                    logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
+                    cursor.execute(query, params)
+                    columns = [desc[0] for desc in cursor.description]
+                    while True:
+                        rows = cursor.fetchmany(self.config.database_query_batch_size)
+                        if not rows:
+                            break  # Use break instead of return in generator
+                        for row in rows:
+                            yield dict(zip(columns, row))
+            else:
+                raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
+    def _get_rows(
+        self, from_createdon: datetime, stop_time: datetime
+    ) -> Iterable[Dict[str, Any]]:
+        params = {
+            "exclude_aspects": list(self.config.exclude_aspects),
+            "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
+        }
+        yield from self.execute_server_cursor(self.query, params)
     def get_aspects(
         self, from_createdon: datetime, stop_time: datetime
     ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
@@ -159,27 +200,6 @@ class DataHubDatabaseReader:
             if mcp:
                 yield mcp, row["createdon"]
-    def _get_rows(
-        self, from_createdon: datetime, stop_time: datetime
-    ) -> Iterable[Dict[str, Any]]:
-        with self.engine.connect() as conn:
-            with contextlib.closing(conn.connection.cursor()) as cursor:
-                cursor.execute(
-                    self.query,
-                    {
-                        "exclude_aspects": list(self.config.exclude_aspects),
-                        "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
-                    },
-                )
-                columns = [desc[0] for desc in cursor.description]
-                while True:
-                    rows = cursor.fetchmany(self.config.database_query_batch_size)
-                    if not rows:
-                        return
-                    for row in rows:
-                        yield dict(zip(columns, row))
     def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
         """
         Fetches all soft-deleted entities from the database.

datahub/ingestion/source/datahub/datahub_source.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from datetime import datetime, timezone
+from datetime import datetime, timedelta, timezone
 from functools import partial
 from typing import Dict, Iterable, List, Optional
@@ -26,6 +26,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionSourceBase,
 )
 from datahub.metadata.schema_classes import ChangeTypeClass
+from datahub.utilities.progress_timer import ProgressTimer
 logger = logging.getLogger(__name__)
@@ -105,11 +106,17 @@ class DataHubSource(StatefulIngestionSourceBase):
         self, from_createdon: datetime, reader: DataHubDatabaseReader
     ) -> Iterable[MetadataWorkUnit]:
         logger.info(f"Fetching database aspects starting from {from_createdon}")
+        progress = ProgressTimer(report_every=timedelta(seconds=60))
         mcps = reader.get_aspects(from_createdon, self.report.stop_time)
         for i, (mcp, createdon) in enumerate(mcps):
             if not self.urn_pattern.allowed(str(mcp.entityUrn)):
                 continue
+            if progress.should_report():
+                logger.info(
+                    f"Ingested {i} database aspects so far, currently at {createdon}"
+                )
             yield mcp.as_workunit()
             self.report.num_database_aspects_ingested += 1

datahub/ingestion/source/dbt/dbt_common.py CHANGED Viewed

@@ -53,19 +53,7 @@ from datahub.ingestion.source.dbt.dbt_tests import (
     make_assertion_from_test,
     make_assertion_result_from_test,
 )
-from datahub.ingestion.source.sql.sql_types import (
-    ATHENA_SQL_TYPES_MAP,
-    BIGQUERY_TYPES_MAP,
-    POSTGRES_TYPES_MAP,
-    SNOWFLAKE_TYPES_MAP,
-    SPARK_SQL_TYPES_MAP,
-    TRINO_SQL_TYPES_MAP,
-    VERTICA_SQL_TYPES_MAP,
-    resolve_athena_modified_type,
-    resolve_postgres_modified_type,
-    resolve_trino_modified_type,
-    resolve_vertica_modified_type,
-)
+from datahub.ingestion.source.sql.sql_types import resolve_sql_type
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
     StaleEntityRemovalSourceReport,
@@ -89,17 +77,11 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
 from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
 from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
 from datahub.metadata.com.linkedin.pegasus2avro.schema import (
-    BooleanTypeClass,
-    DateTypeClass,
     MySqlDDL,
     NullTypeClass,
-    NumberTypeClass,
-    RecordType,
     SchemaField,
     SchemaFieldDataType,
     SchemaMetadata,
-    StringTypeClass,
-    TimeTypeClass,
 )
 from datahub.metadata.schema_classes import (
     DataPlatformInstanceClass,
@@ -804,28 +786,6 @@ def make_mapping_upstream_lineage(
     )
-# See https://github.com/fishtown-analytics/dbt/blob/master/core/dbt/adapters/sql/impl.py
-_field_type_mapping = {
-    "boolean": BooleanTypeClass,
-    "date": DateTypeClass,
-    "time": TimeTypeClass,
-    "numeric": NumberTypeClass,
-    "text": StringTypeClass,
-    "timestamp with time zone": DateTypeClass,
-    "timestamp without time zone": DateTypeClass,
-    "integer": NumberTypeClass,
-    "float8": NumberTypeClass,
-    "struct": RecordType,
-    **POSTGRES_TYPES_MAP,
-    **SNOWFLAKE_TYPES_MAP,
-    **BIGQUERY_TYPES_MAP,
-    **SPARK_SQL_TYPES_MAP,
-    **TRINO_SQL_TYPES_MAP,
-    **ATHENA_SQL_TYPES_MAP,
-    **VERTICA_SQL_TYPES_MAP,
-}
 def get_column_type(
     report: DBTSourceReport,
     dataset_name: str,
@@ -835,24 +795,10 @@ def get_column_type(
     """
     Maps known DBT types to datahub types
     """
-    TypeClass: Any = _field_type_mapping.get(column_type) if column_type else None
-    if TypeClass is None and column_type:
-        # resolve a modified type
-        if dbt_adapter == "trino":
-            TypeClass = resolve_trino_modified_type(column_type)
-        elif dbt_adapter == "athena":
-            TypeClass = resolve_athena_modified_type(column_type)
-        elif dbt_adapter == "postgres" or dbt_adapter == "redshift":
-            # Redshift uses a variant of Postgres, so we can use the same logic.
-            TypeClass = resolve_postgres_modified_type(column_type)
-        elif dbt_adapter == "vertica":
-            TypeClass = resolve_vertica_modified_type(column_type)
-        elif dbt_adapter == "snowflake":
-            # Snowflake types are uppercase, so we check that.
-            TypeClass = _field_type_mapping.get(column_type.upper())
-    # if still not found, report the warning
+    TypeClass = resolve_sql_type(column_type, dbt_adapter)
+    # if still not found, report a warning
     if TypeClass is None:
         if column_type:
             report.info(
@@ -861,9 +807,9 @@ def get_column_type(
                 context=f"{dataset_name} - {column_type}",
                 log=False,
             )
-        TypeClass = NullTypeClass
+        TypeClass = NullTypeClass()
-    return SchemaFieldDataType(type=TypeClass())
+    return SchemaFieldDataType(type=TypeClass)
 @platform_name("dbt")

acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.14.1.13rc9py3-none-any.whl → 0.15.0py3-none-any.whl