PyPI - acryl-datahub - Versions diffs - 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

acryl-datahub 0.14.1.13rc9py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (133) hide show

{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
datahub/__init__.py +1 -1
datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
datahub/cli/cli_utils.py +2 -0
datahub/cli/delete_cli.py +103 -24
datahub/cli/ingest_cli.py +110 -0
datahub/cli/put_cli.py +1 -1
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/structuredproperties_cli.py +2 -1
datahub/configuration/common.py +3 -3
datahub/configuration/git.py +7 -1
datahub/configuration/kafka_consumer_config.py +31 -1
datahub/emitter/mcp_patch_builder.py +43 -0
datahub/emitter/rest_emitter.py +17 -4
datahub/ingestion/api/incremental_properties_helper.py +69 -0
datahub/ingestion/api/source.py +6 -1
datahub/ingestion/api/source_helpers.py +4 -2
datahub/ingestion/graph/client.py +2 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
datahub/ingestion/run/pipeline.py +6 -5
datahub/ingestion/run/pipeline_config.py +6 -0
datahub/ingestion/sink/datahub_rest.py +15 -4
datahub/ingestion/source/abs/source.py +4 -0
datahub/ingestion/source/aws/aws_common.py +13 -1
datahub/ingestion/source/aws/sagemaker.py +8 -0
datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/common/subtypes.py +2 -0
datahub/ingestion/source/csv_enricher.py +1 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
datahub/ingestion/source/datahub/datahub_source.py +8 -1
datahub/ingestion/source/dbt/dbt_common.py +7 -61
datahub/ingestion/source/dremio/dremio_api.py +204 -86
datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
datahub/ingestion/source/dremio/dremio_config.py +5 -0
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
datahub/ingestion/source/dremio/dremio_entities.py +4 -0
datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
datahub/ingestion/source/dremio/dremio_source.py +7 -2
datahub/ingestion/source/elastic_search.py +1 -1
datahub/ingestion/source/feast.py +97 -6
datahub/ingestion/source/gc/datahub_gc.py +46 -35
datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
datahub/ingestion/source/ge_data_profiler.py +23 -1
datahub/ingestion/source/iceberg/iceberg.py +12 -5
datahub/ingestion/source/kafka/kafka.py +39 -19
datahub/ingestion/source/kafka/kafka_connect.py +81 -51
datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
datahub/ingestion/source/looker/view_upstream.py +65 -30
datahub/ingestion/source/metadata/business_glossary.py +35 -18
datahub/ingestion/source/mode.py +0 -23
datahub/ingestion/source/neo4j/__init__.py +0 -0
datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
datahub/ingestion/source/powerbi/__init__.py +0 -1
datahub/ingestion/source/powerbi/config.py +3 -3
datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
datahub/ingestion/source/powerbi/powerbi.py +12 -6
datahub/ingestion/source/preset.py +1 -0
datahub/ingestion/source/pulsar.py +21 -2
datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
datahub/ingestion/source/redash.py +13 -63
datahub/ingestion/source/redshift/config.py +1 -0
datahub/ingestion/source/redshift/redshift.py +3 -0
datahub/ingestion/source/s3/source.py +2 -3
datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
datahub/ingestion/source/sql/athena.py +46 -22
datahub/ingestion/source/sql/mssql/source.py +0 -2
datahub/ingestion/source/sql/sql_common.py +34 -21
datahub/ingestion/source/sql/sql_report.py +1 -0
datahub/ingestion/source/sql/sql_types.py +85 -8
datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
datahub/ingestion/source/superset.py +215 -65
datahub/ingestion/source/tableau/tableau.py +237 -76
datahub/ingestion/source/tableau/tableau_common.py +12 -6
datahub/ingestion/source/tableau/tableau_constant.py +2 -0
datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
datahub/ingestion/source/tableau/tableau_validation.py +48 -0
datahub/ingestion/source/unity/proxy_types.py +1 -0
datahub/ingestion/source/unity/source.py +4 -0
datahub/ingestion/source/unity/usage.py +20 -11
datahub/ingestion/transformer/add_dataset_tags.py +1 -1
datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
datahub/integrations/assertion/common.py +1 -1
datahub/lite/duckdb_lite.py +12 -17
datahub/metadata/_schema_classes.py +512 -392
datahub/metadata/_urns/urn_defs.py +1355 -1355
datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
datahub/metadata/schema.avsc +17222 -17499
datahub/metadata/schemas/FormInfo.avsc +4 -0
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
datahub/specific/chart.py +0 -39
datahub/specific/dashboard.py +0 -39
datahub/specific/datajob.py +7 -57
datahub/sql_parsing/schema_resolver.py +23 -0
datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
datahub/sql_parsing/sqlglot_lineage.py +55 -14
datahub/sql_parsing/sqlglot_utils.py +8 -2
datahub/telemetry/telemetry.py +23 -9
datahub/testing/compare_metadata_json.py +1 -1
datahub/testing/doctest.py +12 -0
datahub/utilities/file_backed_collections.py +35 -2
datahub/utilities/partition_executor.py +1 -1
datahub/utilities/urn_encoder.py +2 -1
datahub/utilities/urns/_urn_base.py +1 -1
datahub/utilities/urns/structured_properties_urn.py +1 -1
datahub/utilities/sql_lineage_parser_impl.py +0 -160
datahub/utilities/sql_parser.py +0 -94
datahub/utilities/sql_parser_base.py +0 -21
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/snowflake/snowflake_schema_gen.py CHANGED Viewed

@@ -103,6 +103,7 @@ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecuto
 logger = logging.getLogger(__name__)
 # https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
+# TODO: Move to the standardized types in sql_types.py
 SNOWFLAKE_FIELD_TYPE_MAPPINGS = {
     "DATE": DateType,
     "BIGINT": NumberType,
@@ -423,6 +424,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
                     view_identifier = self.identifiers.get_dataset_identifier(
                         view.name, schema_name, db_name
                     )
+                    if view.is_secure and not view.view_definition:
+                        view.view_definition = self.fetch_secure_view_definition(
+                            view.name, schema_name, db_name
+                        )
                     if view.view_definition:
                         self.aggregator.add_view_definition(
                             view_urn=self.identifiers.gen_dataset_urn(view_identifier),
@@ -430,6 +435,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
                             default_db=db_name,
                             default_schema=schema_name,
                         )
+                    elif view.is_secure:
+                        self.report.num_secure_views_missing_definition += 1
             if self.config.include_technical_schema:
                 for view in views:
@@ -446,6 +453,25 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
                 context=f"{db_name}.{schema_name}",
             )
+    def fetch_secure_view_definition(
+        self, table_name: str, schema_name: str, db_name: str
+    ) -> Optional[str]:
+        try:
+            view_definitions = self.data_dictionary.get_secure_view_definitions()
+            return view_definitions[db_name][schema_name][table_name]
+        except Exception as e:
+            if isinstance(e, SnowflakePermissionError):
+                error_msg = (
+                    "Failed to get secure views definitions. Please check permissions."
+                )
+            else:
+                error_msg = "Failed to get secure views definitions"
+            self.structured_reporter.warning(
+                error_msg,
+                exc=e,
+            )
+            return None
     def fetch_views_for_schema(
         self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
     ) -> List[SnowflakeView]:
@@ -748,8 +774,21 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
     ) -> DatasetProperties:
         custom_properties = {}
-        if isinstance(table, SnowflakeTable) and table.clustering_key:
-            custom_properties["CLUSTERING_KEY"] = table.clustering_key
+        if isinstance(table, SnowflakeTable):
+            if table.clustering_key:
+                custom_properties["CLUSTERING_KEY"] = table.clustering_key
+            if table.is_hybrid:
+                custom_properties["IS_HYBRID"] = "true"
+            if table.is_dynamic:
+                custom_properties["IS_DYNAMIC"] = "true"
+            if table.is_iceberg:
+                custom_properties["IS_ICEBERG"] = "true"
+        if isinstance(table, SnowflakeView) and table.is_secure:
+            custom_properties["IS_SECURE"] = "true"
         return DatasetProperties(
             name=table.name,

datahub/ingestion/source/snowflake/snowflake_utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import abc
 from functools import cached_property
-from typing import ClassVar, Literal, Optional, Tuple
+from typing import ClassVar, List, Literal, Optional, Tuple
 from datahub.configuration.pattern_utils import is_schema_allowed
 from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
@@ -119,7 +119,6 @@ class SnowflakeFilter:
     ) -> bool:
         if not dataset_type or not dataset_name:
             return True
-        dataset_params = dataset_name.split(".")
         if dataset_type.lower() not in (
             SnowflakeObjectDomain.TABLE,
             SnowflakeObjectDomain.EXTERNAL_TABLE,
@@ -131,6 +130,7 @@ class SnowflakeFilter:
         if _is_sys_table(dataset_name):
             return False
+        dataset_params = _split_qualified_name(dataset_name)
         if len(dataset_params) != 3:
             self.structured_reporter.info(
                 title="Unexpected dataset pattern",
@@ -184,6 +184,46 @@ def _is_sys_table(table_name: str) -> bool:
     return table_name.lower().startswith("sys$")
+def _split_qualified_name(qualified_name: str) -> List[str]:
+    """
+    Split a qualified name into its constituent parts.
+    >>> _split_qualified_name("db.my_schema.my_table")
+    ['db', 'my_schema', 'my_table']
+    >>> _split_qualified_name('"db"."my_schema"."my_table"')
+    ['db', 'my_schema', 'my_table']
+    >>> _split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
+    ['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
+    >>> _split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
+    ['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
+    """
+    # Fast path - no quotes.
+    if '"' not in qualified_name:
+        return qualified_name.split(".")
+    # First pass - split on dots that are not inside quotes.
+    in_quote = False
+    parts: List[List[str]] = [[]]
+    for char in qualified_name:
+        if char == '"':
+            in_quote = not in_quote
+        elif char == "." and not in_quote:
+            parts.append([])
+        else:
+            parts[-1].append(char)
+    # Second pass - remove outer pairs of quotes.
+    result = []
+    for part in parts:
+        if len(part) > 2 and part[0] == '"' and part[-1] == '"':
+            part = part[1:-1]
+        result.append("".join(part))
+    return result
 # Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers,
 # For example "test-database"."test-schema".test_table
 # whereas we generate urns without quotes even for quoted identifiers for backward compatibility
@@ -192,7 +232,7 @@ def _is_sys_table(table_name: str) -> bool:
 def _cleanup_qualified_name(
     qualified_name: str, structured_reporter: SourceReport
 ) -> str:
-    name_parts = qualified_name.split(".")
+    name_parts = _split_qualified_name(qualified_name)
     if len(name_parts) != 3:
         if not _is_sys_table(qualified_name):
             structured_reporter.info(
@@ -203,9 +243,9 @@ def _cleanup_qualified_name(
             )
         return qualified_name.replace('"', "")
     return _combine_identifier_parts(
-        db_name=name_parts[0].strip('"'),
-        schema_name=name_parts[1].strip('"'),
-        table_name=name_parts[2].strip('"'),
+        db_name=name_parts[0],
+        schema_name=name_parts[1],
+        table_name=name_parts[2],
     )

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -17,6 +17,9 @@ from datahub.ingestion.api.decorators import (
     support_status,
 )
 from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage
+from datahub.ingestion.api.incremental_properties_helper import (
+    auto_incremental_properties,
+)
 from datahub.ingestion.api.source import (
     CapabilityReport,
     MetadataWorkUnitProcessor,
@@ -446,6 +449,9 @@ class SnowflakeV2Source(
             functools.partial(
                 auto_incremental_lineage, self.config.incremental_lineage
             ),
+            functools.partial(
+                auto_incremental_properties, self.config.incremental_properties
+            ),
             StaleEntityRemovalHandler.create(
                 self, self.config, self.ctx
             ).workunit_processor,

datahub/ingestion/source/sql/athena.py CHANGED Viewed

@@ -26,6 +26,7 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
+from datahub.ingestion.api.source import StructuredLogLevel
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.aws.s3_util import make_s3_urn
 from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
@@ -35,6 +36,7 @@ from datahub.ingestion.source.sql.sql_common import (
     register_custom_type,
 )
 from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
+from datahub.ingestion.source.sql.sql_report import SQLSourceReport
 from datahub.ingestion.source.sql.sql_utils import (
     add_table_to_schema_container,
     gen_database_container,
@@ -48,6 +50,15 @@ from datahub.utilities.sqlalchemy_type_converter import (
     get_schema_fields_for_sqlalchemy_column,
 )
+try:
+    from typing_extensions import override
+except ImportError:
+    _F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
+    def override(f: _F, /) -> _F:  # noqa: F811
+        return f
 logger = logging.getLogger(__name__)
 assert STRUCT, "required type modules are not available"
@@ -322,12 +333,15 @@ class AthenaSource(SQLAlchemySource):
     - Profiling when enabled.
     """
-    table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
+    config: AthenaConfig
+    report: SQLSourceReport
     def __init__(self, config, ctx):
         super().__init__(config, ctx, "athena")
         self.cursor: Optional[BaseCursor] = None
+        self.table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
     @classmethod
     def create(cls, config_dict, ctx):
         config = AthenaConfig.parse_obj(config_dict)
@@ -452,6 +466,7 @@ class AthenaSource(SQLAlchemySource):
         )
     # It seems like database/schema filter in the connection string does not work and this to work around that
+    @override
     def get_schema_names(self, inspector: Inspector) -> List[str]:
         athena_config = typing.cast(AthenaConfig, self.config)
         schemas = inspector.get_schema_names()
@@ -459,34 +474,42 @@ class AthenaSource(SQLAlchemySource):
             return [schema for schema in schemas if schema == athena_config.database]
         return schemas
-    # Overwrite to get partitions
+    @classmethod
+    def _casted_partition_key(cls, key: str) -> str:
+        # We need to cast the partition keys to a VARCHAR, since otherwise
+        # Athena may throw an error during concatenation / comparison.
+        return f"CAST({key} as VARCHAR)"
+    @override
     def get_partitions(
         self, inspector: Inspector, schema: str, table: str
-    ) -> List[str]:
-        partitions = []
-        athena_config = typing.cast(AthenaConfig, self.config)
-        if not athena_config.extract_partitions:
-            return []
+    ) -> Optional[List[str]]:
+        if not self.config.extract_partitions:
+            return None
         if not self.cursor:
-            return []
+            return None
         metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
             table_name=table, schema_name=schema
         )
-        if metadata.partition_keys:
-            for key in metadata.partition_keys:
-                if key.name:
-                    partitions.append(key.name)
-            if not partitions:
-                return []
+        partitions = []
+        for key in metadata.partition_keys:
+            if key.name:
+                partitions.append(key.name)
+        if not partitions:
+            return []
-            # We create an artiificaial concatenated partition key to be able to query max partition easier
-            part_concat = "|| '-' ||".join(partitions)
+        with self.report.report_exc(
+            message="Failed to extract partition details",
+            context=f"{schema}.{table}",
+            level=StructuredLogLevel.WARN,
+        ):
+            # We create an artifical concatenated partition key to be able to query max partition easier
+            part_concat = " || '-' || ".join(
+                self._casted_partition_key(key) for key in partitions
+            )
             max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
             ret = self.cursor.execute(max_partition_query)
             max_partition: Dict[str, str] = {}
@@ -500,9 +523,8 @@ class AthenaSource(SQLAlchemySource):
                 partitions=partitions,
                 max_partition=max_partition,
             )
-            return partitions
-        return []
+        return partitions
     # Overwrite to modify the creation of schema fields
     def get_schema_fields_for_column(
@@ -551,7 +573,9 @@ class AthenaSource(SQLAlchemySource):
         if partition and partition.max_partition:
             max_partition_filters = []
             for key, value in partition.max_partition.items():
-                max_partition_filters.append(f"CAST({key} as VARCHAR) = '{value}'")
+                max_partition_filters.append(
+                    f"{self._casted_partition_key(key)} = '{value}'"
+                )
             max_partition = str(partition.max_partition)
             return (
                 max_partition,

datahub/ingestion/source/sql/mssql/source.py CHANGED Viewed

@@ -5,8 +5,6 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 import pydantic
 import sqlalchemy.dialects.mssql
-# This import verifies that the dependencies are available.
 from pydantic.fields import Field
 from sqlalchemy import create_engine, inspect
 from sqlalchemy.engine.base import Connection

datahub/ingestion/source/sql/sql_common.py CHANGED Viewed

@@ -582,6 +582,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
             generate_operations=False,
         )
         for dataset_name in self._view_definition_cache.keys():
+            # TODO: Ensure that the lineage generated from the view definition
+            # matches the dataset_name.
             view_definition = self._view_definition_cache[dataset_name]
             result = self._run_sql_parser(
                 dataset_name,
@@ -1059,6 +1061,20 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
                 exc=e,
             )
+    def _get_view_definition(self, inspector: Inspector, schema: str, view: str) -> str:
+        try:
+            view_definition = inspector.get_view_definition(view, schema)
+            if view_definition is None:
+                view_definition = ""
+            else:
+                # Some dialects return a TextClause instead of a raw string,
+                # so we need to convert them to a string.
+                view_definition = str(view_definition)
+        except NotImplementedError:
+            view_definition = ""
+        return view_definition
     def _process_view(
         self,
         dataset_name: str,
@@ -1077,7 +1093,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
             columns = inspector.get_columns(view, schema)
         except KeyError:
             # For certain types of views, we are unable to fetch the list of columns.
-            self.warn(logger, dataset_name, "unable to get schema for this view")
+            self.report.warning(
+                message="Unable to get schema for a view",
+                context=f"{dataset_name}",
+            )
             schema_metadata = None
         else:
             schema_fields = self.get_schema_fields(dataset_name, columns, inspector)
@@ -1091,19 +1110,12 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
             if self._save_schema_to_resolver():
                 self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
                 self.discovered_datasets.add(dataset_name)
         description, properties, _ = self.get_table_properties(inspector, schema, view)
-        try:
-            view_definition = inspector.get_view_definition(view, schema)
-            if view_definition is None:
-                view_definition = ""
-            else:
-                # Some dialects return a TextClause instead of a raw string,
-                # so we need to convert them to a string.
-                view_definition = str(view_definition)
-        except NotImplementedError:
-            view_definition = ""
-        properties["view_definition"] = view_definition
         properties["is_view"] = "True"
+        view_definition = self._get_view_definition(inspector, schema, view)
+        properties["view_definition"] = view_definition
         if view_definition and self.config.include_view_lineage:
             self._view_definition_cache[dataset_name] = view_definition
@@ -1135,15 +1147,14 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
             entityUrn=dataset_urn,
             aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW]),
         ).as_workunit()
-        if "view_definition" in properties:
-            view_definition_string = properties["view_definition"]
-            view_properties_aspect = ViewPropertiesClass(
-                materialized=False, viewLanguage="SQL", viewLogic=view_definition_string
-            )
-            yield MetadataChangeProposalWrapper(
-                entityUrn=dataset_urn,
-                aspect=view_properties_aspect,
-            ).as_workunit()
+        view_properties_aspect = ViewPropertiesClass(
+            materialized=False, viewLanguage="SQL", viewLogic=view_definition
+        )
+        yield MetadataChangeProposalWrapper(
+            entityUrn=dataset_urn,
+            aspect=view_properties_aspect,
+        ).as_workunit()
         if self.config.domain and self.domain_registry:
             yield from get_domain_wu(
@@ -1197,6 +1208,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
             )
         else:
             self.report.num_view_definitions_parsed += 1
+            if raw_lineage.out_tables != [view_urn]:
+                self.report.num_view_definitions_view_urn_mismatch += 1
         return view_definition_lineage_helper(raw_lineage, view_urn)
     def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:

datahub/ingestion/source/sql/sql_report.py CHANGED Viewed

@@ -48,6 +48,7 @@ class SQLSourceReport(
     query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
     num_view_definitions_parsed: int = 0
+    num_view_definitions_view_urn_mismatch: int = 0
     num_view_definitions_failed_parsing: int = 0
     num_view_definitions_failed_column_parsing: int = 0
     view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)

datahub/ingestion/source/sql/sql_types.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import re
-from typing import Any, Dict, ValuesView
+from typing import Any, Dict, Optional, Type, Union, ValuesView
 from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     ArrayType,
@@ -16,14 +16,28 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     UnionType,
 )
-# these can be obtained by running `select format_type(oid, null),* from pg_type;`
-# we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
-# (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
+DATAHUB_FIELD_TYPE = Union[
+    ArrayType,
+    BooleanType,
+    BytesType,
+    DateType,
+    EnumType,
+    MapType,
+    NullType,
+    NumberType,
+    RecordType,
+    StringType,
+    TimeType,
+    UnionType,
+]
-# we map from format_type since this is what dbt uses
-# see https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
-# see https://www.npgsql.org/dev/types.html for helpful type annotations
+# These can be obtained by running `select format_type(oid, null),* from pg_type;`
+# We've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
+# (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
+# We map from format_type since this is what dbt uses.
+# See https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
+# See https://www.npgsql.org/dev/types.html for helpful type annotations
 POSTGRES_TYPES_MAP: Dict[str, Any] = {
     "boolean": BooleanType,
     "bytea": BytesType,
@@ -262,7 +276,6 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
     return VERTICA_SQL_TYPES_MAP[type_string]
-# see https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
 SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
     "NUMBER": NumberType,
     "DECIMAL": NumberType,
@@ -298,6 +311,18 @@ SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
     "GEOGRAPHY": None,
 }
+def resolve_snowflake_modified_type(type_string: str) -> Any:
+    # Match types with precision and scale, e.g., 'DECIMAL(38,0)'
+    match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
+    if match:
+        modified_type_base = match.group(1)  # Extract the base type
+        return SNOWFLAKE_TYPES_MAP.get(modified_type_base, None)
+    # Fallback for types without precision/scale
+    return SNOWFLAKE_TYPES_MAP.get(type_string, None)
 # see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32
 BIGQUERY_TYPES_MAP: Dict[str, Any] = {
     "STRING": StringType,
@@ -366,6 +391,7 @@ TRINO_SQL_TYPES_MAP: Dict[str, Any] = {
     "row": RecordType,
     "map": MapType,
     "array": ArrayType,
+    "json": RecordType,
 }
 # https://docs.aws.amazon.com/athena/latest/ug/data-types.html
@@ -430,3 +456,54 @@ VERTICA_SQL_TYPES_MAP: Dict[str, Any] = {
     "geography": None,
     "uuid": StringType,
 }
+_merged_mapping = {
+    "boolean": BooleanType,
+    "date": DateType,
+    "time": TimeType,
+    "numeric": NumberType,
+    "text": StringType,
+    "timestamp with time zone": DateType,
+    "timestamp without time zone": DateType,
+    "integer": NumberType,
+    "float8": NumberType,
+    "struct": RecordType,
+    **POSTGRES_TYPES_MAP,
+    **SNOWFLAKE_TYPES_MAP,
+    **BIGQUERY_TYPES_MAP,
+    **SPARK_SQL_TYPES_MAP,
+    **TRINO_SQL_TYPES_MAP,
+    **ATHENA_SQL_TYPES_MAP,
+    **VERTICA_SQL_TYPES_MAP,
+}
+def resolve_sql_type(
+    column_type: Optional[str],
+    platform: Optional[str] = None,
+) -> Optional[DATAHUB_FIELD_TYPE]:
+    # In theory, we should use the platform-specific mapping where available.
+    # However, the types don't ever conflict, so the merged mapping is fine.
+    TypeClass: Optional[Type[DATAHUB_FIELD_TYPE]] = (
+        _merged_mapping.get(column_type) if column_type else None
+    )
+    if TypeClass is None and column_type:
+        # resolve a modified type
+        if platform == "trino":
+            TypeClass = resolve_trino_modified_type(column_type)
+        elif platform == "athena":
+            TypeClass = resolve_athena_modified_type(column_type)
+        elif platform == "postgres" or platform == "redshift":
+            # Redshift uses a variant of Postgres, so we can use the same logic.
+            TypeClass = resolve_postgres_modified_type(column_type)
+        elif platform == "vertica":
+            TypeClass = resolve_vertica_modified_type(column_type)
+        elif platform == "snowflake":
+            # Snowflake types are uppercase, so we check that.
+            TypeClass = resolve_snowflake_modified_type(column_type.upper())
+    if TypeClass:
+        return TypeClass()
+    return None

datahub/ingestion/source/state/redundant_run_skip_handler.py CHANGED Viewed

@@ -69,7 +69,7 @@ class RedundantRunSkipHandler(
         platform: Optional[str] = None
         source_class = type(self.source)
         if hasattr(source_class, "get_platform_name"):
-            platform = source_class.get_platform_name()  # type: ignore
+            platform = source_class.get_platform_name()
         # Default name for everything else
         job_name_suffix = self.get_job_name_suffix()

acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.14.1.13rc9py3-none-any.whl → 0.15.0py3-none-any.whl