PyPI - acryl-datahub - Versions diffs - 1.2.0.6__py3-none-any.whl → 1.2.0.7rc1__py3-none-any.whl - Mend

acryl-datahub 1.2.0.6py3-none-any.whl → 1.2.0.7rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (63) hide show

{acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/METADATA +2693 -2630
{acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/RECORD +63 -55
{acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/graphql/operation.py +1 -1
datahub/ingestion/autogenerated/capability_summary.json +45 -5
datahub/ingestion/autogenerated/lineage.json +3 -2
datahub/ingestion/run/pipeline.py +1 -0
datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
datahub/ingestion/source/common/subtypes.py +3 -0
datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
datahub/ingestion/source/dbt/dbt_common.py +74 -0
datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
datahub/ingestion/source/dremio/dremio_source.py +4 -0
datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
datahub/ingestion/source/excel/__init__.py +0 -0
datahub/ingestion/source/excel/config.py +92 -0
datahub/ingestion/source/excel/excel_file.py +539 -0
datahub/ingestion/source/excel/profiling.py +308 -0
datahub/ingestion/source/excel/report.py +49 -0
datahub/ingestion/source/excel/source.py +662 -0
datahub/ingestion/source/excel/util.py +18 -0
datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/powerbi/config.py +33 -0
datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
datahub/ingestion/source/powerbi/powerbi.py +5 -0
datahub/ingestion/source/s3/source.py +65 -59
datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +21 -6
datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
datahub/ingestion/source/sql/hive_metastore.py +1 -0
datahub/ingestion/source/sql_queries.py +24 -2
datahub/ingestion/source/state/checkpoint.py +3 -28
datahub/metadata/_internal_schema_classes.py +568 -512
datahub/metadata/_urns/urn_defs.py +1748 -1748
datahub/metadata/schema.avsc +18242 -18168
datahub/metadata/schemas/ChartInfo.avsc +2 -1
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
datahub/metadata/schemas/Ownership.avsc +69 -0
datahub/metadata/schemas/StructuredProperties.avsc +69 -0
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/lineage_client.py +6 -26
datahub/sdk/main_client.py +7 -3
datahub/sdk/search_filters.py +16 -0
datahub/specific/aspect_helpers/siblings.py +73 -0
datahub/specific/dataset.py +2 -0
datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
datahub/sql_parsing/tool_meta_extractor.py +1 -3
datahub/upgrade/upgrade.py +14 -2
{acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/WHEEL +0 -0
{acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/snowflake/snowflake_schema.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 from collections import defaultdict
 from dataclasses import dataclass, field
 from datetime import datetime
-from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional
+from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Tuple
 from datahub.ingestion.api.report import SupportsAsObj
 from datahub.ingestion.source.common.subtypes import DatasetSubTypes
@@ -239,12 +239,16 @@ class _SnowflakeTagCache:
 class SnowflakeDataDictionary(SupportsAsObj):
     def __init__(
-        self, connection: SnowflakeConnection, report: SnowflakeV2Report
+        self,
+        connection: SnowflakeConnection,
+        report: SnowflakeV2Report,
+        fetch_views_from_information_schema: bool = False,
     ) -> None:
         self.connection = connection
         self.report = report
+        self._fetch_views_from_information_schema = fetch_views_from_information_schema
-    def as_obj(self) -> Dict[str, Dict[str, int]]:
+    def as_obj(self) -> Dict[str, Any]:
         # TODO: Move this into a proper report type that gets computed.
         # Reports how many times we reset in-memory `functools.lru_cache` caches of data,
@@ -260,7 +264,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
             self.get_fk_constraints_for_schema,
         ]
-        report = {}
+        report: Dict[str, Any] = {
+            "fetch_views_from_information_schema": self._fetch_views_from_information_schema,
+        }
         for func in lru_cache_functions:
             report[func.__name__] = func.cache_info()._asdict()  # type: ignore
         return report
@@ -430,7 +436,17 @@ class SnowflakeDataDictionary(SupportsAsObj):
         return tables
     @serialized_lru_cache(maxsize=1)
-    def get_views_for_database(self, db_name: str) -> Dict[str, List[SnowflakeView]]:
+    def get_views_for_database(
+        self, db_name: str
+    ) -> Optional[Dict[str, List[SnowflakeView]]]:
+        if self._fetch_views_from_information_schema:
+            return self._get_views_for_database_using_information_schema(db_name)
+        else:
+            return self._get_views_for_database_using_show(db_name)
+    def _get_views_for_database_using_show(
+        self, db_name: str
+    ) -> Dict[str, List[SnowflakeView]]:
         page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
         views: Dict[str, List[SnowflakeView]] = {}
@@ -461,10 +477,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
                     SnowflakeView(
                         name=view_name,
                         created=view["created_on"],
-                        # last_altered=table["last_altered"],
                         comment=view["comment"],
                         view_definition=view["text"],
-                        last_altered=view["created_on"],
+                        last_altered=view["created_on"],  # TODO: This is not correct.
                         materialized=(
                             view.get("is_materialized", "false").lower() == "true"
                         ),
@@ -479,6 +494,163 @@ class SnowflakeDataDictionary(SupportsAsObj):
                 )
                 view_pagination_marker = view_name
+        # Because this is in a cached function, this will only log once per database.
+        view_counts = {schema_name: len(views[schema_name]) for schema_name in views}
+        logger.info(
+            f"Finished fetching views in {db_name}; counts by schema {view_counts}"
+        )
+        return views
+    def _map_view(self, db_name: str, row: Dict[str, Any]) -> Tuple[str, SnowflakeView]:
+        schema_name = row["VIEW_SCHEMA"]
+        view_definition = row.get("VIEW_DEFINITION")
+        fragment_view_definition = (
+            view_definition[:50].strip() if view_definition else None
+        )
+        logger.info(
+            f"Mapping view {db_name}.{schema_name}.{row['VIEW_NAME']} with view definition: {fragment_view_definition}..."
+        )
+        return schema_name, SnowflakeView(
+            name=row["VIEW_NAME"],
+            created=row["CREATED"],
+            comment=row["COMMENT"],
+            view_definition=view_definition,
+            last_altered=row["LAST_ALTERED"],
+            is_secure=(row.get("IS_SECURE", "false").lower() == "true"),
+            # TODO: This doesn't work for materialized views.
+            materialized=False,
+        )
+    def _maybe_populate_empty_view_definitions(
+        self,
+        db_name: str,
+        schema_name: str,
+        views_with_empty_definition: List[SnowflakeView],
+    ) -> List[SnowflakeView]:
+        if not views_with_empty_definition:
+            return []
+        view_names = [view.name for view in views_with_empty_definition]
+        batches = [
+            batch[0]
+            for batch in build_prefix_batches(
+                view_names, max_batch_size=1000, max_groups_in_batch=1
+            )
+            if batch
+            # Skip empty batch if so, also max_groups_in_batch=1 makes it safe to access batch[0]
+        ]
+        view_map: Dict[str, SnowflakeView] = {
+            view.name: view for view in views_with_empty_definition
+        }
+        views_found_count = 0
+        logger.info(
+            f"Fetching definitions for {len(view_map)} views in {db_name}.{schema_name} "
+            f"using batched 'SHOW VIEWS ... LIKE ...' queries. Found {len(batches)} batch(es)."
+        )
+        for batch_index, prefix_group in enumerate(batches):
+            query = f'SHOW VIEWS LIKE \'{prefix_group.prefix}%\' IN SCHEMA "{db_name}"."{schema_name}"'
+            logger.info(f"Processing batch {batch_index + 1}/{len(batches)}: {query}")
+            try:
+                cur = self.connection.query(query)
+                for row in cur:
+                    view_name = row["name"]
+                    if view_name in view_map:
+                        view_definition = row.get("text")
+                        if view_definition:  # Ensure definition is not None or empty
+                            view_map[view_name].view_definition = view_definition
+                            views_found_count += 1
+                            logger.debug(
+                                f"Fetched view definition for {db_name}.{schema_name}.{view_name}"
+                            )
+                            # If all targeted views are found, we could theoretically break early,
+                            # but SHOW VIEWS doesn't guarantee order, so we must process all results.
+                        else:
+                            logger.warning(
+                                f"'text' field missing or empty in SHOW VIEWS result for {db_name}.{schema_name}.{view_name}"
+                            )
+            except Exception as e:
+                logger.error(
+                    f"Failed to execute query for batch {batch_index + 1} ('{query}') for {db_name}.{schema_name} or process its results.",
+                    exc_info=e,
+                )
+                # Returning the original list; some views might still be missing definitions.
+                # This also means subsequent batches for this schema (in this call) are skipped.
+                return views_with_empty_definition
+        logger.info(
+            f"Finished processing 'SHOW VIEWS' batches for {db_name}.{schema_name}. "
+            f"Fetched definitions for {views_found_count} out of {len(view_map)} targeted views."
+        )
+        if views_found_count < len(view_map):
+            missing_count = len(view_map) - views_found_count
+            logger.warning(
+                f"Could not fetch definitions for {missing_count} views in {db_name}.{schema_name} after processing all batches."
+            )
+        # The SnowflakeView objects in the original list were modified in place via view_map
+        return views_with_empty_definition
+    def _get_views_for_database_using_information_schema(
+        self, db_name: str
+    ) -> Optional[Dict[str, List[SnowflakeView]]]:
+        try:
+            cur = self.connection.query(
+                SnowflakeQuery.get_views_for_database(db_name),
+            )
+        except Exception as e:
+            logger.debug(f"Failed to get all views for database {db_name}", exc_info=e)
+            # Error - Information schema query returned too much data. Please repeat query with more selective predicates.
+            return None
+        views: Dict[str, List[SnowflakeView]] = {}
+        views_with_empty_definition: Dict[str, List[SnowflakeView]] = {}
+        for row in cur:
+            schema_name, view = self._map_view(db_name, row)
+            if view.view_definition is None or view.view_definition == "":
+                views_with_empty_definition.setdefault(schema_name, []).append(view)
+            else:
+                views.setdefault(schema_name, []).append(view)
+        for schema_name, empty_views in views_with_empty_definition.items():
+            updated_views = self._maybe_populate_empty_view_definitions(
+                db_name, schema_name, empty_views
+            )
+            views.setdefault(schema_name, []).extend(updated_views)
+        return views
+    def get_views_for_schema_using_information_schema(
+        self, *, schema_name: str, db_name: str
+    ) -> List[SnowflakeView]:
+        cur = self.connection.query(
+            SnowflakeQuery.get_views_for_schema(
+                db_name=db_name, schema_name=schema_name
+            ),
+        )
+        views: List[SnowflakeView] = []
+        views_with_empty_definition: List[SnowflakeView] = []
+        for row in cur:
+            schema_name, view = self._map_view(db_name, row)
+            if view.view_definition is None or view.view_definition == "":
+                views_with_empty_definition.append(view)
+            else:
+                views.append(view)
+        if views_with_empty_definition:
+            updated_empty_views = self._maybe_populate_empty_view_definitions(
+                db_name, schema_name, views_with_empty_definition
+            )
+            views.extend(updated_empty_views)
         return views
     @serialized_lru_cache(maxsize=SCHEMA_PARALLELISM)

datahub/ingestion/source/snowflake/snowflake_schema_gen.py CHANGED Viewed

@@ -166,8 +166,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
     def __init__(
         self,
-        config: SnowflakeV2Config,
-        report: SnowflakeV2Report,
+        config: SnowflakeV2Config,  # FIXME: SnowflakeSummary is passing here SnowflakeSummaryConfig
+        report: SnowflakeV2Report,  # FIXME: SnowflakeSummary is passing here SnowflakeSummaryReport
         connection: SnowflakeConnection,
         filters: SnowflakeFilter,
         identifiers: SnowflakeIdentifierBuilder,
@@ -175,6 +175,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         profiler: Optional[SnowflakeProfiler],
         aggregator: Optional[SqlParsingAggregator],
         snowsight_url_builder: Optional[SnowsightUrlBuilder],
+        fetch_views_from_information_schema: bool = False,
     ) -> None:
         self.config: SnowflakeV2Config = config
         self.report: SnowflakeV2Report = report
@@ -183,7 +184,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         self.identifiers: SnowflakeIdentifierBuilder = identifiers
         self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary(
-            connection=self.connection, report=self.report
+            connection=self.connection,
+            report=self.report,
+            fetch_views_from_information_schema=fetch_views_from_information_schema,
         )
         self.report.data_dictionary_cache = self.data_dictionary
@@ -1241,7 +1244,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         # falling back to get tables for schema
         if tables is None:
             self.report.num_get_tables_for_schema_queries += 1
-            return self.data_dictionary.get_tables_for_schema(schema_name, db_name)
+            return self.data_dictionary.get_tables_for_schema(
+                db_name=db_name,
+                schema_name=schema_name,
+            )
         # Some schema may not have any table
         return tables.get(schema_name, [])
@@ -1251,8 +1257,17 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
     ) -> List[SnowflakeView]:
         views = self.data_dictionary.get_views_for_database(db_name)
-        # Some schema may not have any table
-        return views.get(schema_name, [])
+        if views is not None:
+            # Some schemas may not have any views
+            return views.get(schema_name, [])
+        # Usually this fails when there are too many views in the schema.
+        # Fall back to per-schema queries.
+        self.report.num_get_views_for_schema_queries += 1
+        return self.data_dictionary.get_views_for_schema_using_information_schema(
+            db_name=db_name,
+            schema_name=schema_name,
+        )
     def get_columns_for_table(
         self, table_name: str, snowflake_schema: SnowflakeSchema, db_name: str

datahub/ingestion/source/snowflake/snowflake_summary.py CHANGED Viewed

@@ -86,6 +86,7 @@ class SnowflakeSummarySource(Source):
                 filter_config=self.config,
                 structured_reporter=self.report,
             ),
+            fetch_views_from_information_schema=False,  # we haven't enabled this config for SnowflakeSummarySource
         )
         # Databases.

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -172,7 +172,9 @@ class SnowflakeV2Source(
         # For database, schema, tables, views, etc
         self.data_dictionary = SnowflakeDataDictionary(
-            connection=self.connection, report=self.report
+            connection=self.connection,
+            report=self.report,
+            fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
         )
         self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
@@ -528,6 +530,7 @@ class SnowflakeV2Source(
             snowsight_url_builder=snowsight_url_builder,
             filters=self.filters,
             identifiers=self.identifiers,
+            fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
         )
         with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):

datahub/ingestion/source/sql/hive_metastore.py CHANGED Viewed

@@ -174,6 +174,7 @@ class HiveMetastore(BasicSQLAlchemyConfig):
     "Enabled by default",
     subtype_modifier=[
         SourceCapabilityModifier.CATALOG,
+        SourceCapabilityModifier.SCHEMA,
     ],
 )
 class HiveMetastoreSource(SQLAlchemySource):

datahub/ingestion/source/sql_queries.py CHANGED Viewed

@@ -25,6 +25,10 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
+from datahub.ingestion.api.incremental_lineage_helper import (
+    IncrementalLineageConfigMixin,
+    auto_incremental_lineage,
+)
 from datahub.ingestion.api.source import (
     MetadataWorkUnitProcessor,
     Source,
@@ -48,7 +52,9 @@ from datahub.sql_parsing.sql_parsing_aggregator import (
 logger = logging.getLogger(__name__)
-class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
+class SqlQueriesSourceConfig(
+    PlatformInstanceConfigMixin, EnvConfigMixin, IncrementalLineageConfigMixin
+):
     query_file: str = Field(description="Path to file to ingest")
     platform: str = Field(
@@ -109,6 +115,16 @@ class SqlQueriesSource(Source):
      used if the query can't be parsed.
     - upstream_tables (optional): string[] - Fallback list of tables the query reads from,
      used if the query can't be parsed.
+    ### Incremental Lineage
+    When `incremental_lineage` is enabled, this source will emit lineage as patches rather than full overwrites.
+    This allows you to add lineage edges without removing existing ones, which is useful for:
+    - Gradually building up lineage from multiple sources
+    - Preserving manually curated lineage
+    - Avoiding conflicts when multiple ingestion processes target the same datasets
+    Note: Incremental lineage only applies to UpstreamLineage aspects. Other aspects like queries and usage
+    statistics will still be emitted normally.
     """
     schema_resolver: Optional[SchemaResolver]
@@ -165,7 +181,13 @@ class SqlQueriesSource(Source):
         return self.report
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
-        return [partial(auto_workunit_reporter, self.get_report())]
+        return [
+            partial(auto_workunit_reporter, self.get_report()),
+            partial(
+                auto_incremental_lineage,
+                self.config.incremental_lineage,
+            ),
+        ]
     def get_workunits_internal(
         self,

datahub/ingestion/source/state/checkpoint.py CHANGED Viewed

@@ -1,10 +1,8 @@
 import base64
 import bz2
-import contextlib
 import functools
 import json
 import logging
-import pickle
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from typing import Callable, Generic, Optional, Type, TypeVar
@@ -117,10 +115,9 @@ class Checkpoint(Generic[StateType]):
                         checkpoint_aspect, state_class
                     )
                 elif checkpoint_aspect.state.serde == "base85":
-                    state_obj = Checkpoint._from_base85_bytes(
-                        checkpoint_aspect,
-                        functools.partial(bz2.decompress),
-                        state_class,
+                    raise ValueError(
+                        "The base85 encoding for stateful ingestion has been removed for security reasons. "
+                        "You may need to temporarily set `ignore_previous_checkpoint` to true to ignore the outdated checkpoint object."
                     )
                 elif checkpoint_aspect.state.serde == "base85-bz2-json":
                     state_obj = Checkpoint._from_base85_json_bytes(
@@ -164,28 +161,6 @@ class Checkpoint(Generic[StateType]):
         state_as_dict["serde"] = checkpoint_aspect.state.serde
         return state_class.parse_obj(state_as_dict)
-    @staticmethod
-    def _from_base85_bytes(
-        checkpoint_aspect: DatahubIngestionCheckpointClass,
-        decompressor: Callable[[bytes], bytes],
-        state_class: Type[StateType],
-    ) -> StateType:
-        state: StateType = pickle.loads(
-            decompressor(base64.b85decode(checkpoint_aspect.state.payload))  # type: ignore
-        )
-        with contextlib.suppress(Exception):
-            # When loading from pickle, the pydantic validators don't run.
-            # By re-serializing and re-parsing, we ensure that the state is valid.
-            # However, we also suppress any exceptions to make sure this doesn't blow up.
-            state = state_class.parse_obj(state.dict())
-        # Because the base85 method is deprecated in favor of base85-bz2-json,
-        # we will automatically switch the serde.
-        state.serde = "base85-bz2-json"
-        return state
     @staticmethod
     def _from_base85_json_bytes(
         checkpoint_aspect: DatahubIngestionCheckpointClass,

acryl-datahub 1.2.0.6__py3-none-any.whl → 1.2.0.7rc1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.6py3-none-any.whl → 1.2.0.7rc1py3-none-any.whl