PyPI - acryl-datahub - Versions diffs - 1.2.0.7rc2__py3-none-any.whl → 1.2.0.7rc4__py3-none-any.whl - Mend

acryl-datahub 1.2.0.7rc2py3-none-any.whl → 1.2.0.7rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (31) hide show

{acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/METADATA +2754 -2749
{acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/RECORD +30 -30
datahub/_version.py +1 -1
datahub/ingestion/autogenerated/capability_summary.json +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
datahub/ingestion/source/redshift/config.py +9 -6
datahub/ingestion/source/redshift/lineage.py +386 -687
datahub/ingestion/source/redshift/redshift.py +19 -106
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +4 -1
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -0
datahub/ingestion/source/sql/mssql/job_models.py +3 -1
datahub/ingestion/source/sql/mssql/source.py +62 -3
datahub/ingestion/source/unity/config.py +74 -9
datahub/ingestion/source/unity/proxy.py +167 -5
datahub/ingestion/source/unity/proxy_patch.py +321 -0
datahub/ingestion/source/unity/proxy_types.py +24 -0
datahub/ingestion/source/unity/report.py +5 -0
datahub/ingestion/source/unity/source.py +111 -1
datahub/ingestion/source/usage/usage_common.py +1 -0
datahub/metadata/_internal_schema_classes.py +5 -5
datahub/metadata/schema.avsc +66 -60
datahub/metadata/schemas/LogicalParent.avsc +104 -100
datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
datahub/sdk/chart.py +36 -22
datahub/sdk/dashboard.py +38 -62
datahub/ingestion/source/redshift/lineage_v2.py +0 -466
{acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/WHEEL +0 -0
{acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/redshift/redshift.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import functools
-import itertools
 import logging
 from collections import defaultdict
 from typing import Dict, Iterable, List, Optional, Type, Union
@@ -52,8 +51,7 @@ from datahub.ingestion.source.common.subtypes import (
 from datahub.ingestion.source.redshift.config import RedshiftConfig
 from datahub.ingestion.source.redshift.datashares import RedshiftDatasharesHelper
 from datahub.ingestion.source.redshift.exception import handle_redshift_exceptions_yield
-from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor
-from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2
+from datahub.ingestion.source.redshift.lineage import RedshiftSqlLineage
 from datahub.ingestion.source.redshift.profile import RedshiftProfiler
 from datahub.ingestion.source.redshift.redshift_data_reader import RedshiftDataReader
 from datahub.ingestion.source.redshift.redshift_schema import (
@@ -72,7 +70,6 @@ from datahub.ingestion.source.sql.sql_utils import (
     add_table_to_schema_container,
     gen_database_container,
     gen_database_key,
-    gen_lineage,
     gen_schema_container,
     gen_schema_key,
     get_dataplatform_instance_aspect,
@@ -116,7 +113,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
 )
 from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass
 from datahub.utilities import memory_footprint
-from datahub.utilities.dedup_list import deduplicate_list
 from datahub.utilities.mapping import Constants
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -423,40 +419,25 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
             memory_footprint.total_size(self.db_views)
         )
-        if self.config.use_lineage_v2:
-            with RedshiftSqlLineageV2(
-                config=self.config,
-                report=self.report,
-                context=self.ctx,
-                database=database,
-                redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
-            ) as lineage_extractor:
-                yield from lineage_extractor.aggregator.register_schemas_from_stream(
-                    self.process_schemas(connection, database)
-                )
-                with self.report.new_stage(LINEAGE_EXTRACTION):
-                    yield from self.extract_lineage_v2(
-                        connection=connection,
-                        database=database,
-                        lineage_extractor=lineage_extractor,
-                    )
-            all_tables = self.get_all_tables()
-        else:
-            yield from self.process_schemas(connection, database)
+        with RedshiftSqlLineage(
+            config=self.config,
+            report=self.report,
+            context=self.ctx,
+            database=database,
+            redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
+        ) as lineage_extractor:
+            yield from lineage_extractor.aggregator.register_schemas_from_stream(
+                self.process_schemas(connection, database)
+            )
-            all_tables = self.get_all_tables()
+            with self.report.new_stage(LINEAGE_EXTRACTION):
+                yield from self.extract_lineage_v2(
+                    connection=connection,
+                    database=database,
+                    lineage_extractor=lineage_extractor,
+                )
-            if (
-                self.config.include_table_lineage
-                or self.config.include_view_lineage
-                or self.config.include_copy_lineage
-            ):
-                with self.report.new_stage(LINEAGE_EXTRACTION):
-                    yield from self.extract_lineage(
-                        connection=connection, all_tables=all_tables, database=database
-                    )
+        all_tables = self.get_all_tables()
         if self.config.include_usage_statistics:
             with self.report.new_stage(USAGE_EXTRACTION_INGESTION):
@@ -968,45 +949,11 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
             self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2)
-    def extract_lineage(
-        self,
-        connection: redshift_connector.Connection,
-        database: str,
-        all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
-    ) -> Iterable[MetadataWorkUnit]:
-        if not self._should_ingest_lineage():
-            return
-        lineage_extractor = RedshiftLineageExtractor(
-            config=self.config,
-            report=self.report,
-            context=self.ctx,
-            redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
-        )
-        with PerfTimer() as timer:
-            lineage_extractor.populate_lineage(
-                database=database, connection=connection, all_tables=all_tables
-            )
-            self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
-                digits=2
-            )
-            yield from self.generate_lineage(
-                database, lineage_extractor=lineage_extractor
-            )
-            if self.redundant_lineage_run_skip_handler:
-                # Update the checkpoint state for this run.
-                self.redundant_lineage_run_skip_handler.update_state(
-                    self.config.start_time, self.config.end_time
-                )
     def extract_lineage_v2(
         self,
         connection: redshift_connector.Connection,
         database: str,
-        lineage_extractor: RedshiftSqlLineageV2,
+        lineage_extractor: RedshiftSqlLineage,
     ) -> Iterable[MetadataWorkUnit]:
         if self.config.include_share_lineage:
             outbound_shares = self.data_dictionary.get_outbound_datashares(connection)
@@ -1069,40 +1016,6 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
         return True
-    def generate_lineage(
-        self, database: str, lineage_extractor: RedshiftLineageExtractor
-    ) -> Iterable[MetadataWorkUnit]:
-        logger.info(f"Generate lineage for {database}")
-        for schema in deduplicate_list(
-            itertools.chain(self.db_tables[database], self.db_views[database])
-        ):
-            if (
-                database not in self.db_schemas
-                or schema not in self.db_schemas[database]
-            ):
-                logger.warning(
-                    f"Either database {database} or {schema} exists in the lineage but was not discovered earlier. Something went wrong."
-                )
-                continue
-            table_or_view: Union[RedshiftTable, RedshiftView]
-            for table_or_view in (
-                []
-                + self.db_tables[database].get(schema, [])
-                + self.db_views[database].get(schema, [])
-            ):
-                datahub_dataset_name = f"{database}.{schema}.{table_or_view.name}"
-                dataset_urn = self.gen_dataset_urn(datahub_dataset_name)
-                lineage_info = lineage_extractor.get_lineage(
-                    table_or_view,
-                    dataset_urn,
-                    self.db_schemas[database][schema],
-                )
-                if lineage_info:
-                    # incremental lineage generation is taken care by auto_incremental_lineage
-                    yield from gen_lineage(dataset_urn, lineage_info)
     def add_config_to_report(self):
         self.report.stateful_lineage_ingestion_enabled = (
             self.config.enable_stateful_lineage_ingestion

datahub/ingestion/source/snowflake/snowflake_schema_gen.py CHANGED Viewed

@@ -441,13 +441,16 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
             tables = self.fetch_tables_for_schema(
                 snowflake_schema, db_name, schema_name
             )
+        if self.config.include_views:
+            views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
+        if self.config.include_tables:
             db_tables[schema_name] = tables
             yield from self._process_tables(
                 tables, snowflake_schema, db_name, schema_name
             )
         if self.config.include_views:
-            views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
             yield from self._process_views(
                 views, snowflake_schema, db_name, schema_name
             )

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -199,6 +199,7 @@ class SnowflakeV2Source(
                 ),
                 generate_usage_statistics=False,
                 generate_operations=False,
+                generate_queries=self.config.include_queries,
                 format_queries=self.config.format_sql_queries,
                 is_temp_table=self._is_temp_table,
                 is_allowed_table=self._is_allowed_table,

datahub/ingestion/source/sql/mssql/job_models.py CHANGED Viewed

@@ -134,7 +134,9 @@ class StoredProcedure:
     @property
     def escape_full_name(self) -> str:
-        return f"[{self.db}].[{self.schema}].[{self.formatted_name}]"
+        return f"[{self.db}].[{self.schema}].[{self.formatted_name}]".replace(
+            "'", r"''"
+        )
     def to_base_procedure(self) -> BaseProcedure:
         return BaseProcedure(

datahub/ingestion/source/sql/mssql/source.py CHANGED Viewed

@@ -10,6 +10,7 @@ from sqlalchemy import create_engine, inspect
 from sqlalchemy.engine.base import Connection
 from sqlalchemy.engine.reflection import Inspector
 from sqlalchemy.exc import ProgrammingError, ResourceClosedError
+from sqlalchemy.sql import quoted_name
 import datahub.metadata.schema_classes as models
 from datahub.configuration.common import AllowDenyPattern
@@ -130,10 +131,14 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
         "match the entire table name in database.schema.table format. Defaults are to set in such a way "
         "to ignore the temporary staging tables created by known ETL tools.",
     )
+    quote_schemas: bool = Field(
+        default=False,
+        description="Represent a schema identifiers combined with quoting preferences. See [sqlalchemy quoted_name docs](https://docs.sqlalchemy.org/en/20/core/sqlelement.html#sqlalchemy.sql.expression.quoted_name).",
+    )
     @pydantic.validator("uri_args")
     def passwords_match(cls, v, values, **kwargs):
-        if values["use_odbc"] and "driver" not in v:
+        if values["use_odbc"] and not values["sqlalchemy_uri"] and "driver" not in v:
             raise ValueError("uri_args must contain a 'driver' option")
         elif not values["use_odbc"] and v:
             raise ValueError("uri_args is not supported when ODBC is disabled")
@@ -159,7 +164,15 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
             uri_opts=uri_opts,
         )
         if self.use_odbc:
-            uri = f"{uri}?{urllib.parse.urlencode(self.uri_args)}"
+            final_uri_args = self.uri_args.copy()
+            if final_uri_args and current_db:
+                final_uri_args.update({"database": current_db})
+            uri = (
+                f"{uri}?{urllib.parse.urlencode(final_uri_args)}"
+                if final_uri_args
+                else uri
+            )
         return uri
     @property
@@ -923,7 +936,11 @@ class SQLServerSource(SQLAlchemySource):
         logger.debug(f"sql_alchemy_url={url}")
         engine = create_engine(url, **self.config.options)
-        if self.config.database and self.config.database != "":
+        if (
+            self.config.database
+            and self.config.database != ""
+            or (self.config.sqlalchemy_uri and self.config.sqlalchemy_uri != "")
+        ):
             inspector = inspect(engine)
             yield inspector
         else:
@@ -1020,3 +1037,45 @@ class SQLServerSource(SQLAlchemySource):
             if self.config.convert_urns_to_lowercase
             else table_ref_str
         )
+    def get_allowed_schemas(self, inspector: Inspector, db_name: str) -> Iterable[str]:
+        for schema in super().get_allowed_schemas(inspector, db_name):
+            if self.config.quote_schemas:
+                yield quoted_name(schema, True)
+            else:
+                yield schema
+    def get_db_name(self, inspector: Inspector) -> str:
+        engine = inspector.engine
+        try:
+            if (
+                engine
+                and hasattr(engine, "url")
+                and hasattr(engine.url, "database")
+                and engine.url.database
+            ):
+                return str(engine.url.database).strip('"')
+            if (
+                engine
+                and hasattr(engine, "url")
+                and hasattr(engine.url, "query")
+                and "odbc_connect" in engine.url.query
+            ):
+                # According to the ODBC connection keywords: https://learn.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver17#supported-dsnconnection-string-keywords-and-connection-attributes
+                database = re.search(
+                    r"DATABASE=([^;]*);",
+                    urllib.parse.unquote_plus(str(engine.url.query["odbc_connect"])),
+                    flags=re.IGNORECASE,
+                )
+                if database and database.group(1):
+                    return database.group(1)
+            return ""
+        except Exception as e:
+            raise RuntimeError(
+                "Unable to get database name from Sqlalchemy inspector"
+            ) from e

datahub/ingestion/source/unity/config.py CHANGED Viewed

@@ -35,6 +35,10 @@ from datahub.utilities.global_warning_util import add_global_warning
 logger = logging.getLogger(__name__)
+# Configuration default constants
+INCLUDE_TAGS_DEFAULT = True
+INCLUDE_HIVE_METASTORE_DEFAULT = True
 class LineageDataSource(ConfigEnum):
     AUTO = "AUTO"
@@ -137,10 +141,18 @@ class UnityCatalogSourceConfig(
     )
     warehouse_id: Optional[str] = pydantic.Field(
         default=None,
-        description="SQL Warehouse id, for running queries. If not set, will use the default warehouse.",
+        description=(
+            "SQL Warehouse id, for running queries. Must be explicitly provided to enable SQL-based features. "
+            "Required for the following features that need SQL access: "
+            "1) Tag extraction (include_tags=True) - queries system.information_schema.tags "
+            "2) Hive Metastore catalog (include_hive_metastore=True) - queries legacy hive_metastore catalog "
+            "3) System table lineage (lineage_data_source=SYSTEM_TABLES) - queries system.access.table_lineage/column_lineage "
+            "4) Data profiling (profiling.enabled=True) - runs SELECT/ANALYZE queries on tables. "
+            "When warehouse_id is missing, these features will be automatically disabled (with warnings) to allow ingestion to continue."
+        ),
     )
     include_hive_metastore: bool = pydantic.Field(
-        default=True,
+        default=INCLUDE_HIVE_METASTORE_DEFAULT,
         description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
     )
     workspace_name: Optional[str] = pydantic.Field(
@@ -236,8 +248,12 @@ class UnityCatalogSourceConfig(
     )
     include_tags: bool = pydantic.Field(
-        default=True,
-        description="Option to enable/disable column/table tag extraction.",
+        default=INCLUDE_TAGS_DEFAULT,
+        description=(
+            "Option to enable/disable column/table tag extraction. "
+            "Requires warehouse_id to be set since tag extraction needs to query system.information_schema.tags. "
+            "If warehouse_id is not provided, this will be automatically disabled to allow ingestion to continue."
+        ),
     )
     _rename_table_ownership = pydantic_renamed_field(
@@ -310,8 +326,62 @@ class UnityCatalogSourceConfig(
         description="Details about the delta lake, incase to emit siblings",
     )
+    include_ml_model_aliases: bool = pydantic.Field(
+        default=False,
+        description="Whether to include ML model aliases in the ingestion.",
+    )
+    ml_model_max_results: int = pydantic.Field(
+        default=1000,
+        ge=0,
+        description="Maximum number of ML models to ingest.",
+    )
+    _forced_disable_tag_extraction: bool = pydantic.PrivateAttr(default=False)
+    _forced_disable_hive_metastore_extraction = pydantic.PrivateAttr(default=False)
     scheme: str = DATABRICKS
+    def __init__(self, **data):
+        # First, let the parent handle the root validators and field processing
+        super().__init__(**data)
+        # After model creation, check if we need to auto-disable features
+        # based on the final warehouse_id value (which may have been set by root validators)
+        include_tags_original = data.get("include_tags", INCLUDE_TAGS_DEFAULT)
+        include_hive_metastore_original = data.get(
+            "include_hive_metastore", INCLUDE_HIVE_METASTORE_DEFAULT
+        )
+        # Track what we're force-disabling
+        forced_disable_tag_extraction = False
+        forced_disable_hive_metastore_extraction = False
+        # Check if features should be auto-disabled based on final warehouse_id
+        if include_tags_original and not self.warehouse_id:
+            forced_disable_tag_extraction = True
+            self.include_tags = False  # Modify the model attribute directly
+            logger.warning(
+                "warehouse_id is not set but include_tags=True. "
+                "Automatically disabling tag extraction since it requires SQL queries. "
+                "Set warehouse_id to enable tag extraction."
+            )
+        if include_hive_metastore_original and not self.warehouse_id:
+            forced_disable_hive_metastore_extraction = True
+            self.include_hive_metastore = False  # Modify the model attribute directly
+            logger.warning(
+                "warehouse_id is not set but include_hive_metastore=True. "
+                "Automatically disabling hive metastore extraction since it requires SQL queries. "
+                "Set warehouse_id to enable hive metastore extraction."
+            )
+        # Set private attributes
+        self._forced_disable_tag_extraction = forced_disable_tag_extraction
+        self._forced_disable_hive_metastore_extraction = (
+            forced_disable_hive_metastore_extraction
+        )
     def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
         uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
         if database:
@@ -381,11 +451,6 @@ class UnityCatalogSourceConfig(
                 "When `warehouse_id` is set, it must match the `warehouse_id` in `profiling`."
             )
-        if values.get("include_hive_metastore") and not values.get("warehouse_id"):
-            raise ValueError(
-                "When `include_hive_metastore` is set, `warehouse_id` must be set."
-            )
         if values.get("warehouse_id") and profiling and not profiling.warehouse_id:
             profiling.warehouse_id = values["warehouse_id"]

acryl-datahub 1.2.0.7rc2__py3-none-any.whl → 1.2.0.7rc4__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.7rc2py3-none-any.whl → 1.2.0.7rc4py3-none-any.whl