PyPI - acryl-datahub - Versions diffs - 1.3.0.1rc2__py3-none-any.whl → 1.3.0.1rc3__py3-none-any.whl - Mend

acryl-datahub 1.3.0.1rc2py3-none-any.whl → 1.3.0.1rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (47) hide show

{acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/METADATA +2563 -2561
{acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/RECORD +46 -44
datahub/_version.py +1 -1
datahub/api/entities/dataproduct/dataproduct.py +26 -0
datahub/cli/config_utils.py +18 -10
datahub/cli/docker_check.py +2 -1
datahub/cli/docker_cli.py +4 -2
datahub/cli/graphql_cli.py +1422 -0
datahub/cli/quickstart_versioning.py +2 -2
datahub/cli/specific/dataproduct_cli.py +2 -4
datahub/cli/specific/user_cli.py +172 -1
datahub/configuration/env_vars.py +331 -0
datahub/configuration/kafka.py +6 -4
datahub/emitter/mce_builder.py +2 -4
datahub/emitter/rest_emitter.py +15 -15
datahub/entrypoints.py +2 -0
datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
datahub/ingestion/api/source.py +5 -0
datahub/ingestion/graph/client.py +197 -0
datahub/ingestion/graph/config.py +2 -2
datahub/ingestion/sink/datahub_rest.py +6 -5
datahub/ingestion/source/aws/aws_common.py +20 -13
datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -4
datahub/ingestion/source/grafana/models.py +5 -0
datahub/ingestion/source/iceberg/iceberg.py +39 -19
datahub/ingestion/source/kafka_connect/source_connectors.py +4 -1
datahub/ingestion/source/mode.py +13 -0
datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
datahub/ingestion/source/snowflake/snowflake_schema.py +2 -2
datahub/ingestion/source/sql/mssql/source.py +7 -1
datahub/ingestion/source/sql/teradata.py +80 -65
datahub/ingestion/source/unity/config.py +31 -0
datahub/ingestion/source/unity/proxy.py +73 -0
datahub/ingestion/source/unity/source.py +27 -70
datahub/ingestion/source/unity/usage.py +46 -4
datahub/sql_parsing/sql_parsing_aggregator.py +14 -5
datahub/sql_parsing/sqlglot_lineage.py +7 -0
datahub/telemetry/telemetry.py +8 -3
datahub/utilities/file_backed_collections.py +2 -2
datahub/utilities/is_pytest.py +3 -2
datahub/utilities/logging_manager.py +22 -6
datahub/utilities/sample_data.py +5 -4
datahub/emitter/sql_parsing_builder.py +0 -306
{acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/WHEEL +0 -0
{acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/iceberg/iceberg.py CHANGED Viewed

@@ -836,9 +836,6 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
             "native_data_type": str(timestamp_type),
         }
-    # visit_timestamptz() is required when using pyiceberg >= 0.5.0, which is essentially a duplicate
-    # of visit_timestampz().  The function has been renamed from visit_timestampz().
-    # Once Datahub can upgrade its pyiceberg dependency to >=0.5.0, the visit_timestampz() function can be safely removed.
     def visit_timestamptz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]:
         # Avro supports 2 types of timestamp:
         #  - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
@@ -855,22 +852,6 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
             "native_data_type": str(timestamptz_type),
         }
-    def visit_timestampz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]:
-        # Avro supports 2 types of timestamp:
-        #  - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
-        #  - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
-        # utcAdjustment: bool = True
-        return {
-            "type": "long",
-            "logicalType": "timestamp-micros",
-            # Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
-            # See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
-            # "logicalType": "timestamp-micros"
-            # if timestamp_type.adjust_to_utc
-            # else "local-timestamp-micros",
-            "native_data_type": str(timestamptz_type),
-        }
     def visit_string(self, string_type: StringType) -> Dict[str, Any]:
         return {
             "type": "string",
@@ -889,3 +870,42 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
             "type": "bytes",
             "native_data_type": str(binary_type),
         }
+    def visit_timestamp_ns(self, timestamp_ns_type: Any) -> Dict[str, Any]:
+        # Handle nanosecond precision timestamps
+        # Avro supports 2 types of timestamp:
+        #  - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
+        #  - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
+        return {
+            "type": "long",
+            "logicalType": "timestamp-micros",
+            # Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
+            # See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
+            # "logicalType": "timestamp-micros"
+            # if timestamp_ns_type.adjust_to_utc
+            # else "local-timestamp-micros",
+            "native_data_type": str(timestamp_ns_type),
+        }
+    def visit_timestamptz_ns(self, timestamptz_ns_type: Any) -> Dict[str, Any]:
+        # Handle nanosecond precision timestamps with timezone
+        # Avro supports 2 types of timestamp:
+        #  - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
+        #  - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
+        return {
+            "type": "long",
+            "logicalType": "timestamp-micros",
+            # Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
+            # See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
+            # "logicalType": "timestamp-micros"
+            # if timestamptz_ns_type.adjust_to_utc
+            # else "local-timestamp-micros",
+            "native_data_type": str(timestamptz_ns_type),
+        }
+    def visit_unknown(self, unknown_type: Any) -> Dict[str, Any]:
+        # Handle unknown types
+        return {
+            "type": "string",
+            "native_data_type": str(unknown_type),
+        }

datahub/ingestion/source/kafka_connect/source_connectors.py CHANGED Viewed

@@ -412,7 +412,10 @@ class MongoSourceConnector(BaseConnector):
         # Escape topic_prefix to handle cases where it contains dots
         # Some users configure topic.prefix like "my.mongodb" which breaks the regex
-        topic_naming_pattern = rf"{re.escape(topic_prefix)}\.(\w+)\.(\w+)"
+        # \w is equivalent to [a-zA-Z0-9_]
+        # So [\w-]+ matches alphanumeric characters, underscores, and hyphens
+        topic_naming_pattern = rf"{re.escape(topic_prefix)}\.([\w-]+)\.([\w-]+)"
         if not self.connector_manifest.topic_names:
             return lineages

datahub/ingestion/source/mode.py CHANGED Viewed

@@ -214,6 +214,10 @@ class ModeConfig(
         description="Number of items per page for paginated API requests.",
     )
+    exclude_archived: bool = Field(
+        default=False, description="Exclude archived reports"
+    )
     @validator("connect_uri")
     def remove_trailing_slash(cls, v):
         return config_clean.remove_trailing_slashes(v)
@@ -1473,6 +1477,15 @@ class ModeSource(StatefulIngestionSourceBase):
                     logger.debug(
                         f"Read {len(reports_page)} reports records from workspace {self.workspace_uri} space {space_token}"
                     )
+                    if self.config.exclude_archived:
+                        logger.debug(
+                            f"Excluding archived reports since exclude_archived: {self.config.exclude_archived}"
+                        )
+                        reports_page = [
+                            report
+                            for report in reports_page
+                            if not report.get("archived", False)
+                        ]
                     yield reports_page
         except ModeRequestError as e:
             if isinstance(e, HTTPError) and e.response.status_code == 404:

datahub/ingestion/source/powerbi/m_query/parser.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import functools
 import importlib.resources as pkg_resource
 import logging
-import os
 from typing import Dict, List, Optional
 import lark
 from lark import Lark, Tree
 import datahub.ingestion.source.powerbi.m_query.data_classes
+from datahub.configuration.env_vars import get_powerbi_m_query_parse_timeout
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.source.powerbi.config import (
     PowerBiDashboardSourceConfig,
@@ -25,7 +25,7 @@ from datahub.utilities.threading_timeout import TimeoutException, threading_time
 logger = logging.getLogger(__name__)
-_M_QUERY_PARSE_TIMEOUT = int(os.getenv("DATAHUB_POWERBI_M_QUERY_PARSE_TIMEOUT", 60))
+_M_QUERY_PARSE_TIMEOUT = get_powerbi_m_query_parse_timeout()
 @functools.lru_cache(maxsize=1)

datahub/ingestion/source/snowflake/snowflake_schema.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import logging
-import os
 from collections import defaultdict
 from dataclasses import dataclass, field
 from datetime import datetime
 from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Tuple
+from datahub.configuration.env_vars import get_snowflake_schema_parallelism
 from datahub.ingestion.api.report import SupportsAsObj
 from datahub.ingestion.source.common.subtypes import DatasetSubTypes
 from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
@@ -22,7 +22,7 @@ from datahub.utilities.serialized_lru_cache import serialized_lru_cache
 logger: logging.Logger = logging.getLogger(__name__)
-SCHEMA_PARALLELISM = int(os.getenv("DATAHUB_SNOWFLAKE_SCHEMA_PARALLELISM", 20))
+SCHEMA_PARALLELISM = get_snowflake_schema_parallelism()
 @dataclass

datahub/ingestion/source/sql/mssql/source.py CHANGED Viewed

@@ -149,18 +149,24 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
         uri_opts: Optional[Dict[str, Any]] = None,
         current_db: Optional[str] = None,
     ) -> str:
+        current_db = current_db or self.database
         if self.use_odbc:
             # Ensure that the import is available.
             import pyodbc  # noqa: F401
             self.scheme = "mssql+pyodbc"
+            # ODBC requires a database name, otherwise it will interpret host_port
+            # as a pre-defined ODBC connection name.
+            current_db = current_db or "master"
         uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri(
             self.scheme,  # type: ignore
             self.username,
             self.password.get_secret_value() if self.password else None,
             self.host_port,  # type: ignore
-            current_db if current_db else self.database,
+            current_db,
             uri_opts=uri_opts,
         )
         if self.use_odbc:

datahub/ingestion/source/sql/teradata.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import re
 import time
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -67,6 +68,12 @@ from datahub.utilities.stats_collections import TopKDict
 logger: logging.Logger = logging.getLogger(__name__)
+# Precompiled regex pattern for case-insensitive "(not casespecific)" removal
+NOT_CASESPECIFIC_PATTERN = re.compile(r"\(not casespecific\)", re.IGNORECASE)
+# Teradata uses a two-tier database.table naming approach without default database prefixing
+DEFAULT_NO_DATABASE_TERADATA = None
 # Common excluded databases used in multiple places
 EXCLUDED_DATABASES = [
     "All",
@@ -453,6 +460,13 @@ class TeradataReport(SQLSourceReport, BaseTimeWindowReport):
     # Global metadata extraction timing (single query for all databases)
     metadata_extraction_total_sec: float = 0.0
+    # Lineage extraction query time range (actively used)
+    lineage_start_time: Optional[datetime] = None
+    lineage_end_time: Optional[datetime] = None
+    # Audit query processing statistics
+    num_audit_query_entries_processed: int = 0
 class BaseTeradataConfig(TwoTierSQLAlchemyConfig):
     scheme: str = Field(default="teradatasql", description="database scheme")
@@ -726,7 +740,8 @@ ORDER by DataBaseName, TableName;
             env=self.config.env,
             schema_resolver=self.schema_resolver,
             graph=self.ctx.graph,
-            generate_lineage=self.include_lineage,
+            generate_lineage=self.config.include_view_lineage
+            or self.config.include_table_lineage,
             generate_queries=self.config.include_queries,
             generate_usage_statistics=self.config.include_usage_statistics,
             generate_query_usage_statistics=self.config.include_usage_statistics,
@@ -1327,6 +1342,9 @@ ORDER by DataBaseName, TableName;
         current_query_metadata = None
         for entry in entries:
+            # Count each audit query entry processed
+            self.report.num_audit_query_entries_processed += 1
             query_id = getattr(entry, "query_id", None)
             query_text = str(getattr(entry, "query_text", ""))
@@ -1367,15 +1385,18 @@ ORDER by DataBaseName, TableName;
         default_database = getattr(metadata_entry, "default_database", None)
         # Apply Teradata-specific query transformations
-        cleaned_query = full_query_text.replace("(NOT CASESPECIFIC)", "")
+        cleaned_query = NOT_CASESPECIFIC_PATTERN.sub("", full_query_text)
+        # For Teradata's two-tier architecture (database.table), we should not set default_db
+        # to avoid incorrect URN generation like "dbc.database.table" instead of "database.table"
+        # The SQL parser will treat database.table references correctly without default_db
         return ObservedQuery(
             query=cleaned_query,
             session_id=session_id,
             timestamp=timestamp,
             user=CorpUserUrn(user) if user else None,
-            default_db=default_database,
-            default_schema=default_database,  # Teradata uses database as schema
+            default_db=DEFAULT_NO_DATABASE_TERADATA,  # Teradata uses two-tier database.table naming without default database prefixing
+            default_schema=default_database,
         )
     def _convert_entry_to_observed_query(self, entry: Any) -> ObservedQuery:
@@ -1393,15 +1414,18 @@ ORDER by DataBaseName, TableName;
         default_database = getattr(entry, "default_database", None)
         # Apply Teradata-specific query transformations
-        cleaned_query = query_text.replace("(NOT CASESPECIFIC)", "")
+        cleaned_query = NOT_CASESPECIFIC_PATTERN.sub("", query_text)
+        # For Teradata's two-tier architecture (database.table), we should not set default_db
+        # to avoid incorrect URN generation like "dbc.database.table" instead of "database.table"
+        # However, we should set default_schema for unqualified table references
         return ObservedQuery(
             query=cleaned_query,
             session_id=session_id,
             timestamp=timestamp,
             user=CorpUserUrn(user) if user else None,
-            default_db=default_database,
-            default_schema=default_database,  # Teradata uses database as schema
+            default_db=DEFAULT_NO_DATABASE_TERADATA,  # Teradata uses two-tier database.table naming without default database prefixing
+            default_schema=default_database,  # Set default_schema for unqualified table references
         )
     def _fetch_lineage_entries_chunked(self) -> Iterable[Any]:
@@ -1421,7 +1445,7 @@ ORDER by DataBaseName, TableName;
                 for query_index, query in enumerate(queries, 1):
                     logger.info(
-                        f"Executing lineage query {query_index}/{len(queries)} with {cursor_type} cursor..."
+                        f"Executing lineage query {query_index}/{len(queries)} for time range {self.config.start_time} to {self.config.end_time} with {cursor_type} cursor..."
                     )
                     # Use helper method to try server-side cursor with fallback
@@ -1589,79 +1613,70 @@ ORDER by DataBaseName, TableName;
         else:
             return connection.execute(text(query))
+    def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
+        """Override to prevent parent class from generating aggregator work units during schema extraction.
+        We handle aggregator generation manually after populating it with audit log data.
+        """
+        # Do nothing - we'll call the parent implementation manually after populating the aggregator
+        return iter([])
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         logger.info("Starting Teradata metadata extraction")
-        # Add all schemas to the schema resolver
-        # Sql parser operates on lowercase urns so we need to lowercase the urns
+        # Step 1: Schema extraction first (parent class will skip aggregator generation due to our override)
         with self.report.new_stage("Schema metadata extraction"):
             yield from super().get_workunits_internal()
             logger.info("Completed schema metadata extraction")
-        with self.report.new_stage("Audit log extraction"):
-            yield from self._get_audit_log_mcps_with_aggregator()
-        # SqlParsingAggregator handles its own work unit generation internally
-        logger.info("Lineage processing completed by SqlParsingAggregator")
-    def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
-        """Override base class to skip aggregator gen_metadata() call.
-        Teradata handles aggregator processing after adding audit log queries,
-        so we skip the base class call to prevent duplicate processing.
-        """
-        # Return empty iterator - Teradata will handle aggregator processing
-        # after adding audit log queries in _get_audit_log_mcps_with_aggregator()
-        return iter([])
+        # Step 2: Lineage extraction after schema extraction
+        # This allows lineage processing to have access to all discovered schema information
+        with self.report.new_stage("Audit log extraction and lineage processing"):
+            self._populate_aggregator_from_audit_logs()
+            # Call parent implementation directly to generate aggregator work units
+            yield from super()._generate_aggregator_workunits()
+            logger.info("Completed lineage processing")
-    def _get_audit_log_mcps_with_aggregator(self) -> Iterable[MetadataWorkUnit]:
+    def _populate_aggregator_from_audit_logs(self) -> None:
         """SqlParsingAggregator-based lineage extraction with enhanced capabilities."""
-        logger.info(
-            "Fetching queries from Teradata audit logs for SqlParsingAggregator"
-        )
-        if self.config.include_table_lineage or self.config.include_usage_statistics:
-            # Step 1: Stream query entries from database with memory-efficient processing
-            with self.report.new_stage("Fetching lineage entries from Audit Logs"):
-                queries_processed = 0
-                entries_processed = False
-                # Use streaming query reconstruction for memory efficiency
-                for observed_query in self._reconstruct_queries_streaming(
-                    self._fetch_lineage_entries_chunked()
-                ):
-                    entries_processed = True
-                    self.aggregator.add(observed_query)
+        with self.report.new_stage("Lineage extraction from Teradata audit logs"):
+            # Record the lineage query time range in the report
+            self.report.lineage_start_time = self.config.start_time
+            self.report.lineage_end_time = self.config.end_time
-                    queries_processed += 1
-                    if queries_processed % 10000 == 0:
-                        logger.info(
-                            f"Processed {queries_processed} queries to aggregator"
-                        )
+            logger.info(
+                f"Starting lineage extraction from Teradata audit logs (time range: {self.config.start_time} to {self.config.end_time})"
+            )
-                if not entries_processed:
-                    logger.info("No lineage entries found")
-                    return
+            if (
+                self.config.include_table_lineage
+                or self.config.include_usage_statistics
+            ):
+                # Step 1: Stream query entries from database with memory-efficient processing
+                with self.report.new_stage("Fetching lineage entries from Audit Logs"):
+                    queries_processed = 0
+                    # Use streaming query reconstruction for memory efficiency
+                    for observed_query in self._reconstruct_queries_streaming(
+                        self._fetch_lineage_entries_chunked()
+                    ):
+                        self.aggregator.add(observed_query)
+                        queries_processed += 1
+                        if queries_processed % 10000 == 0:
+                            logger.info(
+                                f"Processed {queries_processed} queries to aggregator"
+                            )
-                logger.info(
-                    f"Completed adding {queries_processed} queries to SqlParsingAggregator"
-                )
+                    if queries_processed == 0:
+                        logger.info("No lineage entries found")
+                        return
-        # Step 2: Generate work units from aggregator
-        with self.report.new_stage("SqlParsingAggregator metadata generation"):
-            logger.info("Generating metadata work units from SqlParsingAggregator")
-            work_unit_count = 0
-            for mcp in self.aggregator.gen_metadata():
-                work_unit_count += 1
-                if work_unit_count % 10000 == 0:
                     logger.info(
-                        f"Generated {work_unit_count} work units from aggregator"
+                        f"Completed adding {queries_processed} queries to SqlParsingAggregator"
                     )
-                yield mcp.as_workunit()
-            logger.info(
-                f"Completed SqlParsingAggregator processing: {work_unit_count} work units generated"
-            )
+            logger.info("Completed lineage extraction from Teradata audit logs")
     def close(self) -> None:
         """Clean up resources when source is closed."""

datahub/ingestion/source/unity/config.py CHANGED Viewed

@@ -49,6 +49,12 @@ class LineageDataSource(ConfigEnum):
     API = "API"
+class UsageDataSource(ConfigEnum):
+    AUTO = "AUTO"
+    SYSTEM_TABLES = "SYSTEM_TABLES"
+    API = "API"
 class UnityCatalogProfilerConfig(ConfigModel):
     method: str = Field(
         description=(
@@ -285,6 +291,17 @@ class UnityCatalogSourceConfig(
         description="Generate usage statistics.",
     )
+    usage_data_source: UsageDataSource = pydantic.Field(
+        default=UsageDataSource.AUTO,
+        description=(
+            "Source for usage/query history data extraction. Options: "
+            f"'{UsageDataSource.AUTO.value}' (default) - Automatically use system.query.history table when SQL warehouse is configured, otherwise fall back to REST API. "
+            "This provides better performance for multi-workspace setups and large query volumes when warehouse_id is set. "
+            f"'{UsageDataSource.SYSTEM_TABLES.value}' - Force use of system.query.history table (requires SQL warehouse and SELECT permission on system.query.history). "
+            f"'{UsageDataSource.API.value}' - Force use of REST API endpoints for query history (legacy method, may have limitations with multiple workspaces)."
+        ),
+    )
     # TODO: Remove `type:ignore` by refactoring config
     profiling: Union[
         UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
@@ -446,6 +463,20 @@ class UnityCatalogSourceConfig(
         return values
+    @pydantic.root_validator(skip_on_failure=True)
+    def validate_usage_data_source_with_warehouse(
+        cls, values: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        usage_data_source = values.get("usage_data_source", UsageDataSource.AUTO)
+        warehouse_id = values.get("warehouse_id")
+        if usage_data_source == UsageDataSource.SYSTEM_TABLES and not warehouse_id:
+            raise ValueError(
+                f"usage_data_source='{UsageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
+            )
+        return values
     @pydantic.validator("schema_pattern", always=True)
     def schema_pattern_should__always_deny_information_schema(
         cls, v: AllowDenyPattern

datahub/ingestion/source/unity/proxy.py CHANGED Viewed

@@ -40,6 +40,7 @@ from datahub.api.entities.external.unity_catalog_external_entites import UnityCa
 from datahub.emitter.mce_builder import parse_ts_millis
 from datahub.ingestion.source.unity.config import (
     LineageDataSource,
+    UsageDataSource,
 )
 from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
 from datahub.ingestion.source.unity.proxy_profiling import (
@@ -163,6 +164,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
         report: UnityCatalogReport,
         hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
         lineage_data_source: LineageDataSource = LineageDataSource.AUTO,
+        usage_data_source: UsageDataSource = UsageDataSource.AUTO,
         databricks_api_page_size: int = 0,
     ):
         self._workspace_client = WorkspaceClient(
@@ -175,6 +177,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
         self.report = report
         self.hive_metastore_proxy = hive_metastore_proxy
         self.lineage_data_source = lineage_data_source
+        self.usage_data_source = usage_data_source
         self.databricks_api_page_size = databricks_api_page_size
         self._sql_connection_params = {
             "server_hostname": self._workspace_client.config.host.replace(
@@ -182,6 +185,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
             ),
             "http_path": f"/sql/1.0/warehouses/{self.warehouse_id}",
             "access_token": self._workspace_client.config.token,
+            "user_agent_entry": "datahub",
         }
     def check_basic_connectivity(self) -> bool:
@@ -401,6 +405,75 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
                 method, path, body={**body, "page_token": response["next_page_token"]}
             )
+    def get_query_history_via_system_tables(
+        self,
+        start_time: datetime,
+        end_time: datetime,
+    ) -> Iterable[Query]:
+        """Get query history using system.query.history table.
+        This method provides an alternative to the REST API for fetching query history,
+        offering better performance and richer data for large query volumes.
+        """
+        logger.info(
+            f"Fetching query history from system.query.history for period: {start_time} to {end_time}"
+        )
+        allowed_types = [typ.value for typ in ALLOWED_STATEMENT_TYPES]
+        statement_type_filter = ", ".join(f"'{typ}'" for typ in allowed_types)
+        query = f"""
+            SELECT
+                statement_id,
+                statement_text,
+                statement_type,
+                start_time,
+                end_time,
+                executed_by,
+                executed_as,
+                executed_by_user_id,
+                executed_as_user_id
+            FROM system.query.history
+            WHERE
+                start_time >= %s
+                AND end_time <= %s
+                AND execution_status = 'FINISHED'
+                AND statement_type IN ({statement_type_filter})
+            ORDER BY start_time
+        """
+        try:
+            rows = self._execute_sql_query(query, (start_time, end_time))
+            for row in rows:
+                try:
+                    yield Query(
+                        query_id=row.statement_id,
+                        query_text=row.statement_text,
+                        statement_type=(
+                            QueryStatementType(row.statement_type)
+                            if row.statement_type
+                            else None
+                        ),
+                        start_time=row.start_time,
+                        end_time=row.end_time,
+                        user_id=row.executed_by_user_id,
+                        user_name=row.executed_by,
+                        executed_as_user_id=row.executed_as_user_id,
+                        executed_as_user_name=row.executed_as,
+                    )
+                except Exception as e:
+                    logger.warning(f"Error parsing query from system table: {e}")
+                    self.report.report_warning("query-parse-system-table", str(e))
+        except Exception as e:
+            logger.error(
+                f"Error fetching query history from system tables: {e}", exc_info=True
+            )
+            self.report.report_failure(
+                title="Failed to fetch query history from system tables",
+                message="Error querying system.query.history table",
+                context=f"Query period: {start_time} to {end_time}",
+            )
     def _build_datetime_where_conditions(
         self, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None
     ) -> str:

acryl-datahub 1.3.0.1rc2__py3-none-any.whl → 1.3.0.1rc3__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.3.0.1rc2py3-none-any.whl → 1.3.0.1rc3py3-none-any.whl