PyPI - acryl-datahub - Versions diffs - 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc7__py3-none-any.whl - Mend

acryl-datahub 1.1.0.5rc6py3-none-any.whl → 1.1.0.5rc7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (38) hide show

datahub/ingestion/source/data_lake_common/object_store.py CHANGED Viewed

@@ -519,6 +519,13 @@ class ObjectStoreSourceAdapter:
                 "get_external_url",
                 lambda table_data: self.get_gcs_external_url(table_data),
             )
+            # Fix URI mismatch issue in pattern matching
+            self.register_customization(
+                "_normalize_uri_for_pattern_matching",
+                self._normalize_gcs_uri_for_pattern_matching,
+            )
+            # Fix URI handling in schema extraction - override strip_s3_prefix for GCS
+            self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
         elif platform == "s3":
             self.register_customization("is_s3_platform", lambda: True)
             self.register_customization("create_s3_path", self.create_s3_path)
@@ -612,6 +619,39 @@ class ObjectStoreSourceAdapter:
             return self.get_abs_external_url(table_data)
         return None
+    def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
+        """
+        Normalize GCS URI for pattern matching.
+        This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
+        fixing the URI mismatch issue in GCS ingestion.
+        Args:
+            uri: The URI to normalize
+        Returns:
+            The normalized URI for pattern matching
+        """
+        if uri.startswith("gs://"):
+            return uri.replace("gs://", "s3://", 1)
+        return uri
+    def _strip_gcs_prefix(self, uri: str) -> str:
+        """
+        Strip GCS prefix from URI.
+        This method removes the gs:// prefix from GCS URIs for path processing.
+        Args:
+            uri: The URI to strip the prefix from
+        Returns:
+            The URI without the gs:// prefix
+        """
+        if uri.startswith("gs://"):
+            return uri[5:]  # Remove "gs://" prefix
+        return uri
 # Factory function to create an adapter for a specific platform
 def create_object_store_adapter(

datahub/ingestion/source/dremio/dremio_source.py CHANGED Viewed

@@ -261,9 +261,12 @@ class DremioSource(StatefulIngestionSourceBase):
             # Profiling
             if self.config.is_profiling_enabled():
-                with self.report.new_stage(PROFILING), ThreadPoolExecutor(
-                    max_workers=self.config.profiling.max_workers
-                ) as executor:
+                with (
+                    self.report.new_stage(PROFILING),
+                    ThreadPoolExecutor(
+                        max_workers=self.config.profiling.max_workers
+                    ) as executor,
+                ):
                     future_to_dataset = {
                         executor.submit(self.generate_profiles, dataset): dataset
                         for dataset in datasets

datahub/ingestion/source/gcs/gcs_source.py CHANGED Viewed

@@ -112,6 +112,7 @@ class GCSSource(StatefulIngestionSourceBase):
             env=self.config.env,
             max_rows=self.config.max_rows,
             number_of_files_to_sample=self.config.number_of_files_to_sample,
+            platform=PLATFORM_GCS,  # Ensure GCS platform is used for correct container subtypes
         )
         return s3_config
@@ -138,7 +139,9 @@ class GCSSource(StatefulIngestionSourceBase):
     def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
         config = self.create_equivalent_s3_config()
-        s3_source = S3Source(config, PipelineContext(ctx.run_id))
+        # Create a new context for S3 source without graph to avoid duplicate checkpointer registration
+        s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
+        s3_source = S3Source(config, s3_ctx)
         return self.s3_source_overrides(s3_source)
     def s3_source_overrides(self, source: S3Source) -> S3Source:

datahub/ingestion/source/ge_data_profiler.py CHANGED Viewed

@@ -1213,26 +1213,34 @@ class DatahubGEProfiler:
             f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
         )
-        with PerfTimer() as timer, unittest.mock.patch(
-            "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
-            get_column_unique_count_dh_patch,
-        ), unittest.mock.patch(
-            "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
-            _get_column_quantiles_bigquery_patch,
-        ), unittest.mock.patch(
-            "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
-            _get_column_quantiles_awsathena_patch,
-        ), unittest.mock.patch(
-            "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
-            _get_column_median_patch,
-        ), concurrent.futures.ThreadPoolExecutor(
-            max_workers=max_workers
-        ) as async_executor, SQLAlchemyQueryCombiner(
-            enabled=self.config.query_combiner_enabled,
-            catch_exceptions=self.config.catch_exceptions,
-            is_single_row_query_method=_is_single_row_query_method,
-            serial_execution_fallback_enabled=True,
-        ).activate() as query_combiner:
+        with (
+            PerfTimer() as timer,
+            unittest.mock.patch(
+                "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
+                get_column_unique_count_dh_patch,
+            ),
+            unittest.mock.patch(
+                "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
+                _get_column_quantiles_bigquery_patch,
+            ),
+            unittest.mock.patch(
+                "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
+                _get_column_quantiles_awsathena_patch,
+            ),
+            unittest.mock.patch(
+                "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
+                _get_column_median_patch,
+            ),
+            concurrent.futures.ThreadPoolExecutor(
+                max_workers=max_workers
+            ) as async_executor,
+            SQLAlchemyQueryCombiner(
+                enabled=self.config.query_combiner_enabled,
+                catch_exceptions=self.config.catch_exceptions,
+                is_single_row_query_method=_is_single_row_query_method,
+                serial_execution_fallback_enabled=True,
+            ).activate() as query_combiner,
+        ):
             # Submit the profiling requests to the thread pool executor.
             async_profiles = collections.deque(
                 async_executor.submit(

datahub/ingestion/source/kafka_connect/source_connectors.py CHANGED Viewed

@@ -20,6 +20,8 @@ from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
     get_platform_from_sqlalchemy_uri,
 )
+logger = logging.getLogger(__name__)
 @dataclass
 class ConfluentJDBCSourceConnector(BaseConnector):
@@ -392,7 +394,7 @@ class MongoSourceConnector(BaseConnector):
             db_connection_url=connector_manifest.config.get("connection.uri"),
             source_platform="mongodb",
             database_name=connector_manifest.config.get("database"),
-            topic_prefix=connector_manifest.config.get("topic_prefix"),
+            topic_prefix=connector_manifest.config.get("topic.prefix"),
             transforms=(
                 connector_manifest.config["transforms"].split(",")
                 if "transforms" in connector_manifest.config
@@ -406,7 +408,11 @@ class MongoSourceConnector(BaseConnector):
         lineages: List[KafkaConnectLineage] = list()
         parser = self.get_parser(self.connector_manifest)
         source_platform = parser.source_platform
-        topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)"
+        topic_prefix = parser.topic_prefix or ""
+        # Escape topic_prefix to handle cases where it contains dots
+        # Some users configure topic.prefix like "my.mongodb" which breaks the regex
+        topic_naming_pattern = rf"{re.escape(topic_prefix)}\.(\w+)\.(\w+)"
         if not self.connector_manifest.topic_names:
             return lineages
@@ -429,6 +435,26 @@ class MongoSourceConnector(BaseConnector):
 @dataclass
 class DebeziumSourceConnector(BaseConnector):
+    # Debezium topic naming patterns by connector type
+    # - MySQL: {topic.prefix}.{database}.{table}
+    # - PostgreSQL: {topic.prefix}.{schema}.{table}
+    # - SQL Server: {topic.prefix}.{database}.{schema}.{table}
+    # - Oracle: {topic.prefix}.{schema}.{table}
+    # - DB2: {topic.prefix}.{schema}.{table}
+    # - MongoDB: {topic.prefix}.{database}.{collection}
+    # - Vitess: {topic.prefix}.{keyspace}.{table}
+    # Note SQL Server allows for "database.names" (multiple databases) config,
+    # and so database is in the topic naming pattern.
+    # However, others have "database.dbname" which is a single database name. For these connectors,
+    # additional databases would require a different connector instance
+    # Connectors with 2-level container in pattern (database + schema)
+    # Others have either database XOR schema, but not both
+    DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN = {
+        "io.debezium.connector.sqlserver.SqlServerConnector",
+    }
     @dataclass
     class DebeziumParser:
         source_platform: str
@@ -514,16 +540,45 @@ class DebeziumSourceConnector(BaseConnector):
             source_platform = parser.source_platform
             server_name = parser.server_name
             database_name = parser.database_name
-            topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)"
+            # Escape server_name to handle cases where topic.prefix contains dots
+            # Some users configure topic.prefix like "my.server" which breaks the regex
+            server_name = server_name or ""
+            # Regex pattern (\w+\.\w+(?:\.\w+)?) supports BOTH 2-part and 3-part table names
+            topic_naming_pattern = rf"({re.escape(server_name)})\.(\w+\.\w+(?:\.\w+)?)"
             if not self.connector_manifest.topic_names:
                 return lineages
+            # Handle connectors with 2-level container (database + schema) in topic pattern
+            connector_class = self.connector_manifest.config.get(CONNECTOR_CLASS, "")
+            maybe_duplicated_database_name = (
+                connector_class
+                in self.DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN
+            )
             for topic in self.connector_manifest.topic_names:
                 found = re.search(re.compile(topic_naming_pattern), topic)
+                logger.debug(
+                    f"Processing topic: '{topic}' with regex pattern '{topic_naming_pattern}', found: {found}"
+                )
                 if found:
-                    table_name = get_dataset_name(database_name, found.group(2))
+                    # Extract the table part after server_name
+                    table_part = found.group(2)
+                    if (
+                        maybe_duplicated_database_name
+                        and database_name
+                        and table_part.startswith(f"{database_name}.")
+                    ):
+                        table_part = table_part[len(database_name) + 1 :]
+                    logger.debug(
+                        f"Extracted table part: '{table_part}' from topic '{topic}'"
+                    )
+                    # Apply database name to create final dataset name
+                    table_name = get_dataset_name(database_name, table_part)
+                    logger.debug(f"Final table name: '{table_name}'")
                     lineage = KafkaConnectLineage(
                         source_dataset=table_name,

datahub/ingestion/source/mock_data/datahub_mock_data.py CHANGED Viewed

@@ -21,9 +21,13 @@ from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
 )
 from datahub.ingestion.source.mock_data.table_naming_helper import TableNamingHelper
 from datahub.metadata.schema_classes import (
+    CalendarIntervalClass,
     DatasetLineageTypeClass,
+    DatasetProfileClass,
+    DatasetUsageStatisticsClass,
     StatusClass,
     SubTypesClass,
+    TimeWindowSizeClass,
     UpstreamClass,
     UpstreamLineageClass,
 )
@@ -278,6 +282,10 @@ class DataHubMockDataSource(Source):
                 yield self._get_subtypes_aspect(table_name, i, j)
+                yield self._get_profile_aspect(table_name)
+                yield self._get_usage_aspect(table_name)
                 yield from self._generate_lineage_for_table(
                     table_name=table_name,
                     table_level=i,
@@ -381,5 +389,42 @@ class DataHubMockDataSource(Source):
         )
         return mcp.as_workunit()
+    def _get_profile_aspect(self, table: str) -> MetadataWorkUnit:
+        urn = make_dataset_urn(
+            platform="fake",
+            name=table,
+        )
+        mcp = MetadataChangeProposalWrapper(
+            entityUrn=urn,
+            entityType="dataset",
+            aspect=DatasetProfileClass(
+                timestampMillis=0,
+                rowCount=100,
+                columnCount=10,
+                sizeInBytes=1000,
+            ),
+        )
+        return mcp.as_workunit()
+    def _get_usage_aspect(self, table: str) -> MetadataWorkUnit:
+        urn = make_dataset_urn(
+            platform="fake",
+            name=table,
+        )
+        mcp = MetadataChangeProposalWrapper(
+            entityUrn=urn,
+            entityType="dataset",
+            aspect=DatasetUsageStatisticsClass(
+                timestampMillis=0,
+                eventGranularity=TimeWindowSizeClass(unit=CalendarIntervalClass.DAY),
+                uniqueUserCount=0,
+                totalSqlQueries=0,
+                topSqlQueries=[],
+                userCounts=[],
+                fieldCounts=[],
+            ),
+        )
+        return mcp.as_workunit()
     def get_report(self) -> SourceReport:
         return self.report

datahub/ingestion/source/redshift/usage.py CHANGED Viewed

@@ -182,9 +182,10 @@ class RedshiftUsageExtractor:
         self.report.num_operational_stats_filtered = 0
         if self.config.include_operational_stats:
-            with self.report.new_stage(
-                USAGE_EXTRACTION_OPERATIONAL_STATS
-            ), PerfTimer() as timer:
+            with (
+                self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS),
+                PerfTimer() as timer,
+            ):
                 # Generate operation aspect workunits
                 yield from self._gen_operation_aspect_workunits(
                     self.connection, all_tables

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -682,7 +682,7 @@ class S3Source(StatefulIngestionSourceBase):
         logger.info(f"Extracting table schema from file: {table_data.full_path}")
         browse_path: str = (
-            strip_s3_prefix(table_data.table_path)
+            self.strip_s3_prefix(table_data.table_path)
             if self.is_s3_platform()
             else table_data.table_path.strip("/")
         )
@@ -949,7 +949,10 @@ class S3Source(StatefulIngestionSourceBase):
         """
         def _is_allowed_path(path_spec_: PathSpec, s3_uri: str) -> bool:
-            allowed = path_spec_.allowed(s3_uri)
+            # Normalize URI for pattern matching
+            normalized_uri = self._normalize_uri_for_pattern_matching(s3_uri)
+            allowed = path_spec_.allowed(normalized_uri)
             if not allowed:
                 logger.debug(f"File {s3_uri} not allowed and skipping")
                 self.report.report_file_dropped(s3_uri)
@@ -1394,8 +1397,13 @@ class S3Source(StatefulIngestionSourceBase):
                 )
                 table_dict: Dict[str, TableData] = {}
                 for browse_path in file_browser:
+                    # Normalize URI for pattern matching
+                    normalized_file_path = self._normalize_uri_for_pattern_matching(
+                        browse_path.file
+                    )
                     if not path_spec.allowed(
-                        browse_path.file,
+                        normalized_file_path,
                         ignore_ext=self.is_s3_platform()
                         and self.source_config.use_s3_content_type,
                     ):
@@ -1471,5 +1479,13 @@ class S3Source(StatefulIngestionSourceBase):
     def is_s3_platform(self):
         return self.source_config.platform == "s3"
+    def strip_s3_prefix(self, s3_uri: str) -> str:
+        """Strip S3 prefix from URI. Can be overridden by adapters for other platforms."""
+        return strip_s3_prefix(s3_uri)
+    def _normalize_uri_for_pattern_matching(self, uri: str) -> str:
+        """Normalize URI for pattern matching. Can be overridden by adapters for other platforms."""
+        return uri
     def get_report(self):
         return self.report

datahub/ingestion/source/snowflake/snowflake_queries.py CHANGED Viewed

@@ -44,6 +44,11 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
     SnowflakeIdentifierBuilder,
     SnowflakeStructuredReportMixin,
 )
+from datahub.ingestion.source.snowflake.stored_proc_lineage import (
+    StoredProcCall,
+    StoredProcLineageReport,
+    StoredProcLineageTracker,
+)
 from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
 from datahub.metadata.urns import CorpUserUrn
 from datahub.sql_parsing.schema_resolver import SchemaResolver
@@ -130,6 +135,7 @@ class SnowflakeQueriesExtractorReport(Report):
     aggregator_generate_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
     sql_aggregator: Optional[SqlAggregatorReport] = None
+    stored_proc_lineage: Optional[StoredProcLineageReport] = None
     num_ddl_queries_dropped: int = 0
     num_stream_queries_observed: int = 0
@@ -261,6 +267,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                 TableRename,
                 TableSwap,
                 ObservedQuery,
+                StoredProcCall,
             ]
         ] = self._exit_stack.enter_context(FileBackedList(shared_connection))
@@ -277,12 +284,34 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                 for entry in self.fetch_query_log(users):
                     queries.append(entry)
+        stored_proc_tracker: StoredProcLineageTracker = self._exit_stack.enter_context(
+            StoredProcLineageTracker(
+                platform=self.identifiers.platform,
+                shared_connection=shared_connection,
+            )
+        )
+        self.report.stored_proc_lineage = stored_proc_tracker.report
         with self.report.audit_log_load_timer:
             for i, query in enumerate(queries):
                 if i % 1000 == 0:
                     logger.info(f"Added {i} query log entries to SQL aggregator")
-                self.aggregator.add(query)
+                if isinstance(query, StoredProcCall):
+                    stored_proc_tracker.add_stored_proc_call(query)
+                    continue
+                if not (
+                    isinstance(query, PreparsedQuery)
+                    and stored_proc_tracker.add_related_query(query)
+                ):
+                    # Only add to aggregator if it's not part of a stored procedure.
+                    self.aggregator.add(query)
+            # Generate and add stored procedure lineage entries.
+            for lineage_entry in stored_proc_tracker.build_merged_lineage_entries():
+                # TODO: Make this the lowest priority lineage - so that it doesn't override other lineage entries.
+                self.aggregator.add(lineage_entry)
         with self.report.aggregator_generate_timer:
             yield from auto_workunit(self.aggregator.gen_metadata())
@@ -342,7 +371,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
     def fetch_query_log(
         self, users: UsersMapping
-    ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery]]:
+    ) -> Iterable[
+        Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
+    ]:
         query_log_query = _build_enriched_query_log_query(
             start_time=self.config.window.start_time,
             end_time=self.config.window.end_time,
@@ -382,7 +413,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
     def _parse_audit_log_row(
         self, row: Dict[str, Any], users: UsersMapping
-    ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
+    ) -> Optional[
+        Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery, StoredProcCall]
+    ]:
         json_fields = {
             "DIRECT_OBJECTS_ACCESSED",
             "OBJECTS_MODIFIED",
@@ -482,6 +515,17 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                 extra_info=extra_info,
             )
+        if snowflake_query_type == "CALL" and res["root_query_id"] is None:
+            return StoredProcCall(
+                # This is the top-level query ID that other entries will reference.
+                snowflake_root_query_id=res["query_id"],
+                query_text=query_text,
+                timestamp=timestamp,
+                user=user,
+                default_db=res["default_db"],
+                default_schema=res["default_schema"],
+            )
         upstreams = []
         column_usage = {}

datahub/ingestion/source/snowflake/snowflake_usage_v2.py CHANGED Viewed

@@ -231,7 +231,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
         with self.report.usage_aggregation.result_fetch_timer as fetch_timer:
             for row in results:
-                with fetch_timer.pause(), self.report.usage_aggregation.result_skip_timer as skip_timer:
+                with (
+                    fetch_timer.pause(),
+                    self.report.usage_aggregation.result_skip_timer as skip_timer,
+                ):
                     if results.rownumber is not None and results.rownumber % 1000 == 0:
                         logger.debug(f"Processing usage row number {results.rownumber}")
                         logger.debug(self.report.usage_aggregation.as_string())
@@ -255,7 +258,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
                             f"Skipping usage for {row['OBJECT_DOMAIN']} {dataset_identifier}, as table is not accessible."
                         )
                         continue
-                    with skip_timer.pause(), self.report.usage_aggregation.result_map_timer as map_timer:
+                    with (
+                        skip_timer.pause(),
+                        self.report.usage_aggregation.result_map_timer as map_timer,
+                    ):
                         wu = self.build_usage_statistics_for_dataset(
                             dataset_identifier, row
                         )

datahub/ingestion/source/snowflake/stored_proc_lineage.py ADDED Viewed

@@ -0,0 +1,143 @@
+import dataclasses
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Iterable, List, Optional
+from datahub.ingestion.api.closeable import Closeable
+from datahub.metadata.urns import CorpUserUrn
+from datahub.sql_parsing.sql_parsing_aggregator import (
+    PreparsedQuery,
+    UrnStr,
+)
+from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
+from datahub.utilities.file_backed_collections import FileBackedDict
+@dataclasses.dataclass
+class StoredProcCall:
+    snowflake_root_query_id: str
+    # Query text will typically be something like:
+    # "CALL SALES_FORECASTING.CUSTOMER_ANALYSIS_PROC();"
+    query_text: str
+    timestamp: datetime
+    user: CorpUserUrn
+    default_db: str
+    default_schema: str
+@dataclass
+class StoredProcExecutionLineage:
+    call: StoredProcCall
+    inputs: List[UrnStr]
+    outputs: List[UrnStr]
+@dataclass
+class StoredProcLineageReport:
+    num_stored_proc_calls: int = 0
+    num_related_queries: int = 0
+    num_related_queries_without_proc_call: int = 0
+    # Incremented at generation/build time.
+    num_stored_proc_lineage_entries: int = 0
+    num_stored_proc_calls_with_no_inputs: int = 0
+    num_stored_proc_calls_with_no_outputs: int = 0
+class StoredProcLineageTracker(Closeable):
+    """
+    Tracks table-level lineage for Snowflake stored procedures.
+    Stored procedures in Snowflake trigger multiple SQL queries during execution.
+    Snowflake assigns each stored procedure call a unique query_id and uses this as the
+    root_query_id for all subsequent queries executed within that procedure. This allows
+    us to trace which queries belong to a specific stored procedure execution and build
+    table-level lineage by aggregating inputs/outputs from all related queries.
+    """
+    def __init__(self, platform: str, shared_connection: Optional[Any] = None):
+        self.platform = platform
+        self.report = StoredProcLineageReport()
+        # { root_query_id -> StoredProcExecutionLineage }
+        self._stored_proc_execution_lineage: FileBackedDict[
+            StoredProcExecutionLineage
+        ] = FileBackedDict(shared_connection)
+    def add_stored_proc_call(self, call: StoredProcCall) -> None:
+        """Add a stored procedure call to track."""
+        self._stored_proc_execution_lineage[call.snowflake_root_query_id] = (
+            StoredProcExecutionLineage(
+                call=call,
+                # Will be populated by subsequent queries.
+                inputs=[],
+                outputs=[],
+            )
+        )
+        self.report.num_stored_proc_calls += 1
+    def add_related_query(self, query: PreparsedQuery) -> bool:
+        """Add a query that might be related to a stored procedure execution.
+        Returns True if the query was added to a stored procedure execution, False otherwise.
+        """
+        snowflake_root_query_id = (query.extra_info or {}).get(
+            "snowflake_root_query_id"
+        )
+        if snowflake_root_query_id:
+            if snowflake_root_query_id not in self._stored_proc_execution_lineage:
+                self.report.num_related_queries_without_proc_call += 1
+                return False
+            stored_proc_execution = self._stored_proc_execution_lineage.for_mutation(
+                snowflake_root_query_id
+            )
+            stored_proc_execution.inputs.extend(query.upstreams)
+            if query.downstream is not None:
+                stored_proc_execution.outputs.append(query.downstream)
+            self.report.num_related_queries += 1
+            return True
+        return False
+    def build_merged_lineage_entries(self) -> Iterable[PreparsedQuery]:
+        # For stored procedures, we can only get table-level lineage from the audit log.
+        # We represent these as PreparsedQuery objects for now. Eventually we'll want to
+        # create dataJobInputOutput lineage instead.
+        for stored_proc_execution in self._stored_proc_execution_lineage.values():
+            if not stored_proc_execution.inputs:
+                self.report.num_stored_proc_calls_with_no_inputs += 1
+                continue
+            if not stored_proc_execution.outputs:
+                self.report.num_stored_proc_calls_with_no_outputs += 1
+                # Still continue to generate lineage for cases where we have inputs but no outputs
+            for downstream in stored_proc_execution.outputs:
+                stored_proc_query_id = get_query_fingerprint(
+                    stored_proc_execution.call.query_text,
+                    self.platform,
+                    fast=True,
+                    secondary_id=downstream,
+                )
+                lineage_entry = PreparsedQuery(
+                    query_id=stored_proc_query_id,
+                    query_text=stored_proc_execution.call.query_text,
+                    upstreams=stored_proc_execution.inputs,
+                    downstream=downstream,
+                    query_count=0,
+                    user=stored_proc_execution.call.user,
+                    timestamp=stored_proc_execution.call.timestamp,
+                )
+                self.report.num_stored_proc_lineage_entries += 1
+                yield lineage_entry
+    def close(self) -> None:
+        self._stored_proc_execution_lineage.close()

datahub/ingestion/source/unity/proxy.py CHANGED Viewed

@@ -507,9 +507,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
     def _execute_sql_query(self, query: str) -> List[List[str]]:
         """Execute SQL query using databricks-sql connector for better performance"""
         try:
-            with connect(
-                **self._sql_connection_params
-            ) as connection, connection.cursor() as cursor:
+            with (
+                connect(**self._sql_connection_params) as connection,
+                connection.cursor() as cursor,
+            ):
                 cursor.execute(query)
                 return cursor.fetchall()

acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc7__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.0.5rc6py3-none-any.whl → 1.1.0.5rc7py3-none-any.whl