PyPI - acryl-datahub - Versions diffs - 0.15.0.5rc9__py3-none-any.whl → 0.15.0.6rc1__py3-none-any.whl - Mend

acryl-datahub 0.15.0.5rc9py3-none-any.whl → 0.15.0.6rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (46) hide show

datahub/ingestion/source/ge_profiling_config.py CHANGED Viewed

@@ -115,26 +115,30 @@ class GEProfilingConfig(GEProfilingBaseConfig):
     )
     max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
         default=None,
-        description="A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.",
+        description="A positive integer that specifies the maximum number of columns to profile for "
+        "any table. `None` implies all columns. The cost of profiling goes up significantly as the "
+        "number of columns to profile goes up.",
     )
     profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
         default=None,
-        description="Profile table only if it has been updated since these many number of days. If set to `null`, no constraint of last modified time for tables to profile. Supported only in `snowflake` and `BigQuery`.",
+        description="Profile table only if it has been updated since these many number of days. "
+        "If set to `null`, no constraint of last modified time for tables to profile. "
+        "Supported only in `snowflake` and `BigQuery`.",
     )
     profile_table_size_limit: Optional[int] = Field(
         default=5,
         description="Profile tables only if their size is less than specified GBs. If set to `null`, "
-        "no limit on the size of tables to profile. Supported only in `snowflake` and `BigQuery`"
-        "Supported for `oracle` based on calculated size from gathered stats.",
+        "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
+        "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
     )
     profile_table_row_limit: Optional[int] = Field(
         default=5000000,
-        description="Profile tables only if their row count is less than specified count. If set to `null`, "
-        "no limit on the row count of tables to profile. Supported only in `snowflake` and `BigQuery`"
-        "Supported for `oracle` based on gathered stats.",
+        description="Profile tables only if their row count is less than specified count. "
+        "If set to `null`, no limit on the row count of tables to profile. Supported only in "
+        "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
     )
     profile_table_row_count_estimate_only: bool = Field(

datahub/ingestion/source/iceberg/iceberg_common.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
 from humanfriendly import format_timespan
 from pydantic import Field, validator
@@ -20,6 +20,7 @@ from datahub.ingestion.source_config.operation_config import (
     OperationConfig,
     is_profiling_enabled,
 )
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
 logger = logging.getLogger(__name__)
@@ -198,7 +199,7 @@ class TimingClass:
 class IcebergSourceReport(StaleEntityRemovalSourceReport):
     tables_scanned: int = 0
     entities_profiled: int = 0
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     load_table_timings: TimingClass = field(default_factory=TimingClass)
     processing_table_timings: TimingClass = field(default_factory=TimingClass)
     profiling_table_timings: TimingClass = field(default_factory=TimingClass)

datahub/ingestion/source/identity/okta.py CHANGED Viewed

@@ -50,6 +50,7 @@ from datahub.metadata.schema_classes import (
     OriginTypeClass,
     StatusClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
 nest_asyncio.apply()
@@ -173,7 +174,7 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
 @dataclass
 class OktaSourceReport(StaleEntityRemovalSourceReport):
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_filtered(self, name: str) -> None:
         self.filtered.append(name)

datahub/ingestion/source/kafka/kafka.py CHANGED Viewed

@@ -73,6 +73,7 @@ from datahub.metadata.schema_classes import (
     OwnershipSourceTypeClass,
     SubTypesClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.mapping import Constants, OperationProcessor
 from datahub.utilities.registries.domain_registry import DomainRegistry
 from datahub.utilities.str_enum import StrEnum
@@ -190,7 +191,7 @@ def get_kafka_admin_client(
 @dataclass
 class KafkaSourceReport(StaleEntityRemovalSourceReport):
     topics_scanned: int = 0
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_topic_scanned(self, topic: str) -> None:
         self.topics_scanned += 1

datahub/ingestion/source/kafka_connect/common.py CHANGED Viewed

@@ -16,6 +16,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionConfigBase,
 )
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -83,7 +84,7 @@ class KafkaConnectSourceConfig(
 @dataclass
 class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
     connectors_scanned: int = 0
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_connector_scanned(self, connector: str) -> None:
         self.connectors_scanned += 1

datahub/ingestion/source/ldap.py CHANGED Viewed

@@ -37,6 +37,7 @@ from datahub.metadata.schema_classes import (
     CorpUserSnapshotClass,
     GroupMembershipClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 # default mapping for attrs
 user_attrs_map: Dict[str, Any] = {}
@@ -160,7 +161,7 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
 @dataclasses.dataclass
 class LDAPSourceReport(StaleEntityRemovalSourceReport):
-    dropped_dns: List[str] = dataclasses.field(default_factory=list)
+    dropped_dns: LossyList[str] = dataclasses.field(default_factory=LossyList)
     def report_dropped(self, dn: str) -> None:
         self.dropped_dns.append(dn)

datahub/ingestion/source/looker/lookml_config.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from dataclasses import dataclass, field as dataclass_field
 from datetime import timedelta
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, Literal, Optional, Union
 import pydantic
 from pydantic import root_validator, validator
@@ -48,13 +48,17 @@ DERIVED_VIEW_PATTERN: str = r"\$\{([^}]*)\}"
 class LookMLSourceReport(StaleEntityRemovalSourceReport):
     git_clone_latency: Optional[timedelta] = None
     models_discovered: int = 0
-    models_dropped: List[str] = dataclass_field(default_factory=LossyList)
+    models_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
     views_discovered: int = 0
-    views_dropped: List[str] = dataclass_field(default_factory=LossyList)
-    views_dropped_unreachable: List[str] = dataclass_field(default_factory=LossyList)
+    views_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
+    views_dropped_unreachable: LossyList[str] = dataclass_field(
+        default_factory=LossyList
+    )
     query_parse_attempts: int = 0
     query_parse_failures: int = 0
-    query_parse_failure_views: List[str] = dataclass_field(default_factory=LossyList)
+    query_parse_failure_views: LossyList[str] = dataclass_field(
+        default_factory=LossyList
+    )
     _looker_api: Optional[LookerAPI] = None
     def report_models_scanned(self) -> None:

datahub/ingestion/source/mongodb.py CHANGED Viewed

@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
     UnionTypeClass,
 )
 from datahub.metadata.urns import DatasetUrn
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -143,7 +144,7 @@ class MongoDBConfig(
 @dataclass
 class MongoDBSourceReport(StaleEntityRemovalSourceReport):
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_dropped(self, name: str) -> None:
         self.filtered.append(name)

datahub/ingestion/source/nifi.py CHANGED Viewed

@@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
     DatasetPropertiesClass,
 )
 from datahub.specific.datajob import DataJobPatchBuilder
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
 NIFI = "nifi"
@@ -452,7 +453,7 @@ def get_attribute_value(attr_lst: List[dict], attr_name: str) -> Optional[str]:
 @dataclass
 class NifiSourceReport(SourceReport):
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_dropped(self, ent_name: str) -> None:
         self.filtered.append(ent_name)

datahub/ingestion/source/powerbi/config.py CHANGED Viewed

@@ -132,6 +132,7 @@ class Constant:
     ACTIVE = "Active"
     SQL_PARSING_FAILURE = "SQL Parsing Failure"
     M_QUERY_NULL = '"null"'
+    REPORT_WEB_URL = "reportWebUrl"
 @dataclass
@@ -195,8 +196,8 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
     dashboards_scanned: int = 0
     charts_scanned: int = 0
-    filtered_dashboards: List[str] = dataclass_field(default_factory=list)
-    filtered_charts: List[str] = dataclass_field(default_factory=list)
+    filtered_dashboards: LossyList[str] = dataclass_field(default_factory=LossyList)
+    filtered_charts: LossyList[str] = dataclass_field(default_factory=LossyList)
     m_query_parse_timer: PerfTimer = dataclass_field(default_factory=PerfTimer)
     m_query_parse_attempts: int = 0

datahub/ingestion/source/powerbi/powerbi.py CHANGED Viewed

@@ -582,8 +582,11 @@ class Mapper:
             if tile.dataset is not None and tile.dataset.webUrl is not None:
                 custom_properties[Constant.DATASET_WEB_URL] = tile.dataset.webUrl
-            if tile.report is not None and tile.report.id is not None:
-                custom_properties[Constant.REPORT_ID] = tile.report.id
+            if tile.report_id is not None:
+                custom_properties[Constant.REPORT_ID] = tile.report_id
+            if tile.report is not None and tile.report.webUrl is not None:
+                custom_properties[Constant.REPORT_WEB_URL] = tile.report.webUrl
             return custom_properties
@@ -1053,6 +1056,7 @@ class Mapper:
         report: powerbi_data_classes.Report,
         chart_mcps: List[MetadataChangeProposalWrapper],
         user_mcps: List[MetadataChangeProposalWrapper],
+        dashboard_edges: List[EdgeClass],
     ) -> List[MetadataChangeProposalWrapper]:
         """
         Map PowerBi report to Datahub dashboard
@@ -1074,6 +1078,7 @@ class Mapper:
             charts=chart_urn_list,
             lastModified=ChangeAuditStamps(),
             dashboardUrl=report.webUrl,
+            dashboards=dashboard_edges,
         )
         info_mcp = self.new_mcp(
@@ -1167,8 +1172,28 @@ class Mapper:
         ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
         chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
+        # find all dashboards with a Tile referencing this report
+        downstream_dashboards_edges = []
+        for d in workspace.dashboards.values():
+            if any(t.report_id == report.id for t in d.tiles):
+                dashboard_urn = builder.make_dashboard_urn(
+                    platform=self.__config.platform_name,
+                    platform_instance=self.__config.platform_instance,
+                    name=d.get_urn_part(),
+                )
+                edge = EdgeClass(
+                    destinationUrn=dashboard_urn,
+                    sourceUrn=None,
+                    created=None,
+                    lastModified=None,
+                    properties=None,
+                )
+                downstream_dashboards_edges.append(edge)
         # Let's convert report to datahub dashboard
-        report_mcps = self.report_to_dashboard(workspace, report, chart_mcps, user_mcps)
+        report_mcps = self.report_to_dashboard(
+            workspace, report, chart_mcps, user_mcps, downstream_dashboards_edges
+        )
         # Now add MCPs in sequence
         mcps.extend(ds_mcps)

datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py CHANGED Viewed

@@ -286,11 +286,15 @@ class Tile:
     id: str
     title: str
     embedUrl: str
-    dataset: Optional["PowerBIDataset"]
     dataset_id: Optional[str]
-    report: Optional[Report]
+    report_id: Optional[str]
     createdFrom: CreatedFrom
+    # In a first pass, `dataset_id` and/or `report_id` are filled in.
+    # In a subsequent pass, the objects are populated.
+    dataset: Optional["PowerBIDataset"]
+    report: Optional[Report]
     def get_urn_part(self):
         return f"charts.{self.id}"

datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py CHANGED Viewed

@@ -337,41 +337,6 @@ class DataResolverBase(ABC):
         -tiles), there is no information available on pagination
         """
-        def new_dataset_or_report(tile_instance: Any) -> dict:
-            """
-            Find out which is the data source for tile. It is either REPORT or DATASET
-            """
-            report_fields = {
-                Constant.REPORT: (
-                    self.get_report(
-                        workspace=workspace,
-                        report_id=tile_instance.get(Constant.REPORT_ID),
-                    )
-                    if tile_instance.get(Constant.REPORT_ID) is not None
-                    else None
-                ),
-                Constant.CREATED_FROM: Tile.CreatedFrom.UNKNOWN,
-            }
-            # reportId and datasetId are exclusive in tile_instance
-            # if datasetId is present that means tile is created from dataset
-            # if reportId is present that means tile is created from report
-            # if both i.e. reportId and datasetId are not present then tile is created from some visualization
-            if tile_instance.get(Constant.REPORT_ID) is not None:
-                report_fields[Constant.CREATED_FROM] = Tile.CreatedFrom.REPORT
-            elif tile_instance.get(Constant.DATASET_ID) is not None:
-                report_fields[Constant.CREATED_FROM] = Tile.CreatedFrom.DATASET
-            else:
-                report_fields[Constant.CREATED_FROM] = Tile.CreatedFrom.VISUALIZATION
-            title: Optional[str] = tile_instance.get(Constant.TITLE)
-            _id: Optional[str] = tile_instance.get(Constant.ID)
-            created_from: Any = report_fields[Constant.CREATED_FROM]
-            logger.info(f"Tile {title}({_id}) is created from {created_from}")
-            return report_fields
         tile_list_endpoint: str = self.get_tiles_endpoint(
             workspace, dashboard_id=dashboard.id
         )
@@ -393,8 +358,18 @@ class DataResolverBase(ABC):
                 title=instance.get(Constant.TITLE),
                 embedUrl=instance.get(Constant.EMBED_URL),
                 dataset_id=instance.get(Constant.DATASET_ID),
+                report_id=instance.get(Constant.REPORT_ID),
                 dataset=None,
-                **new_dataset_or_report(instance),
+                report=None,
+                createdFrom=(
+                    # In the past we considered that only one of the two report_id or dataset_id would be present
+                    # but we have seen cases where both are present. If both are present, we prioritize the report.
+                    Tile.CreatedFrom.REPORT
+                    if instance.get(Constant.REPORT_ID)
+                    else Tile.CreatedFrom.DATASET
+                    if instance.get(Constant.DATASET_ID)
+                    else Tile.CreatedFrom.VISUALIZATION
+                ),
             )
             for instance in tile_dict
             if instance is not None

datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py CHANGED Viewed

@@ -625,13 +625,26 @@ class PowerBiAPI:
                 dashboard.tiles = self._get_resolver().get_tiles(
                     workspace, dashboard=dashboard
                 )
-                # set the dataset for tiles
+                # set the dataset and the report for tiles
                 for tile in dashboard.tiles:
+                    # In Power BI, dashboards, reports, and datasets are tightly scoped to the workspace they belong to.
+                    # https://learn.microsoft.com/en-us/power-bi/collaborate-share/service-new-workspaces
+                    if tile.report_id:
+                        tile.report = workspace.reports.get(tile.report_id)
+                        if tile.report is None:
+                            self.reporter.info(
+                                title="Missing Report Lineage For Tile",
+                                message="A Report reference that failed to be resolved. Please ensure that 'extract_reports' is set to True in the configuration.",
+                                context=f"workspace-name: {workspace.name}, tile-name: {tile.title}, report-id: {tile.report_id}",
+                            )
+                    # However, semantic models (aka datasets) can be shared accross workspaces
+                    # https://learn.microsoft.com/en-us/fabric/admin/portal-workspace#use-semantic-models-across-workspaces
+                    # That's why the global 'dataset_registry' is required
                     if tile.dataset_id:
                         tile.dataset = self.dataset_registry.get(tile.dataset_id)
                         if tile.dataset is None:
                             self.reporter.info(
-                                title="Missing Lineage For Tile",
+                                title="Missing Dataset Lineage For Tile",
                                 message="A cross-workspace reference that failed to be resolved. Please ensure that no global workspace is being filtered out due to the workspace_id_pattern.",
                                 context=f"workspace-name: {workspace.name}, tile-name: {tile.title}, dataset-id: {tile.dataset_id}",
                             )
@@ -653,10 +666,10 @@ class PowerBiAPI:
             for dashboard in workspace.dashboards.values():
                 dashboard.tags = workspace.dashboard_endorsements.get(dashboard.id, [])
+        # fill reports first since some dashboard may reference a report
+        fill_reports()
         if self.__config.extract_dashboards:
             fill_dashboards()
-        fill_reports()
         fill_dashboard_tags()
         self._fill_independent_datasets(workspace=workspace)

datahub/ingestion/source/powerbi_report_server/report_server.py CHANGED Viewed

@@ -53,6 +53,7 @@ from datahub.metadata.schema_classes import (
     StatusClass,
 )
 from datahub.utilities.dedup_list import deduplicate_list
+from datahub.utilities.lossy_collections import LossyList
 LOGGER = logging.getLogger(__name__)
@@ -476,7 +477,7 @@ class Mapper:
 @dataclass
 class PowerBiReportServerDashboardSourceReport(SourceReport):
     scanned_report: int = 0
-    filtered_reports: List[str] = dataclass_field(default_factory=list)
+    filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
     def report_scanned(self, count: int = 1) -> None:
         self.scanned_report += count

datahub/ingestion/source/redash.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 import sys
 from dataclasses import dataclass, field
-from typing import Dict, Iterable, List, Optional, Set
+from typing import Dict, Iterable, List, Optional
 import dateutil.parser as dp
 from packaging import version
@@ -39,7 +39,7 @@ from datahub.metadata.schema_classes import (
     DashboardInfoClass,
 )
 from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
-from datahub.utilities.lossy_collections import LossyDict, LossyList
+from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
@@ -280,9 +280,9 @@ class RedashConfig(ConfigModel):
 class RedashSourceReport(SourceReport):
     items_scanned: int = 0
     filtered: LossyList[str] = field(default_factory=LossyList)
-    queries_problem_parsing: Set[str] = field(default_factory=set)
-    queries_no_dataset: Set[str] = field(default_factory=set)
-    charts_no_input: Set[str] = field(default_factory=set)
+    queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
+    queries_no_dataset: LossySet[str] = field(default_factory=LossySet)
+    charts_no_input: LossySet[str] = field(default_factory=LossySet)
     total_queries: Optional[int] = field(
         default=None,
     )

datahub/ingestion/source/salesforce.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import logging
 import time
+from dataclasses import dataclass, field as dataclass_field
 from datetime import datetime
 from enum import Enum
 from typing import Any, Dict, Iterable, List, Optional
@@ -60,6 +61,7 @@ from datahub.metadata.schema_classes import (
     TagAssociationClass,
 )
 from datahub.utilities import config_clean
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -146,8 +148,9 @@ class SalesforceConfig(DatasetSourceConfigMixin):
         return config_clean.remove_trailing_slashes(v)
+@dataclass
 class SalesforceSourceReport(SourceReport):
-    filtered: List[str] = []
+    filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
     def report_dropped(self, ent_name: str) -> None:
         self.filtered.append(ent_name)

datahub/ingestion/source/snowflake/constants.py CHANGED Viewed

@@ -53,6 +53,7 @@ class SnowflakeObjectDomain(StrEnum):
     SCHEMA = "schema"
     COLUMN = "column"
     ICEBERG_TABLE = "iceberg table"
+    STREAM = "stream"
 GENERIC_PERMISSION_ERROR_KEY = "permission-error"

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -98,6 +98,11 @@ class SnowflakeFilterConfig(SQLFilterConfig):
     )
     # table_pattern and view_pattern are inherited from SQLFilterConfig
+    stream_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description="Regex patterns for streams to filter in ingestion. Note: Defaults to table_pattern if not specified. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
+    )
     match_fully_qualified_names: bool = Field(
         default=False,
         description="Whether `schema_pattern` is matched against fully qualified schema name `<catalog>.<schema>`.",
@@ -274,6 +279,11 @@ class SnowflakeV2Config(
         description="List of regex patterns for tags to include in ingestion. Only used if `extract_tags` is enabled.",
     )
+    include_streams: bool = Field(
+        default=True,
+        description="If enabled, streams will be ingested as separate entities from tables/views.",
+    )
     structured_property_pattern: AllowDenyPattern = Field(
         default=AllowDenyPattern.allow_all(),
         description=(

datahub/ingestion/source/snowflake/snowflake_queries.py CHANGED Viewed

@@ -49,6 +49,7 @@ from datahub.metadata.urns import CorpUserUrn
 from datahub.sql_parsing.schema_resolver import SchemaResolver
 from datahub.sql_parsing.sql_parsing_aggregator import (
     KnownLineageMapping,
+    ObservedQuery,
     PreparsedQuery,
     SqlAggregatorReport,
     SqlParsingAggregator,
@@ -241,7 +242,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
         use_cached_audit_log = audit_log_file.exists()
         queries: FileBackedList[
-            Union[KnownLineageMapping, PreparsedQuery, TableRename, TableSwap]
+            Union[
+                KnownLineageMapping,
+                PreparsedQuery,
+                TableRename,
+                TableSwap,
+                ObservedQuery,
+            ]
         ]
         if use_cached_audit_log:
             logger.info("Using cached audit log")
@@ -252,7 +259,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
             shared_connection = ConnectionWrapper(audit_log_file)
             queries = FileBackedList(shared_connection)
-            entry: Union[KnownLineageMapping, PreparsedQuery, TableRename, TableSwap]
+            entry: Union[
+                KnownLineageMapping,
+                PreparsedQuery,
+                TableRename,
+                TableSwap,
+                ObservedQuery,
+            ]
             with self.report.copy_history_fetch_timer:
                 for entry in self.fetch_copy_history():
@@ -329,7 +342,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
     def fetch_query_log(
         self, users: UsersMapping
-    ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap]]:
+    ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery]]:
         query_log_query = _build_enriched_query_log_query(
             start_time=self.config.window.start_time,
             end_time=self.config.window.end_time,
@@ -362,7 +375,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
     def _parse_audit_log_row(
         self, row: Dict[str, Any], users: UsersMapping
-    ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery]]:
+    ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
         json_fields = {
             "DIRECT_OBJECTS_ACCESSED",
             "OBJECTS_MODIFIED",
@@ -398,6 +411,34 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                 pass
             else:
                 return None
+        user = CorpUserUrn(
+            self.identifiers.get_user_identifier(
+                res["user_name"], users.get(res["user_name"])
+            )
+        )
+        # Use direct_objects_accessed instead objects_modified
+        # objects_modified returns $SYS_VIEW_X with no mapping
+        has_stream_objects = any(
+            obj.get("objectDomain") == "Stream" for obj in direct_objects_accessed
+        )
+        # If a stream is used, default to query parsing.
+        if has_stream_objects:
+            logger.debug("Found matching stream object")
+            return ObservedQuery(
+                query=res["query_text"],
+                session_id=res["session_id"],
+                timestamp=res["query_start_time"].astimezone(timezone.utc),
+                user=user,
+                default_db=res["default_db"],
+                default_schema=res["default_schema"],
+                query_hash=get_query_fingerprint(
+                    res["query_text"], self.identifiers.platform, fast=True
+                ),
+            )
         upstreams = []
         column_usage = {}
@@ -460,12 +501,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                     )
                 )
-        user = CorpUserUrn(
-            self.identifiers.get_user_identifier(
-                res["user_name"], users.get(res["user_name"])
-            )
-        )
         timestamp: datetime = res["query_start_time"]
         timestamp = timestamp.astimezone(timezone.utc)

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -9,6 +9,7 @@ from datahub.ingestion.source.snowflake.snowflake_config import (
 from datahub.utilities.prefix_batch_builder import PrefixGroup
 SHOW_VIEWS_MAX_PAGE_SIZE = 10000
+SHOW_STREAM_MAX_PAGE_SIZE = 10000
 def create_deny_regex_sql_filter(
@@ -36,6 +37,7 @@ class SnowflakeQuery:
         SnowflakeObjectDomain.VIEW.capitalize(),
         SnowflakeObjectDomain.MATERIALIZED_VIEW.capitalize(),
         SnowflakeObjectDomain.ICEBERG_TABLE.capitalize(),
+        SnowflakeObjectDomain.STREAM.capitalize(),
     }
     ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format(
@@ -44,7 +46,8 @@ class SnowflakeQuery:
     ACCESS_HISTORY_TABLE_DOMAINS_FILTER = (
         "("
         f"'{SnowflakeObjectDomain.TABLE.capitalize()}',"
-        f"'{SnowflakeObjectDomain.VIEW.capitalize()}'"
+        f"'{SnowflakeObjectDomain.VIEW.capitalize()}',"
+        f"'{SnowflakeObjectDomain.STREAM.capitalize()}',"
         ")"
     )
@@ -963,3 +966,19 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
     @staticmethod
     def get_all_users() -> str:
         return """SELECT name as "NAME", email as "EMAIL" FROM SNOWFLAKE.ACCOUNT_USAGE.USERS"""
+    @staticmethod
+    def streams_for_database(
+        db_name: str,
+        limit: int = SHOW_STREAM_MAX_PAGE_SIZE,
+        stream_pagination_marker: Optional[str] = None,
+    ) -> str:
+        # SHOW STREAMS can return a maximum of 10000 rows.
+        # https://docs.snowflake.com/en/sql-reference/sql/show-streams#usage-notes
+        assert limit <= SHOW_STREAM_MAX_PAGE_SIZE
+        # To work around this, we paginate through the results using the FROM clause.
+        from_clause = (
+            f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
+        )
+        return f"""SHOW STREAMS IN DATABASE {db_name} LIMIT {limit} {from_clause};"""

acryl-datahub 0.15.0.5rc9__py3-none-any.whl → 0.15.0.6rc1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.5rc9py3-none-any.whl → 0.15.0.6rc1py3-none-any.whl