PyPI - acryl-datahub - Versions diffs - 1.0.0.2rc5__py3-none-any.whl → 1.0.0.3rc1__py3-none-any.whl - Mend

acryl-datahub 1.0.0.2rc5py3-none-any.whl → 1.0.0.3rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (15) hide show

{acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc1.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
-acryl_datahub-1.0.0.2rc5.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
+acryl_datahub-1.0.0.3rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
 datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
 datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
-datahub/_version.py,sha256=TEFaI0SngUMeKPXQwQz9bnZDzmSywu7Y6e6m6k--k00,323
+datahub/_version.py,sha256=R-5q2sde87sdyofKBpzMGjN_yrh8SbPAoOTVYlH3CuU,323
 datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
 datahub/errors.py,sha256=BzKdcmYseHOt36zfjJXc17WNutFhp9Y23cU_L6cIkxc,612
 datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -151,7 +151,7 @@ datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188m
 datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
 datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/api/auto_work_units/auto_dataset_properties_aspect.py,sha256=ID_6N3nWl2qohsSGizUCqo3d2MNyDeVbyWroQpSOSsc,5059
-datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=5jrl7cEyonce-YdWe1Iw6y3Okw5smJosqwOm5e-nvqM,4363
+datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=fMjPnyWEofIZV52E2AFYU3IgBJwyZvbygXxCJyEtcWI,4442
 datahub/ingestion/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/extractor/extractor_registry.py,sha256=f7CLfW3pr29QZkXSHbp7HjUrsdw7ejQJmot-tiSPcqc,342
 datahub/ingestion/extractor/json_ref_patch.py,sha256=4g3ZWHn7rwS74jUvSXJiGpi-UKHhiSYKKgBeU4E5ukE,1448
@@ -327,7 +327,7 @@ datahub/ingestion/source/git/git_import.py,sha256=5CT6vMDb0MDctCtShnxb3JVihULtvk
 datahub/ingestion/source/grafana/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/grafana/grafana_source.py,sha256=3pU3xodPgS5lmnjuQ_u7F0XPzD_Y8MnPlMxRJ86qz4g,4960
 datahub/ingestion/source/hex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/source/hex/api.py,sha256=JfFPD8O4z16fwZE_BdX5aCQztEq-tbzxJJ7aofH4DE4,12274
+datahub/ingestion/source/hex/api.py,sha256=OVQNI_11NJJcNCT6OzSDEtVjNcom0vmes_KkjgzWCcI,11806
 datahub/ingestion/source/hex/constants.py,sha256=8hUTMWyG5keTNfXoLu_Dh413Hw_mGGJX1atiiDZyKtg,271
 datahub/ingestion/source/hex/hex.py,sha256=PIRl8fPkKtlHV7cqR4H8RKVYdTLgEFXHFzc3QAqJLhE,12733
 datahub/ingestion/source/hex/mapper.py,sha256=N3mTlEcrOmhv9ia1dnHGFgFJD2ddyTtU3H5IUbb-UxU,13344
@@ -439,10 +439,10 @@ datahub/ingestion/source/schema_inference/json.py,sha256=p5S-3idn65V2uad5T8txs1U
 datahub/ingestion/source/schema_inference/object.py,sha256=dhSOtxVJHbTDY0hWeHwdLYHnOsW07Omk7Y4DPeztie0,5847
 datahub/ingestion/source/schema_inference/parquet.py,sha256=CdqsNuiabLLCulWbuPMssijeFmKLv3M5MKFIhlatpWA,3456
 datahub/ingestion/source/sigma/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/source/sigma/config.py,sha256=yfdKQYvI5hKVl8gNAKIcJe-VW3klvdDqYbUP76gJQDI,3812
+datahub/ingestion/source/sigma/config.py,sha256=xpZXt4f05-sroWFv9SbzVhU1-iBeVfU1ocJKb-fy3aM,6333
 datahub/ingestion/source/sigma/data_classes.py,sha256=YZkkzwftV34mq5c_4jlC2PCSiRKt4hvHjmqikLQhl1I,2012
-datahub/ingestion/source/sigma/sigma.py,sha256=ucODIa5KUGr3WSoo7VgCt8uFaKRbSDlwsdVMAcjPLpQ,24378
-datahub/ingestion/source/sigma/sigma_api.py,sha256=SVvbUs2vjueUdDa-3FzeMsaX5pNpApVI192P7EZzPcI,17870
+datahub/ingestion/source/sigma/sigma.py,sha256=ZtPj8eu6hcJxyFcWizob4kRaxrpcqsWzh__lmuVZdt8,25212
+datahub/ingestion/source/sigma/sigma_api.py,sha256=7PK5AQa838hYeaQ5L0dioi4n4bLrpN-r7COKTTNUYw8,19837
 datahub/ingestion/source/slack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/slack/slack.py,sha256=3N7Yp-u9DvBmo536Z6-pQTrJgSJ3i742GePSgjlBOUU,27616
 datahub/ingestion/source/snowflake/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -455,8 +455,8 @@ datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=pEw2O9xoTSIWDi
 datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
 datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=FBmiONx4EGHWV8RNJT6zHZyntKinPFFyd2oKbTUIbhE,21319
 datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
-datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=gX9E1Z_CemAZsuTDmtvqrxY7vBL2da75j7X8Xwhaf8Y,28441
-datahub/ingestion/source/snowflake/snowflake_query.py,sha256=0AMPQ_L7sgQtBizBNEe69-BUM8_wk1m8ystWivwKEMI,40409
+datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=PY4Wy6i89nqRl92ARwXNqWwm-ifagkKbKKtxYWeswkk,29209
+datahub/ingestion/source/snowflake/snowflake_query.py,sha256=JtTrfzGqM9mk2Fr-F1X0KXzc_8ot7rD3dD2vPEuzd0E,40411
 datahub/ingestion/source/snowflake/snowflake_report.py,sha256=O-465aBA8uaYZ6WepP7i6cgK6Q1jXJPjDA1j9C8klus,6762
 datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=1yGBbs2aWIdHnrwgeTR7J2lqxbbBsIt8ejCLumIpLEA,27274
 datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=_37-AQyI4uGt4fu-d3v2eAWzQ3uG835ZQxMjFwGYCng,57193
@@ -940,7 +940,7 @@ datahub/sql_parsing/sql_parsing_aggregator.py,sha256=A3_0wSxBJSRowEaslptDpBoKO42
 datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
 datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
 datahub/sql_parsing/sqlglot_lineage.py,sha256=l0kT8MuRIg96X7BNJaboMznF54b-yvM2nMTLyF2d0Nw,47446
-datahub/sql_parsing/sqlglot_utils.py,sha256=HP6awSU4ijmwjmTvGA_d0X_RO9O3rbGdkbVAWEhAcck,14667
+datahub/sql_parsing/sqlglot_utils.py,sha256=5cUiEWLWfVTI7uIxolAfOfNVo50qnklzhj86gxSFWqg,14943
 datahub/sql_parsing/tool_meta_extractor.py,sha256=EV_g7sOchTSUm2p6wluNJqND7-rDYokVTqqFCM7hQ6c,7599
 datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/telemetry/stats.py,sha256=TwaQisQlD2Bk0uw__pP6u3Ovz9r-Ip4pCwpnto4r5e0,959
@@ -1045,8 +1045,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
 datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
 datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
 datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
-acryl_datahub-1.0.0.2rc5.dist-info/METADATA,sha256=urp7GO85YeQHY_-wuzs6YWZ6xzfGkunfiD-r-e7CvfY,176853
-acryl_datahub-1.0.0.2rc5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-acryl_datahub-1.0.0.2rc5.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
-acryl_datahub-1.0.0.2rc5.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
-acryl_datahub-1.0.0.2rc5.dist-info/RECORD,,
+acryl_datahub-1.0.0.3rc1.dist-info/METADATA,sha256=43mPIcmD4ByKfyR6rn8PPgaKNUBSmDmVJnGm1KhBZuo,176855
+acryl_datahub-1.0.0.3rc1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+acryl_datahub-1.0.0.3rc1.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
+acryl_datahub-1.0.0.3rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
+acryl_datahub-1.0.0.3rc1.dist-info/RECORD,,

datahub/_version.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Published at https://pypi.org/project/acryl-datahub/.
 __package_name__ = "acryl-datahub"
-__version__ = "1.0.0.2rc5"
+__version__ = "1.0.0.3rc1"
 def is_dev_mode() -> bool:

datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py CHANGED Viewed

@@ -23,6 +23,7 @@ class EnsureAspectSizeProcessor:
     ):
         self.report = report
         self.payload_constraint = payload_constraint
+        self.schema_size_constraint = int(self.payload_constraint * 0.985)
     def ensure_dataset_profile_size(
         self, dataset_urn: str, profile: DatasetProfileClass
@@ -68,7 +69,7 @@ class EnsureAspectSizeProcessor:
         for field in schema.fields:
             field_size = len(json.dumps(pre_json_transform(field.to_obj())))
             logger.debug(f"Field {field.fieldPath} takes total {field_size}")
-            if total_fields_size + field_size < self.payload_constraint:
+            if total_fields_size + field_size < self.schema_size_constraint:
                 accepted_fields.append(field)
                 total_fields_size += field_size
             else:

datahub/ingestion/source/hex/api.py CHANGED Viewed

@@ -27,6 +27,7 @@ logger = logging.getLogger(__name__)
 # The following models were Claude-generated from Hex API OpenAPI definition https://static.hex.site/openapi.json
 # To be exclusively used internally for the deserialization of the API response
+# Model is incomplete and fields may have not been mapped if not used in the ingestion
 class HexApiAppViewStats(BaseModel):
@@ -83,20 +84,10 @@ class HexApiUser(BaseModel):
     email: str
-class HexApiAccessType(StrEnum):
-    """Access type enum."""
-    NONE = "NONE"
-    VIEW = "VIEW"
-    EDIT = "EDIT"
-    FULL_ACCESS = "FULL_ACCESS"
 class HexApiUserAccess(BaseModel):
     """User access model."""
     user: HexApiUser
-    access: Optional[HexApiAccessType] = None
 class HexApiCollectionData(BaseModel):
@@ -109,13 +100,6 @@ class HexApiCollectionAccess(BaseModel):
     """Collection access model."""
     collection: HexApiCollectionData
-    access: Optional[HexApiAccessType] = None
-class HexApiAccessSettings(BaseModel):
-    """Access settings model."""
-    access: Optional[HexApiAccessType] = None
 class HexApiWeeklySchedule(BaseModel):
@@ -145,9 +129,6 @@ class HexApiSharing(BaseModel):
     users: Optional[List[HexApiUserAccess]] = []
     collections: Optional[List[HexApiCollectionAccess]] = []
     groups: Optional[List[Any]] = []
-    workspace: Optional[HexApiAccessSettings] = None
-    public_web: Optional[HexApiAccessSettings] = Field(default=None, alias="publicWeb")
-    support: Optional[HexApiAccessSettings] = None
     class Config:
         extra = "ignore"  # Allow extra fields in the JSON

datahub/ingestion/source/sigma/config.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import logging
 from dataclasses import dataclass, field
-from typing import Dict, Optional
+from typing import Dict, List, Optional
 import pydantic
+from pydantic import BaseModel, Field
 from datahub.configuration.common import AllowDenyPattern
 from datahub.configuration.source_common import (
@@ -17,6 +18,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionConfigBase,
 )
+from datahub.utilities.lossy_collections import LossyDict
 logger = logging.getLogger(__name__)
@@ -53,15 +55,82 @@ class Constant:
     DEFAULT_API_URL = "https://aws-api.sigmacomputing.com/v2"
+class WorkspaceCounts(BaseModel):
+    workbooks_count: int = 0
+    datasets_count: int = 0
+    elements_count: int = 0
+    pages_count: int = 0
+    def is_empty(self) -> bool:
+        return (
+            self.workbooks_count == 0
+            and self.datasets_count == 0
+            and self.elements_count == 0
+            and self.pages_count == 0
+        )
+    def as_obj(self) -> dict:
+        return {
+            "workbooks_count": self.workbooks_count,
+            "datasets_count": self.datasets_count,
+            "elements_count": self.elements_count,
+            "pages_count": self.pages_count,
+        }
+class SigmaWorkspaceEntityFilterReport(EntityFilterReport):
+    type: str = "workspace"
+    workspace_counts: LossyDict[str, WorkspaceCounts] = Field(
+        default_factory=LossyDict,
+        description="Counts of workbooks, datasets, elements and pages in each workspace.",
+    )
+    def increment_workbooks_count(self, workspace_id: str) -> None:
+        if workspace_id not in self.workspace_counts:
+            self.workspace_counts[workspace_id] = WorkspaceCounts()
+        self.workspace_counts[workspace_id].workbooks_count += 1
+    def increment_datasets_count(self, workspace_id: str) -> None:
+        if workspace_id not in self.workspace_counts:
+            self.workspace_counts[workspace_id] = WorkspaceCounts()
+        self.workspace_counts[workspace_id].datasets_count += 1
+    def increment_elements_count(self, workspace_id: str) -> None:
+        if workspace_id not in self.workspace_counts:
+            self.workspace_counts[workspace_id] = WorkspaceCounts()
+        self.workspace_counts[workspace_id].elements_count += 1
+    def increment_pages_count(self, workspace_id: str) -> None:
+        if workspace_id not in self.workspace_counts:
+            self.workspace_counts[workspace_id] = WorkspaceCounts()
+        self.workspace_counts[workspace_id].pages_count += 1
+    def as_obj(self) -> dict:
+        return {
+            "filtered": self.dropped_entities.as_obj(),
+            "processed": self.processed_entities.as_obj(),
+            "workspace_counts": {
+                key: item.as_obj() for key, item in self.workspace_counts.items()
+            },
+        }
 @dataclass
 class SigmaSourceReport(StaleEntityRemovalSourceReport):
-    workspaces: EntityFilterReport = EntityFilterReport.field(type="workspace")
-    number_of_workspaces: Optional[int] = None
+    workspaces: SigmaWorkspaceEntityFilterReport = field(
+        default_factory=SigmaWorkspaceEntityFilterReport
+    )
     non_accessible_workspaces_count: int = 0
-    shared_entities_count: int = 0
-    number_of_datasets: int = 0
-    number_of_workbooks: int = 0
+    datasets: EntityFilterReport = EntityFilterReport.field(type="dataset")
+    datasets_without_workspace: int = 0
+    workbooks: EntityFilterReport = EntityFilterReport.field(type="workbook")
+    workbooks_without_workspace: int = 0
     number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
+    empty_workspaces: List[str] = field(default_factory=list)
 class PlatformDetail(PlatformInstanceConfigMixin, EnvConfigMixin):

datahub/ingestion/source/sigma/sigma.py CHANGED Viewed

@@ -35,6 +35,7 @@ from datahub.ingestion.source.sigma.config import (
     PlatformDetail,
     SigmaSourceConfig,
     SigmaSourceReport,
+    WorkspaceCounts,
 )
 from datahub.ingestion.source.sigma.data_classes import (
     Element,
@@ -163,7 +164,6 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
     def _get_allowed_workspaces(self) -> List[Workspace]:
         all_workspaces = self.sigma_api.workspaces.values()
         logger.info(f"Number of workspaces = {len(all_workspaces)}")
-        self.reporter.number_of_workspaces = len(all_workspaces)
         allowed_workspaces = []
         for workspace in all_workspaces:
@@ -285,6 +285,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
         yield self._gen_dataset_properties(dataset_urn, dataset)
         if dataset.workspaceId:
+            self.reporter.workspaces.increment_datasets_count(dataset.workspaceId)
             yield from add_entity_to_container(
                 container_key=self._gen_workspace_key(dataset.workspaceId),
                 entity_type="dataset",
@@ -468,6 +469,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
             ).as_workunit()
             if workbook.workspaceId:
+                self.reporter.workspaces.increment_elements_count(workbook.workspaceId)
                 yield self._gen_entity_browsepath_aspect(
                     entity_urn=chart_urn,
                     parent_entity_urn=builder.make_container_urn(
@@ -525,6 +528,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
             all_input_fields: List[InputFieldClass] = []
             if workbook.workspaceId:
+                self.reporter.workspaces.increment_pages_count(workbook.workspaceId)
                 yield self._gen_entity_browsepath_aspect(
                     entity_urn=dashboard_urn,
                     parent_entity_urn=builder.make_container_urn(
@@ -614,6 +618,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
         paths = workbook.path.split("/")[1:]
         if workbook.workspaceId:
+            self.reporter.workspaces.increment_workbooks_count(workbook.workspaceId)
             yield self._gen_entity_browsepath_aspect(
                 entity_urn=dashboard_urn,
                 parent_entity_urn=builder.make_container_urn(
@@ -667,6 +673,15 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
                 f"{workspace.name} ({workspace.workspaceId})"
             )
             yield from self._gen_workspace_workunit(workspace)
+            if self.reporter.workspaces.workspace_counts.get(
+                workspace.workspaceId, WorkspaceCounts()
+            ).is_empty():
+                logger.warning(
+                    f"Workspace {workspace.name} ({workspace.workspaceId}) is empty. If this is not expected, add the user associated with the Client ID/Secret to each workspace with missing metadata"
+                )
+                self.reporter.empty_workspaces.append(
+                    f"{workspace.name} ({workspace.workspaceId})"
+                )
         yield from self._gen_sigma_dataset_upstream_lineage_workunit()
     def get_report(self) -> SourceReport:

datahub/ingestion/source/sigma/sigma_api.py CHANGED Viewed

@@ -95,22 +95,22 @@ class SigmaAPI:
         return get_response
     def get_workspace(self, workspace_id: str) -> Optional[Workspace]:
+        if workspace_id in self.workspaces:
+            return self.workspaces[workspace_id]
         logger.debug(f"Fetching workspace metadata with id '{workspace_id}'")
         try:
-            if workspace_id in self.workspaces:
-                return self.workspaces[workspace_id]
-            else:
-                response = self._get_api_call(
-                    f"{self.config.api_url}/workspaces/{workspace_id}"
-                )
-                if response.status_code == 403:
-                    logger.debug(f"Workspace {workspace_id} not accessible.")
-                    self.report.non_accessible_workspaces_count += 1
-                    return None
-                response.raise_for_status()
-                workspace = Workspace.parse_obj(response.json())
-                self.workspaces[workspace.workspaceId] = workspace
-                return workspace
+            response = self._get_api_call(
+                f"{self.config.api_url}/workspaces/{workspace_id}"
+            )
+            if response.status_code == 403:
+                logger.debug(f"Workspace {workspace_id} not accessible.")
+                self.report.non_accessible_workspaces_count += 1
+                return None
+            response.raise_for_status()
+            workspace = Workspace.parse_obj(response.json())
+            self.workspaces[workspace.workspaceId] = workspace
+            return workspace
         except Exception as e:
             self._log_http_error(
                 message=f"Unable to fetch workspace '{workspace_id}'. Exception: {e}"
@@ -187,7 +187,9 @@ class SigmaAPI:
     @functools.lru_cache
     def _get_files_metadata(self, file_type: str) -> Dict[str, File]:
         logger.debug(f"Fetching file metadata with type {file_type}.")
-        file_url = url = f"{self.config.api_url}/files?typeFilters={file_type}"
+        file_url = url = (
+            f"{self.config.api_url}/files?permissionFilter=view&typeFilters={file_type}"
+        )
         try:
             files_metadata: Dict[str, File] = {}
             while True:
@@ -225,31 +227,50 @@ class SigmaAPI:
                 for dataset_dict in response_dict[Constant.ENTRIES]:
                     dataset = SigmaDataset.parse_obj(dataset_dict)
-                    if dataset.datasetId in dataset_files_metadata:
-                        dataset.path = dataset_files_metadata[dataset.datasetId].path
-                        dataset.badge = dataset_files_metadata[dataset.datasetId].badge
-                        workspace_id = dataset_files_metadata[
-                            dataset.datasetId
-                        ].workspaceId
-                        if workspace_id:
-                            dataset.workspaceId = workspace_id
-                            workspace = self.get_workspace(dataset.workspaceId)
-                            if workspace:
-                                if self.config.workspace_pattern.allowed(
-                                    workspace.name
-                                ):
-                                    datasets.append(dataset)
-                            elif self.config.ingest_shared_entities:
-                                # If no workspace for dataset we can consider it as shared entity
-                                self.report.shared_entities_count += 1
-                                datasets.append(dataset)
+                    if dataset.datasetId not in dataset_files_metadata:
+                        self.report.datasets.dropped(
+                            f"{dataset.name} ({dataset.datasetId}) (missing file metadata)"
+                        )
+                        continue
+                    dataset.workspaceId = dataset_files_metadata[
+                        dataset.datasetId
+                    ].workspaceId
+                    dataset.path = dataset_files_metadata[dataset.datasetId].path
+                    dataset.badge = dataset_files_metadata[dataset.datasetId].badge
+                    workspace = None
+                    if dataset.workspaceId:
+                        workspace = self.get_workspace(dataset.workspaceId)
+                    if workspace:
+                        if self.config.workspace_pattern.allowed(workspace.name):
+                            self.report.datasets.processed(
+                                f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
+                            )
+                            datasets.append(dataset)
+                        else:
+                            self.report.datasets.dropped(
+                                f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
+                            )
+                    elif self.config.ingest_shared_entities:
+                        # If no workspace for dataset we can consider it as shared entity
+                        self.report.datasets_without_workspace += 1
+                        self.report.datasets.processed(
+                            f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
+                        )
+                        datasets.append(dataset)
+                    else:
+                        self.report.datasets.dropped(
+                            f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
+                        )
                 if response_dict[Constant.NEXTPAGE]:
                     url = f"{dataset_url}?page={response_dict[Constant.NEXTPAGE]}"
                 else:
                     break
-            self.report.number_of_datasets = len(datasets)
             return datasets
         except Exception as e:
             self._log_http_error(
@@ -381,34 +402,54 @@ class SigmaAPI:
                 for workbook_dict in response_dict[Constant.ENTRIES]:
                     workbook = Workbook.parse_obj(workbook_dict)
-                    if workbook.workbookId in workbook_files_metadata:
-                        workbook.badge = workbook_files_metadata[
-                            workbook.workbookId
-                        ].badge
-                        workspace_id = workbook_files_metadata[
-                            workbook.workbookId
-                        ].workspaceId
-                        if workspace_id:
-                            workbook.workspaceId = workspace_id
-                            workspace = self.get_workspace(workbook.workspaceId)
-                            if workspace:
-                                if self.config.workspace_pattern.allowed(
-                                    workspace.name
-                                ):
-                                    workbook.pages = self.get_workbook_pages(workbook)
-                                    workbooks.append(workbook)
-                            elif self.config.ingest_shared_entities:
-                                # If no workspace for workbook we can consider it as shared entity
-                                self.report.shared_entities_count += 1
-                                workbook.pages = self.get_workbook_pages(workbook)
-                                workbooks.append(workbook)
+                    if workbook.workbookId not in workbook_files_metadata:
+                        # Due to a bug in the Sigma API, it seems like the /files endpoint does not
+                        # return file metadata when the user has access via admin permissions. In
+                        # those cases, the user associated with the token needs to be manually added
+                        # to the workspace.
+                        self.report.workbooks.dropped(
+                            f"{workbook.name} ({workbook.workbookId}) (missing file metadata; path: {workbook.path}; likely need to manually add user to workspace)"
+                        )
+                        continue
+                    workbook.workspaceId = workbook_files_metadata[
+                        workbook.workbookId
+                    ].workspaceId
+                    workbook.badge = workbook_files_metadata[workbook.workbookId].badge
+                    workspace = None
+                    if workbook.workspaceId:
+                        workspace = self.get_workspace(workbook.workspaceId)
+                    if workspace:
+                        if self.config.workspace_pattern.allowed(workspace.name):
+                            self.report.workbooks.processed(
+                                f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
+                            )
+                            workbook.pages = self.get_workbook_pages(workbook)
+                            workbooks.append(workbook)
+                        else:
+                            self.report.workbooks.dropped(
+                                f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
+                            )
+                    elif self.config.ingest_shared_entities:
+                        # If no workspace for workbook we can consider it as shared entity
+                        self.report.workbooks_without_workspace += 1
+                        self.report.workbooks.processed(
+                            f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
+                        )
+                        workbook.pages = self.get_workbook_pages(workbook)
+                        workbooks.append(workbook)
+                    else:
+                        self.report.workbooks.dropped(
+                            f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
+                        )
                 if response_dict[Constant.NEXTPAGE]:
                     url = f"{workbook_url}?page={response_dict[Constant.NEXTPAGE]}"
                 else:
                     break
-            self.report.number_of_workbooks = len(workbooks)
             return workbooks
         except Exception as e:
             self._log_http_error(

datahub/ingestion/source/snowflake/snowflake_queries.py CHANGED Viewed

@@ -515,7 +515,10 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
             # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
             # here
             query_id=get_query_fingerprint(
-                res["query_text"], self.identifiers.platform, fast=True
+                res["query_text"],
+                self.identifiers.platform,
+                fast=True,
+                secondary_id=res["query_secondary_fingerprint"],
             ),
             query_text=res["query_text"],
             upstreams=upstreams,
@@ -654,7 +657,17 @@ WITH
 fingerprinted_queries as (
     SELECT *,
         -- TODO: Generate better fingerprints for each query by pushing down regex logic.
-        query_history.query_parameterized_hash as query_fingerprint
+        query_history.query_parameterized_hash as query_fingerprint,
+        -- Optional and additional hash to be used for query deduplication and final query identity
+        CASE
+            WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
+            -- Extract project id and hash it
+            THEN CAST(HASH(
+                REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
+                REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
+            ) AS VARCHAR)
+            ELSE NULL
+        END as query_secondary_fingerprint
     FROM
         snowflake.account_usage.query_history
     WHERE
@@ -670,11 +683,11 @@ fingerprinted_queries as (
             {time_bucket_size},
             CONVERT_TIMEZONE('UTC', start_time)
         ) AS bucket_start_time,
-        COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint) AS query_count,
+        COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
     FROM
         fingerprinted_queries
     QUALIFY
-        ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint ORDER BY start_time DESC) = 1
+        ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
 )
 , raw_access_history AS (
     SELECT
@@ -714,6 +727,7 @@ fingerprinted_queries as (
         q.bucket_start_time,
         q.query_id,
         q.query_fingerprint,
+        q.query_secondary_fingerprint,
         q.query_count,
         q.session_id AS "SESSION_ID",
         q.start_time AS "QUERY_START_TIME",

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -1000,4 +1000,4 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
         from_clause = (
             f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
         )
-        return f"""SHOW STREAMS IN DATABASE {db_name} LIMIT {limit} {from_clause};"""
+        return f"""SHOW STREAMS IN DATABASE "{db_name}" LIMIT {limit} {from_clause};"""

datahub/sql_parsing/sqlglot_utils.py CHANGED Viewed

@@ -257,7 +257,10 @@ def generate_hash(text: str) -> str:
 def get_query_fingerprint_debug(
-    expression: sqlglot.exp.ExpOrStr, platform: DialectOrStr, fast: bool = False
+    expression: sqlglot.exp.ExpOrStr,
+    platform: DialectOrStr,
+    fast: bool = False,
+    secondary_id: Optional[str] = None,
 ) -> Tuple[str, Optional[str]]:
     try:
         if not fast:
@@ -272,16 +275,18 @@ def get_query_fingerprint_debug(
         logger.debug("Failed to generalize query for fingerprinting: %s", e)
         expression_sql = None
-    fingerprint = generate_hash(
-        expression_sql
-        if expression_sql is not None
-        else _expression_to_string(expression, platform=platform)
-    )
+    text = expression_sql or _expression_to_string(expression, platform=platform)
+    if secondary_id:
+        text = text + " -- " + secondary_id
+    fingerprint = generate_hash(text=text)
     return fingerprint, expression_sql
 def get_query_fingerprint(
-    expression: sqlglot.exp.ExpOrStr, platform: DialectOrStr, fast: bool = False
+    expression: sqlglot.exp.ExpOrStr,
+    platform: DialectOrStr,
+    fast: bool = False,
+    secondary_id: Optional[str] = None,
 ) -> str:
     """Get a fingerprint for a SQL query.
@@ -298,12 +303,15 @@ def get_query_fingerprint(
     Args:
         expression: The SQL query to fingerprint.
         platform: The SQL dialect to use.
+        secondary_id: An optional additional id string to included in the final fingerprint.
     Returns:
         The fingerprint for the SQL query.
     """
-    return get_query_fingerprint_debug(expression, platform, fast=fast)[0]
+    return get_query_fingerprint_debug(
+        expression=expression, platform=platform, fast=fast, secondary_id=secondary_id
+    )[0]
 @functools.lru_cache(maxsize=FORMAT_QUERY_CACHE_SIZE)

{acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc1.dist-info}/WHEEL RENAMED Viewed

File without changes

{acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc1.dist-info}/top_level.txt RENAMED Viewed

File without changes

acryl-datahub 1.0.0.2rc5__py3-none-any.whl → 1.0.0.3rc1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.2rc5py3-none-any.whl → 1.0.0.3rc1py3-none-any.whl