acryl-datahub 1.0.0.2rc5__py3-none-any.whl → 1.0.0.3rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.0.0.2rc5.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.0.0.3rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=TEFaI0SngUMeKPXQwQz9bnZDzmSywu7Y6e6m6k--k00,323
4
+ datahub/_version.py,sha256=R-5q2sde87sdyofKBpzMGjN_yrh8SbPAoOTVYlH3CuU,323
5
5
  datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
6
6
  datahub/errors.py,sha256=BzKdcmYseHOt36zfjJXc17WNutFhp9Y23cU_L6cIkxc,612
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -151,7 +151,7 @@ datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188m
151
151
  datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
152
152
  datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
153
153
  datahub/ingestion/api/auto_work_units/auto_dataset_properties_aspect.py,sha256=ID_6N3nWl2qohsSGizUCqo3d2MNyDeVbyWroQpSOSsc,5059
154
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=5jrl7cEyonce-YdWe1Iw6y3Okw5smJosqwOm5e-nvqM,4363
154
+ datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=fMjPnyWEofIZV52E2AFYU3IgBJwyZvbygXxCJyEtcWI,4442
155
155
  datahub/ingestion/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
156
156
  datahub/ingestion/extractor/extractor_registry.py,sha256=f7CLfW3pr29QZkXSHbp7HjUrsdw7ejQJmot-tiSPcqc,342
157
157
  datahub/ingestion/extractor/json_ref_patch.py,sha256=4g3ZWHn7rwS74jUvSXJiGpi-UKHhiSYKKgBeU4E5ukE,1448
@@ -327,7 +327,7 @@ datahub/ingestion/source/git/git_import.py,sha256=5CT6vMDb0MDctCtShnxb3JVihULtvk
327
327
  datahub/ingestion/source/grafana/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
328
328
  datahub/ingestion/source/grafana/grafana_source.py,sha256=3pU3xodPgS5lmnjuQ_u7F0XPzD_Y8MnPlMxRJ86qz4g,4960
329
329
  datahub/ingestion/source/hex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
330
- datahub/ingestion/source/hex/api.py,sha256=JfFPD8O4z16fwZE_BdX5aCQztEq-tbzxJJ7aofH4DE4,12274
330
+ datahub/ingestion/source/hex/api.py,sha256=OVQNI_11NJJcNCT6OzSDEtVjNcom0vmes_KkjgzWCcI,11806
331
331
  datahub/ingestion/source/hex/constants.py,sha256=8hUTMWyG5keTNfXoLu_Dh413Hw_mGGJX1atiiDZyKtg,271
332
332
  datahub/ingestion/source/hex/hex.py,sha256=PIRl8fPkKtlHV7cqR4H8RKVYdTLgEFXHFzc3QAqJLhE,12733
333
333
  datahub/ingestion/source/hex/mapper.py,sha256=N3mTlEcrOmhv9ia1dnHGFgFJD2ddyTtU3H5IUbb-UxU,13344
@@ -439,10 +439,10 @@ datahub/ingestion/source/schema_inference/json.py,sha256=p5S-3idn65V2uad5T8txs1U
439
439
  datahub/ingestion/source/schema_inference/object.py,sha256=dhSOtxVJHbTDY0hWeHwdLYHnOsW07Omk7Y4DPeztie0,5847
440
440
  datahub/ingestion/source/schema_inference/parquet.py,sha256=CdqsNuiabLLCulWbuPMssijeFmKLv3M5MKFIhlatpWA,3456
441
441
  datahub/ingestion/source/sigma/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
442
- datahub/ingestion/source/sigma/config.py,sha256=yfdKQYvI5hKVl8gNAKIcJe-VW3klvdDqYbUP76gJQDI,3812
442
+ datahub/ingestion/source/sigma/config.py,sha256=xpZXt4f05-sroWFv9SbzVhU1-iBeVfU1ocJKb-fy3aM,6333
443
443
  datahub/ingestion/source/sigma/data_classes.py,sha256=YZkkzwftV34mq5c_4jlC2PCSiRKt4hvHjmqikLQhl1I,2012
444
- datahub/ingestion/source/sigma/sigma.py,sha256=ucODIa5KUGr3WSoo7VgCt8uFaKRbSDlwsdVMAcjPLpQ,24378
445
- datahub/ingestion/source/sigma/sigma_api.py,sha256=SVvbUs2vjueUdDa-3FzeMsaX5pNpApVI192P7EZzPcI,17870
444
+ datahub/ingestion/source/sigma/sigma.py,sha256=ZtPj8eu6hcJxyFcWizob4kRaxrpcqsWzh__lmuVZdt8,25212
445
+ datahub/ingestion/source/sigma/sigma_api.py,sha256=7PK5AQa838hYeaQ5L0dioi4n4bLrpN-r7COKTTNUYw8,19837
446
446
  datahub/ingestion/source/slack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
447
447
  datahub/ingestion/source/slack/slack.py,sha256=3N7Yp-u9DvBmo536Z6-pQTrJgSJ3i742GePSgjlBOUU,27616
448
448
  datahub/ingestion/source/snowflake/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -455,8 +455,8 @@ datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=pEw2O9xoTSIWDi
455
455
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
456
456
  datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=FBmiONx4EGHWV8RNJT6zHZyntKinPFFyd2oKbTUIbhE,21319
457
457
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
458
- datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=gX9E1Z_CemAZsuTDmtvqrxY7vBL2da75j7X8Xwhaf8Y,28441
459
- datahub/ingestion/source/snowflake/snowflake_query.py,sha256=0AMPQ_L7sgQtBizBNEe69-BUM8_wk1m8ystWivwKEMI,40409
458
+ datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=PY4Wy6i89nqRl92ARwXNqWwm-ifagkKbKKtxYWeswkk,29209
459
+ datahub/ingestion/source/snowflake/snowflake_query.py,sha256=JtTrfzGqM9mk2Fr-F1X0KXzc_8ot7rD3dD2vPEuzd0E,40411
460
460
  datahub/ingestion/source/snowflake/snowflake_report.py,sha256=O-465aBA8uaYZ6WepP7i6cgK6Q1jXJPjDA1j9C8klus,6762
461
461
  datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=1yGBbs2aWIdHnrwgeTR7J2lqxbbBsIt8ejCLumIpLEA,27274
462
462
  datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=_37-AQyI4uGt4fu-d3v2eAWzQ3uG835ZQxMjFwGYCng,57193
@@ -940,7 +940,7 @@ datahub/sql_parsing/sql_parsing_aggregator.py,sha256=A3_0wSxBJSRowEaslptDpBoKO42
940
940
  datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
941
941
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
942
942
  datahub/sql_parsing/sqlglot_lineage.py,sha256=l0kT8MuRIg96X7BNJaboMznF54b-yvM2nMTLyF2d0Nw,47446
943
- datahub/sql_parsing/sqlglot_utils.py,sha256=HP6awSU4ijmwjmTvGA_d0X_RO9O3rbGdkbVAWEhAcck,14667
943
+ datahub/sql_parsing/sqlglot_utils.py,sha256=5cUiEWLWfVTI7uIxolAfOfNVo50qnklzhj86gxSFWqg,14943
944
944
  datahub/sql_parsing/tool_meta_extractor.py,sha256=EV_g7sOchTSUm2p6wluNJqND7-rDYokVTqqFCM7hQ6c,7599
945
945
  datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
946
946
  datahub/telemetry/stats.py,sha256=TwaQisQlD2Bk0uw__pP6u3Ovz9r-Ip4pCwpnto4r5e0,959
@@ -1045,8 +1045,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1045
1045
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1046
1046
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1047
1047
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1048
- acryl_datahub-1.0.0.2rc5.dist-info/METADATA,sha256=urp7GO85YeQHY_-wuzs6YWZ6xzfGkunfiD-r-e7CvfY,176853
1049
- acryl_datahub-1.0.0.2rc5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
1050
- acryl_datahub-1.0.0.2rc5.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1051
- acryl_datahub-1.0.0.2rc5.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1052
- acryl_datahub-1.0.0.2rc5.dist-info/RECORD,,
1048
+ acryl_datahub-1.0.0.3rc1.dist-info/METADATA,sha256=43mPIcmD4ByKfyR6rn8PPgaKNUBSmDmVJnGm1KhBZuo,176855
1049
+ acryl_datahub-1.0.0.3rc1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
1050
+ acryl_datahub-1.0.0.3rc1.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1051
+ acryl_datahub-1.0.0.3rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1052
+ acryl_datahub-1.0.0.3rc1.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.0.0.2rc5"
3
+ __version__ = "1.0.0.3rc1"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -23,6 +23,7 @@ class EnsureAspectSizeProcessor:
23
23
  ):
24
24
  self.report = report
25
25
  self.payload_constraint = payload_constraint
26
+ self.schema_size_constraint = int(self.payload_constraint * 0.985)
26
27
 
27
28
  def ensure_dataset_profile_size(
28
29
  self, dataset_urn: str, profile: DatasetProfileClass
@@ -68,7 +69,7 @@ class EnsureAspectSizeProcessor:
68
69
  for field in schema.fields:
69
70
  field_size = len(json.dumps(pre_json_transform(field.to_obj())))
70
71
  logger.debug(f"Field {field.fieldPath} takes total {field_size}")
71
- if total_fields_size + field_size < self.payload_constraint:
72
+ if total_fields_size + field_size < self.schema_size_constraint:
72
73
  accepted_fields.append(field)
73
74
  total_fields_size += field_size
74
75
  else:
@@ -27,6 +27,7 @@ logger = logging.getLogger(__name__)
27
27
 
28
28
  # The following models were Claude-generated from Hex API OpenAPI definition https://static.hex.site/openapi.json
29
29
  # To be exclusively used internally for the deserialization of the API response
30
+ # Model is incomplete and fields may have not been mapped if not used in the ingestion
30
31
 
31
32
 
32
33
  class HexApiAppViewStats(BaseModel):
@@ -83,20 +84,10 @@ class HexApiUser(BaseModel):
83
84
  email: str
84
85
 
85
86
 
86
- class HexApiAccessType(StrEnum):
87
- """Access type enum."""
88
-
89
- NONE = "NONE"
90
- VIEW = "VIEW"
91
- EDIT = "EDIT"
92
- FULL_ACCESS = "FULL_ACCESS"
93
-
94
-
95
87
  class HexApiUserAccess(BaseModel):
96
88
  """User access model."""
97
89
 
98
90
  user: HexApiUser
99
- access: Optional[HexApiAccessType] = None
100
91
 
101
92
 
102
93
  class HexApiCollectionData(BaseModel):
@@ -109,13 +100,6 @@ class HexApiCollectionAccess(BaseModel):
109
100
  """Collection access model."""
110
101
 
111
102
  collection: HexApiCollectionData
112
- access: Optional[HexApiAccessType] = None
113
-
114
-
115
- class HexApiAccessSettings(BaseModel):
116
- """Access settings model."""
117
-
118
- access: Optional[HexApiAccessType] = None
119
103
 
120
104
 
121
105
  class HexApiWeeklySchedule(BaseModel):
@@ -145,9 +129,6 @@ class HexApiSharing(BaseModel):
145
129
  users: Optional[List[HexApiUserAccess]] = []
146
130
  collections: Optional[List[HexApiCollectionAccess]] = []
147
131
  groups: Optional[List[Any]] = []
148
- workspace: Optional[HexApiAccessSettings] = None
149
- public_web: Optional[HexApiAccessSettings] = Field(default=None, alias="publicWeb")
150
- support: Optional[HexApiAccessSettings] = None
151
132
 
152
133
  class Config:
153
134
  extra = "ignore" # Allow extra fields in the JSON
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field
3
- from typing import Dict, Optional
3
+ from typing import Dict, List, Optional
4
4
 
5
5
  import pydantic
6
+ from pydantic import BaseModel, Field
6
7
 
7
8
  from datahub.configuration.common import AllowDenyPattern
8
9
  from datahub.configuration.source_common import (
@@ -17,6 +18,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
17
18
  from datahub.ingestion.source.state.stateful_ingestion_base import (
18
19
  StatefulIngestionConfigBase,
19
20
  )
21
+ from datahub.utilities.lossy_collections import LossyDict
20
22
 
21
23
  logger = logging.getLogger(__name__)
22
24
 
@@ -53,15 +55,82 @@ class Constant:
53
55
  DEFAULT_API_URL = "https://aws-api.sigmacomputing.com/v2"
54
56
 
55
57
 
58
+ class WorkspaceCounts(BaseModel):
59
+ workbooks_count: int = 0
60
+ datasets_count: int = 0
61
+ elements_count: int = 0
62
+ pages_count: int = 0
63
+
64
+ def is_empty(self) -> bool:
65
+ return (
66
+ self.workbooks_count == 0
67
+ and self.datasets_count == 0
68
+ and self.elements_count == 0
69
+ and self.pages_count == 0
70
+ )
71
+
72
+ def as_obj(self) -> dict:
73
+ return {
74
+ "workbooks_count": self.workbooks_count,
75
+ "datasets_count": self.datasets_count,
76
+ "elements_count": self.elements_count,
77
+ "pages_count": self.pages_count,
78
+ }
79
+
80
+
81
+ class SigmaWorkspaceEntityFilterReport(EntityFilterReport):
82
+ type: str = "workspace"
83
+
84
+ workspace_counts: LossyDict[str, WorkspaceCounts] = Field(
85
+ default_factory=LossyDict,
86
+ description="Counts of workbooks, datasets, elements and pages in each workspace.",
87
+ )
88
+
89
+ def increment_workbooks_count(self, workspace_id: str) -> None:
90
+ if workspace_id not in self.workspace_counts:
91
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
92
+ self.workspace_counts[workspace_id].workbooks_count += 1
93
+
94
+ def increment_datasets_count(self, workspace_id: str) -> None:
95
+ if workspace_id not in self.workspace_counts:
96
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
97
+ self.workspace_counts[workspace_id].datasets_count += 1
98
+
99
+ def increment_elements_count(self, workspace_id: str) -> None:
100
+ if workspace_id not in self.workspace_counts:
101
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
102
+ self.workspace_counts[workspace_id].elements_count += 1
103
+
104
+ def increment_pages_count(self, workspace_id: str) -> None:
105
+ if workspace_id not in self.workspace_counts:
106
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
107
+ self.workspace_counts[workspace_id].pages_count += 1
108
+
109
+ def as_obj(self) -> dict:
110
+ return {
111
+ "filtered": self.dropped_entities.as_obj(),
112
+ "processed": self.processed_entities.as_obj(),
113
+ "workspace_counts": {
114
+ key: item.as_obj() for key, item in self.workspace_counts.items()
115
+ },
116
+ }
117
+
118
+
56
119
  @dataclass
57
120
  class SigmaSourceReport(StaleEntityRemovalSourceReport):
58
- workspaces: EntityFilterReport = EntityFilterReport.field(type="workspace")
59
- number_of_workspaces: Optional[int] = None
121
+ workspaces: SigmaWorkspaceEntityFilterReport = field(
122
+ default_factory=SigmaWorkspaceEntityFilterReport
123
+ )
60
124
  non_accessible_workspaces_count: int = 0
61
- shared_entities_count: int = 0
62
- number_of_datasets: int = 0
63
- number_of_workbooks: int = 0
125
+
126
+ datasets: EntityFilterReport = EntityFilterReport.field(type="dataset")
127
+ datasets_without_workspace: int = 0
128
+
129
+ workbooks: EntityFilterReport = EntityFilterReport.field(type="workbook")
130
+ workbooks_without_workspace: int = 0
131
+
64
132
  number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
133
+ empty_workspaces: List[str] = field(default_factory=list)
65
134
 
66
135
 
67
136
  class PlatformDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
@@ -35,6 +35,7 @@ from datahub.ingestion.source.sigma.config import (
35
35
  PlatformDetail,
36
36
  SigmaSourceConfig,
37
37
  SigmaSourceReport,
38
+ WorkspaceCounts,
38
39
  )
39
40
  from datahub.ingestion.source.sigma.data_classes import (
40
41
  Element,
@@ -163,7 +164,6 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
163
164
  def _get_allowed_workspaces(self) -> List[Workspace]:
164
165
  all_workspaces = self.sigma_api.workspaces.values()
165
166
  logger.info(f"Number of workspaces = {len(all_workspaces)}")
166
- self.reporter.number_of_workspaces = len(all_workspaces)
167
167
 
168
168
  allowed_workspaces = []
169
169
  for workspace in all_workspaces:
@@ -285,6 +285,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
285
285
  yield self._gen_dataset_properties(dataset_urn, dataset)
286
286
 
287
287
  if dataset.workspaceId:
288
+ self.reporter.workspaces.increment_datasets_count(dataset.workspaceId)
288
289
  yield from add_entity_to_container(
289
290
  container_key=self._gen_workspace_key(dataset.workspaceId),
290
291
  entity_type="dataset",
@@ -468,6 +469,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
468
469
  ).as_workunit()
469
470
 
470
471
  if workbook.workspaceId:
472
+ self.reporter.workspaces.increment_elements_count(workbook.workspaceId)
473
+
471
474
  yield self._gen_entity_browsepath_aspect(
472
475
  entity_urn=chart_urn,
473
476
  parent_entity_urn=builder.make_container_urn(
@@ -525,6 +528,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
525
528
  all_input_fields: List[InputFieldClass] = []
526
529
 
527
530
  if workbook.workspaceId:
531
+ self.reporter.workspaces.increment_pages_count(workbook.workspaceId)
528
532
  yield self._gen_entity_browsepath_aspect(
529
533
  entity_urn=dashboard_urn,
530
534
  parent_entity_urn=builder.make_container_urn(
@@ -614,6 +618,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
614
618
 
615
619
  paths = workbook.path.split("/")[1:]
616
620
  if workbook.workspaceId:
621
+ self.reporter.workspaces.increment_workbooks_count(workbook.workspaceId)
622
+
617
623
  yield self._gen_entity_browsepath_aspect(
618
624
  entity_urn=dashboard_urn,
619
625
  parent_entity_urn=builder.make_container_urn(
@@ -667,6 +673,15 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
667
673
  f"{workspace.name} ({workspace.workspaceId})"
668
674
  )
669
675
  yield from self._gen_workspace_workunit(workspace)
676
+ if self.reporter.workspaces.workspace_counts.get(
677
+ workspace.workspaceId, WorkspaceCounts()
678
+ ).is_empty():
679
+ logger.warning(
680
+ f"Workspace {workspace.name} ({workspace.workspaceId}) is empty. If this is not expected, add the user associated with the Client ID/Secret to each workspace with missing metadata"
681
+ )
682
+ self.reporter.empty_workspaces.append(
683
+ f"{workspace.name} ({workspace.workspaceId})"
684
+ )
670
685
  yield from self._gen_sigma_dataset_upstream_lineage_workunit()
671
686
 
672
687
  def get_report(self) -> SourceReport:
@@ -95,22 +95,22 @@ class SigmaAPI:
95
95
  return get_response
96
96
 
97
97
  def get_workspace(self, workspace_id: str) -> Optional[Workspace]:
98
+ if workspace_id in self.workspaces:
99
+ return self.workspaces[workspace_id]
100
+
98
101
  logger.debug(f"Fetching workspace metadata with id '{workspace_id}'")
99
102
  try:
100
- if workspace_id in self.workspaces:
101
- return self.workspaces[workspace_id]
102
- else:
103
- response = self._get_api_call(
104
- f"{self.config.api_url}/workspaces/{workspace_id}"
105
- )
106
- if response.status_code == 403:
107
- logger.debug(f"Workspace {workspace_id} not accessible.")
108
- self.report.non_accessible_workspaces_count += 1
109
- return None
110
- response.raise_for_status()
111
- workspace = Workspace.parse_obj(response.json())
112
- self.workspaces[workspace.workspaceId] = workspace
113
- return workspace
103
+ response = self._get_api_call(
104
+ f"{self.config.api_url}/workspaces/{workspace_id}"
105
+ )
106
+ if response.status_code == 403:
107
+ logger.debug(f"Workspace {workspace_id} not accessible.")
108
+ self.report.non_accessible_workspaces_count += 1
109
+ return None
110
+ response.raise_for_status()
111
+ workspace = Workspace.parse_obj(response.json())
112
+ self.workspaces[workspace.workspaceId] = workspace
113
+ return workspace
114
114
  except Exception as e:
115
115
  self._log_http_error(
116
116
  message=f"Unable to fetch workspace '{workspace_id}'. Exception: {e}"
@@ -187,7 +187,9 @@ class SigmaAPI:
187
187
  @functools.lru_cache
188
188
  def _get_files_metadata(self, file_type: str) -> Dict[str, File]:
189
189
  logger.debug(f"Fetching file metadata with type {file_type}.")
190
- file_url = url = f"{self.config.api_url}/files?typeFilters={file_type}"
190
+ file_url = url = (
191
+ f"{self.config.api_url}/files?permissionFilter=view&typeFilters={file_type}"
192
+ )
191
193
  try:
192
194
  files_metadata: Dict[str, File] = {}
193
195
  while True:
@@ -225,31 +227,50 @@ class SigmaAPI:
225
227
  for dataset_dict in response_dict[Constant.ENTRIES]:
226
228
  dataset = SigmaDataset.parse_obj(dataset_dict)
227
229
 
228
- if dataset.datasetId in dataset_files_metadata:
229
- dataset.path = dataset_files_metadata[dataset.datasetId].path
230
- dataset.badge = dataset_files_metadata[dataset.datasetId].badge
231
-
232
- workspace_id = dataset_files_metadata[
233
- dataset.datasetId
234
- ].workspaceId
235
- if workspace_id:
236
- dataset.workspaceId = workspace_id
237
- workspace = self.get_workspace(dataset.workspaceId)
238
- if workspace:
239
- if self.config.workspace_pattern.allowed(
240
- workspace.name
241
- ):
242
- datasets.append(dataset)
243
- elif self.config.ingest_shared_entities:
244
- # If no workspace for dataset we can consider it as shared entity
245
- self.report.shared_entities_count += 1
246
- datasets.append(dataset)
230
+ if dataset.datasetId not in dataset_files_metadata:
231
+ self.report.datasets.dropped(
232
+ f"{dataset.name} ({dataset.datasetId}) (missing file metadata)"
233
+ )
234
+ continue
235
+
236
+ dataset.workspaceId = dataset_files_metadata[
237
+ dataset.datasetId
238
+ ].workspaceId
239
+
240
+ dataset.path = dataset_files_metadata[dataset.datasetId].path
241
+ dataset.badge = dataset_files_metadata[dataset.datasetId].badge
242
+
243
+ workspace = None
244
+ if dataset.workspaceId:
245
+ workspace = self.get_workspace(dataset.workspaceId)
246
+
247
+ if workspace:
248
+ if self.config.workspace_pattern.allowed(workspace.name):
249
+ self.report.datasets.processed(
250
+ f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
251
+ )
252
+ datasets.append(dataset)
253
+ else:
254
+ self.report.datasets.dropped(
255
+ f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
256
+ )
257
+ elif self.config.ingest_shared_entities:
258
+ # If no workspace for dataset we can consider it as shared entity
259
+ self.report.datasets_without_workspace += 1
260
+ self.report.datasets.processed(
261
+ f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
262
+ )
263
+ datasets.append(dataset)
264
+ else:
265
+ self.report.datasets.dropped(
266
+ f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
267
+ )
247
268
 
248
269
  if response_dict[Constant.NEXTPAGE]:
249
270
  url = f"{dataset_url}?page={response_dict[Constant.NEXTPAGE]}"
250
271
  else:
251
272
  break
252
- self.report.number_of_datasets = len(datasets)
273
+
253
274
  return datasets
254
275
  except Exception as e:
255
276
  self._log_http_error(
@@ -381,34 +402,54 @@ class SigmaAPI:
381
402
  for workbook_dict in response_dict[Constant.ENTRIES]:
382
403
  workbook = Workbook.parse_obj(workbook_dict)
383
404
 
384
- if workbook.workbookId in workbook_files_metadata:
385
- workbook.badge = workbook_files_metadata[
386
- workbook.workbookId
387
- ].badge
388
-
389
- workspace_id = workbook_files_metadata[
390
- workbook.workbookId
391
- ].workspaceId
392
- if workspace_id:
393
- workbook.workspaceId = workspace_id
394
- workspace = self.get_workspace(workbook.workspaceId)
395
- if workspace:
396
- if self.config.workspace_pattern.allowed(
397
- workspace.name
398
- ):
399
- workbook.pages = self.get_workbook_pages(workbook)
400
- workbooks.append(workbook)
401
- elif self.config.ingest_shared_entities:
402
- # If no workspace for workbook we can consider it as shared entity
403
- self.report.shared_entities_count += 1
404
- workbook.pages = self.get_workbook_pages(workbook)
405
- workbooks.append(workbook)
405
+ if workbook.workbookId not in workbook_files_metadata:
406
+ # Due to a bug in the Sigma API, it seems like the /files endpoint does not
407
+ # return file metadata when the user has access via admin permissions. In
408
+ # those cases, the user associated with the token needs to be manually added
409
+ # to the workspace.
410
+ self.report.workbooks.dropped(
411
+ f"{workbook.name} ({workbook.workbookId}) (missing file metadata; path: {workbook.path}; likely need to manually add user to workspace)"
412
+ )
413
+ continue
414
+
415
+ workbook.workspaceId = workbook_files_metadata[
416
+ workbook.workbookId
417
+ ].workspaceId
418
+
419
+ workbook.badge = workbook_files_metadata[workbook.workbookId].badge
420
+
421
+ workspace = None
422
+ if workbook.workspaceId:
423
+ workspace = self.get_workspace(workbook.workspaceId)
424
+
425
+ if workspace:
426
+ if self.config.workspace_pattern.allowed(workspace.name):
427
+ self.report.workbooks.processed(
428
+ f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
429
+ )
430
+ workbook.pages = self.get_workbook_pages(workbook)
431
+ workbooks.append(workbook)
432
+ else:
433
+ self.report.workbooks.dropped(
434
+ f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
435
+ )
436
+ elif self.config.ingest_shared_entities:
437
+ # If no workspace for workbook we can consider it as shared entity
438
+ self.report.workbooks_without_workspace += 1
439
+ self.report.workbooks.processed(
440
+ f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
441
+ )
442
+ workbook.pages = self.get_workbook_pages(workbook)
443
+ workbooks.append(workbook)
444
+ else:
445
+ self.report.workbooks.dropped(
446
+ f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
447
+ )
406
448
 
407
449
  if response_dict[Constant.NEXTPAGE]:
408
450
  url = f"{workbook_url}?page={response_dict[Constant.NEXTPAGE]}"
409
451
  else:
410
452
  break
411
- self.report.number_of_workbooks = len(workbooks)
412
453
  return workbooks
413
454
  except Exception as e:
414
455
  self._log_http_error(
@@ -515,7 +515,10 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
515
515
  # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
516
516
  # here
517
517
  query_id=get_query_fingerprint(
518
- res["query_text"], self.identifiers.platform, fast=True
518
+ res["query_text"],
519
+ self.identifiers.platform,
520
+ fast=True,
521
+ secondary_id=res["query_secondary_fingerprint"],
519
522
  ),
520
523
  query_text=res["query_text"],
521
524
  upstreams=upstreams,
@@ -654,7 +657,17 @@ WITH
654
657
  fingerprinted_queries as (
655
658
  SELECT *,
656
659
  -- TODO: Generate better fingerprints for each query by pushing down regex logic.
657
- query_history.query_parameterized_hash as query_fingerprint
660
+ query_history.query_parameterized_hash as query_fingerprint,
661
+ -- Optional and additional hash to be used for query deduplication and final query identity
662
+ CASE
663
+ WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
664
+ -- Extract project id and hash it
665
+ THEN CAST(HASH(
666
+ REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
667
+ REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
668
+ ) AS VARCHAR)
669
+ ELSE NULL
670
+ END as query_secondary_fingerprint
658
671
  FROM
659
672
  snowflake.account_usage.query_history
660
673
  WHERE
@@ -670,11 +683,11 @@ fingerprinted_queries as (
670
683
  {time_bucket_size},
671
684
  CONVERT_TIMEZONE('UTC', start_time)
672
685
  ) AS bucket_start_time,
673
- COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint) AS query_count,
686
+ COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
674
687
  FROM
675
688
  fingerprinted_queries
676
689
  QUALIFY
677
- ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint ORDER BY start_time DESC) = 1
690
+ ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
678
691
  )
679
692
  , raw_access_history AS (
680
693
  SELECT
@@ -714,6 +727,7 @@ fingerprinted_queries as (
714
727
  q.bucket_start_time,
715
728
  q.query_id,
716
729
  q.query_fingerprint,
730
+ q.query_secondary_fingerprint,
717
731
  q.query_count,
718
732
  q.session_id AS "SESSION_ID",
719
733
  q.start_time AS "QUERY_START_TIME",
@@ -1000,4 +1000,4 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
1000
1000
  from_clause = (
1001
1001
  f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
1002
1002
  )
1003
- return f"""SHOW STREAMS IN DATABASE {db_name} LIMIT {limit} {from_clause};"""
1003
+ return f"""SHOW STREAMS IN DATABASE "{db_name}" LIMIT {limit} {from_clause};"""
@@ -257,7 +257,10 @@ def generate_hash(text: str) -> str:
257
257
 
258
258
 
259
259
  def get_query_fingerprint_debug(
260
- expression: sqlglot.exp.ExpOrStr, platform: DialectOrStr, fast: bool = False
260
+ expression: sqlglot.exp.ExpOrStr,
261
+ platform: DialectOrStr,
262
+ fast: bool = False,
263
+ secondary_id: Optional[str] = None,
261
264
  ) -> Tuple[str, Optional[str]]:
262
265
  try:
263
266
  if not fast:
@@ -272,16 +275,18 @@ def get_query_fingerprint_debug(
272
275
  logger.debug("Failed to generalize query for fingerprinting: %s", e)
273
276
  expression_sql = None
274
277
 
275
- fingerprint = generate_hash(
276
- expression_sql
277
- if expression_sql is not None
278
- else _expression_to_string(expression, platform=platform)
279
- )
278
+ text = expression_sql or _expression_to_string(expression, platform=platform)
279
+ if secondary_id:
280
+ text = text + " -- " + secondary_id
281
+ fingerprint = generate_hash(text=text)
280
282
  return fingerprint, expression_sql
281
283
 
282
284
 
283
285
  def get_query_fingerprint(
284
- expression: sqlglot.exp.ExpOrStr, platform: DialectOrStr, fast: bool = False
286
+ expression: sqlglot.exp.ExpOrStr,
287
+ platform: DialectOrStr,
288
+ fast: bool = False,
289
+ secondary_id: Optional[str] = None,
285
290
  ) -> str:
286
291
  """Get a fingerprint for a SQL query.
287
292
 
@@ -298,12 +303,15 @@ def get_query_fingerprint(
298
303
  Args:
299
304
  expression: The SQL query to fingerprint.
300
305
  platform: The SQL dialect to use.
306
+ secondary_id: An optional additional id string to included in the final fingerprint.
301
307
 
302
308
  Returns:
303
309
  The fingerprint for the SQL query.
304
310
  """
305
311
 
306
- return get_query_fingerprint_debug(expression, platform, fast=fast)[0]
312
+ return get_query_fingerprint_debug(
313
+ expression=expression, platform=platform, fast=fast, secondary_id=secondary_id
314
+ )[0]
307
315
 
308
316
 
309
317
  @functools.lru_cache(maxsize=FORMAT_QUERY_CACHE_SIZE)