acryl-datahub 1.0.0.2rc5__py3-none-any.whl → 1.0.0.3rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc1.dist-info}/METADATA +2529 -2529
- {acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc1.dist-info}/RECORD +15 -15
- datahub/_version.py +1 -1
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
- datahub/ingestion/source/hex/api.py +1 -20
- datahub/ingestion/source/sigma/config.py +75 -6
- datahub/ingestion/source/sigma/sigma.py +16 -1
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
- datahub/ingestion/source/snowflake/snowflake_query.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +16 -8
- {acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
acryl_datahub-1.0.0.
|
|
1
|
+
acryl_datahub-1.0.0.3rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
|
|
2
2
|
datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
|
|
3
3
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
4
|
-
datahub/_version.py,sha256=
|
|
4
|
+
datahub/_version.py,sha256=R-5q2sde87sdyofKBpzMGjN_yrh8SbPAoOTVYlH3CuU,323
|
|
5
5
|
datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
|
|
6
6
|
datahub/errors.py,sha256=BzKdcmYseHOt36zfjJXc17WNutFhp9Y23cU_L6cIkxc,612
|
|
7
7
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -151,7 +151,7 @@ datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188m
|
|
|
151
151
|
datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
|
|
152
152
|
datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
153
|
datahub/ingestion/api/auto_work_units/auto_dataset_properties_aspect.py,sha256=ID_6N3nWl2qohsSGizUCqo3d2MNyDeVbyWroQpSOSsc,5059
|
|
154
|
-
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=
|
|
154
|
+
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=fMjPnyWEofIZV52E2AFYU3IgBJwyZvbygXxCJyEtcWI,4442
|
|
155
155
|
datahub/ingestion/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
156
156
|
datahub/ingestion/extractor/extractor_registry.py,sha256=f7CLfW3pr29QZkXSHbp7HjUrsdw7ejQJmot-tiSPcqc,342
|
|
157
157
|
datahub/ingestion/extractor/json_ref_patch.py,sha256=4g3ZWHn7rwS74jUvSXJiGpi-UKHhiSYKKgBeU4E5ukE,1448
|
|
@@ -327,7 +327,7 @@ datahub/ingestion/source/git/git_import.py,sha256=5CT6vMDb0MDctCtShnxb3JVihULtvk
|
|
|
327
327
|
datahub/ingestion/source/grafana/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
328
328
|
datahub/ingestion/source/grafana/grafana_source.py,sha256=3pU3xodPgS5lmnjuQ_u7F0XPzD_Y8MnPlMxRJ86qz4g,4960
|
|
329
329
|
datahub/ingestion/source/hex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
330
|
-
datahub/ingestion/source/hex/api.py,sha256=
|
|
330
|
+
datahub/ingestion/source/hex/api.py,sha256=OVQNI_11NJJcNCT6OzSDEtVjNcom0vmes_KkjgzWCcI,11806
|
|
331
331
|
datahub/ingestion/source/hex/constants.py,sha256=8hUTMWyG5keTNfXoLu_Dh413Hw_mGGJX1atiiDZyKtg,271
|
|
332
332
|
datahub/ingestion/source/hex/hex.py,sha256=PIRl8fPkKtlHV7cqR4H8RKVYdTLgEFXHFzc3QAqJLhE,12733
|
|
333
333
|
datahub/ingestion/source/hex/mapper.py,sha256=N3mTlEcrOmhv9ia1dnHGFgFJD2ddyTtU3H5IUbb-UxU,13344
|
|
@@ -439,10 +439,10 @@ datahub/ingestion/source/schema_inference/json.py,sha256=p5S-3idn65V2uad5T8txs1U
|
|
|
439
439
|
datahub/ingestion/source/schema_inference/object.py,sha256=dhSOtxVJHbTDY0hWeHwdLYHnOsW07Omk7Y4DPeztie0,5847
|
|
440
440
|
datahub/ingestion/source/schema_inference/parquet.py,sha256=CdqsNuiabLLCulWbuPMssijeFmKLv3M5MKFIhlatpWA,3456
|
|
441
441
|
datahub/ingestion/source/sigma/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
442
|
-
datahub/ingestion/source/sigma/config.py,sha256=
|
|
442
|
+
datahub/ingestion/source/sigma/config.py,sha256=xpZXt4f05-sroWFv9SbzVhU1-iBeVfU1ocJKb-fy3aM,6333
|
|
443
443
|
datahub/ingestion/source/sigma/data_classes.py,sha256=YZkkzwftV34mq5c_4jlC2PCSiRKt4hvHjmqikLQhl1I,2012
|
|
444
|
-
datahub/ingestion/source/sigma/sigma.py,sha256=
|
|
445
|
-
datahub/ingestion/source/sigma/sigma_api.py,sha256=
|
|
444
|
+
datahub/ingestion/source/sigma/sigma.py,sha256=ZtPj8eu6hcJxyFcWizob4kRaxrpcqsWzh__lmuVZdt8,25212
|
|
445
|
+
datahub/ingestion/source/sigma/sigma_api.py,sha256=7PK5AQa838hYeaQ5L0dioi4n4bLrpN-r7COKTTNUYw8,19837
|
|
446
446
|
datahub/ingestion/source/slack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
447
447
|
datahub/ingestion/source/slack/slack.py,sha256=3N7Yp-u9DvBmo536Z6-pQTrJgSJ3i742GePSgjlBOUU,27616
|
|
448
448
|
datahub/ingestion/source/snowflake/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -455,8 +455,8 @@ datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=pEw2O9xoTSIWDi
|
|
|
455
455
|
datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
|
|
456
456
|
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=FBmiONx4EGHWV8RNJT6zHZyntKinPFFyd2oKbTUIbhE,21319
|
|
457
457
|
datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
|
|
458
|
-
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=
|
|
459
|
-
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=
|
|
458
|
+
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=PY4Wy6i89nqRl92ARwXNqWwm-ifagkKbKKtxYWeswkk,29209
|
|
459
|
+
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=JtTrfzGqM9mk2Fr-F1X0KXzc_8ot7rD3dD2vPEuzd0E,40411
|
|
460
460
|
datahub/ingestion/source/snowflake/snowflake_report.py,sha256=O-465aBA8uaYZ6WepP7i6cgK6Q1jXJPjDA1j9C8klus,6762
|
|
461
461
|
datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=1yGBbs2aWIdHnrwgeTR7J2lqxbbBsIt8ejCLumIpLEA,27274
|
|
462
462
|
datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=_37-AQyI4uGt4fu-d3v2eAWzQ3uG835ZQxMjFwGYCng,57193
|
|
@@ -940,7 +940,7 @@ datahub/sql_parsing/sql_parsing_aggregator.py,sha256=A3_0wSxBJSRowEaslptDpBoKO42
|
|
|
940
940
|
datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
|
|
941
941
|
datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
|
|
942
942
|
datahub/sql_parsing/sqlglot_lineage.py,sha256=l0kT8MuRIg96X7BNJaboMznF54b-yvM2nMTLyF2d0Nw,47446
|
|
943
|
-
datahub/sql_parsing/sqlglot_utils.py,sha256=
|
|
943
|
+
datahub/sql_parsing/sqlglot_utils.py,sha256=5cUiEWLWfVTI7uIxolAfOfNVo50qnklzhj86gxSFWqg,14943
|
|
944
944
|
datahub/sql_parsing/tool_meta_extractor.py,sha256=EV_g7sOchTSUm2p6wluNJqND7-rDYokVTqqFCM7hQ6c,7599
|
|
945
945
|
datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
946
946
|
datahub/telemetry/stats.py,sha256=TwaQisQlD2Bk0uw__pP6u3Ovz9r-Ip4pCwpnto4r5e0,959
|
|
@@ -1045,8 +1045,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
1045
1045
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
1046
1046
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
1047
1047
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
1048
|
-
acryl_datahub-1.0.0.
|
|
1049
|
-
acryl_datahub-1.0.0.
|
|
1050
|
-
acryl_datahub-1.0.0.
|
|
1051
|
-
acryl_datahub-1.0.0.
|
|
1052
|
-
acryl_datahub-1.0.0.
|
|
1048
|
+
acryl_datahub-1.0.0.3rc1.dist-info/METADATA,sha256=43mPIcmD4ByKfyR6rn8PPgaKNUBSmDmVJnGm1KhBZuo,176855
|
|
1049
|
+
acryl_datahub-1.0.0.3rc1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
1050
|
+
acryl_datahub-1.0.0.3rc1.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
|
|
1051
|
+
acryl_datahub-1.0.0.3rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
1052
|
+
acryl_datahub-1.0.0.3rc1.dist-info/RECORD,,
|
datahub/_version.py
CHANGED
|
@@ -23,6 +23,7 @@ class EnsureAspectSizeProcessor:
|
|
|
23
23
|
):
|
|
24
24
|
self.report = report
|
|
25
25
|
self.payload_constraint = payload_constraint
|
|
26
|
+
self.schema_size_constraint = int(self.payload_constraint * 0.985)
|
|
26
27
|
|
|
27
28
|
def ensure_dataset_profile_size(
|
|
28
29
|
self, dataset_urn: str, profile: DatasetProfileClass
|
|
@@ -68,7 +69,7 @@ class EnsureAspectSizeProcessor:
|
|
|
68
69
|
for field in schema.fields:
|
|
69
70
|
field_size = len(json.dumps(pre_json_transform(field.to_obj())))
|
|
70
71
|
logger.debug(f"Field {field.fieldPath} takes total {field_size}")
|
|
71
|
-
if total_fields_size + field_size < self.
|
|
72
|
+
if total_fields_size + field_size < self.schema_size_constraint:
|
|
72
73
|
accepted_fields.append(field)
|
|
73
74
|
total_fields_size += field_size
|
|
74
75
|
else:
|
|
@@ -27,6 +27,7 @@ logger = logging.getLogger(__name__)
|
|
|
27
27
|
|
|
28
28
|
# The following models were Claude-generated from Hex API OpenAPI definition https://static.hex.site/openapi.json
|
|
29
29
|
# To be exclusively used internally for the deserialization of the API response
|
|
30
|
+
# Model is incomplete and fields may have not been mapped if not used in the ingestion
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
class HexApiAppViewStats(BaseModel):
|
|
@@ -83,20 +84,10 @@ class HexApiUser(BaseModel):
|
|
|
83
84
|
email: str
|
|
84
85
|
|
|
85
86
|
|
|
86
|
-
class HexApiAccessType(StrEnum):
|
|
87
|
-
"""Access type enum."""
|
|
88
|
-
|
|
89
|
-
NONE = "NONE"
|
|
90
|
-
VIEW = "VIEW"
|
|
91
|
-
EDIT = "EDIT"
|
|
92
|
-
FULL_ACCESS = "FULL_ACCESS"
|
|
93
|
-
|
|
94
|
-
|
|
95
87
|
class HexApiUserAccess(BaseModel):
|
|
96
88
|
"""User access model."""
|
|
97
89
|
|
|
98
90
|
user: HexApiUser
|
|
99
|
-
access: Optional[HexApiAccessType] = None
|
|
100
91
|
|
|
101
92
|
|
|
102
93
|
class HexApiCollectionData(BaseModel):
|
|
@@ -109,13 +100,6 @@ class HexApiCollectionAccess(BaseModel):
|
|
|
109
100
|
"""Collection access model."""
|
|
110
101
|
|
|
111
102
|
collection: HexApiCollectionData
|
|
112
|
-
access: Optional[HexApiAccessType] = None
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
class HexApiAccessSettings(BaseModel):
|
|
116
|
-
"""Access settings model."""
|
|
117
|
-
|
|
118
|
-
access: Optional[HexApiAccessType] = None
|
|
119
103
|
|
|
120
104
|
|
|
121
105
|
class HexApiWeeklySchedule(BaseModel):
|
|
@@ -145,9 +129,6 @@ class HexApiSharing(BaseModel):
|
|
|
145
129
|
users: Optional[List[HexApiUserAccess]] = []
|
|
146
130
|
collections: Optional[List[HexApiCollectionAccess]] = []
|
|
147
131
|
groups: Optional[List[Any]] = []
|
|
148
|
-
workspace: Optional[HexApiAccessSettings] = None
|
|
149
|
-
public_web: Optional[HexApiAccessSettings] = Field(default=None, alias="publicWeb")
|
|
150
|
-
support: Optional[HexApiAccessSettings] = None
|
|
151
132
|
|
|
152
133
|
class Config:
|
|
153
134
|
extra = "ignore" # Allow extra fields in the JSON
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import Dict, Optional
|
|
3
|
+
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
5
|
import pydantic
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
6
7
|
|
|
7
8
|
from datahub.configuration.common import AllowDenyPattern
|
|
8
9
|
from datahub.configuration.source_common import (
|
|
@@ -17,6 +18,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
17
18
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
18
19
|
StatefulIngestionConfigBase,
|
|
19
20
|
)
|
|
21
|
+
from datahub.utilities.lossy_collections import LossyDict
|
|
20
22
|
|
|
21
23
|
logger = logging.getLogger(__name__)
|
|
22
24
|
|
|
@@ -53,15 +55,82 @@ class Constant:
|
|
|
53
55
|
DEFAULT_API_URL = "https://aws-api.sigmacomputing.com/v2"
|
|
54
56
|
|
|
55
57
|
|
|
58
|
+
class WorkspaceCounts(BaseModel):
|
|
59
|
+
workbooks_count: int = 0
|
|
60
|
+
datasets_count: int = 0
|
|
61
|
+
elements_count: int = 0
|
|
62
|
+
pages_count: int = 0
|
|
63
|
+
|
|
64
|
+
def is_empty(self) -> bool:
|
|
65
|
+
return (
|
|
66
|
+
self.workbooks_count == 0
|
|
67
|
+
and self.datasets_count == 0
|
|
68
|
+
and self.elements_count == 0
|
|
69
|
+
and self.pages_count == 0
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def as_obj(self) -> dict:
|
|
73
|
+
return {
|
|
74
|
+
"workbooks_count": self.workbooks_count,
|
|
75
|
+
"datasets_count": self.datasets_count,
|
|
76
|
+
"elements_count": self.elements_count,
|
|
77
|
+
"pages_count": self.pages_count,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class SigmaWorkspaceEntityFilterReport(EntityFilterReport):
|
|
82
|
+
type: str = "workspace"
|
|
83
|
+
|
|
84
|
+
workspace_counts: LossyDict[str, WorkspaceCounts] = Field(
|
|
85
|
+
default_factory=LossyDict,
|
|
86
|
+
description="Counts of workbooks, datasets, elements and pages in each workspace.",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def increment_workbooks_count(self, workspace_id: str) -> None:
|
|
90
|
+
if workspace_id not in self.workspace_counts:
|
|
91
|
+
self.workspace_counts[workspace_id] = WorkspaceCounts()
|
|
92
|
+
self.workspace_counts[workspace_id].workbooks_count += 1
|
|
93
|
+
|
|
94
|
+
def increment_datasets_count(self, workspace_id: str) -> None:
|
|
95
|
+
if workspace_id not in self.workspace_counts:
|
|
96
|
+
self.workspace_counts[workspace_id] = WorkspaceCounts()
|
|
97
|
+
self.workspace_counts[workspace_id].datasets_count += 1
|
|
98
|
+
|
|
99
|
+
def increment_elements_count(self, workspace_id: str) -> None:
|
|
100
|
+
if workspace_id not in self.workspace_counts:
|
|
101
|
+
self.workspace_counts[workspace_id] = WorkspaceCounts()
|
|
102
|
+
self.workspace_counts[workspace_id].elements_count += 1
|
|
103
|
+
|
|
104
|
+
def increment_pages_count(self, workspace_id: str) -> None:
|
|
105
|
+
if workspace_id not in self.workspace_counts:
|
|
106
|
+
self.workspace_counts[workspace_id] = WorkspaceCounts()
|
|
107
|
+
self.workspace_counts[workspace_id].pages_count += 1
|
|
108
|
+
|
|
109
|
+
def as_obj(self) -> dict:
|
|
110
|
+
return {
|
|
111
|
+
"filtered": self.dropped_entities.as_obj(),
|
|
112
|
+
"processed": self.processed_entities.as_obj(),
|
|
113
|
+
"workspace_counts": {
|
|
114
|
+
key: item.as_obj() for key, item in self.workspace_counts.items()
|
|
115
|
+
},
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
|
|
56
119
|
@dataclass
|
|
57
120
|
class SigmaSourceReport(StaleEntityRemovalSourceReport):
|
|
58
|
-
workspaces:
|
|
59
|
-
|
|
121
|
+
workspaces: SigmaWorkspaceEntityFilterReport = field(
|
|
122
|
+
default_factory=SigmaWorkspaceEntityFilterReport
|
|
123
|
+
)
|
|
60
124
|
non_accessible_workspaces_count: int = 0
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
125
|
+
|
|
126
|
+
datasets: EntityFilterReport = EntityFilterReport.field(type="dataset")
|
|
127
|
+
datasets_without_workspace: int = 0
|
|
128
|
+
|
|
129
|
+
workbooks: EntityFilterReport = EntityFilterReport.field(type="workbook")
|
|
130
|
+
workbooks_without_workspace: int = 0
|
|
131
|
+
|
|
64
132
|
number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
|
|
133
|
+
empty_workspaces: List[str] = field(default_factory=list)
|
|
65
134
|
|
|
66
135
|
|
|
67
136
|
class PlatformDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
@@ -35,6 +35,7 @@ from datahub.ingestion.source.sigma.config import (
|
|
|
35
35
|
PlatformDetail,
|
|
36
36
|
SigmaSourceConfig,
|
|
37
37
|
SigmaSourceReport,
|
|
38
|
+
WorkspaceCounts,
|
|
38
39
|
)
|
|
39
40
|
from datahub.ingestion.source.sigma.data_classes import (
|
|
40
41
|
Element,
|
|
@@ -163,7 +164,6 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
163
164
|
def _get_allowed_workspaces(self) -> List[Workspace]:
|
|
164
165
|
all_workspaces = self.sigma_api.workspaces.values()
|
|
165
166
|
logger.info(f"Number of workspaces = {len(all_workspaces)}")
|
|
166
|
-
self.reporter.number_of_workspaces = len(all_workspaces)
|
|
167
167
|
|
|
168
168
|
allowed_workspaces = []
|
|
169
169
|
for workspace in all_workspaces:
|
|
@@ -285,6 +285,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
285
285
|
yield self._gen_dataset_properties(dataset_urn, dataset)
|
|
286
286
|
|
|
287
287
|
if dataset.workspaceId:
|
|
288
|
+
self.reporter.workspaces.increment_datasets_count(dataset.workspaceId)
|
|
288
289
|
yield from add_entity_to_container(
|
|
289
290
|
container_key=self._gen_workspace_key(dataset.workspaceId),
|
|
290
291
|
entity_type="dataset",
|
|
@@ -468,6 +469,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
468
469
|
).as_workunit()
|
|
469
470
|
|
|
470
471
|
if workbook.workspaceId:
|
|
472
|
+
self.reporter.workspaces.increment_elements_count(workbook.workspaceId)
|
|
473
|
+
|
|
471
474
|
yield self._gen_entity_browsepath_aspect(
|
|
472
475
|
entity_urn=chart_urn,
|
|
473
476
|
parent_entity_urn=builder.make_container_urn(
|
|
@@ -525,6 +528,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
525
528
|
all_input_fields: List[InputFieldClass] = []
|
|
526
529
|
|
|
527
530
|
if workbook.workspaceId:
|
|
531
|
+
self.reporter.workspaces.increment_pages_count(workbook.workspaceId)
|
|
528
532
|
yield self._gen_entity_browsepath_aspect(
|
|
529
533
|
entity_urn=dashboard_urn,
|
|
530
534
|
parent_entity_urn=builder.make_container_urn(
|
|
@@ -614,6 +618,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
614
618
|
|
|
615
619
|
paths = workbook.path.split("/")[1:]
|
|
616
620
|
if workbook.workspaceId:
|
|
621
|
+
self.reporter.workspaces.increment_workbooks_count(workbook.workspaceId)
|
|
622
|
+
|
|
617
623
|
yield self._gen_entity_browsepath_aspect(
|
|
618
624
|
entity_urn=dashboard_urn,
|
|
619
625
|
parent_entity_urn=builder.make_container_urn(
|
|
@@ -667,6 +673,15 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
667
673
|
f"{workspace.name} ({workspace.workspaceId})"
|
|
668
674
|
)
|
|
669
675
|
yield from self._gen_workspace_workunit(workspace)
|
|
676
|
+
if self.reporter.workspaces.workspace_counts.get(
|
|
677
|
+
workspace.workspaceId, WorkspaceCounts()
|
|
678
|
+
).is_empty():
|
|
679
|
+
logger.warning(
|
|
680
|
+
f"Workspace {workspace.name} ({workspace.workspaceId}) is empty. If this is not expected, add the user associated with the Client ID/Secret to each workspace with missing metadata"
|
|
681
|
+
)
|
|
682
|
+
self.reporter.empty_workspaces.append(
|
|
683
|
+
f"{workspace.name} ({workspace.workspaceId})"
|
|
684
|
+
)
|
|
670
685
|
yield from self._gen_sigma_dataset_upstream_lineage_workunit()
|
|
671
686
|
|
|
672
687
|
def get_report(self) -> SourceReport:
|
|
@@ -95,22 +95,22 @@ class SigmaAPI:
|
|
|
95
95
|
return get_response
|
|
96
96
|
|
|
97
97
|
def get_workspace(self, workspace_id: str) -> Optional[Workspace]:
|
|
98
|
+
if workspace_id in self.workspaces:
|
|
99
|
+
return self.workspaces[workspace_id]
|
|
100
|
+
|
|
98
101
|
logger.debug(f"Fetching workspace metadata with id '{workspace_id}'")
|
|
99
102
|
try:
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
workspace = Workspace.parse_obj(response.json())
|
|
112
|
-
self.workspaces[workspace.workspaceId] = workspace
|
|
113
|
-
return workspace
|
|
103
|
+
response = self._get_api_call(
|
|
104
|
+
f"{self.config.api_url}/workspaces/{workspace_id}"
|
|
105
|
+
)
|
|
106
|
+
if response.status_code == 403:
|
|
107
|
+
logger.debug(f"Workspace {workspace_id} not accessible.")
|
|
108
|
+
self.report.non_accessible_workspaces_count += 1
|
|
109
|
+
return None
|
|
110
|
+
response.raise_for_status()
|
|
111
|
+
workspace = Workspace.parse_obj(response.json())
|
|
112
|
+
self.workspaces[workspace.workspaceId] = workspace
|
|
113
|
+
return workspace
|
|
114
114
|
except Exception as e:
|
|
115
115
|
self._log_http_error(
|
|
116
116
|
message=f"Unable to fetch workspace '{workspace_id}'. Exception: {e}"
|
|
@@ -187,7 +187,9 @@ class SigmaAPI:
|
|
|
187
187
|
@functools.lru_cache
|
|
188
188
|
def _get_files_metadata(self, file_type: str) -> Dict[str, File]:
|
|
189
189
|
logger.debug(f"Fetching file metadata with type {file_type}.")
|
|
190
|
-
file_url = url =
|
|
190
|
+
file_url = url = (
|
|
191
|
+
f"{self.config.api_url}/files?permissionFilter=view&typeFilters={file_type}"
|
|
192
|
+
)
|
|
191
193
|
try:
|
|
192
194
|
files_metadata: Dict[str, File] = {}
|
|
193
195
|
while True:
|
|
@@ -225,31 +227,50 @@ class SigmaAPI:
|
|
|
225
227
|
for dataset_dict in response_dict[Constant.ENTRIES]:
|
|
226
228
|
dataset = SigmaDataset.parse_obj(dataset_dict)
|
|
227
229
|
|
|
228
|
-
if dataset.datasetId in dataset_files_metadata:
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
230
|
+
if dataset.datasetId not in dataset_files_metadata:
|
|
231
|
+
self.report.datasets.dropped(
|
|
232
|
+
f"{dataset.name} ({dataset.datasetId}) (missing file metadata)"
|
|
233
|
+
)
|
|
234
|
+
continue
|
|
235
|
+
|
|
236
|
+
dataset.workspaceId = dataset_files_metadata[
|
|
237
|
+
dataset.datasetId
|
|
238
|
+
].workspaceId
|
|
239
|
+
|
|
240
|
+
dataset.path = dataset_files_metadata[dataset.datasetId].path
|
|
241
|
+
dataset.badge = dataset_files_metadata[dataset.datasetId].badge
|
|
242
|
+
|
|
243
|
+
workspace = None
|
|
244
|
+
if dataset.workspaceId:
|
|
245
|
+
workspace = self.get_workspace(dataset.workspaceId)
|
|
246
|
+
|
|
247
|
+
if workspace:
|
|
248
|
+
if self.config.workspace_pattern.allowed(workspace.name):
|
|
249
|
+
self.report.datasets.processed(
|
|
250
|
+
f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
|
|
251
|
+
)
|
|
252
|
+
datasets.append(dataset)
|
|
253
|
+
else:
|
|
254
|
+
self.report.datasets.dropped(
|
|
255
|
+
f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
|
|
256
|
+
)
|
|
257
|
+
elif self.config.ingest_shared_entities:
|
|
258
|
+
# If no workspace for dataset we can consider it as shared entity
|
|
259
|
+
self.report.datasets_without_workspace += 1
|
|
260
|
+
self.report.datasets.processed(
|
|
261
|
+
f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
|
|
262
|
+
)
|
|
263
|
+
datasets.append(dataset)
|
|
264
|
+
else:
|
|
265
|
+
self.report.datasets.dropped(
|
|
266
|
+
f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
|
|
267
|
+
)
|
|
247
268
|
|
|
248
269
|
if response_dict[Constant.NEXTPAGE]:
|
|
249
270
|
url = f"{dataset_url}?page={response_dict[Constant.NEXTPAGE]}"
|
|
250
271
|
else:
|
|
251
272
|
break
|
|
252
|
-
|
|
273
|
+
|
|
253
274
|
return datasets
|
|
254
275
|
except Exception as e:
|
|
255
276
|
self._log_http_error(
|
|
@@ -381,34 +402,54 @@ class SigmaAPI:
|
|
|
381
402
|
for workbook_dict in response_dict[Constant.ENTRIES]:
|
|
382
403
|
workbook = Workbook.parse_obj(workbook_dict)
|
|
383
404
|
|
|
384
|
-
if workbook.workbookId in workbook_files_metadata:
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
workbook.workbookId
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
405
|
+
if workbook.workbookId not in workbook_files_metadata:
|
|
406
|
+
# Due to a bug in the Sigma API, it seems like the /files endpoint does not
|
|
407
|
+
# return file metadata when the user has access via admin permissions. In
|
|
408
|
+
# those cases, the user associated with the token needs to be manually added
|
|
409
|
+
# to the workspace.
|
|
410
|
+
self.report.workbooks.dropped(
|
|
411
|
+
f"{workbook.name} ({workbook.workbookId}) (missing file metadata; path: {workbook.path}; likely need to manually add user to workspace)"
|
|
412
|
+
)
|
|
413
|
+
continue
|
|
414
|
+
|
|
415
|
+
workbook.workspaceId = workbook_files_metadata[
|
|
416
|
+
workbook.workbookId
|
|
417
|
+
].workspaceId
|
|
418
|
+
|
|
419
|
+
workbook.badge = workbook_files_metadata[workbook.workbookId].badge
|
|
420
|
+
|
|
421
|
+
workspace = None
|
|
422
|
+
if workbook.workspaceId:
|
|
423
|
+
workspace = self.get_workspace(workbook.workspaceId)
|
|
424
|
+
|
|
425
|
+
if workspace:
|
|
426
|
+
if self.config.workspace_pattern.allowed(workspace.name):
|
|
427
|
+
self.report.workbooks.processed(
|
|
428
|
+
f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
|
|
429
|
+
)
|
|
430
|
+
workbook.pages = self.get_workbook_pages(workbook)
|
|
431
|
+
workbooks.append(workbook)
|
|
432
|
+
else:
|
|
433
|
+
self.report.workbooks.dropped(
|
|
434
|
+
f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
|
|
435
|
+
)
|
|
436
|
+
elif self.config.ingest_shared_entities:
|
|
437
|
+
# If no workspace for workbook we can consider it as shared entity
|
|
438
|
+
self.report.workbooks_without_workspace += 1
|
|
439
|
+
self.report.workbooks.processed(
|
|
440
|
+
f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
|
|
441
|
+
)
|
|
442
|
+
workbook.pages = self.get_workbook_pages(workbook)
|
|
443
|
+
workbooks.append(workbook)
|
|
444
|
+
else:
|
|
445
|
+
self.report.workbooks.dropped(
|
|
446
|
+
f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
|
|
447
|
+
)
|
|
406
448
|
|
|
407
449
|
if response_dict[Constant.NEXTPAGE]:
|
|
408
450
|
url = f"{workbook_url}?page={response_dict[Constant.NEXTPAGE]}"
|
|
409
451
|
else:
|
|
410
452
|
break
|
|
411
|
-
self.report.number_of_workbooks = len(workbooks)
|
|
412
453
|
return workbooks
|
|
413
454
|
except Exception as e:
|
|
414
455
|
self._log_http_error(
|
|
@@ -515,7 +515,10 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
515
515
|
# job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
|
|
516
516
|
# here
|
|
517
517
|
query_id=get_query_fingerprint(
|
|
518
|
-
res["query_text"],
|
|
518
|
+
res["query_text"],
|
|
519
|
+
self.identifiers.platform,
|
|
520
|
+
fast=True,
|
|
521
|
+
secondary_id=res["query_secondary_fingerprint"],
|
|
519
522
|
),
|
|
520
523
|
query_text=res["query_text"],
|
|
521
524
|
upstreams=upstreams,
|
|
@@ -654,7 +657,17 @@ WITH
|
|
|
654
657
|
fingerprinted_queries as (
|
|
655
658
|
SELECT *,
|
|
656
659
|
-- TODO: Generate better fingerprints for each query by pushing down regex logic.
|
|
657
|
-
query_history.query_parameterized_hash as query_fingerprint
|
|
660
|
+
query_history.query_parameterized_hash as query_fingerprint,
|
|
661
|
+
-- Optional and additional hash to be used for query deduplication and final query identity
|
|
662
|
+
CASE
|
|
663
|
+
WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
|
|
664
|
+
-- Extract project id and hash it
|
|
665
|
+
THEN CAST(HASH(
|
|
666
|
+
REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
|
|
667
|
+
REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
|
|
668
|
+
) AS VARCHAR)
|
|
669
|
+
ELSE NULL
|
|
670
|
+
END as query_secondary_fingerprint
|
|
658
671
|
FROM
|
|
659
672
|
snowflake.account_usage.query_history
|
|
660
673
|
WHERE
|
|
@@ -670,11 +683,11 @@ fingerprinted_queries as (
|
|
|
670
683
|
{time_bucket_size},
|
|
671
684
|
CONVERT_TIMEZONE('UTC', start_time)
|
|
672
685
|
) AS bucket_start_time,
|
|
673
|
-
COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint) AS query_count,
|
|
686
|
+
COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
|
|
674
687
|
FROM
|
|
675
688
|
fingerprinted_queries
|
|
676
689
|
QUALIFY
|
|
677
|
-
ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint ORDER BY start_time DESC) = 1
|
|
690
|
+
ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
|
|
678
691
|
)
|
|
679
692
|
, raw_access_history AS (
|
|
680
693
|
SELECT
|
|
@@ -714,6 +727,7 @@ fingerprinted_queries as (
|
|
|
714
727
|
q.bucket_start_time,
|
|
715
728
|
q.query_id,
|
|
716
729
|
q.query_fingerprint,
|
|
730
|
+
q.query_secondary_fingerprint,
|
|
717
731
|
q.query_count,
|
|
718
732
|
q.session_id AS "SESSION_ID",
|
|
719
733
|
q.start_time AS "QUERY_START_TIME",
|
|
@@ -1000,4 +1000,4 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
1000
1000
|
from_clause = (
|
|
1001
1001
|
f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
|
|
1002
1002
|
)
|
|
1003
|
-
return f"""SHOW STREAMS IN DATABASE {db_name} LIMIT {limit} {from_clause};"""
|
|
1003
|
+
return f"""SHOW STREAMS IN DATABASE "{db_name}" LIMIT {limit} {from_clause};"""
|
|
@@ -257,7 +257,10 @@ def generate_hash(text: str) -> str:
|
|
|
257
257
|
|
|
258
258
|
|
|
259
259
|
def get_query_fingerprint_debug(
|
|
260
|
-
expression: sqlglot.exp.ExpOrStr,
|
|
260
|
+
expression: sqlglot.exp.ExpOrStr,
|
|
261
|
+
platform: DialectOrStr,
|
|
262
|
+
fast: bool = False,
|
|
263
|
+
secondary_id: Optional[str] = None,
|
|
261
264
|
) -> Tuple[str, Optional[str]]:
|
|
262
265
|
try:
|
|
263
266
|
if not fast:
|
|
@@ -272,16 +275,18 @@ def get_query_fingerprint_debug(
|
|
|
272
275
|
logger.debug("Failed to generalize query for fingerprinting: %s", e)
|
|
273
276
|
expression_sql = None
|
|
274
277
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
)
|
|
278
|
+
text = expression_sql or _expression_to_string(expression, platform=platform)
|
|
279
|
+
if secondary_id:
|
|
280
|
+
text = text + " -- " + secondary_id
|
|
281
|
+
fingerprint = generate_hash(text=text)
|
|
280
282
|
return fingerprint, expression_sql
|
|
281
283
|
|
|
282
284
|
|
|
283
285
|
def get_query_fingerprint(
|
|
284
|
-
expression: sqlglot.exp.ExpOrStr,
|
|
286
|
+
expression: sqlglot.exp.ExpOrStr,
|
|
287
|
+
platform: DialectOrStr,
|
|
288
|
+
fast: bool = False,
|
|
289
|
+
secondary_id: Optional[str] = None,
|
|
285
290
|
) -> str:
|
|
286
291
|
"""Get a fingerprint for a SQL query.
|
|
287
292
|
|
|
@@ -298,12 +303,15 @@ def get_query_fingerprint(
|
|
|
298
303
|
Args:
|
|
299
304
|
expression: The SQL query to fingerprint.
|
|
300
305
|
platform: The SQL dialect to use.
|
|
306
|
+
secondary_id: An optional additional id string to included in the final fingerprint.
|
|
301
307
|
|
|
302
308
|
Returns:
|
|
303
309
|
The fingerprint for the SQL query.
|
|
304
310
|
"""
|
|
305
311
|
|
|
306
|
-
return get_query_fingerprint_debug(
|
|
312
|
+
return get_query_fingerprint_debug(
|
|
313
|
+
expression=expression, platform=platform, fast=fast, secondary_id=secondary_id
|
|
314
|
+
)[0]
|
|
307
315
|
|
|
308
316
|
|
|
309
317
|
@functools.lru_cache(maxsize=FORMAT_QUERY_CACHE_SIZE)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|