acryl-datahub 1.0.0.2rc5__py3-none-any.whl → 1.0.0.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (24) hide show
  1. {acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc2.dist-info}/METADATA +2516 -2516
  2. {acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc2.dist-info}/RECORD +24 -23
  3. datahub/_version.py +1 -1
  4. datahub/emitter/mcp.py +5 -1
  5. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
  6. datahub/ingestion/source/hex/api.py +1 -20
  7. datahub/ingestion/source/mlflow.py +19 -6
  8. datahub/ingestion/source/powerbi/config.py +12 -0
  9. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  10. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  11. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
  12. datahub/ingestion/source/sigma/config.py +75 -6
  13. datahub/ingestion/source/sigma/sigma.py +16 -1
  14. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  15. datahub/ingestion/source/snowflake/snowflake_config.py +1 -0
  16. datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
  17. datahub/ingestion/source/snowflake/snowflake_query.py +1 -1
  18. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  19. datahub/sql_parsing/sqlglot_utils.py +16 -8
  20. datahub/testing/mcp_diff.py +15 -2
  21. {acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc2.dist-info}/WHEEL +0 -0
  22. {acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc2.dist-info}/entry_points.txt +0 -0
  23. {acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc2.dist-info}/licenses/LICENSE +0 -0
  24. {acryl_datahub-1.0.0.2rc5.dist-info → acryl_datahub-1.0.0.3rc2.dist-info}/top_level.txt +0 -0
@@ -29,6 +29,12 @@ from datahub.ingestion.source.powerbi.m_query.data_classes import (
29
29
  Lineage,
30
30
  ReferencedTable,
31
31
  )
32
+ from datahub.ingestion.source.powerbi.m_query.odbc import (
33
+ extract_dsn,
34
+ extract_platform,
35
+ extract_server,
36
+ normalize_platform_name,
37
+ )
32
38
  from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
33
39
  from datahub.metadata.schema_classes import SchemaFieldDataTypeClass
34
40
  from datahub.sql_parsing.sqlglot_lineage import (
@@ -155,6 +161,7 @@ class AbstractLineage(ABC):
155
161
  tree_function.token_values(arg_list)
156
162
  ),
157
163
  )
164
+ logger.debug(f"DB Details: {arguments}")
158
165
 
159
166
  if len(arguments) < 2:
160
167
  logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
@@ -940,6 +947,147 @@ class NativeQueryLineage(AbstractLineage):
940
947
  )
941
948
 
942
949
 
950
+ class OdbcLineage(AbstractLineage):
951
+ def create_lineage(
952
+ self, data_access_func_detail: DataAccessFunctionDetail
953
+ ) -> Lineage:
954
+ logger.debug(
955
+ f"Processing {self.get_platform_pair().powerbi_data_platform_name} "
956
+ f"data-access function detail {data_access_func_detail}"
957
+ )
958
+
959
+ connect_string, _ = self.get_db_detail_from_argument(
960
+ data_access_func_detail.arg_list
961
+ )
962
+
963
+ if not connect_string:
964
+ self.reporter.warning(
965
+ title="Can not extract ODBC connect string",
966
+ message="Can not extract ODBC connect string from data access function. Skipping Lineage creation.",
967
+ context=f"table-name={self.table.full_name}, data-access-func-detail={data_access_func_detail}",
968
+ )
969
+ return Lineage.empty()
970
+
971
+ logger.debug(f"ODBC connect string: {connect_string}")
972
+ data_platform, powerbi_platform = extract_platform(connect_string)
973
+ server_name = extract_server(connect_string)
974
+
975
+ if not data_platform:
976
+ dsn = extract_dsn(connect_string)
977
+ if dsn:
978
+ logger.debug(f"Extracted DSN: {dsn}")
979
+ server_name = dsn
980
+ if dsn and self.config.dsn_to_platform_name:
981
+ logger.debug(f"Attempting to map DSN {dsn} to platform")
982
+ name = self.config.dsn_to_platform_name.get(dsn)
983
+ if name:
984
+ logger.debug(f"Found DSN {dsn} mapped to platform {name}")
985
+ data_platform, powerbi_platform = normalize_platform_name(name)
986
+
987
+ if not data_platform or not powerbi_platform:
988
+ self.reporter.warning(
989
+ title="Can not determine ODBC platform",
990
+ message="Can not determine platform from ODBC connect string. Skipping Lineage creation.",
991
+ context=f"table-name={self.table.full_name}, connect-string={connect_string}",
992
+ )
993
+ return Lineage.empty()
994
+
995
+ platform_pair: DataPlatformPair = self.create_platform_pair(
996
+ data_platform, powerbi_platform
997
+ )
998
+
999
+ if not server_name and self.config.server_to_platform_instance:
1000
+ self.reporter.warning(
1001
+ title="Can not determine ODBC server name",
1002
+ message="Can not determine server name with server_to_platform_instance mapping. Skipping Lineage creation.",
1003
+ context=f"table-name={self.table.full_name}",
1004
+ )
1005
+ return Lineage.empty()
1006
+ elif not server_name:
1007
+ server_name = "unknown"
1008
+
1009
+ database_name = None
1010
+ schema_name = None
1011
+ table_name = None
1012
+ qualified_table_name = None
1013
+
1014
+ temp_accessor: Optional[IdentifierAccessor] = (
1015
+ data_access_func_detail.identifier_accessor
1016
+ )
1017
+
1018
+ while temp_accessor:
1019
+ logger.debug(
1020
+ f"identifier = {temp_accessor.identifier} items = {temp_accessor.items}"
1021
+ )
1022
+ if temp_accessor.items.get("Kind") == "Database":
1023
+ database_name = temp_accessor.items["Name"]
1024
+
1025
+ if temp_accessor.items.get("Kind") == "Schema":
1026
+ schema_name = temp_accessor.items["Name"]
1027
+
1028
+ if temp_accessor.items.get("Kind") == "Table":
1029
+ table_name = temp_accessor.items["Name"]
1030
+
1031
+ if temp_accessor.next is not None:
1032
+ temp_accessor = temp_accessor.next
1033
+ else:
1034
+ break
1035
+
1036
+ if (
1037
+ database_name is not None
1038
+ and schema_name is not None
1039
+ and table_name is not None
1040
+ ):
1041
+ qualified_table_name = f"{database_name}.{schema_name}.{table_name}"
1042
+ elif database_name is not None and table_name is not None:
1043
+ qualified_table_name = f"{database_name}.{table_name}"
1044
+
1045
+ if not qualified_table_name:
1046
+ self.reporter.warning(
1047
+ title="Can not determine qualified table name",
1048
+ message="Can not determine qualified table name for ODBC data source. Skipping Lineage creation.",
1049
+ context=f"table-name={self.table.full_name}, data-platform={data_platform}",
1050
+ )
1051
+ logger.warning(
1052
+ f"Can not determine qualified table name for ODBC data source {data_platform} "
1053
+ f"table {self.table.full_name}."
1054
+ )
1055
+ return Lineage.empty()
1056
+
1057
+ logger.debug(
1058
+ f"ODBC Platform {data_platform} found qualified table name {qualified_table_name}"
1059
+ )
1060
+
1061
+ urn = make_urn(
1062
+ config=self.config,
1063
+ platform_instance_resolver=self.platform_instance_resolver,
1064
+ data_platform_pair=platform_pair,
1065
+ server=server_name,
1066
+ qualified_table_name=qualified_table_name,
1067
+ )
1068
+
1069
+ column_lineage = self.create_table_column_lineage(urn)
1070
+
1071
+ return Lineage(
1072
+ upstreams=[
1073
+ DataPlatformTable(
1074
+ data_platform_pair=platform_pair,
1075
+ urn=urn,
1076
+ )
1077
+ ],
1078
+ column_lineage=column_lineage,
1079
+ )
1080
+
1081
+ @staticmethod
1082
+ def create_platform_pair(
1083
+ data_platform: str, powerbi_platform: str
1084
+ ) -> DataPlatformPair:
1085
+ return DataPlatformPair(data_platform, powerbi_platform)
1086
+
1087
+ def get_platform_pair(self) -> DataPlatformPair:
1088
+ return SupportedDataPlatform.ODBC.value
1089
+
1090
+
943
1091
  class SupportedPattern(Enum):
944
1092
  DATABRICKS_QUERY = (
945
1093
  DatabricksLineage,
@@ -991,6 +1139,11 @@ class SupportedPattern(Enum):
991
1139
  FunctionName.NATIVE_QUERY,
992
1140
  )
993
1141
 
1142
+ ODBC = (
1143
+ OdbcLineage,
1144
+ FunctionName.ODBC_DATA_ACCESS,
1145
+ )
1146
+
994
1147
  def handler(self) -> Type[AbstractLineage]:
995
1148
  return self.value[0]
996
1149
 
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field
3
- from typing import Dict, Optional
3
+ from typing import Dict, List, Optional
4
4
 
5
5
  import pydantic
6
+ from pydantic import BaseModel, Field
6
7
 
7
8
  from datahub.configuration.common import AllowDenyPattern
8
9
  from datahub.configuration.source_common import (
@@ -17,6 +18,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
17
18
  from datahub.ingestion.source.state.stateful_ingestion_base import (
18
19
  StatefulIngestionConfigBase,
19
20
  )
21
+ from datahub.utilities.lossy_collections import LossyDict
20
22
 
21
23
  logger = logging.getLogger(__name__)
22
24
 
@@ -53,15 +55,82 @@ class Constant:
53
55
  DEFAULT_API_URL = "https://aws-api.sigmacomputing.com/v2"
54
56
 
55
57
 
58
+ class WorkspaceCounts(BaseModel):
59
+ workbooks_count: int = 0
60
+ datasets_count: int = 0
61
+ elements_count: int = 0
62
+ pages_count: int = 0
63
+
64
+ def is_empty(self) -> bool:
65
+ return (
66
+ self.workbooks_count == 0
67
+ and self.datasets_count == 0
68
+ and self.elements_count == 0
69
+ and self.pages_count == 0
70
+ )
71
+
72
+ def as_obj(self) -> dict:
73
+ return {
74
+ "workbooks_count": self.workbooks_count,
75
+ "datasets_count": self.datasets_count,
76
+ "elements_count": self.elements_count,
77
+ "pages_count": self.pages_count,
78
+ }
79
+
80
+
81
+ class SigmaWorkspaceEntityFilterReport(EntityFilterReport):
82
+ type: str = "workspace"
83
+
84
+ workspace_counts: LossyDict[str, WorkspaceCounts] = Field(
85
+ default_factory=LossyDict,
86
+ description="Counts of workbooks, datasets, elements and pages in each workspace.",
87
+ )
88
+
89
+ def increment_workbooks_count(self, workspace_id: str) -> None:
90
+ if workspace_id not in self.workspace_counts:
91
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
92
+ self.workspace_counts[workspace_id].workbooks_count += 1
93
+
94
+ def increment_datasets_count(self, workspace_id: str) -> None:
95
+ if workspace_id not in self.workspace_counts:
96
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
97
+ self.workspace_counts[workspace_id].datasets_count += 1
98
+
99
+ def increment_elements_count(self, workspace_id: str) -> None:
100
+ if workspace_id not in self.workspace_counts:
101
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
102
+ self.workspace_counts[workspace_id].elements_count += 1
103
+
104
+ def increment_pages_count(self, workspace_id: str) -> None:
105
+ if workspace_id not in self.workspace_counts:
106
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
107
+ self.workspace_counts[workspace_id].pages_count += 1
108
+
109
+ def as_obj(self) -> dict:
110
+ return {
111
+ "filtered": self.dropped_entities.as_obj(),
112
+ "processed": self.processed_entities.as_obj(),
113
+ "workspace_counts": {
114
+ key: item.as_obj() for key, item in self.workspace_counts.items()
115
+ },
116
+ }
117
+
118
+
56
119
  @dataclass
57
120
  class SigmaSourceReport(StaleEntityRemovalSourceReport):
58
- workspaces: EntityFilterReport = EntityFilterReport.field(type="workspace")
59
- number_of_workspaces: Optional[int] = None
121
+ workspaces: SigmaWorkspaceEntityFilterReport = field(
122
+ default_factory=SigmaWorkspaceEntityFilterReport
123
+ )
60
124
  non_accessible_workspaces_count: int = 0
61
- shared_entities_count: int = 0
62
- number_of_datasets: int = 0
63
- number_of_workbooks: int = 0
125
+
126
+ datasets: EntityFilterReport = EntityFilterReport.field(type="dataset")
127
+ datasets_without_workspace: int = 0
128
+
129
+ workbooks: EntityFilterReport = EntityFilterReport.field(type="workbook")
130
+ workbooks_without_workspace: int = 0
131
+
64
132
  number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
133
+ empty_workspaces: List[str] = field(default_factory=list)
65
134
 
66
135
 
67
136
  class PlatformDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
@@ -35,6 +35,7 @@ from datahub.ingestion.source.sigma.config import (
35
35
  PlatformDetail,
36
36
  SigmaSourceConfig,
37
37
  SigmaSourceReport,
38
+ WorkspaceCounts,
38
39
  )
39
40
  from datahub.ingestion.source.sigma.data_classes import (
40
41
  Element,
@@ -163,7 +164,6 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
163
164
  def _get_allowed_workspaces(self) -> List[Workspace]:
164
165
  all_workspaces = self.sigma_api.workspaces.values()
165
166
  logger.info(f"Number of workspaces = {len(all_workspaces)}")
166
- self.reporter.number_of_workspaces = len(all_workspaces)
167
167
 
168
168
  allowed_workspaces = []
169
169
  for workspace in all_workspaces:
@@ -285,6 +285,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
285
285
  yield self._gen_dataset_properties(dataset_urn, dataset)
286
286
 
287
287
  if dataset.workspaceId:
288
+ self.reporter.workspaces.increment_datasets_count(dataset.workspaceId)
288
289
  yield from add_entity_to_container(
289
290
  container_key=self._gen_workspace_key(dataset.workspaceId),
290
291
  entity_type="dataset",
@@ -468,6 +469,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
468
469
  ).as_workunit()
469
470
 
470
471
  if workbook.workspaceId:
472
+ self.reporter.workspaces.increment_elements_count(workbook.workspaceId)
473
+
471
474
  yield self._gen_entity_browsepath_aspect(
472
475
  entity_urn=chart_urn,
473
476
  parent_entity_urn=builder.make_container_urn(
@@ -525,6 +528,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
525
528
  all_input_fields: List[InputFieldClass] = []
526
529
 
527
530
  if workbook.workspaceId:
531
+ self.reporter.workspaces.increment_pages_count(workbook.workspaceId)
528
532
  yield self._gen_entity_browsepath_aspect(
529
533
  entity_urn=dashboard_urn,
530
534
  parent_entity_urn=builder.make_container_urn(
@@ -614,6 +618,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
614
618
 
615
619
  paths = workbook.path.split("/")[1:]
616
620
  if workbook.workspaceId:
621
+ self.reporter.workspaces.increment_workbooks_count(workbook.workspaceId)
622
+
617
623
  yield self._gen_entity_browsepath_aspect(
618
624
  entity_urn=dashboard_urn,
619
625
  parent_entity_urn=builder.make_container_urn(
@@ -667,6 +673,15 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
667
673
  f"{workspace.name} ({workspace.workspaceId})"
668
674
  )
669
675
  yield from self._gen_workspace_workunit(workspace)
676
+ if self.reporter.workspaces.workspace_counts.get(
677
+ workspace.workspaceId, WorkspaceCounts()
678
+ ).is_empty():
679
+ logger.warning(
680
+ f"Workspace {workspace.name} ({workspace.workspaceId}) is empty. If this is not expected, add the user associated with the Client ID/Secret to each workspace with missing metadata"
681
+ )
682
+ self.reporter.empty_workspaces.append(
683
+ f"{workspace.name} ({workspace.workspaceId})"
684
+ )
670
685
  yield from self._gen_sigma_dataset_upstream_lineage_workunit()
671
686
 
672
687
  def get_report(self) -> SourceReport:
@@ -95,22 +95,22 @@ class SigmaAPI:
95
95
  return get_response
96
96
 
97
97
  def get_workspace(self, workspace_id: str) -> Optional[Workspace]:
98
+ if workspace_id in self.workspaces:
99
+ return self.workspaces[workspace_id]
100
+
98
101
  logger.debug(f"Fetching workspace metadata with id '{workspace_id}'")
99
102
  try:
100
- if workspace_id in self.workspaces:
101
- return self.workspaces[workspace_id]
102
- else:
103
- response = self._get_api_call(
104
- f"{self.config.api_url}/workspaces/{workspace_id}"
105
- )
106
- if response.status_code == 403:
107
- logger.debug(f"Workspace {workspace_id} not accessible.")
108
- self.report.non_accessible_workspaces_count += 1
109
- return None
110
- response.raise_for_status()
111
- workspace = Workspace.parse_obj(response.json())
112
- self.workspaces[workspace.workspaceId] = workspace
113
- return workspace
103
+ response = self._get_api_call(
104
+ f"{self.config.api_url}/workspaces/{workspace_id}"
105
+ )
106
+ if response.status_code == 403:
107
+ logger.debug(f"Workspace {workspace_id} not accessible.")
108
+ self.report.non_accessible_workspaces_count += 1
109
+ return None
110
+ response.raise_for_status()
111
+ workspace = Workspace.parse_obj(response.json())
112
+ self.workspaces[workspace.workspaceId] = workspace
113
+ return workspace
114
114
  except Exception as e:
115
115
  self._log_http_error(
116
116
  message=f"Unable to fetch workspace '{workspace_id}'. Exception: {e}"
@@ -187,7 +187,9 @@ class SigmaAPI:
187
187
  @functools.lru_cache
188
188
  def _get_files_metadata(self, file_type: str) -> Dict[str, File]:
189
189
  logger.debug(f"Fetching file metadata with type {file_type}.")
190
- file_url = url = f"{self.config.api_url}/files?typeFilters={file_type}"
190
+ file_url = url = (
191
+ f"{self.config.api_url}/files?permissionFilter=view&typeFilters={file_type}"
192
+ )
191
193
  try:
192
194
  files_metadata: Dict[str, File] = {}
193
195
  while True:
@@ -225,31 +227,50 @@ class SigmaAPI:
225
227
  for dataset_dict in response_dict[Constant.ENTRIES]:
226
228
  dataset = SigmaDataset.parse_obj(dataset_dict)
227
229
 
228
- if dataset.datasetId in dataset_files_metadata:
229
- dataset.path = dataset_files_metadata[dataset.datasetId].path
230
- dataset.badge = dataset_files_metadata[dataset.datasetId].badge
231
-
232
- workspace_id = dataset_files_metadata[
233
- dataset.datasetId
234
- ].workspaceId
235
- if workspace_id:
236
- dataset.workspaceId = workspace_id
237
- workspace = self.get_workspace(dataset.workspaceId)
238
- if workspace:
239
- if self.config.workspace_pattern.allowed(
240
- workspace.name
241
- ):
242
- datasets.append(dataset)
243
- elif self.config.ingest_shared_entities:
244
- # If no workspace for dataset we can consider it as shared entity
245
- self.report.shared_entities_count += 1
246
- datasets.append(dataset)
230
+ if dataset.datasetId not in dataset_files_metadata:
231
+ self.report.datasets.dropped(
232
+ f"{dataset.name} ({dataset.datasetId}) (missing file metadata)"
233
+ )
234
+ continue
235
+
236
+ dataset.workspaceId = dataset_files_metadata[
237
+ dataset.datasetId
238
+ ].workspaceId
239
+
240
+ dataset.path = dataset_files_metadata[dataset.datasetId].path
241
+ dataset.badge = dataset_files_metadata[dataset.datasetId].badge
242
+
243
+ workspace = None
244
+ if dataset.workspaceId:
245
+ workspace = self.get_workspace(dataset.workspaceId)
246
+
247
+ if workspace:
248
+ if self.config.workspace_pattern.allowed(workspace.name):
249
+ self.report.datasets.processed(
250
+ f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
251
+ )
252
+ datasets.append(dataset)
253
+ else:
254
+ self.report.datasets.dropped(
255
+ f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
256
+ )
257
+ elif self.config.ingest_shared_entities:
258
+ # If no workspace for dataset we can consider it as shared entity
259
+ self.report.datasets_without_workspace += 1
260
+ self.report.datasets.processed(
261
+ f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
262
+ )
263
+ datasets.append(dataset)
264
+ else:
265
+ self.report.datasets.dropped(
266
+ f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
267
+ )
247
268
 
248
269
  if response_dict[Constant.NEXTPAGE]:
249
270
  url = f"{dataset_url}?page={response_dict[Constant.NEXTPAGE]}"
250
271
  else:
251
272
  break
252
- self.report.number_of_datasets = len(datasets)
273
+
253
274
  return datasets
254
275
  except Exception as e:
255
276
  self._log_http_error(
@@ -381,34 +402,54 @@ class SigmaAPI:
381
402
  for workbook_dict in response_dict[Constant.ENTRIES]:
382
403
  workbook = Workbook.parse_obj(workbook_dict)
383
404
 
384
- if workbook.workbookId in workbook_files_metadata:
385
- workbook.badge = workbook_files_metadata[
386
- workbook.workbookId
387
- ].badge
388
-
389
- workspace_id = workbook_files_metadata[
390
- workbook.workbookId
391
- ].workspaceId
392
- if workspace_id:
393
- workbook.workspaceId = workspace_id
394
- workspace = self.get_workspace(workbook.workspaceId)
395
- if workspace:
396
- if self.config.workspace_pattern.allowed(
397
- workspace.name
398
- ):
399
- workbook.pages = self.get_workbook_pages(workbook)
400
- workbooks.append(workbook)
401
- elif self.config.ingest_shared_entities:
402
- # If no workspace for workbook we can consider it as shared entity
403
- self.report.shared_entities_count += 1
404
- workbook.pages = self.get_workbook_pages(workbook)
405
- workbooks.append(workbook)
405
+ if workbook.workbookId not in workbook_files_metadata:
406
+ # Due to a bug in the Sigma API, it seems like the /files endpoint does not
407
+ # return file metadata when the user has access via admin permissions. In
408
+ # those cases, the user associated with the token needs to be manually added
409
+ # to the workspace.
410
+ self.report.workbooks.dropped(
411
+ f"{workbook.name} ({workbook.workbookId}) (missing file metadata; path: {workbook.path}; likely need to manually add user to workspace)"
412
+ )
413
+ continue
414
+
415
+ workbook.workspaceId = workbook_files_metadata[
416
+ workbook.workbookId
417
+ ].workspaceId
418
+
419
+ workbook.badge = workbook_files_metadata[workbook.workbookId].badge
420
+
421
+ workspace = None
422
+ if workbook.workspaceId:
423
+ workspace = self.get_workspace(workbook.workspaceId)
424
+
425
+ if workspace:
426
+ if self.config.workspace_pattern.allowed(workspace.name):
427
+ self.report.workbooks.processed(
428
+ f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
429
+ )
430
+ workbook.pages = self.get_workbook_pages(workbook)
431
+ workbooks.append(workbook)
432
+ else:
433
+ self.report.workbooks.dropped(
434
+ f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
435
+ )
436
+ elif self.config.ingest_shared_entities:
437
+ # If no workspace for workbook we can consider it as shared entity
438
+ self.report.workbooks_without_workspace += 1
439
+ self.report.workbooks.processed(
440
+ f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
441
+ )
442
+ workbook.pages = self.get_workbook_pages(workbook)
443
+ workbooks.append(workbook)
444
+ else:
445
+ self.report.workbooks.dropped(
446
+ f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
447
+ )
406
448
 
407
449
  if response_dict[Constant.NEXTPAGE]:
408
450
  url = f"{workbook_url}?page={response_dict[Constant.NEXTPAGE]}"
409
451
  else:
410
452
  break
411
- self.report.number_of_workbooks = len(workbooks)
412
453
  return workbooks
413
454
  except Exception as e:
414
455
  self._log_http_error(
@@ -301,6 +301,7 @@ class SnowflakeV2Config(
301
301
  default=AllowDenyPattern.allow_all(),
302
302
  description=(
303
303
  "List of regex patterns for structured properties to include in ingestion."
304
+ " Applied to tags with form `<database>.<schema>.<tag_name>`."
304
305
  " Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
305
306
  ),
306
307
  )
@@ -515,7 +515,10 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
515
515
  # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
516
516
  # here
517
517
  query_id=get_query_fingerprint(
518
- res["query_text"], self.identifiers.platform, fast=True
518
+ res["query_text"],
519
+ self.identifiers.platform,
520
+ fast=True,
521
+ secondary_id=res["query_secondary_fingerprint"],
519
522
  ),
520
523
  query_text=res["query_text"],
521
524
  upstreams=upstreams,
@@ -654,7 +657,17 @@ WITH
654
657
  fingerprinted_queries as (
655
658
  SELECT *,
656
659
  -- TODO: Generate better fingerprints for each query by pushing down regex logic.
657
- query_history.query_parameterized_hash as query_fingerprint
660
+ query_history.query_parameterized_hash as query_fingerprint,
661
+ -- Optional and additional hash to be used for query deduplication and final query identity
662
+ CASE
663
+ WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
664
+ -- Extract project id and hash it
665
+ THEN CAST(HASH(
666
+ REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
667
+ REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
668
+ ) AS VARCHAR)
669
+ ELSE NULL
670
+ END as query_secondary_fingerprint
658
671
  FROM
659
672
  snowflake.account_usage.query_history
660
673
  WHERE
@@ -670,11 +683,11 @@ fingerprinted_queries as (
670
683
  {time_bucket_size},
671
684
  CONVERT_TIMEZONE('UTC', start_time)
672
685
  ) AS bucket_start_time,
673
- COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint) AS query_count,
686
+ COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
674
687
  FROM
675
688
  fingerprinted_queries
676
689
  QUALIFY
677
- ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint ORDER BY start_time DESC) = 1
690
+ ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
678
691
  )
679
692
  , raw_access_history AS (
680
693
  SELECT
@@ -714,6 +727,7 @@ fingerprinted_queries as (
714
727
  q.bucket_start_time,
715
728
  q.query_id,
716
729
  q.query_fingerprint,
730
+ q.query_secondary_fingerprint,
717
731
  q.query_count,
718
732
  q.session_id AS "SESSION_ID",
719
733
  q.start_time AS "QUERY_START_TIME",
@@ -1000,4 +1000,4 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
1000
1000
  from_clause = (
1001
1001
  f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
1002
1002
  )
1003
- return f"""SHOW STREAMS IN DATABASE {db_name} LIMIT {limit} {from_clause};"""
1003
+ return f"""SHOW STREAMS IN DATABASE "{db_name}" LIMIT {limit} {from_clause};"""
@@ -23,6 +23,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
23
23
  from datahub.metadata.com.linkedin.pegasus2avro.structured import (
24
24
  StructuredPropertyDefinition,
25
25
  )
26
+ from datahub.metadata.schema_classes import ChangeTypeClass
26
27
  from datahub.metadata.urns import (
27
28
  ContainerUrn,
28
29
  DatasetUrn,
@@ -81,7 +82,7 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
81
82
  def create_structured_property_templates(self) -> Iterable[MetadataWorkUnit]:
82
83
  for tag in self.data_dictionary.get_all_tags():
83
84
  if not self.config.structured_property_pattern.allowed(
84
- tag.tag_identifier()
85
+ tag._id_prefix_as_str()
85
86
  ):
86
87
  continue
87
88
  if self.config.extract_tags_as_structured_properties:
@@ -111,6 +112,8 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
111
112
  yield MetadataChangeProposalWrapper(
112
113
  entityUrn=urn,
113
114
  aspect=aspect,
115
+ changeType=ChangeTypeClass.CREATE,
116
+ headers={"If-None-Match": "*"},
114
117
  ).as_workunit()
115
118
 
116
119
  def _get_tags_on_object_with_propagation(