acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (159) hide show
  1. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
  2. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
  3. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  6. datahub/api/entities/datacontract/datacontract.py +35 -3
  7. datahub/api/entities/datajob/dataflow.py +3 -3
  8. datahub/api/entities/datajob/datajob.py +7 -4
  9. datahub/api/entities/dataset/dataset.py +9 -11
  10. datahub/api/entities/forms/forms.py +34 -34
  11. datahub/api/graphql/assertion.py +1 -1
  12. datahub/api/graphql/operation.py +4 -4
  13. datahub/cli/check_cli.py +3 -2
  14. datahub/cli/config_utils.py +2 -2
  15. datahub/cli/delete_cli.py +6 -5
  16. datahub/cli/docker_cli.py +2 -2
  17. datahub/cli/exists_cli.py +2 -1
  18. datahub/cli/get_cli.py +2 -1
  19. datahub/cli/iceberg_cli.py +6 -5
  20. datahub/cli/ingest_cli.py +9 -6
  21. datahub/cli/migrate.py +4 -3
  22. datahub/cli/migration_utils.py +4 -3
  23. datahub/cli/put_cli.py +3 -2
  24. datahub/cli/specific/assertions_cli.py +2 -1
  25. datahub/cli/specific/datacontract_cli.py +3 -2
  26. datahub/cli/specific/dataproduct_cli.py +10 -9
  27. datahub/cli/specific/dataset_cli.py +4 -3
  28. datahub/cli/specific/forms_cli.py +2 -1
  29. datahub/cli/specific/group_cli.py +2 -1
  30. datahub/cli/specific/structuredproperties_cli.py +4 -3
  31. datahub/cli/specific/user_cli.py +2 -1
  32. datahub/cli/state_cli.py +2 -1
  33. datahub/cli/timeline_cli.py +2 -1
  34. datahub/configuration/common.py +5 -0
  35. datahub/configuration/source_common.py +1 -1
  36. datahub/emitter/mcp.py +20 -5
  37. datahub/emitter/request_helper.py +116 -3
  38. datahub/emitter/rest_emitter.py +163 -93
  39. datahub/entrypoints.py +2 -1
  40. datahub/errors.py +4 -0
  41. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
  42. datahub/ingestion/api/source.py +2 -5
  43. datahub/ingestion/api/source_helpers.py +1 -0
  44. datahub/ingestion/glossary/classification_mixin.py +4 -2
  45. datahub/ingestion/graph/client.py +33 -8
  46. datahub/ingestion/graph/config.py +14 -0
  47. datahub/ingestion/graph/filters.py +1 -1
  48. datahub/ingestion/graph/links.py +53 -0
  49. datahub/ingestion/run/pipeline.py +9 -6
  50. datahub/ingestion/run/pipeline_config.py +1 -1
  51. datahub/ingestion/sink/datahub_rest.py +5 -6
  52. datahub/ingestion/source/apply/datahub_apply.py +2 -1
  53. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  54. datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
  55. datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
  56. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
  57. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
  58. datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
  59. datahub/ingestion/source/common/subtypes.py +3 -0
  60. datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
  61. datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
  62. datahub/ingestion/source/dbt/dbt_common.py +10 -2
  63. datahub/ingestion/source/dbt/dbt_core.py +82 -42
  64. datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
  65. datahub/ingestion/source/feast.py +4 -4
  66. datahub/ingestion/source/fivetran/config.py +1 -1
  67. datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
  68. datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
  69. datahub/ingestion/source/ge_data_profiler.py +27 -1
  70. datahub/ingestion/source/hex/api.py +1 -20
  71. datahub/ingestion/source/hex/query_fetcher.py +4 -1
  72. datahub/ingestion/source/iceberg/iceberg.py +20 -4
  73. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  74. datahub/ingestion/source/ldap.py +1 -1
  75. datahub/ingestion/source/looker/looker_common.py +17 -2
  76. datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
  77. datahub/ingestion/source/looker/looker_source.py +34 -5
  78. datahub/ingestion/source/looker/lookml_source.py +7 -1
  79. datahub/ingestion/source/metadata/lineage.py +2 -1
  80. datahub/ingestion/source/mlflow.py +19 -6
  81. datahub/ingestion/source/mode.py +74 -28
  82. datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
  83. datahub/ingestion/source/powerbi/config.py +13 -1
  84. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  85. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  86. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
  87. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  88. datahub/ingestion/source/redshift/usage.py +10 -9
  89. datahub/ingestion/source/sigma/config.py +74 -6
  90. datahub/ingestion/source/sigma/sigma.py +16 -1
  91. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  92. datahub/ingestion/source/slack/slack.py +4 -52
  93. datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
  94. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
  95. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  96. datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
  97. datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
  98. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  99. datahub/ingestion/source/sql/athena.py +2 -1
  100. datahub/ingestion/source/sql/clickhouse.py +5 -1
  101. datahub/ingestion/source/sql/druid.py +7 -2
  102. datahub/ingestion/source/sql/hive.py +7 -2
  103. datahub/ingestion/source/sql/hive_metastore.py +5 -5
  104. datahub/ingestion/source/sql/mssql/source.py +1 -1
  105. datahub/ingestion/source/sql/oracle.py +6 -2
  106. datahub/ingestion/source/sql/sql_config.py +1 -34
  107. datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
  108. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  109. datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
  110. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  111. datahub/ingestion/source/tableau/tableau.py +31 -6
  112. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  113. datahub/ingestion/source/unity/config.py +2 -1
  114. datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
  115. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
  116. datahub/ingestion/source/vertexai/vertexai.py +316 -4
  117. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
  118. datahub/integrations/assertion/common.py +3 -2
  119. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
  120. datahub/metadata/_urns/urn_defs.py +1819 -1763
  121. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  122. datahub/metadata/schema.avsc +17296 -16883
  123. datahub/metadata/schema_classes.py +3 -3
  124. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  125. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  126. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  127. datahub/metadata/schemas/FormInfo.avsc +5 -0
  128. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  129. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  130. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  131. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  132. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  133. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  134. datahub/metadata/schemas/__init__.py +3 -3
  135. datahub/sdk/_all_entities.py +4 -0
  136. datahub/sdk/_shared.py +142 -4
  137. datahub/sdk/_utils.py +4 -0
  138. datahub/sdk/dataset.py +2 -2
  139. datahub/sdk/entity_client.py +8 -0
  140. datahub/sdk/lineage_client.py +235 -0
  141. datahub/sdk/main_client.py +6 -3
  142. datahub/sdk/mlmodel.py +301 -0
  143. datahub/sdk/mlmodelgroup.py +233 -0
  144. datahub/secret/datahub_secret_store.py +2 -1
  145. datahub/specific/dataset.py +12 -0
  146. datahub/sql_parsing/fingerprint_utils.py +6 -0
  147. datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
  148. datahub/sql_parsing/sqlglot_utils.py +18 -14
  149. datahub/telemetry/telemetry.py +2 -2
  150. datahub/testing/check_imports.py +1 -1
  151. datahub/testing/mcp_diff.py +15 -2
  152. datahub/upgrade/upgrade.py +10 -12
  153. datahub/utilities/logging_manager.py +8 -1
  154. datahub/utilities/server_config_util.py +350 -10
  155. datahub/utilities/sqlalchemy_query_combiner.py +4 -5
  156. datahub/utilities/urn_encoder.py +1 -1
  157. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
  158. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
  159. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
@@ -95,22 +95,22 @@ class SigmaAPI:
95
95
  return get_response
96
96
 
97
97
  def get_workspace(self, workspace_id: str) -> Optional[Workspace]:
98
+ if workspace_id in self.workspaces:
99
+ return self.workspaces[workspace_id]
100
+
98
101
  logger.debug(f"Fetching workspace metadata with id '{workspace_id}'")
99
102
  try:
100
- if workspace_id in self.workspaces:
101
- return self.workspaces[workspace_id]
102
- else:
103
- response = self._get_api_call(
104
- f"{self.config.api_url}/workspaces/{workspace_id}"
105
- )
106
- if response.status_code == 403:
107
- logger.debug(f"Workspace {workspace_id} not accessible.")
108
- self.report.non_accessible_workspaces_count += 1
109
- return None
110
- response.raise_for_status()
111
- workspace = Workspace.parse_obj(response.json())
112
- self.workspaces[workspace.workspaceId] = workspace
113
- return workspace
103
+ response = self._get_api_call(
104
+ f"{self.config.api_url}/workspaces/{workspace_id}"
105
+ )
106
+ if response.status_code == 403:
107
+ logger.debug(f"Workspace {workspace_id} not accessible.")
108
+ self.report.non_accessible_workspaces_count += 1
109
+ return None
110
+ response.raise_for_status()
111
+ workspace = Workspace.parse_obj(response.json())
112
+ self.workspaces[workspace.workspaceId] = workspace
113
+ return workspace
114
114
  except Exception as e:
115
115
  self._log_http_error(
116
116
  message=f"Unable to fetch workspace '{workspace_id}'. Exception: {e}"
@@ -187,7 +187,9 @@ class SigmaAPI:
187
187
  @functools.lru_cache
188
188
  def _get_files_metadata(self, file_type: str) -> Dict[str, File]:
189
189
  logger.debug(f"Fetching file metadata with type {file_type}.")
190
- file_url = url = f"{self.config.api_url}/files?typeFilters={file_type}"
190
+ file_url = url = (
191
+ f"{self.config.api_url}/files?permissionFilter=view&typeFilters={file_type}"
192
+ )
191
193
  try:
192
194
  files_metadata: Dict[str, File] = {}
193
195
  while True:
@@ -225,31 +227,50 @@ class SigmaAPI:
225
227
  for dataset_dict in response_dict[Constant.ENTRIES]:
226
228
  dataset = SigmaDataset.parse_obj(dataset_dict)
227
229
 
228
- if dataset.datasetId in dataset_files_metadata:
229
- dataset.path = dataset_files_metadata[dataset.datasetId].path
230
- dataset.badge = dataset_files_metadata[dataset.datasetId].badge
231
-
232
- workspace_id = dataset_files_metadata[
233
- dataset.datasetId
234
- ].workspaceId
235
- if workspace_id:
236
- dataset.workspaceId = workspace_id
237
- workspace = self.get_workspace(dataset.workspaceId)
238
- if workspace:
239
- if self.config.workspace_pattern.allowed(
240
- workspace.name
241
- ):
242
- datasets.append(dataset)
243
- elif self.config.ingest_shared_entities:
244
- # If no workspace for dataset we can consider it as shared entity
245
- self.report.shared_entities_count += 1
246
- datasets.append(dataset)
230
+ if dataset.datasetId not in dataset_files_metadata:
231
+ self.report.datasets.dropped(
232
+ f"{dataset.name} ({dataset.datasetId}) (missing file metadata)"
233
+ )
234
+ continue
235
+
236
+ dataset.workspaceId = dataset_files_metadata[
237
+ dataset.datasetId
238
+ ].workspaceId
239
+
240
+ dataset.path = dataset_files_metadata[dataset.datasetId].path
241
+ dataset.badge = dataset_files_metadata[dataset.datasetId].badge
242
+
243
+ workspace = None
244
+ if dataset.workspaceId:
245
+ workspace = self.get_workspace(dataset.workspaceId)
246
+
247
+ if workspace:
248
+ if self.config.workspace_pattern.allowed(workspace.name):
249
+ self.report.datasets.processed(
250
+ f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
251
+ )
252
+ datasets.append(dataset)
253
+ else:
254
+ self.report.datasets.dropped(
255
+ f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
256
+ )
257
+ elif self.config.ingest_shared_entities:
258
+ # If no workspace for dataset we can consider it as shared entity
259
+ self.report.datasets_without_workspace += 1
260
+ self.report.datasets.processed(
261
+ f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
262
+ )
263
+ datasets.append(dataset)
264
+ else:
265
+ self.report.datasets.dropped(
266
+ f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
267
+ )
247
268
 
248
269
  if response_dict[Constant.NEXTPAGE]:
249
270
  url = f"{dataset_url}?page={response_dict[Constant.NEXTPAGE]}"
250
271
  else:
251
272
  break
252
- self.report.number_of_datasets = len(datasets)
273
+
253
274
  return datasets
254
275
  except Exception as e:
255
276
  self._log_http_error(
@@ -381,34 +402,54 @@ class SigmaAPI:
381
402
  for workbook_dict in response_dict[Constant.ENTRIES]:
382
403
  workbook = Workbook.parse_obj(workbook_dict)
383
404
 
384
- if workbook.workbookId in workbook_files_metadata:
385
- workbook.badge = workbook_files_metadata[
386
- workbook.workbookId
387
- ].badge
388
-
389
- workspace_id = workbook_files_metadata[
390
- workbook.workbookId
391
- ].workspaceId
392
- if workspace_id:
393
- workbook.workspaceId = workspace_id
394
- workspace = self.get_workspace(workbook.workspaceId)
395
- if workspace:
396
- if self.config.workspace_pattern.allowed(
397
- workspace.name
398
- ):
399
- workbook.pages = self.get_workbook_pages(workbook)
400
- workbooks.append(workbook)
401
- elif self.config.ingest_shared_entities:
402
- # If no workspace for workbook we can consider it as shared entity
403
- self.report.shared_entities_count += 1
404
- workbook.pages = self.get_workbook_pages(workbook)
405
- workbooks.append(workbook)
405
+ if workbook.workbookId not in workbook_files_metadata:
406
+ # Due to a bug in the Sigma API, it seems like the /files endpoint does not
407
+ # return file metadata when the user has access via admin permissions. In
408
+ # those cases, the user associated with the token needs to be manually added
409
+ # to the workspace.
410
+ self.report.workbooks.dropped(
411
+ f"{workbook.name} ({workbook.workbookId}) (missing file metadata; path: {workbook.path}; likely need to manually add user to workspace)"
412
+ )
413
+ continue
414
+
415
+ workbook.workspaceId = workbook_files_metadata[
416
+ workbook.workbookId
417
+ ].workspaceId
418
+
419
+ workbook.badge = workbook_files_metadata[workbook.workbookId].badge
420
+
421
+ workspace = None
422
+ if workbook.workspaceId:
423
+ workspace = self.get_workspace(workbook.workspaceId)
424
+
425
+ if workspace:
426
+ if self.config.workspace_pattern.allowed(workspace.name):
427
+ self.report.workbooks.processed(
428
+ f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
429
+ )
430
+ workbook.pages = self.get_workbook_pages(workbook)
431
+ workbooks.append(workbook)
432
+ else:
433
+ self.report.workbooks.dropped(
434
+ f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
435
+ )
436
+ elif self.config.ingest_shared_entities:
437
+ # If no workspace for workbook we can consider it as shared entity
438
+ self.report.workbooks_without_workspace += 1
439
+ self.report.workbooks.processed(
440
+ f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
441
+ )
442
+ workbook.pages = self.get_workbook_pages(workbook)
443
+ workbooks.append(workbook)
444
+ else:
445
+ self.report.workbooks.dropped(
446
+ f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
447
+ )
406
448
 
407
449
  if response_dict[Constant.NEXTPAGE]:
408
450
  url = f"{workbook_url}?page={response_dict[Constant.NEXTPAGE]}"
409
451
  else:
410
452
  break
411
- self.report.number_of_workbooks = len(workbooks)
412
453
  return workbooks
413
454
  except Exception as e:
414
455
  self._log_http_error(
@@ -1,6 +1,5 @@
1
1
  import json
2
2
  import logging
3
- import textwrap
4
3
  from dataclasses import dataclass
5
4
  from typing import Any, Dict, Iterable, List, Optional, Tuple
6
5
 
@@ -613,6 +612,10 @@ class SlackSource(StatefulIngestionSourceBase):
613
612
  ),
614
613
  )
615
614
 
615
+ @retry(
616
+ wait=wait_exponential(multiplier=2, min=4, max=60),
617
+ before_sleep=before_sleep_log(logger, logging.ERROR, True),
618
+ )
616
619
  def get_user_to_be_updated(
617
620
  self,
618
621
  ) -> Iterable[Tuple[CorpUser, Optional[CorpUserEditableInfoClass]]]:
@@ -634,56 +637,5 @@ class SlackSource(StatefulIngestionSourceBase):
634
637
  if user_obj.email is not None:
635
638
  yield (user_obj, editable_properties)
636
639
 
637
- @retry(
638
- wait=wait_exponential(multiplier=2, min=4, max=60),
639
- before_sleep=before_sleep_log(logger, logging.ERROR, True),
640
- )
641
- def get_user_to_be_updated_oss(self) -> Iterable[CorpUser]:
642
- graphql_query = textwrap.dedent(
643
- """
644
- query listUsers($input: ListUsersInput!) {
645
- listUsers(input: $input) {
646
- total
647
- users {
648
- urn
649
- editableProperties {
650
- email
651
- slack
652
- }
653
- }
654
- }
655
- }
656
- """
657
- )
658
- start = 0
659
- count = 10
660
- total = count
661
-
662
- assert self.ctx.graph is not None
663
-
664
- while start < total:
665
- variables = {"input": {"start": start, "count": count}}
666
- response = self.ctx.graph.execute_graphql(
667
- query=graphql_query, variables=variables
668
- )
669
- list_users = response.get("listUsers", {})
670
- total = list_users.get("total", 0)
671
- users = list_users.get("users", [])
672
- for user in users:
673
- user_obj = CorpUser()
674
- editable_properties = user.get("editableProperties", {})
675
- user_obj.urn = user.get("urn")
676
- if user_obj.urn is None:
677
- continue
678
- if editable_properties is not None:
679
- user_obj.email = editable_properties.get("email")
680
- if user_obj.email is None:
681
- urn_id = Urn.from_string(user_obj.urn).get_entity_id_as_string()
682
- if "@" in urn_id:
683
- user_obj.email = urn_id
684
- if user_obj.email is not None:
685
- yield user_obj
686
- start += count
687
-
688
640
  def get_report(self) -> SourceReport:
689
641
  return self.report
@@ -4,7 +4,7 @@ from dataclasses import dataclass
4
4
  from typing import Dict, List, Optional, Set
5
5
 
6
6
  import pydantic
7
- from pydantic import Field, SecretStr, root_validator, validator
7
+ from pydantic import Field, root_validator, validator
8
8
 
9
9
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
10
  from datahub.configuration.pattern_utils import UUID_REGEX
@@ -301,6 +301,7 @@ class SnowflakeV2Config(
301
301
  default=AllowDenyPattern.allow_all(),
302
302
  description=(
303
303
  "List of regex patterns for structured properties to include in ingestion."
304
+ " Applied to tags with form `<database>.<schema>.<tag_name>`."
304
305
  " Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
305
306
  ),
306
307
  )
@@ -384,17 +385,6 @@ class SnowflakeV2Config(
384
385
 
385
386
  return values
386
387
 
387
- def get_sql_alchemy_url(
388
- self,
389
- database: Optional[str] = None,
390
- username: Optional[str] = None,
391
- password: Optional[SecretStr] = None,
392
- role: Optional[str] = None,
393
- ) -> str:
394
- return SnowflakeConnectionConfig.get_sql_alchemy_url(
395
- self, database=database, username=username, password=password, role=role
396
- )
397
-
398
388
  @validator("shares")
399
389
  def validate_shares(
400
390
  cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import threading
2
3
  from typing import Any, Dict, Optional
3
4
 
4
5
  import pydantic
@@ -27,7 +28,7 @@ from datahub.ingestion.source.snowflake.oauth_config import (
27
28
  OAuthIdentityProvider,
28
29
  )
29
30
  from datahub.ingestion.source.snowflake.oauth_generator import OAuthTokenGenerator
30
- from datahub.ingestion.source.sql.sql_config import make_sqlalchemy_uri
31
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
31
32
  from datahub.utilities.config_clean import (
32
33
  remove_protocol,
33
34
  remove_suffix,
@@ -192,23 +193,11 @@ class SnowflakeConnectionConfig(ConfigModel):
192
193
  "but should be set when using use_certificate false for oauth_config"
193
194
  )
194
195
 
195
- def get_sql_alchemy_url(
196
- self,
197
- database: Optional[str] = None,
198
- username: Optional[str] = None,
199
- password: Optional[pydantic.SecretStr] = None,
200
- role: Optional[str] = None,
201
- ) -> str:
202
- if username is None:
203
- username = self.username
204
- if password is None:
205
- password = self.password
206
- if role is None:
207
- role = self.role
196
+ def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
208
197
  return make_sqlalchemy_uri(
209
198
  self.scheme,
210
- username,
211
- password.get_secret_value() if password else None,
199
+ self.username,
200
+ self.password.get_secret_value() if self.password else None,
212
201
  self.account_id,
213
202
  f'"{database}"' if database is not None else database,
214
203
  uri_opts={
@@ -217,7 +206,7 @@ class SnowflakeConnectionConfig(ConfigModel):
217
206
  for (key, value) in {
218
207
  "authenticator": _VALID_AUTH_TYPES.get(self.authentication_type),
219
208
  "warehouse": self.warehouse,
220
- "role": role,
209
+ "role": self.role,
221
210
  "application": _APPLICATION_NAME,
222
211
  }.items()
223
212
  if value
@@ -402,13 +391,30 @@ class SnowflakeConnection(Closeable):
402
391
  def __init__(self, connection: NativeSnowflakeConnection):
403
392
  self._connection = connection
404
393
 
394
+ self._query_num_lock = threading.Lock()
395
+ self._query_num = 1
396
+
405
397
  def native_connection(self) -> NativeSnowflakeConnection:
406
398
  return self._connection
407
399
 
400
+ def get_query_no(self) -> int:
401
+ with self._query_num_lock:
402
+ no = self._query_num
403
+ self._query_num += 1
404
+ return no
405
+
408
406
  def query(self, query: str) -> Any:
409
407
  try:
410
- logger.info(f"Query: {query}", stacklevel=2)
408
+ # We often run multiple queries in parallel across multiple threads,
409
+ # so we need to number them to help with log readability.
410
+ query_num = self.get_query_no()
411
+ logger.info(f"Query #{query_num}: {query}", stacklevel=2)
411
412
  resp = self._connection.cursor(DictCursor).execute(query)
413
+ if resp is not None and resp.rowcount is not None:
414
+ logger.info(
415
+ f"Query #{query_num} got {resp.rowcount} row(s) back from Snowflake",
416
+ stacklevel=2,
417
+ )
412
418
  return resp
413
419
 
414
420
  except Exception as e:
@@ -135,12 +135,7 @@ class SnowflakeProfiler(GenericProfiler, SnowflakeCommonMixin):
135
135
  ) -> "DatahubGEProfiler":
136
136
  assert db_name
137
137
 
138
- url = self.config.get_sql_alchemy_url(
139
- database=db_name,
140
- username=self.config.username,
141
- password=self.config.password,
142
- role=self.config.role,
143
- )
138
+ url = self.config.get_sql_alchemy_url(database=db_name)
144
139
 
145
140
  logger.debug(f"sql_alchemy_url={url}")
146
141
 
@@ -515,7 +515,10 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
515
515
  # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
516
516
  # here
517
517
  query_id=get_query_fingerprint(
518
- res["query_text"], self.identifiers.platform, fast=True
518
+ res["query_text"],
519
+ self.identifiers.platform,
520
+ fast=True,
521
+ secondary_id=res["query_secondary_fingerprint"],
519
522
  ),
520
523
  query_text=res["query_text"],
521
524
  upstreams=upstreams,
@@ -654,7 +657,17 @@ WITH
654
657
  fingerprinted_queries as (
655
658
  SELECT *,
656
659
  -- TODO: Generate better fingerprints for each query by pushing down regex logic.
657
- query_history.query_parameterized_hash as query_fingerprint
660
+ query_history.query_parameterized_hash as query_fingerprint,
661
+ -- Optional and additional hash to be used for query deduplication and final query identity
662
+ CASE
663
+ WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
664
+ -- Extract project id and hash it
665
+ THEN CAST(HASH(
666
+ REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
667
+ REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
668
+ ) AS VARCHAR)
669
+ ELSE NULL
670
+ END as query_secondary_fingerprint
658
671
  FROM
659
672
  snowflake.account_usage.query_history
660
673
  WHERE
@@ -670,11 +683,11 @@ fingerprinted_queries as (
670
683
  {time_bucket_size},
671
684
  CONVERT_TIMEZONE('UTC', start_time)
672
685
  ) AS bucket_start_time,
673
- COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint) AS query_count,
686
+ COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
674
687
  FROM
675
688
  fingerprinted_queries
676
689
  QUALIFY
677
- ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint ORDER BY start_time DESC) = 1
690
+ ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
678
691
  )
679
692
  , raw_access_history AS (
680
693
  SELECT
@@ -714,6 +727,7 @@ fingerprinted_queries as (
714
727
  q.bucket_start_time,
715
728
  q.query_id,
716
729
  q.query_fingerprint,
730
+ q.query_secondary_fingerprint,
717
731
  q.query_count,
718
732
  q.session_id AS "SESSION_ID",
719
733
  q.start_time AS "QUERY_START_TIME",
@@ -71,14 +71,6 @@ class SnowflakeQuery:
71
71
  def current_warehouse() -> str:
72
72
  return "select CURRENT_WAREHOUSE()"
73
73
 
74
- @staticmethod
75
- def current_database() -> str:
76
- return "select CURRENT_DATABASE()"
77
-
78
- @staticmethod
79
- def current_schema() -> str:
80
- return "select CURRENT_SCHEMA()"
81
-
82
74
  @staticmethod
83
75
  def show_databases() -> str:
84
76
  return "show databases"
@@ -107,8 +99,8 @@ class SnowflakeQuery:
107
99
  order by database_name"""
108
100
 
109
101
  @staticmethod
110
- def schemas_for_database(db_name: Optional[str]) -> str:
111
- db_clause = f'"{db_name}".' if db_name is not None else ""
102
+ def schemas_for_database(db_name: str) -> str:
103
+ db_clause = f'"{db_name}".'
112
104
  return f"""
113
105
  SELECT schema_name AS "SCHEMA_NAME",
114
106
  created AS "CREATED",
@@ -119,8 +111,8 @@ class SnowflakeQuery:
119
111
  order by schema_name"""
120
112
 
121
113
  @staticmethod
122
- def tables_for_database(db_name: Optional[str]) -> str:
123
- db_clause = f'"{db_name}".' if db_name is not None else ""
114
+ def tables_for_database(db_name: str) -> str:
115
+ db_clause = f'"{db_name}".'
124
116
  return f"""
125
117
  SELECT table_catalog AS "TABLE_CATALOG",
126
118
  table_schema AS "TABLE_SCHEMA",
@@ -142,8 +134,8 @@ class SnowflakeQuery:
142
134
  order by table_schema, table_name"""
143
135
 
144
136
  @staticmethod
145
- def tables_for_schema(schema_name: str, db_name: Optional[str]) -> str:
146
- db_clause = f'"{db_name}".' if db_name is not None else ""
137
+ def tables_for_schema(schema_name: str, db_name: str) -> str:
138
+ db_clause = f'"{db_name}".'
147
139
  return f"""
148
140
  SELECT table_catalog AS "TABLE_CATALOG",
149
141
  table_schema AS "TABLE_SCHEMA",
@@ -165,8 +157,8 @@ class SnowflakeQuery:
165
157
  order by table_schema, table_name"""
166
158
 
167
159
  @staticmethod
168
- def procedures_for_database(db_name: Optional[str]) -> str:
169
- db_clause = f'"{db_name}".' if db_name is not None else ""
160
+ def procedures_for_database(db_name: str) -> str:
161
+ db_clause = f'"{db_name}".'
170
162
  return f"""
171
163
  SELECT procedure_catalog AS "PROCEDURE_CATALOG",
172
164
  procedure_schema AS "PROCEDURE_SCHEMA",
@@ -382,26 +374,6 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
382
374
  ORDER BY query_start_time DESC
383
375
  ;"""
384
376
 
385
- @staticmethod
386
- def view_dependencies() -> str:
387
- return """
388
- SELECT
389
- concat(
390
- referenced_database, '.', referenced_schema,
391
- '.', referenced_object_name
392
- ) AS "VIEW_UPSTREAM",
393
- referenced_object_domain as "REFERENCED_OBJECT_DOMAIN",
394
- concat(
395
- referencing_database, '.', referencing_schema,
396
- '.', referencing_object_name
397
- ) AS "DOWNSTREAM_VIEW",
398
- referencing_object_domain AS "REFERENCING_OBJECT_DOMAIN"
399
- FROM
400
- snowflake.account_usage.object_dependencies
401
- WHERE
402
- referencing_object_domain in ('VIEW', 'MATERIALIZED VIEW')
403
- """
404
-
405
377
  # Note on use of `upstreams_deny_pattern` to ignore temporary tables:
406
378
  # Snowflake access history may include temporary tables in DIRECT_OBJECTS_ACCESSED and
407
379
  # OBJECTS_MODIFIED->columns->directSources. We do not need these temporary tables and filter these in the query.
@@ -425,32 +397,6 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
425
397
  upstreams_deny_pattern,
426
398
  )
427
399
 
428
- @staticmethod
429
- def view_dependencies_v2() -> str:
430
- return """
431
- SELECT
432
- ARRAY_UNIQUE_AGG(
433
- OBJECT_CONSTRUCT(
434
- 'upstream_object_name', concat(
435
- referenced_database, '.', referenced_schema,
436
- '.', referenced_object_name
437
- ),
438
- 'upstream_object_domain', referenced_object_domain
439
- )
440
- ) as "UPSTREAM_TABLES",
441
- concat(
442
- referencing_database, '.', referencing_schema,
443
- '.', referencing_object_name
444
- ) AS "DOWNSTREAM_TABLE_NAME",
445
- ANY_VALUE(referencing_object_domain) AS "DOWNSTREAM_TABLE_DOMAIN"
446
- FROM
447
- snowflake.account_usage.object_dependencies
448
- WHERE
449
- referencing_object_domain in ('VIEW', 'MATERIALIZED VIEW')
450
- GROUP BY
451
- DOWNSTREAM_TABLE_NAME
452
- """
453
-
454
400
  @staticmethod
455
401
  def show_external_tables() -> str:
456
402
  return "show external tables in account"
@@ -1000,4 +946,4 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
1000
946
  from_clause = (
1001
947
  f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
1002
948
  )
1003
- return f"""SHOW STREAMS IN DATABASE {db_name} LIMIT {limit} {from_clause};"""
949
+ return f"""SHOW STREAMS IN DATABASE "{db_name}" LIMIT {limit} {from_clause};"""
@@ -23,6 +23,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
23
23
  from datahub.metadata.com.linkedin.pegasus2avro.structured import (
24
24
  StructuredPropertyDefinition,
25
25
  )
26
+ from datahub.metadata.schema_classes import ChangeTypeClass
26
27
  from datahub.metadata.urns import (
27
28
  ContainerUrn,
28
29
  DatasetUrn,
@@ -81,7 +82,7 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
81
82
  def create_structured_property_templates(self) -> Iterable[MetadataWorkUnit]:
82
83
  for tag in self.data_dictionary.get_all_tags():
83
84
  if not self.config.structured_property_pattern.allowed(
84
- tag.tag_identifier()
85
+ tag._id_prefix_as_str()
85
86
  ):
86
87
  continue
87
88
  if self.config.extract_tags_as_structured_properties:
@@ -111,6 +112,8 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
111
112
  yield MetadataChangeProposalWrapper(
112
113
  entityUrn=urn,
113
114
  aspect=aspect,
115
+ changeType=ChangeTypeClass.CREATE,
116
+ headers={"If-None-Match": "*"},
114
117
  ).as_workunit()
115
118
 
116
119
  def _get_tags_on_object_with_propagation(
@@ -35,13 +35,14 @@ from datahub.ingestion.source.sql.sql_common import (
35
35
  SQLAlchemySource,
36
36
  register_custom_type,
37
37
  )
38
- from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
38
+ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
39
39
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
40
40
  from datahub.ingestion.source.sql.sql_utils import (
41
41
  add_table_to_schema_container,
42
42
  gen_database_container,
43
43
  gen_database_key,
44
44
  )
45
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
45
46
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
46
47
  from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass
47
48
  from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
@@ -145,7 +145,11 @@ class ClickHouseConfig(
145
145
  )
146
146
  include_materialized_views: Optional[bool] = Field(default=True, description="")
147
147
 
148
- def get_sql_alchemy_url(self, current_db=None):
148
+ def get_sql_alchemy_url(
149
+ self,
150
+ uri_opts: Optional[Dict[str, Any]] = None,
151
+ current_db: Optional[str] = None,
152
+ ) -> str:
149
153
  url = make_url(
150
154
  super().get_sql_alchemy_url(uri_opts=self.uri_opts, current_db=current_db)
151
155
  )
@@ -1,4 +1,6 @@
1
1
  # This import verifies that the dependencies are available.
2
+ from typing import Any, Dict, Optional
3
+
2
4
  import pydruid # noqa: F401
3
5
  from pydantic.fields import Field
4
6
  from pydruid.db.sqlalchemy import DruidDialect
@@ -38,8 +40,11 @@ class DruidConfig(BasicSQLAlchemyConfig):
38
40
  description="regex patterns for schemas to filter in ingestion.",
39
41
  )
40
42
 
41
- def get_sql_alchemy_url(self):
42
- return f"{super().get_sql_alchemy_url()}/druid/v2/sql/"
43
+ def get_sql_alchemy_url(
44
+ self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
45
+ ) -> str:
46
+ base_url = super().get_sql_alchemy_url(uri_opts=uri_opts, database=database)
47
+ return f"{base_url}/druid/v2/sql/"
43
48
 
44
49
  """
45
50
  The pydruid library already formats the table name correctly, so we do not