acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
  2. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
  3. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
  5. datahub/__init__.py +1 -1
  6. datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
  7. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  8. datahub/configuration/common.py +2 -5
  9. datahub/configuration/source_common.py +13 -0
  10. datahub/emitter/mce_builder.py +20 -4
  11. datahub/emitter/mcp_builder.py +2 -7
  12. datahub/emitter/mcp_patch_builder.py +37 -13
  13. datahub/emitter/rest_emitter.py +25 -3
  14. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
  15. datahub/ingestion/api/closeable.py +3 -3
  16. datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
  17. datahub/ingestion/api/report.py +4 -1
  18. datahub/ingestion/api/sink.py +4 -3
  19. datahub/ingestion/api/source.py +4 -0
  20. datahub/ingestion/api/source_helpers.py +2 -6
  21. datahub/ingestion/glossary/classifier.py +2 -3
  22. datahub/ingestion/graph/client.py +6 -3
  23. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
  24. datahub/ingestion/source/aws/aws_common.py +231 -27
  25. datahub/ingestion/source/aws/glue.py +12 -2
  26. datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
  27. datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
  28. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
  29. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
  30. datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
  31. datahub/ingestion/source/datahub/config.py +22 -1
  32. datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
  33. datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
  34. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  35. datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
  36. datahub/ingestion/source/gc/datahub_gc.py +21 -5
  37. datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
  38. datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
  39. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
  40. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  41. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  42. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  43. datahub/ingestion/source/kafka_connect/common.py +202 -0
  44. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  45. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  46. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  47. datahub/ingestion/source/looker/looker_common.py +63 -2
  48. datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
  49. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  50. datahub/ingestion/source/looker/looker_source.py +31 -4
  51. datahub/ingestion/source/looker/looker_usage.py +23 -17
  52. datahub/ingestion/source/mlflow.py +30 -5
  53. datahub/ingestion/source/mode.py +40 -27
  54. datahub/ingestion/source/powerbi/config.py +1 -14
  55. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  56. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  57. datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
  58. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
  59. datahub/ingestion/source/s3/source.py +1 -1
  60. datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
  61. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
  62. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
  63. datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
  64. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
  65. datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
  66. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
  67. datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
  68. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
  69. datahub/ingestion/source/sql/hive.py +621 -8
  70. datahub/ingestion/source/sql/hive_metastore.py +7 -0
  71. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  72. datahub/ingestion/source/sql/mssql/source.py +15 -1
  73. datahub/ingestion/source/sql/sql_common.py +41 -102
  74. datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
  75. datahub/ingestion/source/sql/sql_report.py +2 -0
  76. datahub/ingestion/source/state/checkpoint.py +2 -1
  77. datahub/ingestion/source/tableau/tableau.py +122 -45
  78. datahub/ingestion/source/tableau/tableau_common.py +18 -0
  79. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  80. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  81. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  82. datahub/ingestion/source/unity/proxy.py +8 -27
  83. datahub/ingestion/source/usage/usage_common.py +15 -1
  84. datahub/ingestion/source_report/ingestion_stage.py +3 -0
  85. datahub/metadata/_schema_classes.py +256 -3
  86. datahub/metadata/_urns/urn_defs.py +168 -168
  87. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  88. datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
  89. datahub/metadata/schema.avsc +252 -33
  90. datahub/metadata/schemas/DataJobKey.avsc +2 -1
  91. datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
  92. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  93. datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
  94. datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
  95. datahub/metadata/schemas/MLModelProperties.avsc +62 -2
  96. datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
  97. datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
  98. datahub/specific/aspect_helpers/__init__.py +0 -0
  99. datahub/specific/aspect_helpers/custom_properties.py +79 -0
  100. datahub/specific/aspect_helpers/ownership.py +67 -0
  101. datahub/specific/aspect_helpers/structured_properties.py +72 -0
  102. datahub/specific/aspect_helpers/tags.py +42 -0
  103. datahub/specific/aspect_helpers/terms.py +43 -0
  104. datahub/specific/chart.py +28 -184
  105. datahub/specific/dashboard.py +31 -196
  106. datahub/specific/datajob.py +34 -189
  107. datahub/specific/dataproduct.py +24 -86
  108. datahub/specific/dataset.py +48 -133
  109. datahub/specific/form.py +12 -32
  110. datahub/specific/structured_property.py +9 -9
  111. datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
  112. datahub/sql_parsing/sqlglot_lineage.py +15 -5
  113. datahub/sql_parsing/tool_meta_extractor.py +119 -5
  114. datahub/utilities/time.py +8 -3
  115. datahub/utilities/urns/_urn_base.py +5 -7
  116. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  117. datahub/specific/custom_properties.py +0 -37
  118. datahub/specific/ownership.py +0 -48
  119. datahub/specific/structured_properties.py +0 -53
  120. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
@@ -84,13 +84,14 @@ class DataResolverBase(ABC):
84
84
  tenant_id: str,
85
85
  metadata_api_timeout: int,
86
86
  ):
87
- self.__access_token: Optional[str] = None
88
- self.__access_token_expiry_time: Optional[datetime] = None
89
- self.__tenant_id = tenant_id
87
+ self._access_token: Optional[str] = None
88
+ self._access_token_expiry_time: Optional[datetime] = None
89
+
90
+ self._tenant_id = tenant_id
90
91
  # Test connection by generating access token
91
92
  logger.info(f"Trying to connect to {self._get_authority_url()}")
92
93
  # Power-Bi Auth (Service Principal Auth)
93
- self.__msal_client = msal.ConfidentialClientApplication(
94
+ self._msal_client = msal.ConfidentialClientApplication(
94
95
  client_id,
95
96
  client_credential=client_secret,
96
97
  authority=DataResolverBase.AUTHORITY + tenant_id,
@@ -168,18 +169,18 @@ class DataResolverBase(ABC):
168
169
  pass
169
170
 
170
171
  def _get_authority_url(self):
171
- return f"{DataResolverBase.AUTHORITY}{self.__tenant_id}"
172
+ return f"{DataResolverBase.AUTHORITY}{self._tenant_id}"
172
173
 
173
174
  def get_authorization_header(self):
174
175
  return {Constant.Authorization: self.get_access_token()}
175
176
 
176
- def get_access_token(self):
177
- if self.__access_token is not None and not self._is_access_token_expired():
178
- return self.__access_token
177
+ def get_access_token(self) -> str:
178
+ if self._access_token is not None and not self._is_access_token_expired():
179
+ return self._access_token
179
180
 
180
181
  logger.info("Generating PowerBi access token")
181
182
 
182
- auth_response = self.__msal_client.acquire_token_for_client(
183
+ auth_response = self._msal_client.acquire_token_for_client(
183
184
  scopes=[DataResolverBase.SCOPE]
184
185
  )
185
186
 
@@ -193,24 +194,24 @@ class DataResolverBase(ABC):
193
194
 
194
195
  logger.info("Generated PowerBi access token")
195
196
 
196
- self.__access_token = "Bearer {}".format(
197
+ self._access_token = "Bearer {}".format(
197
198
  auth_response.get(Constant.ACCESS_TOKEN)
198
199
  )
199
200
  safety_gap = 300
200
- self.__access_token_expiry_time = datetime.now() + timedelta(
201
+ self._access_token_expiry_time = datetime.now() + timedelta(
201
202
  seconds=(
202
203
  max(auth_response.get(Constant.ACCESS_TOKEN_EXPIRY, 0) - safety_gap, 0)
203
204
  )
204
205
  )
205
206
 
206
- logger.debug(f"{Constant.PBIAccessToken}={self.__access_token}")
207
+ logger.debug(f"{Constant.PBIAccessToken}={self._access_token}")
207
208
 
208
- return self.__access_token
209
+ return self._access_token
209
210
 
210
211
  def _is_access_token_expired(self) -> bool:
211
- if not self.__access_token_expiry_time:
212
+ if not self._access_token_expiry_time:
212
213
  return True
213
- return self.__access_token_expiry_time < datetime.now()
214
+ return self._access_token_expiry_time < datetime.now()
214
215
 
215
216
  def get_dashboards(self, workspace: Workspace) -> List[Dashboard]:
216
217
  """
@@ -225,7 +225,7 @@ class S3Source(StatefulIngestionSourceBase):
225
225
  self.init_spark()
226
226
 
227
227
  def init_spark(self):
228
- os.environ.setdefault("SPARK_VERSION", "3.3")
228
+ os.environ.setdefault("SPARK_VERSION", "3.5")
229
229
  spark_version = os.environ["SPARK_VERSION"]
230
230
 
231
231
  # Importing here to avoid Deequ dependency for non profiling use cases
@@ -138,12 +138,20 @@ class SnowflakeIdentifierConfig(
138
138
  description="Whether to convert dataset urns to lowercase.",
139
139
  )
140
140
 
141
-
142
- class SnowflakeUsageConfig(BaseUsageConfig):
143
141
  email_domain: Optional[str] = pydantic.Field(
144
142
  default=None,
145
143
  description="Email domain of your organization so users can be displayed on UI appropriately.",
146
144
  )
145
+
146
+ email_as_user_identifier: bool = Field(
147
+ default=True,
148
+ description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
149
+ "provided, generates email addresses for snowflake users with unset emails, based on their "
150
+ "username.",
151
+ )
152
+
153
+
154
+ class SnowflakeUsageConfig(BaseUsageConfig):
147
155
  apply_view_usage_to_tables: bool = pydantic.Field(
148
156
  default=False,
149
157
  description="Whether to apply view's usage to its base tables. If set to True, usage is applied to base tables only.",
@@ -163,26 +171,13 @@ class SnowflakeConfig(
163
171
  default=True,
164
172
  description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.",
165
173
  )
166
- include_view_lineage: bool = pydantic.Field(
167
- default=True,
168
- description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.",
169
- )
174
+
175
+ _include_view_lineage = pydantic_removed_field("include_view_lineage")
176
+ _include_view_column_lineage = pydantic_removed_field("include_view_column_lineage")
170
177
 
171
178
  ignore_start_time_lineage: bool = False
172
179
  upstream_lineage_in_report: bool = False
173
180
 
174
- @pydantic.root_validator(skip_on_failure=True)
175
- def validate_include_view_lineage(cls, values):
176
- if (
177
- "include_table_lineage" in values
178
- and not values.get("include_table_lineage")
179
- and values.get("include_view_lineage")
180
- ):
181
- raise ValueError(
182
- "include_table_lineage must be True for include_view_lineage to be set."
183
- )
184
- return values
185
-
186
181
 
187
182
  class SnowflakeV2Config(
188
183
  SnowflakeConfig,
@@ -222,11 +217,6 @@ class SnowflakeV2Config(
222
217
  description="Populates table->table and view->table column lineage. Requires appropriate grants given to the role and the Snowflake Enterprise Edition or above.",
223
218
  )
224
219
 
225
- include_view_column_lineage: bool = Field(
226
- default=True,
227
- description="Populates view->view and table->view column lineage using DataHub's sql parser.",
228
- )
229
-
230
220
  use_queries_v2: bool = Field(
231
221
  default=False,
232
222
  description="If enabled, uses the new queries extractor to extract queries from snowflake.",
@@ -285,13 +275,6 @@ class SnowflakeV2Config(
285
275
  " Map of share name -> details of share.",
286
276
  )
287
277
 
288
- email_as_user_identifier: bool = Field(
289
- default=True,
290
- description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
291
- "provided, generates email addresses for snowflake users with unset emails, based on their "
292
- "username.",
293
- )
294
-
295
278
  include_assertion_results: bool = Field(
296
279
  default=False,
297
280
  description="Whether to ingest assertion run results for assertions created using Datahub"
@@ -355,10 +338,6 @@ class SnowflakeV2Config(
355
338
  self, database=database, username=username, password=password, role=role
356
339
  )
357
340
 
358
- @property
359
- def parse_view_ddl(self) -> bool:
360
- return self.include_view_column_lineage
361
-
362
341
  @validator("shares")
363
342
  def validate_shares(
364
343
  cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict
@@ -4,11 +4,10 @@ from dataclasses import dataclass
4
4
  from datetime import datetime
5
5
  from typing import Any, Collection, Iterable, List, Optional, Set, Tuple, Type
6
6
 
7
- from pydantic import BaseModel, validator
7
+ from pydantic import BaseModel, Field, validator
8
8
 
9
9
  from datahub.configuration.datetimes import parse_absolute_time
10
10
  from datahub.ingestion.api.closeable import Closeable
11
- from datahub.ingestion.api.workunit import MetadataWorkUnit
12
11
  from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
13
12
  from datahub.ingestion.source.snowflake.constants import (
14
13
  LINEAGE_PERMISSION_ERROR,
@@ -41,6 +40,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
41
40
  ColumnRef,
42
41
  DownstreamColumnRef,
43
42
  )
43
+ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
44
44
  from datahub.utilities.perf_timer import PerfTimer
45
45
  from datahub.utilities.time import ts_millis_to_datetime
46
46
 
@@ -72,8 +72,8 @@ class ColumnUpstreamJob(BaseModel):
72
72
 
73
73
 
74
74
  class ColumnUpstreamLineage(BaseModel):
75
- column_name: str
76
- upstreams: List[ColumnUpstreamJob]
75
+ column_name: Optional[str]
76
+ upstreams: List[ColumnUpstreamJob] = Field(default_factory=list)
77
77
 
78
78
 
79
79
  class UpstreamTableNode(BaseModel):
@@ -163,11 +163,11 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
163
163
  self.config.end_time,
164
164
  )
165
165
 
166
- def get_workunits(
166
+ def add_time_based_lineage_to_aggregator(
167
167
  self,
168
168
  discovered_tables: List[str],
169
169
  discovered_views: List[str],
170
- ) -> Iterable[MetadataWorkUnit]:
170
+ ) -> None:
171
171
  if not self._should_ingest_lineage():
172
172
  return
173
173
 
@@ -177,9 +177,7 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
177
177
  # snowflake view/table -> snowflake table
178
178
  self.populate_table_upstreams(discovered_tables)
179
179
 
180
- for mcp in self.sql_aggregator.gen_metadata():
181
- yield mcp.as_workunit()
182
-
180
+ def update_state(self):
183
181
  if self.redundant_run_skip_handler:
184
182
  # Update the checkpoint state for this run.
185
183
  self.redundant_run_skip_handler.update_state(
@@ -242,6 +240,9 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
242
240
  downstream_table_urn = self.identifiers.gen_dataset_urn(dataset_name)
243
241
 
244
242
  known_lineage = KnownQueryLineageInfo(
243
+ query_id=get_query_fingerprint(
244
+ query.query_text, self.identifiers.platform, fast=True
245
+ ),
245
246
  query_text=query.query_text,
246
247
  downstream=downstream_table_urn,
247
248
  upstreams=self.map_query_result_upstreams(
@@ -265,64 +266,17 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
265
266
  with PerfTimer() as timer:
266
267
  self.report.num_external_table_edges_scanned = 0
267
268
 
268
- for (
269
- known_lineage_mapping
270
- ) in self._populate_external_lineage_from_copy_history(discovered_tables):
271
- self.sql_aggregator.add(known_lineage_mapping)
272
- logger.info(
273
- "Done populating external lineage from copy history. "
274
- f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
275
- )
276
-
277
- for (
278
- known_lineage_mapping
279
- ) in self._populate_external_lineage_from_show_query(discovered_tables):
280
- self.sql_aggregator.add(known_lineage_mapping)
281
-
282
- logger.info(
283
- "Done populating external lineage from show external tables. "
284
- f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
285
- )
269
+ for entry in self._get_copy_history_lineage(discovered_tables):
270
+ self.sql_aggregator.add(entry)
271
+ logger.info("Done populating external lineage from copy history. ")
286
272
 
287
273
  self.report.external_lineage_queries_secs = timer.elapsed_seconds()
288
274
 
289
- # Handles the case for explicitly created external tables.
290
- # NOTE: Snowflake does not log this information to the access_history table.
291
- def _populate_external_lineage_from_show_query(
292
- self, discovered_tables: List[str]
293
- ) -> Iterable[KnownLineageMapping]:
294
- external_tables_query: str = SnowflakeQuery.show_external_tables()
295
- try:
296
- for db_row in self.connection.query(external_tables_query):
297
- key = self.identifiers.get_dataset_identifier(
298
- db_row["name"], db_row["schema_name"], db_row["database_name"]
299
- )
300
-
301
- if key not in discovered_tables:
302
- continue
303
- if db_row["location"].startswith("s3://"):
304
- yield KnownLineageMapping(
305
- upstream_urn=make_s3_urn_for_lineage(
306
- db_row["location"], self.config.env
307
- ),
308
- downstream_urn=self.identifiers.gen_dataset_urn(key),
309
- )
310
- self.report.num_external_table_edges_scanned += 1
311
-
312
- self.report.num_external_table_edges_scanned += 1
313
- except Exception as e:
314
- logger.debug(e, exc_info=e)
315
- self.structured_reporter.warning(
316
- "Error populating external table lineage from Snowflake",
317
- exc=e,
318
- )
319
- self.report_status(EXTERNAL_LINEAGE, False)
320
-
321
275
  # Handles the case where a table is populated from an external stage/s3 location via copy.
322
276
  # Eg: copy into category_english from @external_s3_stage;
323
277
  # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv';
324
278
  # NOTE: Snowflake does not log this information to the access_history table.
325
- def _populate_external_lineage_from_copy_history(
279
+ def _get_copy_history_lineage(
326
280
  self, discovered_tables: List[str]
327
281
  ) -> Iterable[KnownLineageMapping]:
328
282
  query: str = SnowflakeQuery.copy_lineage_history(
@@ -384,10 +338,6 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
384
338
  start_time_millis=int(self.start_time.timestamp() * 1000),
385
339
  end_time_millis=int(self.end_time.timestamp() * 1000),
386
340
  upstreams_deny_pattern=self.config.temporary_tables_pattern,
387
- # The self.config.include_view_lineage setting is about fetching upstreams of views.
388
- # We always generate lineage pointing at views from tables, even if self.config.include_view_lineage is False.
389
- # TODO: Remove this `include_view_lineage` flag, since it's effectively dead code.
390
- include_view_lineage=True,
391
341
  include_column_lineage=self.config.include_column_lineage,
392
342
  )
393
343
  try:
@@ -61,11 +61,17 @@ from datahub.sql_parsing.sqlglot_lineage import (
61
61
  ColumnRef,
62
62
  DownstreamColumnRef,
63
63
  )
64
+ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
64
65
  from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
65
66
  from datahub.utilities.perf_timer import PerfTimer
66
67
 
67
68
  logger = logging.getLogger(__name__)
68
69
 
70
+ # Define a type alias
71
+ UserName = str
72
+ UserEmail = str
73
+ UsersMapping = Dict[UserName, UserEmail]
74
+
69
75
 
70
76
  class SnowflakeQueriesExtractorConfig(ConfigModel):
71
77
  # TODO: Support stateful ingestion for the time windows.
@@ -114,11 +120,13 @@ class SnowflakeQueriesSourceConfig(
114
120
  class SnowflakeQueriesExtractorReport(Report):
115
121
  copy_history_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
116
122
  query_log_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
123
+ users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
117
124
 
118
125
  audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
119
126
  sql_aggregator: Optional[SqlAggregatorReport] = None
120
127
 
121
128
  num_ddl_queries_dropped: int = 0
129
+ num_users: int = 0
122
130
 
123
131
 
124
132
  @dataclass
@@ -225,6 +233,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
225
233
  def get_workunits_internal(
226
234
  self,
227
235
  ) -> Iterable[MetadataWorkUnit]:
236
+ with self.report.users_fetch_timer:
237
+ users = self.fetch_users()
238
+
228
239
  # TODO: Add some logic to check if the cached audit log is stale or not.
229
240
  audit_log_file = self.local_temp_path / "audit_log.sqlite"
230
241
  use_cached_audit_log = audit_log_file.exists()
@@ -247,11 +258,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
247
258
  for entry in self.fetch_copy_history():
248
259
  queries.append(entry)
249
260
 
250
- # TODO: Add "show external tables" lineage to the main schema extractor.
251
- # Because it's not a time-based thing, it doesn't really make sense in the snowflake-queries extractor.
252
-
253
261
  with self.report.query_log_fetch_timer:
254
- for entry in self.fetch_query_log():
262
+ for entry in self.fetch_query_log(users):
255
263
  queries.append(entry)
256
264
 
257
265
  with self.report.audit_log_load_timer:
@@ -266,6 +274,25 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
266
274
  shared_connection.close()
267
275
  audit_log_file.unlink(missing_ok=True)
268
276
 
277
+ def fetch_users(self) -> UsersMapping:
278
+ users: UsersMapping = dict()
279
+ with self.structured_reporter.report_exc("Error fetching users from Snowflake"):
280
+ logger.info("Fetching users from Snowflake")
281
+ query = SnowflakeQuery.get_all_users()
282
+ resp = self.connection.query(query)
283
+
284
+ for row in resp:
285
+ try:
286
+ users[row["NAME"]] = row["EMAIL"]
287
+ self.report.num_users += 1
288
+ except Exception as e:
289
+ self.structured_reporter.warning(
290
+ "Error parsing user row",
291
+ context=f"{row}",
292
+ exc=e,
293
+ )
294
+ return users
295
+
269
296
  def fetch_copy_history(self) -> Iterable[KnownLineageMapping]:
270
297
  # Derived from _populate_external_lineage_from_copy_history.
271
298
 
@@ -301,7 +328,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
301
328
  yield result
302
329
 
303
330
  def fetch_query_log(
304
- self,
331
+ self, users: UsersMapping
305
332
  ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap]]:
306
333
  query_log_query = _build_enriched_query_log_query(
307
334
  start_time=self.config.window.start_time,
@@ -322,7 +349,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
322
349
 
323
350
  assert isinstance(row, dict)
324
351
  try:
325
- entry = self._parse_audit_log_row(row)
352
+ entry = self._parse_audit_log_row(row, users)
326
353
  except Exception as e:
327
354
  self.structured_reporter.warning(
328
355
  "Error parsing query log row",
@@ -334,7 +361,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
334
361
  yield entry
335
362
 
336
363
  def _parse_audit_log_row(
337
- self, row: Dict[str, Any]
364
+ self, row: Dict[str, Any], users: UsersMapping
338
365
  ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery]]:
339
366
  json_fields = {
340
367
  "DIRECT_OBJECTS_ACCESSED",
@@ -433,9 +460,11 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
433
460
  )
434
461
  )
435
462
 
436
- # TODO: Fetch email addresses from Snowflake to map user -> email
437
- # TODO: Support email_domain fallback for generating user urns.
438
- user = CorpUserUrn(self.identifiers.snowflake_identifier(res["user_name"]))
463
+ user = CorpUserUrn(
464
+ self.identifiers.get_user_identifier(
465
+ res["user_name"], users.get(res["user_name"])
466
+ )
467
+ )
439
468
 
440
469
  timestamp: datetime = res["query_start_time"]
441
470
  timestamp = timestamp.astimezone(timezone.utc)
@@ -447,10 +476,11 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
447
476
 
448
477
  entry = PreparsedQuery(
449
478
  # Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
450
- # job at eliminating redundant / repetitive queries. As such, we don't include the fingerprint
451
- # here so that the aggregator auto-generates one.
452
- # query_id=res["query_fingerprint"],
453
- query_id=None,
479
+ # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
480
+ # here
481
+ query_id=get_query_fingerprint(
482
+ res["query_text"], self.identifiers.platform, fast=True
483
+ ),
454
484
  query_text=res["query_text"],
455
485
  upstreams=upstreams,
456
486
  downstream=downstream,
@@ -376,7 +376,6 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
376
376
  def table_to_table_lineage_history_v2(
377
377
  start_time_millis: int,
378
378
  end_time_millis: int,
379
- include_view_lineage: bool = True,
380
379
  include_column_lineage: bool = True,
381
380
  upstreams_deny_pattern: List[str] = DEFAULT_TEMP_TABLES_PATTERNS,
382
381
  ) -> str:
@@ -385,14 +384,12 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
385
384
  start_time_millis,
386
385
  end_time_millis,
387
386
  upstreams_deny_pattern,
388
- include_view_lineage,
389
387
  )
390
388
  else:
391
389
  return SnowflakeQuery.table_upstreams_only(
392
390
  start_time_millis,
393
391
  end_time_millis,
394
392
  upstreams_deny_pattern,
395
- include_view_lineage,
396
393
  )
397
394
 
398
395
  @staticmethod
@@ -677,12 +674,9 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
677
674
  start_time_millis: int,
678
675
  end_time_millis: int,
679
676
  upstreams_deny_pattern: List[str],
680
- include_view_lineage: bool = True,
681
677
  ) -> str:
682
678
  allowed_upstream_table_domains = (
683
679
  SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER
684
- if include_view_lineage
685
- else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER
686
680
  )
687
681
 
688
682
  upstream_sql_filter = create_deny_regex_sql_filter(
@@ -847,12 +841,9 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
847
841
  start_time_millis: int,
848
842
  end_time_millis: int,
849
843
  upstreams_deny_pattern: List[str],
850
- include_view_lineage: bool = True,
851
844
  ) -> str:
852
845
  allowed_upstream_table_domains = (
853
846
  SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER
854
- if include_view_lineage
855
- else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER
856
847
  )
857
848
 
858
849
  upstream_sql_filter = create_deny_regex_sql_filter(
@@ -956,4 +947,8 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
956
947
  AND METRIC_NAME ilike '{pattern}' escape '{escape_pattern}'
957
948
  ORDER BY MEASUREMENT_TIME ASC;
958
949
 
959
- """
950
+ """
951
+
952
+ @staticmethod
953
+ def get_all_users() -> str:
954
+ return """SELECT name as "NAME", email as "EMAIL" FROM SNOWFLAKE.ACCOUNT_USAGE.USERS"""
@@ -16,6 +16,7 @@ from datahub.ingestion.glossary.classification_mixin import (
16
16
  ClassificationHandler,
17
17
  classification_workunit_processor,
18
18
  )
19
+ from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
19
20
  from datahub.ingestion.source.common.subtypes import (
20
21
  DatasetContainerSubTypes,
21
22
  DatasetSubTypes,
@@ -35,6 +36,7 @@ from datahub.ingestion.source.snowflake.snowflake_connection import (
35
36
  )
36
37
  from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader
37
38
  from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler
39
+ from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
38
40
  from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
39
41
  from datahub.ingestion.source.snowflake.snowflake_schema import (
40
42
  SCHEMA_PARALLELISM,
@@ -65,6 +67,7 @@ from datahub.ingestion.source.sql.sql_utils import (
65
67
  get_domain_wu,
66
68
  )
67
69
  from datahub.ingestion.source_report.ingestion_stage import (
70
+ EXTERNAL_TABLE_DDL_LINEAGE,
68
71
  METADATA_EXTRACTION,
69
72
  PROFILING,
70
73
  )
@@ -96,7 +99,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
96
99
  TimeType,
97
100
  )
98
101
  from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
99
- from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
102
+ from datahub.sql_parsing.sql_parsing_aggregator import (
103
+ KnownLineageMapping,
104
+ SqlParsingAggregator,
105
+ )
100
106
  from datahub.utilities.registries.domain_registry import DomainRegistry
101
107
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
102
108
 
@@ -180,7 +186,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
180
186
 
181
187
  # These are populated as side-effects of get_workunits_internal.
182
188
  self.databases: List[SnowflakeDatabase] = []
183
- self.aggregator: Optional[SqlParsingAggregator] = aggregator
189
+
190
+ self.aggregator = aggregator
184
191
 
185
192
  def get_connection(self) -> SnowflakeConnection:
186
193
  return self.connection
@@ -212,6 +219,19 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
212
219
  self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
213
220
  yield from self._process_database(snowflake_db)
214
221
 
222
+ self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
223
+ discovered_tables: List[str] = [
224
+ self.identifiers.get_dataset_identifier(
225
+ table_name, schema.name, db.name
226
+ )
227
+ for db in self.databases
228
+ for schema in db.schemas
229
+ for table_name in schema.tables
230
+ ]
231
+ if self.aggregator:
232
+ for entry in self._external_tables_ddl_lineage(discovered_tables):
233
+ self.aggregator.add(entry)
234
+
215
235
  except SnowflakePermissionError as e:
216
236
  self.structured_reporter.failure(
217
237
  GENERIC_PERMISSION_ERROR_KEY,
@@ -415,11 +435,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
415
435
  )
416
436
 
417
437
  if self.config.include_views:
418
- if (
419
- self.aggregator
420
- and self.config.include_view_lineage
421
- and self.config.parse_view_ddl
422
- ):
438
+ if self.aggregator:
423
439
  for view in views:
424
440
  view_identifier = self.identifiers.get_dataset_identifier(
425
441
  view.name, schema_name, db_name
@@ -1082,3 +1098,33 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1082
1098
 
1083
1099
  # Access to table but none of its constraints - is this possible ?
1084
1100
  return constraints.get(table_name, [])
1101
+
1102
+ # Handles the case for explicitly created external tables.
1103
+ # NOTE: Snowflake does not log this information to the access_history table.
1104
+ def _external_tables_ddl_lineage(
1105
+ self, discovered_tables: List[str]
1106
+ ) -> Iterable[KnownLineageMapping]:
1107
+ external_tables_query: str = SnowflakeQuery.show_external_tables()
1108
+ try:
1109
+ for db_row in self.connection.query(external_tables_query):
1110
+ key = self.identifiers.get_dataset_identifier(
1111
+ db_row["name"], db_row["schema_name"], db_row["database_name"]
1112
+ )
1113
+
1114
+ if key not in discovered_tables:
1115
+ continue
1116
+ if db_row["location"].startswith("s3://"):
1117
+ yield KnownLineageMapping(
1118
+ upstream_urn=make_s3_urn_for_lineage(
1119
+ db_row["location"], self.config.env
1120
+ ),
1121
+ downstream_urn=self.identifiers.gen_dataset_urn(key),
1122
+ )
1123
+ self.report.num_external_table_edges_scanned += 1
1124
+
1125
+ self.report.num_external_table_edges_scanned += 1
1126
+ except Exception as e:
1127
+ self.structured_reporter.warning(
1128
+ "External table ddl lineage extraction failed",
1129
+ exc=e,
1130
+ )
@@ -72,7 +72,7 @@ class SnowflakeSharesHandler(SnowflakeCommonMixin):
72
72
  assert len(sibling_dbs) == 1
73
73
  # SnowflakeLineageExtractor is unaware of database->schema->table hierarchy
74
74
  # hence this lineage code is not written in SnowflakeLineageExtractor
75
- # also this is not governed by configs include_table_lineage and include_view_lineage
75
+ # also this is not governed by configs include_table_lineage
76
76
  yield self.get_upstream_lineage_with_primary_sibling(
77
77
  db.name, schema.name, table_name, sibling_dbs[0]
78
78
  )
@@ -342,10 +342,9 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
342
342
  filtered_user_counts.append(
343
343
  DatasetUserUsageCounts(
344
344
  user=make_user_urn(
345
- self.get_user_identifier(
345
+ self.identifiers.get_user_identifier(
346
346
  user_count["user_name"],
347
347
  user_email,
348
- self.config.email_as_user_identifier,
349
348
  )
350
349
  ),
351
350
  count=user_count["total"],
@@ -453,9 +452,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
453
452
  reported_time: int = int(time.time() * 1000)
454
453
  last_updated_timestamp: int = int(start_time.timestamp() * 1000)
455
454
  user_urn = make_user_urn(
456
- self.get_user_identifier(
457
- user_name, user_email, self.config.email_as_user_identifier
458
- )
455
+ self.identifiers.get_user_identifier(user_name, user_email)
459
456
  )
460
457
 
461
458
  # NOTE: In earlier `snowflake-usage` connector this was base_objects_accessed, which is incorrect