acryl-datahub 0.15.0.1rc8__py3-none-any.whl → 0.15.0.1rc10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -66,6 +66,11 @@ from datahub.utilities.perf_timer import PerfTimer
66
66
 
67
67
  logger = logging.getLogger(__name__)
68
68
 
69
+ # Define a type alias
70
+ UserName = str
71
+ UserEmail = str
72
+ UsersMapping = Dict[UserName, UserEmail]
73
+
69
74
 
70
75
  class SnowflakeQueriesExtractorConfig(ConfigModel):
71
76
  # TODO: Support stateful ingestion for the time windows.
@@ -114,11 +119,13 @@ class SnowflakeQueriesSourceConfig(
114
119
  class SnowflakeQueriesExtractorReport(Report):
115
120
  copy_history_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
116
121
  query_log_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
122
+ users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
117
123
 
118
124
  audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
119
125
  sql_aggregator: Optional[SqlAggregatorReport] = None
120
126
 
121
127
  num_ddl_queries_dropped: int = 0
128
+ num_users: int = 0
122
129
 
123
130
 
124
131
  @dataclass
@@ -225,6 +232,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
225
232
  def get_workunits_internal(
226
233
  self,
227
234
  ) -> Iterable[MetadataWorkUnit]:
235
+ with self.report.users_fetch_timer:
236
+ users = self.fetch_users()
237
+
228
238
  # TODO: Add some logic to check if the cached audit log is stale or not.
229
239
  audit_log_file = self.local_temp_path / "audit_log.sqlite"
230
240
  use_cached_audit_log = audit_log_file.exists()
@@ -248,7 +258,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
248
258
  queries.append(entry)
249
259
 
250
260
  with self.report.query_log_fetch_timer:
251
- for entry in self.fetch_query_log():
261
+ for entry in self.fetch_query_log(users):
252
262
  queries.append(entry)
253
263
 
254
264
  with self.report.audit_log_load_timer:
@@ -263,6 +273,25 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
263
273
  shared_connection.close()
264
274
  audit_log_file.unlink(missing_ok=True)
265
275
 
276
+ def fetch_users(self) -> UsersMapping:
277
+ users: UsersMapping = dict()
278
+ with self.structured_reporter.report_exc("Error fetching users from Snowflake"):
279
+ logger.info("Fetching users from Snowflake")
280
+ query = SnowflakeQuery.get_all_users()
281
+ resp = self.connection.query(query)
282
+
283
+ for row in resp:
284
+ try:
285
+ users[row["NAME"]] = row["EMAIL"]
286
+ self.report.num_users += 1
287
+ except Exception as e:
288
+ self.structured_reporter.warning(
289
+ "Error parsing user row",
290
+ context=f"{row}",
291
+ exc=e,
292
+ )
293
+ return users
294
+
266
295
  def fetch_copy_history(self) -> Iterable[KnownLineageMapping]:
267
296
  # Derived from _populate_external_lineage_from_copy_history.
268
297
 
@@ -298,7 +327,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
298
327
  yield result
299
328
 
300
329
  def fetch_query_log(
301
- self,
330
+ self, users: UsersMapping
302
331
  ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap]]:
303
332
  query_log_query = _build_enriched_query_log_query(
304
333
  start_time=self.config.window.start_time,
@@ -319,7 +348,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
319
348
 
320
349
  assert isinstance(row, dict)
321
350
  try:
322
- entry = self._parse_audit_log_row(row)
351
+ entry = self._parse_audit_log_row(row, users)
323
352
  except Exception as e:
324
353
  self.structured_reporter.warning(
325
354
  "Error parsing query log row",
@@ -331,7 +360,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
331
360
  yield entry
332
361
 
333
362
  def _parse_audit_log_row(
334
- self, row: Dict[str, Any]
363
+ self, row: Dict[str, Any], users: UsersMapping
335
364
  ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery]]:
336
365
  json_fields = {
337
366
  "DIRECT_OBJECTS_ACCESSED",
@@ -430,9 +459,11 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
430
459
  )
431
460
  )
432
461
 
433
- # TODO: Fetch email addresses from Snowflake to map user -> email
434
- # TODO: Support email_domain fallback for generating user urns.
435
- user = CorpUserUrn(self.identifiers.snowflake_identifier(res["user_name"]))
462
+ user = CorpUserUrn(
463
+ self.identifiers.get_user_identifier(
464
+ res["user_name"], users.get(res["user_name"])
465
+ )
466
+ )
436
467
 
437
468
  timestamp: datetime = res["query_start_time"]
438
469
  timestamp = timestamp.astimezone(timezone.utc)
@@ -947,4 +947,8 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
947
947
  AND METRIC_NAME ilike '{pattern}' escape '{escape_pattern}'
948
948
  ORDER BY MEASUREMENT_TIME ASC;
949
949
 
950
- """
950
+ """
951
+
952
+ @staticmethod
953
+ def get_all_users() -> str:
954
+ return """SELECT name as "NAME", email as "EMAIL" FROM SNOWFLAKE.ACCOUNT_USAGE.USERS"""
@@ -342,10 +342,9 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
342
342
  filtered_user_counts.append(
343
343
  DatasetUserUsageCounts(
344
344
  user=make_user_urn(
345
- self.get_user_identifier(
345
+ self.identifiers.get_user_identifier(
346
346
  user_count["user_name"],
347
347
  user_email,
348
- self.config.email_as_user_identifier,
349
348
  )
350
349
  ),
351
350
  count=user_count["total"],
@@ -453,9 +452,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
453
452
  reported_time: int = int(time.time() * 1000)
454
453
  last_updated_timestamp: int = int(start_time.timestamp() * 1000)
455
454
  user_urn = make_user_urn(
456
- self.get_user_identifier(
457
- user_name, user_email, self.config.email_as_user_identifier
458
- )
455
+ self.identifiers.get_user_identifier(user_name, user_email)
459
456
  )
460
457
 
461
458
  # NOTE: In earlier `snowflake-usage` connector this was base_objects_accessed, which is incorrect
@@ -300,6 +300,28 @@ class SnowflakeIdentifierBuilder:
300
300
  def get_quoted_identifier_for_table(db_name, schema_name, table_name):
301
301
  return f'"{db_name}"."{schema_name}"."{table_name}"'
302
302
 
303
+ # Note - decide how to construct user urns.
304
+ # Historically urns were created using part before @ from user's email.
305
+ # Users without email were skipped from both user entries as well as aggregates.
306
+ # However email is not mandatory field in snowflake user, user_name is always present.
307
+ def get_user_identifier(
308
+ self,
309
+ user_name: str,
310
+ user_email: Optional[str],
311
+ ) -> str:
312
+ if user_email:
313
+ return self.snowflake_identifier(
314
+ user_email
315
+ if self.identifier_config.email_as_user_identifier is True
316
+ else user_email.split("@")[0]
317
+ )
318
+ return self.snowflake_identifier(
319
+ f"{user_name}@{self.identifier_config.email_domain}"
320
+ if self.identifier_config.email_as_user_identifier is True
321
+ and self.identifier_config.email_domain is not None
322
+ else user_name
323
+ )
324
+
303
325
 
304
326
  class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
305
327
  platform = "snowflake"
@@ -315,24 +337,6 @@ class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
315
337
  def identifiers(self) -> SnowflakeIdentifierBuilder:
316
338
  return SnowflakeIdentifierBuilder(self.config, self.report)
317
339
 
318
- # Note - decide how to construct user urns.
319
- # Historically urns were created using part before @ from user's email.
320
- # Users without email were skipped from both user entries as well as aggregates.
321
- # However email is not mandatory field in snowflake user, user_name is always present.
322
- def get_user_identifier(
323
- self,
324
- user_name: str,
325
- user_email: Optional[str],
326
- email_as_user_identifier: bool,
327
- ) -> str:
328
- if user_email:
329
- return self.identifiers.snowflake_identifier(
330
- user_email
331
- if email_as_user_identifier is True
332
- else user_email.split("@")[0]
333
- )
334
- return self.identifiers.snowflake_identifier(user_name)
335
-
336
340
  # TODO: Revisit this after stateful ingestion can commit checkpoint
337
341
  # for failures that do not affect the checkpoint
338
342
  # TODO: Add additional parameters to match the signature of the .warning and .failure methods
@@ -42,4 +42,5 @@ class IngestionStageReport:
42
42
  self._timer = PerfTimer()
43
43
 
44
44
  self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
45
+ logger.info(f"Stage started: {self.ingestion_stage}")
45
46
  self._timer.start()