acryl-datahub 0.15.0.1rc8__py3-none-any.whl → 0.15.0.1rc10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc8.dist-info → acryl_datahub-0.15.0.1rc10.dist-info}/METADATA +2492 -2492
- {acryl_datahub-0.15.0.1rc8.dist-info → acryl_datahub-0.15.0.1rc10.dist-info}/RECORD +18 -18
- datahub/__init__.py +1 -1
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
- datahub/ingestion/source/gc/datahub_gc.py +18 -5
- datahub/ingestion/source/gc/execution_request_cleanup.py +49 -12
- datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -9
- datahub/ingestion/source/snowflake/snowflake_queries.py +38 -7
- datahub/ingestion/source/snowflake/snowflake_query.py +5 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
- datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
- datahub/ingestion/source_report/ingestion_stage.py +1 -0
- {acryl_datahub-0.15.0.1rc8.dist-info → acryl_datahub-0.15.0.1rc10.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.1rc8.dist-info → acryl_datahub-0.15.0.1rc10.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc8.dist-info → acryl_datahub-0.15.0.1rc10.dist-info}/top_level.txt +0 -0
|
@@ -66,6 +66,11 @@ from datahub.utilities.perf_timer import PerfTimer
|
|
|
66
66
|
|
|
67
67
|
logger = logging.getLogger(__name__)
|
|
68
68
|
|
|
69
|
+
# Define a type alias
|
|
70
|
+
UserName = str
|
|
71
|
+
UserEmail = str
|
|
72
|
+
UsersMapping = Dict[UserName, UserEmail]
|
|
73
|
+
|
|
69
74
|
|
|
70
75
|
class SnowflakeQueriesExtractorConfig(ConfigModel):
|
|
71
76
|
# TODO: Support stateful ingestion for the time windows.
|
|
@@ -114,11 +119,13 @@ class SnowflakeQueriesSourceConfig(
|
|
|
114
119
|
class SnowflakeQueriesExtractorReport(Report):
|
|
115
120
|
copy_history_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
116
121
|
query_log_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
122
|
+
users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
117
123
|
|
|
118
124
|
audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
119
125
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
120
126
|
|
|
121
127
|
num_ddl_queries_dropped: int = 0
|
|
128
|
+
num_users: int = 0
|
|
122
129
|
|
|
123
130
|
|
|
124
131
|
@dataclass
|
|
@@ -225,6 +232,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
225
232
|
def get_workunits_internal(
|
|
226
233
|
self,
|
|
227
234
|
) -> Iterable[MetadataWorkUnit]:
|
|
235
|
+
with self.report.users_fetch_timer:
|
|
236
|
+
users = self.fetch_users()
|
|
237
|
+
|
|
228
238
|
# TODO: Add some logic to check if the cached audit log is stale or not.
|
|
229
239
|
audit_log_file = self.local_temp_path / "audit_log.sqlite"
|
|
230
240
|
use_cached_audit_log = audit_log_file.exists()
|
|
@@ -248,7 +258,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
248
258
|
queries.append(entry)
|
|
249
259
|
|
|
250
260
|
with self.report.query_log_fetch_timer:
|
|
251
|
-
for entry in self.fetch_query_log():
|
|
261
|
+
for entry in self.fetch_query_log(users):
|
|
252
262
|
queries.append(entry)
|
|
253
263
|
|
|
254
264
|
with self.report.audit_log_load_timer:
|
|
@@ -263,6 +273,25 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
263
273
|
shared_connection.close()
|
|
264
274
|
audit_log_file.unlink(missing_ok=True)
|
|
265
275
|
|
|
276
|
+
def fetch_users(self) -> UsersMapping:
|
|
277
|
+
users: UsersMapping = dict()
|
|
278
|
+
with self.structured_reporter.report_exc("Error fetching users from Snowflake"):
|
|
279
|
+
logger.info("Fetching users from Snowflake")
|
|
280
|
+
query = SnowflakeQuery.get_all_users()
|
|
281
|
+
resp = self.connection.query(query)
|
|
282
|
+
|
|
283
|
+
for row in resp:
|
|
284
|
+
try:
|
|
285
|
+
users[row["NAME"]] = row["EMAIL"]
|
|
286
|
+
self.report.num_users += 1
|
|
287
|
+
except Exception as e:
|
|
288
|
+
self.structured_reporter.warning(
|
|
289
|
+
"Error parsing user row",
|
|
290
|
+
context=f"{row}",
|
|
291
|
+
exc=e,
|
|
292
|
+
)
|
|
293
|
+
return users
|
|
294
|
+
|
|
266
295
|
def fetch_copy_history(self) -> Iterable[KnownLineageMapping]:
|
|
267
296
|
# Derived from _populate_external_lineage_from_copy_history.
|
|
268
297
|
|
|
@@ -298,7 +327,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
298
327
|
yield result
|
|
299
328
|
|
|
300
329
|
def fetch_query_log(
|
|
301
|
-
self,
|
|
330
|
+
self, users: UsersMapping
|
|
302
331
|
) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap]]:
|
|
303
332
|
query_log_query = _build_enriched_query_log_query(
|
|
304
333
|
start_time=self.config.window.start_time,
|
|
@@ -319,7 +348,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
319
348
|
|
|
320
349
|
assert isinstance(row, dict)
|
|
321
350
|
try:
|
|
322
|
-
entry = self._parse_audit_log_row(row)
|
|
351
|
+
entry = self._parse_audit_log_row(row, users)
|
|
323
352
|
except Exception as e:
|
|
324
353
|
self.structured_reporter.warning(
|
|
325
354
|
"Error parsing query log row",
|
|
@@ -331,7 +360,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
331
360
|
yield entry
|
|
332
361
|
|
|
333
362
|
def _parse_audit_log_row(
|
|
334
|
-
self, row: Dict[str, Any]
|
|
363
|
+
self, row: Dict[str, Any], users: UsersMapping
|
|
335
364
|
) -> Optional[Union[TableRename, TableSwap, PreparsedQuery]]:
|
|
336
365
|
json_fields = {
|
|
337
366
|
"DIRECT_OBJECTS_ACCESSED",
|
|
@@ -430,9 +459,11 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
430
459
|
)
|
|
431
460
|
)
|
|
432
461
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
462
|
+
user = CorpUserUrn(
|
|
463
|
+
self.identifiers.get_user_identifier(
|
|
464
|
+
res["user_name"], users.get(res["user_name"])
|
|
465
|
+
)
|
|
466
|
+
)
|
|
436
467
|
|
|
437
468
|
timestamp: datetime = res["query_start_time"]
|
|
438
469
|
timestamp = timestamp.astimezone(timezone.utc)
|
|
@@ -947,4 +947,8 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
947
947
|
AND METRIC_NAME ilike '{pattern}' escape '{escape_pattern}'
|
|
948
948
|
ORDER BY MEASUREMENT_TIME ASC;
|
|
949
949
|
|
|
950
|
-
"""
|
|
950
|
+
"""
|
|
951
|
+
|
|
952
|
+
@staticmethod
|
|
953
|
+
def get_all_users() -> str:
|
|
954
|
+
return """SELECT name as "NAME", email as "EMAIL" FROM SNOWFLAKE.ACCOUNT_USAGE.USERS"""
|
|
@@ -342,10 +342,9 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
342
342
|
filtered_user_counts.append(
|
|
343
343
|
DatasetUserUsageCounts(
|
|
344
344
|
user=make_user_urn(
|
|
345
|
-
self.get_user_identifier(
|
|
345
|
+
self.identifiers.get_user_identifier(
|
|
346
346
|
user_count["user_name"],
|
|
347
347
|
user_email,
|
|
348
|
-
self.config.email_as_user_identifier,
|
|
349
348
|
)
|
|
350
349
|
),
|
|
351
350
|
count=user_count["total"],
|
|
@@ -453,9 +452,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
453
452
|
reported_time: int = int(time.time() * 1000)
|
|
454
453
|
last_updated_timestamp: int = int(start_time.timestamp() * 1000)
|
|
455
454
|
user_urn = make_user_urn(
|
|
456
|
-
self.get_user_identifier(
|
|
457
|
-
user_name, user_email, self.config.email_as_user_identifier
|
|
458
|
-
)
|
|
455
|
+
self.identifiers.get_user_identifier(user_name, user_email)
|
|
459
456
|
)
|
|
460
457
|
|
|
461
458
|
# NOTE: In earlier `snowflake-usage` connector this was base_objects_accessed, which is incorrect
|
|
@@ -300,6 +300,28 @@ class SnowflakeIdentifierBuilder:
|
|
|
300
300
|
def get_quoted_identifier_for_table(db_name, schema_name, table_name):
|
|
301
301
|
return f'"{db_name}"."{schema_name}"."{table_name}"'
|
|
302
302
|
|
|
303
|
+
# Note - decide how to construct user urns.
|
|
304
|
+
# Historically urns were created using part before @ from user's email.
|
|
305
|
+
# Users without email were skipped from both user entries as well as aggregates.
|
|
306
|
+
# However email is not mandatory field in snowflake user, user_name is always present.
|
|
307
|
+
def get_user_identifier(
|
|
308
|
+
self,
|
|
309
|
+
user_name: str,
|
|
310
|
+
user_email: Optional[str],
|
|
311
|
+
) -> str:
|
|
312
|
+
if user_email:
|
|
313
|
+
return self.snowflake_identifier(
|
|
314
|
+
user_email
|
|
315
|
+
if self.identifier_config.email_as_user_identifier is True
|
|
316
|
+
else user_email.split("@")[0]
|
|
317
|
+
)
|
|
318
|
+
return self.snowflake_identifier(
|
|
319
|
+
f"{user_name}@{self.identifier_config.email_domain}"
|
|
320
|
+
if self.identifier_config.email_as_user_identifier is True
|
|
321
|
+
and self.identifier_config.email_domain is not None
|
|
322
|
+
else user_name
|
|
323
|
+
)
|
|
324
|
+
|
|
303
325
|
|
|
304
326
|
class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
|
|
305
327
|
platform = "snowflake"
|
|
@@ -315,24 +337,6 @@ class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
|
|
|
315
337
|
def identifiers(self) -> SnowflakeIdentifierBuilder:
|
|
316
338
|
return SnowflakeIdentifierBuilder(self.config, self.report)
|
|
317
339
|
|
|
318
|
-
# Note - decide how to construct user urns.
|
|
319
|
-
# Historically urns were created using part before @ from user's email.
|
|
320
|
-
# Users without email were skipped from both user entries as well as aggregates.
|
|
321
|
-
# However email is not mandatory field in snowflake user, user_name is always present.
|
|
322
|
-
def get_user_identifier(
|
|
323
|
-
self,
|
|
324
|
-
user_name: str,
|
|
325
|
-
user_email: Optional[str],
|
|
326
|
-
email_as_user_identifier: bool,
|
|
327
|
-
) -> str:
|
|
328
|
-
if user_email:
|
|
329
|
-
return self.identifiers.snowflake_identifier(
|
|
330
|
-
user_email
|
|
331
|
-
if email_as_user_identifier is True
|
|
332
|
-
else user_email.split("@")[0]
|
|
333
|
-
)
|
|
334
|
-
return self.identifiers.snowflake_identifier(user_name)
|
|
335
|
-
|
|
336
340
|
# TODO: Revisit this after stateful ingestion can commit checkpoint
|
|
337
341
|
# for failures that do not affect the checkpoint
|
|
338
342
|
# TODO: Add additional parameters to match the signature of the .warning and .failure methods
|
|
File without changes
|
{acryl_datahub-0.15.0.1rc8.dist-info → acryl_datahub-0.15.0.1rc10.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|