acryl-datahub 1.0.0.3rc5__py3-none-any.whl → 1.0.0.3rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -32,6 +32,7 @@ from datahub.metadata.urns import (
32
32
  SchemaFieldUrn,
33
33
  Urn,
34
34
  )
35
+ from datahub.sql_parsing.fingerprint_utils import generate_hash
35
36
  from datahub.sql_parsing.schema_resolver import (
36
37
  SchemaResolver,
37
38
  SchemaResolverInterface,
@@ -49,7 +50,6 @@ from datahub.sql_parsing.sqlglot_lineage import (
49
50
  )
50
51
  from datahub.sql_parsing.sqlglot_utils import (
51
52
  _parse_statement,
52
- generate_hash,
53
53
  get_query_fingerprint,
54
54
  try_format_query,
55
55
  )
@@ -155,6 +155,47 @@ class QueryMetadata:
155
155
  actor=(self.actor or _DEFAULT_USER_URN).urn(),
156
156
  )
157
157
 
158
+ def get_subjects(
159
+ self,
160
+ downstream_urn: Optional[str],
161
+ include_fields: bool,
162
+ ) -> List[UrnStr]:
163
+ query_subject_urns = OrderedSet[UrnStr]()
164
+ for upstream in self.upstreams:
165
+ query_subject_urns.add(upstream)
166
+ if include_fields:
167
+ for column in sorted(self.column_usage.get(upstream, [])):
168
+ query_subject_urns.add(
169
+ builder.make_schema_field_urn(upstream, column)
170
+ )
171
+ if downstream_urn:
172
+ query_subject_urns.add(downstream_urn)
173
+ if include_fields:
174
+ for column_lineage in self.column_lineage:
175
+ query_subject_urns.add(
176
+ builder.make_schema_field_urn(
177
+ downstream_urn, column_lineage.downstream.column
178
+ )
179
+ )
180
+ return list(query_subject_urns)
181
+
182
+ def make_query_properties(self) -> models.QueryPropertiesClass:
183
+ return models.QueryPropertiesClass(
184
+ statement=models.QueryStatementClass(
185
+ value=self.formatted_query_string,
186
+ language=models.QueryLanguageClass.SQL,
187
+ ),
188
+ source=models.QuerySourceClass.SYSTEM,
189
+ created=self.make_created_audit_stamp(),
190
+ lastModified=self.make_last_modified_audit_stamp(),
191
+ )
192
+
193
+
194
+ def make_query_subjects(urns: List[UrnStr]) -> models.QuerySubjectsClass:
195
+ return models.QuerySubjectsClass(
196
+ subjects=[models.QuerySubjectClass(entity=urn) for urn in urns]
197
+ )
198
+
158
199
 
159
200
  @dataclasses.dataclass
160
201
  class KnownQueryLineageInfo:
@@ -1440,42 +1481,15 @@ class SqlParsingAggregator(Closeable):
1440
1481
  self.report.num_queries_skipped_due_to_filters += 1
1441
1482
  return
1442
1483
 
1443
- query_subject_urns = OrderedSet[UrnStr]()
1444
- for upstream in query.upstreams:
1445
- query_subject_urns.add(upstream)
1446
- if self.generate_query_subject_fields:
1447
- for column in sorted(query.column_usage.get(upstream, [])):
1448
- query_subject_urns.add(
1449
- builder.make_schema_field_urn(upstream, column)
1450
- )
1451
- if downstream_urn:
1452
- query_subject_urns.add(downstream_urn)
1453
- if self.generate_query_subject_fields:
1454
- for column_lineage in query.column_lineage:
1455
- query_subject_urns.add(
1456
- builder.make_schema_field_urn(
1457
- downstream_urn, column_lineage.downstream.column
1458
- )
1459
- )
1460
-
1461
1484
  yield from MetadataChangeProposalWrapper.construct_many(
1462
1485
  entityUrn=self._query_urn(query_id),
1463
1486
  aspects=[
1464
- models.QueryPropertiesClass(
1465
- statement=models.QueryStatementClass(
1466
- value=query.formatted_query_string,
1467
- language=models.QueryLanguageClass.SQL,
1468
- ),
1469
- source=models.QuerySourceClass.SYSTEM,
1470
- created=query.make_created_audit_stamp(),
1471
- lastModified=query.make_last_modified_audit_stamp(),
1472
- origin=query.origin.urn() if query.origin else None,
1473
- ),
1474
- models.QuerySubjectsClass(
1475
- subjects=[
1476
- models.QuerySubjectClass(entity=urn)
1477
- for urn in query_subject_urns
1478
- ]
1487
+ query.make_query_properties(),
1488
+ make_query_subjects(
1489
+ query.get_subjects(
1490
+ downstream_urn=downstream_urn,
1491
+ include_fields=self.generate_query_subject_fields,
1492
+ )
1479
1493
  ),
1480
1494
  models.DataPlatformInstanceClass(
1481
1495
  platform=self.platform.urn(),
@@ -1,7 +1,6 @@
1
1
  from datahub.sql_parsing._sqlglot_patch import SQLGLOT_PATCHED
2
2
 
3
3
  import functools
4
- import hashlib
5
4
  import logging
6
5
  import re
7
6
  from typing import Dict, Iterable, Optional, Tuple, Union
@@ -10,6 +9,8 @@ import sqlglot
10
9
  import sqlglot.errors
11
10
  import sqlglot.optimizer.eliminate_ctes
12
11
 
12
+ from datahub.sql_parsing.fingerprint_utils import generate_hash
13
+
13
14
  assert SQLGLOT_PATCHED
14
15
 
15
16
  logger = logging.getLogger(__name__)
@@ -251,11 +252,6 @@ def generalize_query(expression: sqlglot.exp.ExpOrStr, dialect: DialectOrStr) ->
251
252
  return expression.transform(_strip_expression, copy=True).sql(dialect=dialect)
252
253
 
253
254
 
254
- def generate_hash(text: str) -> str:
255
- # Once we move to Python 3.9+, we can set `usedforsecurity=False`.
256
- return hashlib.sha256(text.encode("utf-8")).hexdigest()
257
-
258
-
259
255
  def get_query_fingerprint_debug(
260
256
  expression: sqlglot.exp.ExpOrStr,
261
257
  platform: DialectOrStr,