acryl-datahub 1.1.0.4rc3__py3-none-any.whl → 1.1.0.5rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (41) hide show
  1. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5rc2.dist-info}/METADATA +2518 -2518
  2. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5rc2.dist-info}/RECORD +41 -36
  3. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5rc2.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/ingestion/api/decorators.py +1 -0
  6. datahub/ingestion/api/sink.py +3 -0
  7. datahub/ingestion/autogenerated/__init__.py +0 -0
  8. datahub/ingestion/run/pipeline.py +1 -1
  9. datahub/ingestion/sink/datahub_rest.py +12 -0
  10. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  11. datahub/ingestion/source/identity/azure_ad.py +1 -1
  12. datahub/ingestion/source/identity/okta.py +1 -1
  13. datahub/ingestion/source/mock_data/__init__.py +0 -0
  14. datahub/ingestion/source/mock_data/datahub_mock_data.py +389 -0
  15. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  16. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  17. datahub/ingestion/source/preset.py +2 -2
  18. datahub/ingestion/source/snowflake/snowflake_config.py +1 -0
  19. datahub/ingestion/source/snowflake/snowflake_queries.py +42 -31
  20. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  21. datahub/ingestion/source/sql/mssql/source.py +15 -15
  22. datahub/ingestion/source/sql/vertica.py +1 -1
  23. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  24. datahub/ingestion/source/superset.py +1 -1
  25. datahub/ingestion/source/unity/source.py +1 -1
  26. datahub/metadata/_internal_schema_classes.py +3 -0
  27. datahub/metadata/schema.avsc +2 -0
  28. datahub/metadata/schemas/ContainerProperties.avsc +2 -0
  29. datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
  30. datahub/metadata/schemas/DataJobInfo.avsc +2 -0
  31. datahub/metadata/schemas/DataProcessKey.avsc +2 -0
  32. datahub/metadata/schemas/DatasetKey.avsc +2 -0
  33. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
  34. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
  35. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
  36. datahub/metadata/schemas/MLModelKey.avsc +2 -0
  37. datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
  38. datahub/sql_parsing/sql_parsing_aggregator.py +5 -2
  39. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5rc2.dist-info}/WHEEL +0 -0
  40. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5rc2.dist-info}/licenses/LICENSE +0 -0
  41. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5rc2.dist-info}/top_level.txt +0 -0
@@ -63,7 +63,10 @@ from datahub.sql_parsing.sqlglot_lineage import (
63
63
  DownstreamColumnRef,
64
64
  )
65
65
  from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
66
- from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
66
+ from datahub.utilities.file_backed_collections import (
67
+ ConnectionWrapper,
68
+ FileBackedList,
69
+ )
67
70
  from datahub.utilities.perf_timer import PerfTimer
68
71
 
69
72
  logger = logging.getLogger(__name__)
@@ -243,6 +246,12 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
243
246
  audit_log_file = self.local_temp_path / "audit_log.sqlite"
244
247
  use_cached_audit_log = audit_log_file.exists()
245
248
 
249
+ if self.config.local_temp_path is None:
250
+ self._exit_stack.callback(lambda: audit_log_file.unlink(missing_ok=True))
251
+
252
+ shared_connection = self._exit_stack.enter_context(
253
+ ConnectionWrapper(audit_log_file)
254
+ )
246
255
  queries: FileBackedList[
247
256
  Union[
248
257
  KnownLineageMapping,
@@ -251,27 +260,16 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
251
260
  TableSwap,
252
261
  ObservedQuery,
253
262
  ]
254
- ]
263
+ ] = self._exit_stack.enter_context(FileBackedList(shared_connection))
264
+
255
265
  if use_cached_audit_log:
256
- logger.info("Using cached audit log")
257
- shared_connection = ConnectionWrapper(audit_log_file)
258
- queries = FileBackedList(shared_connection)
266
+ logger.info(f"Using cached audit log at {audit_log_file}")
259
267
  else:
260
- audit_log_file.unlink(missing_ok=True)
261
-
262
- shared_connection = ConnectionWrapper(audit_log_file)
263
- queries = FileBackedList(shared_connection)
264
- entry: Union[
265
- KnownLineageMapping,
266
- PreparsedQuery,
267
- TableRename,
268
- TableSwap,
269
- ObservedQuery,
270
- ]
268
+ logger.info(f"Fetching audit log into {audit_log_file}")
271
269
 
272
270
  with self.report.copy_history_fetch_timer:
273
- for entry in self.fetch_copy_history():
274
- queries.append(entry)
271
+ for copy_entry in self.fetch_copy_history():
272
+ queries.append(copy_entry)
275
273
 
276
274
  with self.report.query_log_fetch_timer:
277
275
  for entry in self.fetch_query_log(users):
@@ -281,13 +279,10 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
281
279
  for i, query in enumerate(queries):
282
280
  if i % 1000 == 0:
283
281
  logger.info(f"Added {i} query log entries to SQL aggregator")
282
+
284
283
  self.aggregator.add(query)
285
284
 
286
285
  yield from auto_workunit(self.aggregator.gen_metadata())
287
- if not use_cached_audit_log:
288
- queries.close()
289
- shared_connection.close()
290
- audit_log_file.unlink(missing_ok=True)
291
286
 
292
287
  def fetch_users(self) -> UsersMapping:
293
288
  users: UsersMapping = dict()
@@ -403,8 +398,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
403
398
 
404
399
  # TODO need to map snowflake query types to ours
405
400
  query_text: str = res["query_text"]
401
+ snowflake_query_type: str = res["query_type"]
406
402
  query_type: QueryType = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
407
- res["query_type"], QueryType.UNKNOWN
403
+ snowflake_query_type, QueryType.UNKNOWN
408
404
  )
409
405
 
410
406
  direct_objects_accessed = res["direct_objects_accessed"]
@@ -421,7 +417,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
421
417
  res["session_id"],
422
418
  timestamp,
423
419
  object_modified_by_ddl,
424
- res["query_type"],
420
+ snowflake_query_type,
425
421
  )
426
422
  if known_ddl_entry:
427
423
  return known_ddl_entry
@@ -436,6 +432,16 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
436
432
  res["user_name"], users.get(res["user_name"])
437
433
  )
438
434
  )
435
+ extra_info = {
436
+ "snowflake_query_id": res["query_id"],
437
+ "snowflake_root_query_id": res["root_query_id"],
438
+ "snowflake_query_type": res["query_type"],
439
+ "snowflake_role_name": res["role_name"],
440
+ "query_duration": res["query_duration"],
441
+ "rows_inserted": res["rows_inserted"],
442
+ "rows_updated": res["rows_updated"],
443
+ "rows_deleted": res["rows_deleted"],
444
+ }
439
445
 
440
446
  # There are a couple cases when we'd want to prefer our own SQL parsing
441
447
  # over Snowflake's metadata.
@@ -470,6 +476,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
470
476
  query_hash=get_query_fingerprint(
471
477
  query_text, self.identifiers.platform, fast=True
472
478
  ),
479
+ extra_info=extra_info,
473
480
  )
474
481
 
475
482
  upstreams = []
@@ -556,6 +563,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
556
563
  timestamp=timestamp,
557
564
  session_id=res["session_id"],
558
565
  query_type=query_type,
566
+ extra_info=extra_info,
559
567
  )
560
568
  return entry
561
569
 
@@ -667,7 +675,7 @@ def _build_enriched_query_log_query(
667
675
  start_time_millis = int(start_time.timestamp() * 1000)
668
676
  end_time_millis = int(end_time.timestamp() * 1000)
669
677
 
670
- users_filter = ""
678
+ users_filter = "TRUE"
671
679
  if deny_usernames:
672
680
  user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
673
681
  users_filter = f"user_name NOT IN ({user_not_in})"
@@ -694,10 +702,10 @@ fingerprinted_queries as (
694
702
  FROM
695
703
  snowflake.account_usage.query_history
696
704
  WHERE
697
- query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3)
698
- AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3)
705
+ query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3) -- {start_time.isoformat()}
706
+ AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3) -- {end_time.isoformat()}
699
707
  AND execution_status = 'SUCCESS'
700
- AND {users_filter or "TRUE"}
708
+ AND {users_filter}
701
709
  )
702
710
  , deduplicated_queries as (
703
711
  SELECT
@@ -715,6 +723,7 @@ fingerprinted_queries as (
715
723
  , raw_access_history AS (
716
724
  SELECT
717
725
  query_id,
726
+ root_query_id,
718
727
  query_start_time,
719
728
  user_name,
720
729
  direct_objects_accessed,
@@ -723,9 +732,9 @@ fingerprinted_queries as (
723
732
  FROM
724
733
  snowflake.account_usage.access_history
725
734
  WHERE
726
- query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
727
- AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
728
- AND {users_filter or "TRUE"}
735
+ query_start_time >= to_timestamp_ltz({start_time_millis}, 3) -- {start_time.isoformat()}
736
+ AND query_start_time < to_timestamp_ltz({end_time_millis}, 3) -- {end_time.isoformat()}
737
+ AND {users_filter}
729
738
  AND query_id IN (
730
739
  SELECT query_id FROM deduplicated_queries
731
740
  )
@@ -734,6 +743,7 @@ fingerprinted_queries as (
734
743
  -- TODO: Add table filter clause.
735
744
  SELECT
736
745
  query_id,
746
+ root_query_id,
737
747
  query_start_time,
738
748
  ARRAY_SLICE(
739
749
  FILTER(direct_objects_accessed, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}),
@@ -764,6 +774,7 @@ fingerprinted_queries as (
764
774
  q.rows_deleted AS "ROWS_DELETED",
765
775
  q.user_name AS "USER_NAME",
766
776
  q.role_name AS "ROLE_NAME",
777
+ a.root_query_id,
767
778
  a.direct_objects_accessed,
768
779
  a.objects_modified,
769
780
  a.object_modified_by_ddl
@@ -118,7 +118,7 @@ logger: logging.Logger = logging.getLogger(__name__)
118
118
  )
119
119
  @capability(
120
120
  SourceCapability.DELETION_DETECTION,
121
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
121
+ "Enabled by default via stateful ingestion",
122
122
  supported=True,
123
123
  )
124
124
  @capability(
@@ -936,25 +936,25 @@ class SQLServerSource(SQLAlchemySource):
936
936
  url = self.config.get_sql_alchemy_url()
937
937
  logger.debug(f"sql_alchemy_url={url}")
938
938
  engine = create_engine(url, **self.config.options)
939
- with engine.connect() as conn:
940
- if self.config.database and self.config.database != "":
941
- inspector = inspect(conn)
942
- yield inspector
943
- else:
939
+
940
+ if self.config.database and self.config.database != "":
941
+ inspector = inspect(engine)
942
+ yield inspector
943
+ else:
944
+ with engine.begin() as conn:
944
945
  databases = conn.execute(
945
946
  "SELECT name FROM master.sys.databases WHERE name NOT IN \
946
947
  ('master', 'model', 'msdb', 'tempdb', 'Resource', \
947
948
  'distribution' , 'reportserver', 'reportservertempdb'); "
948
- )
949
- for db in databases:
950
- if self.config.database_pattern.allowed(db["name"]):
951
- url = self.config.get_sql_alchemy_url(current_db=db["name"])
952
- with create_engine(
953
- url, **self.config.options
954
- ).connect() as conn:
955
- inspector = inspect(conn)
956
- self.current_database = db["name"]
957
- yield inspector
949
+ ).fetchall()
950
+
951
+ for db in databases:
952
+ if self.config.database_pattern.allowed(db["name"]):
953
+ url = self.config.get_sql_alchemy_url(current_db=db["name"])
954
+ engine = create_engine(url, **self.config.options)
955
+ inspector = inspect(engine)
956
+ self.current_database = db["name"]
957
+ yield inspector
958
958
 
959
959
  def get_identifier(
960
960
  self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
@@ -116,7 +116,7 @@ class VerticaConfig(BasicSQLAlchemyConfig):
116
116
  )
117
117
  @capability(
118
118
  SourceCapability.DELETION_DETECTION,
119
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
119
+ "Enabled by default via stateful ingestion",
120
120
  supported=True,
121
121
  )
122
122
  class VerticaSource(SQLAlchemySource):
@@ -179,7 +179,7 @@ class StatefulIngestionReport(SourceReport):
179
179
 
180
180
  @capability(
181
181
  SourceCapability.DELETION_DETECTION,
182
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
182
+ "Enabled by default via stateful ingestion",
183
183
  supported=True,
184
184
  )
185
185
  class StatefulIngestionSourceBase(Source):
@@ -272,7 +272,7 @@ def get_filter_name(filter_obj):
272
272
  @config_class(SupersetConfig)
273
273
  @support_status(SupportStatus.CERTIFIED)
274
274
  @capability(
275
- SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
275
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
276
276
  )
277
277
  @capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
278
278
  @capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
@@ -159,7 +159,7 @@ logger: logging.Logger = logging.getLogger(__name__)
159
159
  )
160
160
  @capability(
161
161
  SourceCapability.DELETION_DETECTION,
162
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
162
+ "Enabled by default via stateful ingestion",
163
163
  supported=True,
164
164
  )
165
165
  @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
@@ -4599,6 +4599,9 @@ class FabricTypeClass(object):
4599
4599
  SIT = "SIT"
4600
4600
  """System Integration Testing"""
4601
4601
 
4602
+ SBX = "SBX"
4603
+ """Alternative spelling for sandbox"""
4604
+
4602
4605
  SANDBOX = "SANDBOX"
4603
4606
  """Designates sandbox fabrics"""
4604
4607
 
@@ -9508,6 +9508,7 @@
9508
9508
  "QA": "Designates quality assurance fabrics",
9509
9509
  "RVW": "Designates review fabrics",
9510
9510
  "SANDBOX": "Designates sandbox fabrics",
9511
+ "SBX": "Alternative spelling for sandbox",
9511
9512
  "SIT": "System Integration Testing",
9512
9513
  "STG": "Designates staging fabrics",
9513
9514
  "TEST": "Designates testing fabrics",
@@ -9531,6 +9532,7 @@
9531
9532
  "PRD",
9532
9533
  "TST",
9533
9534
  "SIT",
9535
+ "SBX",
9534
9536
  "SANDBOX"
9535
9537
  ],
9536
9538
  "doc": "Fabric group type"
@@ -99,6 +99,7 @@
99
99
  "QA": "Designates quality assurance fabrics",
100
100
  "RVW": "Designates review fabrics",
101
101
  "SANDBOX": "Designates sandbox fabrics",
102
+ "SBX": "Alternative spelling for sandbox",
102
103
  "SIT": "System Integration Testing",
103
104
  "STG": "Designates staging fabrics",
104
105
  "TEST": "Designates testing fabrics",
@@ -122,6 +123,7 @@
122
123
  "PRD",
123
124
  "TST",
124
125
  "SIT",
126
+ "SBX",
125
127
  "SANDBOX"
126
128
  ],
127
129
  "doc": "Fabric group type"
@@ -153,6 +153,7 @@
153
153
  "QA": "Designates quality assurance fabrics",
154
154
  "RVW": "Designates review fabrics",
155
155
  "SANDBOX": "Designates sandbox fabrics",
156
+ "SBX": "Alternative spelling for sandbox",
156
157
  "SIT": "System Integration Testing",
157
158
  "STG": "Designates staging fabrics",
158
159
  "TEST": "Designates testing fabrics",
@@ -176,6 +177,7 @@
176
177
  "PRD",
177
178
  "TST",
178
179
  "SIT",
180
+ "SBX",
179
181
  "SANDBOX"
180
182
  ],
181
183
  "doc": "Fabric group type"
@@ -219,6 +219,7 @@
219
219
  "QA": "Designates quality assurance fabrics",
220
220
  "RVW": "Designates review fabrics",
221
221
  "SANDBOX": "Designates sandbox fabrics",
222
+ "SBX": "Alternative spelling for sandbox",
222
223
  "SIT": "System Integration Testing",
223
224
  "STG": "Designates staging fabrics",
224
225
  "TEST": "Designates testing fabrics",
@@ -242,6 +243,7 @@
242
243
  "PRD",
243
244
  "TST",
244
245
  "SIT",
246
+ "SBX",
245
247
  "SANDBOX"
246
248
  ],
247
249
  "doc": "Fabric group type"
@@ -52,6 +52,7 @@
52
52
  "QA": "Designates quality assurance fabrics",
53
53
  "RVW": "Designates review fabrics",
54
54
  "SANDBOX": "Designates sandbox fabrics",
55
+ "SBX": "Alternative spelling for sandbox",
55
56
  "SIT": "System Integration Testing",
56
57
  "STG": "Designates staging fabrics",
57
58
  "TEST": "Designates testing fabrics",
@@ -75,6 +76,7 @@
75
76
  "PRD",
76
77
  "TST",
77
78
  "SIT",
79
+ "SBX",
78
80
  "SANDBOX"
79
81
  ],
80
82
  "doc": "Fabric group type"
@@ -89,6 +89,7 @@
89
89
  "QA": "Designates quality assurance fabrics",
90
90
  "RVW": "Designates review fabrics",
91
91
  "SANDBOX": "Designates sandbox fabrics",
92
+ "SBX": "Alternative spelling for sandbox",
92
93
  "SIT": "System Integration Testing",
93
94
  "STG": "Designates staging fabrics",
94
95
  "TEST": "Designates testing fabrics",
@@ -112,6 +113,7 @@
112
113
  "PRD",
113
114
  "TST",
114
115
  "SIT",
116
+ "SBX",
115
117
  "SANDBOX"
116
118
  ],
117
119
  "doc": "Fabric group type"
@@ -64,6 +64,7 @@
64
64
  "QA": "Designates quality assurance fabrics",
65
65
  "RVW": "Designates review fabrics",
66
66
  "SANDBOX": "Designates sandbox fabrics",
67
+ "SBX": "Alternative spelling for sandbox",
67
68
  "SIT": "System Integration Testing",
68
69
  "STG": "Designates staging fabrics",
69
70
  "TEST": "Designates testing fabrics",
@@ -87,6 +88,7 @@
87
88
  "PRD",
88
89
  "TST",
89
90
  "SIT",
91
+ "SBX",
90
92
  "SANDBOX"
91
93
  ],
92
94
  "doc": "Fabric group type"
@@ -60,6 +60,7 @@
60
60
  "QA": "Designates quality assurance fabrics",
61
61
  "RVW": "Designates review fabrics",
62
62
  "SANDBOX": "Designates sandbox fabrics",
63
+ "SBX": "Alternative spelling for sandbox",
63
64
  "SIT": "System Integration Testing",
64
65
  "STG": "Designates staging fabrics",
65
66
  "TEST": "Designates testing fabrics",
@@ -83,6 +84,7 @@
83
84
  "PRD",
84
85
  "TST",
85
86
  "SIT",
87
+ "SBX",
86
88
  "SANDBOX"
87
89
  ],
88
90
  "doc": "Fabric group type"
@@ -67,6 +67,7 @@
67
67
  "QA": "Designates quality assurance fabrics",
68
68
  "RVW": "Designates review fabrics",
69
69
  "SANDBOX": "Designates sandbox fabrics",
70
+ "SBX": "Alternative spelling for sandbox",
70
71
  "SIT": "System Integration Testing",
71
72
  "STG": "Designates staging fabrics",
72
73
  "TEST": "Designates testing fabrics",
@@ -90,6 +91,7 @@
90
91
  "PRD",
91
92
  "TST",
92
93
  "SIT",
94
+ "SBX",
93
95
  "SANDBOX"
94
96
  ],
95
97
  "doc": "Fabric group type"
@@ -81,6 +81,7 @@
81
81
  "QA": "Designates quality assurance fabrics",
82
82
  "RVW": "Designates review fabrics",
83
83
  "SANDBOX": "Designates sandbox fabrics",
84
+ "SBX": "Alternative spelling for sandbox",
84
85
  "SIT": "System Integration Testing",
85
86
  "STG": "Designates staging fabrics",
86
87
  "TEST": "Designates testing fabrics",
@@ -104,6 +105,7 @@
104
105
  "PRD",
105
106
  "TST",
106
107
  "SIT",
108
+ "SBX",
107
109
  "SANDBOX"
108
110
  ],
109
111
  "doc": "Fabric group type"
@@ -2430,6 +2430,7 @@
2430
2430
  "QA": "Designates quality assurance fabrics",
2431
2431
  "RVW": "Designates review fabrics",
2432
2432
  "SANDBOX": "Designates sandbox fabrics",
2433
+ "SBX": "Alternative spelling for sandbox",
2433
2434
  "SIT": "System Integration Testing",
2434
2435
  "STG": "Designates staging fabrics",
2435
2436
  "TEST": "Designates testing fabrics",
@@ -2453,6 +2454,7 @@
2453
2454
  "PRD",
2454
2455
  "TST",
2455
2456
  "SIT",
2457
+ "SBX",
2456
2458
  "SANDBOX"
2457
2459
  ],
2458
2460
  "doc": "Fabric group type"
@@ -58,6 +58,7 @@ from datahub.sql_parsing.tool_meta_extractor import (
58
58
  ToolMetaExtractorReport,
59
59
  )
60
60
  from datahub.utilities.cooperative_timeout import CooperativeTimeoutError
61
+ from datahub.utilities.dedup_list import deduplicate_list
61
62
  from datahub.utilities.file_backed_collections import (
62
63
  ConnectionWrapper,
63
64
  FileBackedDict,
@@ -140,6 +141,7 @@ class QueryMetadata:
140
141
 
141
142
  used_temp_tables: bool = True
142
143
 
144
+ extra_info: Optional[dict] = None
143
145
  origin: Optional[Urn] = None
144
146
 
145
147
  def make_created_audit_stamp(self) -> models.AuditStampClass:
@@ -263,7 +265,7 @@ class PreparsedQuery:
263
265
  query_type_props: QueryTypeProps = dataclasses.field(
264
266
  default_factory=lambda: QueryTypeProps()
265
267
  )
266
- # Use this to store addtitional key-value information about query for debugging
268
+ # Use this to store additional key-value information about the query for debugging.
267
269
  extra_info: Optional[dict] = None
268
270
  origin: Optional[Urn] = None
269
271
 
@@ -948,6 +950,7 @@ class SqlParsingAggregator(Closeable):
948
950
  column_usage=parsed.column_usage or {},
949
951
  confidence_score=parsed.confidence_score,
950
952
  used_temp_tables=session_has_temp_tables,
953
+ extra_info=parsed.extra_info,
951
954
  origin=parsed.origin,
952
955
  )
953
956
  )
@@ -1706,7 +1709,7 @@ class SqlParsingAggregator(Closeable):
1706
1709
  )
1707
1710
 
1708
1711
  merged_query_text = ";\n\n".join(
1709
- [q.formatted_query_string for q in ordered_queries]
1712
+ deduplicate_list([q.formatted_query_string for q in ordered_queries])
1710
1713
  )
1711
1714
 
1712
1715
  resolved_query = dataclasses.replace(