acryl-datahub 1.1.0.5rc8__py3-none-any.whl → 1.1.0.5rc10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (47) hide show
  1. {acryl_datahub-1.1.0.5rc8.dist-info → acryl_datahub-1.1.0.5rc10.dist-info}/METADATA +2465 -2465
  2. {acryl_datahub-1.1.0.5rc8.dist-info → acryl_datahub-1.1.0.5rc10.dist-info}/RECORD +47 -47
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +45 -1
  5. datahub/cli/cli_utils.py +0 -10
  6. datahub/cli/container_cli.py +5 -0
  7. datahub/cli/delete_cli.py +5 -0
  8. datahub/cli/docker_cli.py +2 -0
  9. datahub/cli/exists_cli.py +2 -0
  10. datahub/cli/get_cli.py +2 -0
  11. datahub/cli/iceberg_cli.py +5 -0
  12. datahub/cli/ingest_cli.py +7 -0
  13. datahub/cli/migrate.py +2 -0
  14. datahub/cli/put_cli.py +3 -0
  15. datahub/cli/specific/assertions_cli.py +2 -0
  16. datahub/cli/specific/datacontract_cli.py +3 -0
  17. datahub/cli/specific/dataproduct_cli.py +11 -0
  18. datahub/cli/specific/dataset_cli.py +4 -0
  19. datahub/cli/specific/forms_cli.py +2 -0
  20. datahub/cli/specific/group_cli.py +2 -0
  21. datahub/cli/specific/structuredproperties_cli.py +4 -0
  22. datahub/cli/specific/user_cli.py +2 -0
  23. datahub/cli/state_cli.py +2 -0
  24. datahub/cli/timeline_cli.py +2 -0
  25. datahub/emitter/rest_emitter.py +24 -8
  26. datahub/ingestion/api/report.py +72 -12
  27. datahub/ingestion/autogenerated/capability_summary.json +19 -1
  28. datahub/ingestion/autogenerated/lineage_helper.py +101 -19
  29. datahub/ingestion/source/common/subtypes.py +2 -0
  30. datahub/ingestion/source/dremio/dremio_api.py +38 -27
  31. datahub/ingestion/source/mlflow.py +11 -1
  32. datahub/ingestion/source/snowflake/snowflake_queries.py +127 -0
  33. datahub/ingestion/source/sql/sql_common.py +4 -0
  34. datahub/ingestion/source/sql/teradata.py +993 -234
  35. datahub/ingestion/source/tableau/tableau.py +11 -2
  36. datahub/ingestion/source/tableau/tableau_constant.py +0 -2
  37. datahub/metadata/_internal_schema_classes.py +528 -529
  38. datahub/metadata/_urns/urn_defs.py +1803 -1803
  39. datahub/metadata/schema.avsc +16720 -17109
  40. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +1 -3
  41. datahub/sdk/main_client.py +14 -2
  42. datahub/sdk/search_client.py +4 -3
  43. datahub/telemetry/telemetry.py +17 -11
  44. {acryl_datahub-1.1.0.5rc8.dist-info → acryl_datahub-1.1.0.5rc10.dist-info}/WHEEL +0 -0
  45. {acryl_datahub-1.1.0.5rc8.dist-info → acryl_datahub-1.1.0.5rc10.dist-info}/entry_points.txt +0 -0
  46. {acryl_datahub-1.1.0.5rc8.dist-info → acryl_datahub-1.1.0.5rc10.dist-info}/licenses/LICENSE +0 -0
  47. {acryl_datahub-1.1.0.5rc8.dist-info → acryl_datahub-1.1.0.5rc10.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,11 @@
1
1
  import logging
2
+ import time
2
3
  from collections import defaultdict
3
- from dataclasses import dataclass
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from dataclasses import dataclass, field
4
6
  from datetime import datetime
5
7
  from functools import lru_cache
8
+ from threading import Lock
6
9
  from typing import (
7
10
  Any,
8
11
  Dict,
@@ -10,7 +13,6 @@ from typing import (
10
13
  List,
11
14
  MutableMapping,
12
15
  Optional,
13
- Set,
14
16
  Tuple,
15
17
  Union,
16
18
  )
@@ -29,7 +31,6 @@ from teradatasqlalchemy.options import configure
29
31
 
30
32
  from datahub.configuration.common import AllowDenyPattern
31
33
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
32
- from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
33
34
  from datahub.ingestion.api.common import PipelineContext
34
35
  from datahub.ingestion.api.decorators import (
35
36
  SourceCapability,
@@ -39,10 +40,9 @@ from datahub.ingestion.api.decorators import (
39
40
  platform_name,
40
41
  support_status,
41
42
  )
42
- from datahub.ingestion.api.source_helpers import auto_lowercase_urns
43
43
  from datahub.ingestion.api.workunit import MetadataWorkUnit
44
44
  from datahub.ingestion.graph.client import DataHubGraph
45
- from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type
45
+ from datahub.ingestion.source.sql.sql_common import register_custom_type
46
46
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
47
47
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
48
48
  from datahub.ingestion.source.sql.two_tier_sql_source import (
@@ -56,13 +56,64 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
56
56
  BytesTypeClass,
57
57
  TimeTypeClass,
58
58
  )
59
- from datahub.metadata.schema_classes import SchemaMetadataClass
59
+ from datahub.metadata.urns import CorpUserUrn
60
60
  from datahub.sql_parsing.schema_resolver import SchemaResolver
61
- from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
61
+ from datahub.sql_parsing.sql_parsing_aggregator import (
62
+ ObservedQuery,
63
+ SqlParsingAggregator,
64
+ )
62
65
  from datahub.utilities.groupby import groupby_unsorted
66
+ from datahub.utilities.stats_collections import TopKDict
63
67
 
64
68
  logger: logging.Logger = logging.getLogger(__name__)
65
69
 
70
+ # Common excluded databases used in multiple places
71
+ EXCLUDED_DATABASES = [
72
+ "All",
73
+ "Crashdumps",
74
+ "Default",
75
+ "DemoNow_Monitor",
76
+ "EXTUSER",
77
+ "External_AP",
78
+ "GLOBAL_FUNCTIONS",
79
+ "LockLogShredder",
80
+ "PUBLIC",
81
+ "SQLJ",
82
+ "SYSBAR",
83
+ "SYSJDBC",
84
+ "SYSLIB",
85
+ "SYSSPATIAL",
86
+ "SYSUDTLIB",
87
+ "SYSUIF",
88
+ "SysAdmin",
89
+ "Sys_Calendar",
90
+ "SystemFe",
91
+ "TDBCMgmt",
92
+ "TDMaps",
93
+ "TDPUSER",
94
+ "TDQCD",
95
+ "TDStats",
96
+ "TD_ANALYTICS_DB",
97
+ "TD_SERVER_DB",
98
+ "TD_SYSFNLIB",
99
+ "TD_SYSGPL",
100
+ "TD_SYSXML",
101
+ "TDaaS_BAR",
102
+ "TDaaS_DB",
103
+ "TDaaS_Maint",
104
+ "TDaaS_Monitor",
105
+ "TDaaS_Support",
106
+ "TDaaS_TDBCMgmt1",
107
+ "TDaaS_TDBCMgmt2",
108
+ "dbcmngr",
109
+ "mldb",
110
+ "system",
111
+ "tapidb",
112
+ "tdwm",
113
+ "val",
114
+ "dbc",
115
+ ]
116
+
66
117
  register_custom_type(custom_types.JSON, BytesTypeClass)
67
118
  register_custom_type(custom_types.INTERVAL_DAY, TimeTypeClass)
68
119
  register_custom_type(custom_types.INTERVAL_DAY_TO_SECOND, TimeTypeClass)
@@ -99,14 +150,16 @@ class TeradataTable:
99
150
  request_text: Optional[str]
100
151
 
101
152
 
102
- # lru cache is set to 1 which work only in single threaded environment but it keeps the memory footprint lower
153
+ # Cache size of 1 is sufficient since schemas are processed sequentially
154
+ # Note: This cache is per-process and helps when processing multiple tables in the same schema
103
155
  @lru_cache(maxsize=1)
104
156
  def get_schema_columns(
105
157
  self: Any, connection: Connection, dbc_columns: str, schema: str
106
158
  ) -> Dict[str, List[Any]]:
159
+ start_time = time.time()
107
160
  columns: Dict[str, List[Any]] = {}
108
- columns_query = f"select * from dbc.{dbc_columns} where DatabaseName (NOT CASESPECIFIC) = '{schema}' (NOT CASESPECIFIC) order by TableName, ColumnId"
109
- rows = connection.execute(text(columns_query)).fetchall()
161
+ columns_query = f"select * from dbc.{dbc_columns} where DatabaseName (NOT CASESPECIFIC) = :schema (NOT CASESPECIFIC) order by TableName, ColumnId"
162
+ rows = connection.execute(text(columns_query), {"schema": schema}).fetchall()
110
163
  for row in rows:
111
164
  row_mapping = row._mapping
112
165
  if row_mapping.TableName not in columns:
@@ -114,18 +167,29 @@ def get_schema_columns(
114
167
 
115
168
  columns[row_mapping.TableName].append(row_mapping)
116
169
 
170
+ end_time = time.time()
171
+ extraction_time = end_time - start_time
172
+ logger.info(
173
+ f"Column extraction for schema '{schema}' completed in {extraction_time:.2f} seconds"
174
+ )
175
+
176
+ # Update report if available
177
+ if hasattr(self, "report"):
178
+ self.report.column_extraction_duration_seconds += extraction_time
179
+
117
180
  return columns
118
181
 
119
182
 
120
- # lru cache is set to 1 which work only in single threaded environment but it keeps the memory footprint lower
183
+ # Cache size of 1 is sufficient since schemas are processed sequentially
184
+ # Note: This cache is per-process and helps when processing multiple tables in the same schema
121
185
  @lru_cache(maxsize=1)
122
186
  def get_schema_pk_constraints(
123
187
  self: Any, connection: Connection, schema: str
124
188
  ) -> Dict[str, List[Any]]:
125
189
  dbc_indices = "IndicesV" + "X" if configure.usexviews else "IndicesV"
126
190
  primary_keys: Dict[str, List[Any]] = {}
127
- stmt = f"select * from dbc.{dbc_indices} where DatabaseName (NOT CASESPECIFIC) = '{schema}' (NOT CASESPECIFIC) and IndexType = 'K' order by IndexNumber"
128
- rows = connection.execute(text(stmt)).fetchall()
191
+ stmt = f"select * from dbc.{dbc_indices} where DatabaseName (NOT CASESPECIFIC) = :schema (NOT CASESPECIFIC) and IndexType = 'K' order by IndexNumber"
192
+ rows = connection.execute(text(stmt), {"schema": schema}).fetchall()
129
193
  for row in rows:
130
194
  row_mapping = row._mapping
131
195
  if row_mapping.TableName not in primary_keys:
@@ -172,6 +236,10 @@ def optimized_get_pk_constraint(
172
236
  index_column.IndexName
173
237
  ) # There should be just one IndexName
174
238
 
239
+ # Update counter if available
240
+ if hasattr(self, "report"):
241
+ self.report.num_primary_keys_processed += 1
242
+
175
243
  return {"constrained_columns": index_columns, "name": index_name}
176
244
 
177
245
 
@@ -228,23 +296,55 @@ def optimized_get_columns(
228
296
  table_name, []
229
297
  )
230
298
 
299
+ start_time = time.time()
300
+
231
301
  final_column_info = []
232
302
  # Don't care about ART tables now
233
303
  # Ignore the non-functional column in a PTI table
234
304
  for row in res:
235
- col_info = self._get_column_info(row)
236
- if "TSColumnType" in col_info and col_info["TSColumnType"] is not None:
237
- if (
238
- col_info["ColumnName"] == "TD_TIMEBUCKET"
239
- and col_info["TSColumnType"].strip() == "TB"
305
+ try:
306
+ col_info = self._get_column_info(row)
307
+
308
+ # Add CommentString as comment field for column description
309
+ if hasattr(row, "CommentString") and row.CommentString:
310
+ col_info["comment"] = row.CommentString.strip()
311
+ elif (
312
+ isinstance(row, dict)
313
+ and "CommentString" in row
314
+ and row["CommentString"]
240
315
  ):
241
- continue
242
- final_column_info.append(col_info)
316
+ col_info["comment"] = row["CommentString"].strip()
317
+
318
+ if "TSColumnType" in col_info and col_info["TSColumnType"] is not None:
319
+ if (
320
+ col_info["ColumnName"] == "TD_TIMEBUCKET"
321
+ and col_info["TSColumnType"].strip() == "TB"
322
+ ):
323
+ continue
324
+ final_column_info.append(col_info)
325
+
326
+ # Update counter - access report through self from the connection context
327
+ if hasattr(self, "report"):
328
+ self.report.num_columns_processed += 1
329
+
330
+ except Exception as e:
331
+ logger.error(
332
+ f"Failed to process column {getattr(row, 'ColumnName', 'unknown')}: {e}"
333
+ )
334
+ if hasattr(self, "report"):
335
+ self.report.num_column_extraction_failures += 1
336
+ continue
337
+
338
+ # Update timing
339
+ if hasattr(self, "report"):
340
+ end_time = time.time()
341
+ self.report.column_extraction_duration_seconds += end_time - start_time
243
342
 
244
343
  return final_column_info
245
344
 
246
345
 
247
- # lru cache is set to 1 which work only in single threaded environment but it keeps the memory footprint lower
346
+ # Cache size of 1 is sufficient since schemas are processed sequentially
347
+ # Note: This cache is per-process and helps when processing multiple tables in the same schema
248
348
  @lru_cache(maxsize=1)
249
349
  def get_schema_foreign_keys(
250
350
  self: Any, connection: Connection, schema: str
@@ -334,9 +434,24 @@ def optimized_get_view_definition(
334
434
 
335
435
  @dataclass
336
436
  class TeradataReport(SQLSourceReport, IngestionStageReport, BaseTimeWindowReport):
337
- num_queries_parsed: int = 0
338
- num_view_ddl_parsed: int = 0
339
- num_table_parse_failures: int = 0
437
+ # View processing metrics (actively used)
438
+ num_views_processed: int = 0
439
+ num_view_processing_failures: int = 0
440
+ view_extraction_total_time_seconds: float = 0.0
441
+ view_extraction_average_time_seconds: float = 0.0
442
+ slowest_view_processing_time_seconds: float = 0.0
443
+ slowest_view_name: TopKDict[str, float] = field(default_factory=TopKDict)
444
+
445
+ # Connection pool performance metrics (actively used)
446
+ connection_pool_wait_time_seconds: float = 0.0
447
+ connection_pool_max_wait_time_seconds: float = 0.0
448
+
449
+ # Database-level metrics similar to BigQuery's approach (actively used)
450
+ num_database_tables_to_scan: TopKDict[str, int] = field(default_factory=TopKDict)
451
+ num_database_views_to_scan: TopKDict[str, int] = field(default_factory=TopKDict)
452
+
453
+ # Global metadata extraction timing (single query for all databases)
454
+ metadata_extraction_total_sec: float = 0.0
340
455
 
341
456
 
342
457
  class BaseTeradataConfig(TwoTierSQLAlchemyConfig):
@@ -353,53 +468,7 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
353
468
  )
354
469
 
355
470
  database_pattern = Field(
356
- default=AllowDenyPattern(
357
- deny=[
358
- "All",
359
- "Crashdumps",
360
- "Default",
361
- "DemoNow_Monitor",
362
- "EXTUSER",
363
- "External_AP",
364
- "GLOBAL_FUNCTIONS",
365
- "LockLogShredder",
366
- "PUBLIC",
367
- "SQLJ",
368
- "SYSBAR",
369
- "SYSJDBC",
370
- "SYSLIB",
371
- "SYSSPATIAL",
372
- "SYSUDTLIB",
373
- "SYSUIF",
374
- "SysAdmin",
375
- "Sys_Calendar",
376
- "SystemFe",
377
- "TDBCMgmt",
378
- "TDMaps",
379
- "TDPUSER",
380
- "TDQCD",
381
- "TDStats",
382
- "TD_ANALYTICS_DB",
383
- "TD_SERVER_DB",
384
- "TD_SYSFNLIB",
385
- "TD_SYSGPL",
386
- "TD_SYSXML",
387
- "TDaaS_BAR",
388
- "TDaaS_DB",
389
- "TDaaS_Maint",
390
- "TDaaS_Monitor",
391
- "TDaaS_Support",
392
- "TDaaS_TDBCMgmt1",
393
- "TDaaS_TDBCMgmt2",
394
- "dbcmngr",
395
- "mldb",
396
- "system",
397
- "tapidb",
398
- "tdwm",
399
- "val",
400
- "dbc",
401
- ]
402
- ),
471
+ default=AllowDenyPattern(deny=EXCLUDED_DATABASES),
403
472
  description="Regex patterns for databases to filter in ingestion.",
404
473
  )
405
474
  include_table_lineage = Field(
@@ -413,6 +482,13 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
413
482
  description="Whether to include view lineage in the ingestion. "
414
483
  "This requires to have the view lineage feature enabled.",
415
484
  )
485
+
486
+ include_queries = Field(
487
+ default=True,
488
+ description="Whether to generate query entities for SQL queries. "
489
+ "Query entities provide metadata about individual SQL queries including "
490
+ "execution timestamps, user information, and query text.",
491
+ )
416
492
  usage: BaseUsageConfig = Field(
417
493
  description="The usage config to use when generating usage statistics",
418
494
  default=BaseUsageConfig(),
@@ -438,6 +514,26 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
438
514
  description="Whether to use QVCI to get column information. This is faster but requires to have QVCI enabled.",
439
515
  )
440
516
 
517
+ include_historical_lineage: bool = Field(
518
+ default=False,
519
+ description="Whether to include historical lineage data from PDCRINFO.DBQLSqlTbl_Hst in addition to current DBC.QryLogV data. "
520
+ "This provides access to historical query logs that may have been archived. "
521
+ "The historical table existence is checked automatically and gracefully falls back to current data only if not available.",
522
+ )
523
+
524
+ use_server_side_cursors: bool = Field(
525
+ default=True,
526
+ description="Enable server-side cursors for large result sets using SQLAlchemy's stream_results. "
527
+ "This reduces memory usage by streaming results from the database server. "
528
+ "Automatically falls back to client-side batching if server-side cursors are not supported.",
529
+ )
530
+
531
+ max_workers: int = Field(
532
+ default=10,
533
+ description="Maximum number of worker threads to use for parallel processing. "
534
+ "Controls the level of concurrency for operations like view processing.",
535
+ )
536
+
441
537
 
442
538
  @platform_name("Teradata")
443
539
  @config_class(TeradataConfig)
@@ -464,13 +560,7 @@ class TeradataSource(TwoTierSQLAlchemySource):
464
560
 
465
561
  config: TeradataConfig
466
562
 
467
- LINEAGE_QUERY_DATABASE_FILTER: str = """and default_database IN ({databases})"""
468
-
469
- LINEAGE_TIMESTAMP_BOUND_QUERY: str = """
470
- SELECT MIN(CollectTimeStamp) as "min_ts", MAX(CollectTimeStamp) as "max_ts" from DBC.QryLogV
471
- """.strip()
472
-
473
- QUERY_TEXT_QUERY: str = """
563
+ QUERY_TEXT_CURRENT_QUERIES: str = """
474
564
  SELECT
475
565
  s.QueryID as "query_id",
476
566
  UserName as "user",
@@ -503,10 +593,89 @@ class TeradataSource(TwoTierSQLAlchemySource):
503
593
  and s.CollectTimeStamp >= TIMESTAMP '{start_time}'
504
594
  and default_database not in ('DEMONOW_MONITOR')
505
595
  {databases_filter}
506
- ORDER BY "query_id", "row_no"
596
+ ORDER BY "timestamp", "query_id", "row_no"
597
+ """.strip()
598
+
599
+ QUERY_TEXT_HISTORICAL_UNION: str = """
600
+ SELECT
601
+ "query_id",
602
+ "user",
603
+ "timestamp",
604
+ default_database,
605
+ "query_text",
606
+ "row_no"
607
+ FROM (
608
+ SELECT
609
+ h.QueryID as "query_id",
610
+ h.UserName as "user",
611
+ h.StartTime AT TIME ZONE 'GMT' as "timestamp",
612
+ h.DefaultDatabase as default_database,
613
+ h.SqlTextInfo as "query_text",
614
+ h.SqlRowNo as "row_no"
615
+ FROM "PDCRINFO".DBQLSqlTbl_Hst as h
616
+ WHERE
617
+ h.ErrorCode = 0
618
+ AND h.statementtype not in (
619
+ 'Unrecognized type',
620
+ 'Create Database/User',
621
+ 'Help',
622
+ 'Modify Database',
623
+ 'Drop Table',
624
+ 'Show',
625
+ 'Not Applicable',
626
+ 'Grant',
627
+ 'Abort',
628
+ 'Database',
629
+ 'Flush Query Logging',
630
+ 'Null',
631
+ 'Begin/End DBQL',
632
+ 'Revoke'
633
+ )
634
+ and h.StartTime AT TIME ZONE 'GMT' >= TIMESTAMP '{start_time}'
635
+ and h.StartTime AT TIME ZONE 'GMT' < TIMESTAMP '{end_time}'
636
+ and h.CollectTimeStamp >= TIMESTAMP '{start_time}'
637
+ and h.DefaultDatabase not in ('DEMONOW_MONITOR')
638
+ {databases_filter_history}
639
+
640
+ UNION
641
+
642
+ SELECT
643
+ s.QueryID as "query_id",
644
+ l.UserName as "user",
645
+ l.StartTime AT TIME ZONE 'GMT' as "timestamp",
646
+ l.DefaultDatabase as default_database,
647
+ s.SqlTextInfo as "query_text",
648
+ s.SqlRowNo as "row_no"
649
+ FROM "DBC".QryLogV as l
650
+ JOIN "DBC".QryLogSqlV as s on s.QueryID = l.QueryID
651
+ WHERE
652
+ l.ErrorCode = 0
653
+ AND l.statementtype not in (
654
+ 'Unrecognized type',
655
+ 'Create Database/User',
656
+ 'Help',
657
+ 'Modify Database',
658
+ 'Drop Table',
659
+ 'Show',
660
+ 'Not Applicable',
661
+ 'Grant',
662
+ 'Abort',
663
+ 'Database',
664
+ 'Flush Query Logging',
665
+ 'Null',
666
+ 'Begin/End DBQL',
667
+ 'Revoke'
668
+ )
669
+ and l.StartTime AT TIME ZONE 'GMT' >= TIMESTAMP '{start_time}'
670
+ and l.StartTime AT TIME ZONE 'GMT' < TIMESTAMP '{end_time}'
671
+ and s.CollectTimeStamp >= TIMESTAMP '{start_time}'
672
+ and l.DefaultDatabase not in ('DEMONOW_MONITOR')
673
+ {databases_filter}
674
+ ) as combined_results
675
+ ORDER BY "timestamp", "query_id", "row_no"
507
676
  """.strip()
508
677
 
509
- TABLES_AND_VIEWS_QUERY: str = """
678
+ TABLES_AND_VIEWS_QUERY: str = f"""
510
679
  SELECT
511
680
  t.DataBaseName,
512
681
  t.TableName as name,
@@ -524,77 +693,51 @@ SELECT
524
693
  t.LastAlterTimeStamp,
525
694
  t.RequestText
526
695
  FROM dbc.TablesV t
527
- WHERE DataBaseName NOT IN (
528
- 'All',
529
- 'Crashdumps',
530
- 'Default',
531
- 'DemoNow_Monitor',
532
- 'EXTUSER',
533
- 'External_AP',
534
- 'GLOBAL_FUNCTIONS',
535
- 'LockLogShredder',
536
- 'PUBLIC',
537
- 'SQLJ',
538
- 'SYSBAR',
539
- 'SYSJDBC',
540
- 'SYSLIB',
541
- 'SYSSPATIAL',
542
- 'SYSUDTLIB',
543
- 'SYSUIF',
544
- 'SysAdmin',
545
- 'Sys_Calendar',
546
- 'SystemFe',
547
- 'TDBCMgmt',
548
- 'TDMaps',
549
- 'TDPUSER',
550
- 'TDQCD',
551
- 'TDStats',
552
- 'TD_ANALYTICS_DB',
553
- 'TD_SERVER_DB',
554
- 'TD_SYSFNLIB',
555
- 'TD_SYSGPL',
556
- 'TD_SYSXML',
557
- 'TDaaS_BAR',
558
- 'TDaaS_DB',
559
- 'TDaaS_Maint',
560
- 'TDaaS_Monitor',
561
- 'TDaaS_Support',
562
- 'TDaaS_TDBCMgmt1',
563
- 'TDaaS_TDBCMgmt2',
564
- 'dbcmngr',
565
- 'mldb',
566
- 'system',
567
- 'tapidb',
568
- 'tdwm',
569
- 'val',
570
- 'dbc'
571
- )
696
+ WHERE DataBaseName NOT IN ({",".join([f"'{db}'" for db in EXCLUDED_DATABASES])})
572
697
  AND t.TableKind in ('T', 'V', 'Q', 'O')
573
698
  ORDER by DataBaseName, TableName;
574
699
  """.strip()
575
700
 
576
701
  _tables_cache: MutableMapping[str, List[TeradataTable]] = defaultdict(list)
702
+ _tables_cache_lock = Lock() # Protect shared cache from concurrent access
703
+ _pooled_engine: Optional[Engine] = None # Reusable pooled engine
704
+ _pooled_engine_lock = Lock() # Protect engine creation
577
705
 
578
706
  def __init__(self, config: TeradataConfig, ctx: PipelineContext):
579
707
  super().__init__(config, ctx, "teradata")
580
708
 
581
709
  self.report: TeradataReport = TeradataReport()
582
710
  self.graph: Optional[DataHubGraph] = ctx.graph
711
+ self._report_lock = Lock() # Thread safety for report counters
712
+
713
+ self.schema_resolver = self._init_schema_resolver()
583
714
 
584
- self.builder: SqlParsingBuilder = SqlParsingBuilder(
585
- usage_config=(
586
- self.config.usage if self.config.include_usage_statistics else None
587
- ),
588
- generate_lineage=True,
715
+ # Initialize SqlParsingAggregator for modern lineage processing
716
+ logger.info("Initializing SqlParsingAggregator for enhanced lineage processing")
717
+ self.aggregator = SqlParsingAggregator(
718
+ platform="teradata",
719
+ platform_instance=self.config.platform_instance,
720
+ env=self.config.env,
721
+ schema_resolver=self.schema_resolver,
722
+ graph=self.ctx.graph,
723
+ generate_lineage=self.include_lineage,
724
+ generate_queries=self.config.include_queries,
589
725
  generate_usage_statistics=self.config.include_usage_statistics,
590
- generate_operations=self.config.usage.include_operational_stats,
726
+ generate_query_usage_statistics=self.config.include_usage_statistics,
727
+ generate_operations=self.config.usage.include_operational_stats
728
+ if self.config.include_usage_statistics
729
+ else False,
730
+ usage_config=self.config.usage
731
+ if self.config.include_usage_statistics
732
+ else None,
733
+ eager_graph_load=False,
591
734
  )
592
-
593
- self.schema_resolver = self._init_schema_resolver()
735
+ self.report.sql_aggregator = self.aggregator.report
594
736
 
595
737
  if self.config.include_tables or self.config.include_views:
596
- self.cache_tables_and_views()
597
- logger.info(f"Found {len(self._tables_cache)} tables and views")
738
+ with self.report.new_stage("Table and view discovery"):
739
+ self.cache_tables_and_views()
740
+ logger.info(f"Found {len(self._tables_cache)} tables and views")
598
741
  setattr(self, "loop_tables", self.cached_loop_tables) # noqa: B010
599
742
  setattr(self, "loop_views", self.cached_loop_views) # noqa: B010
600
743
  setattr( # noqa: B010
@@ -724,6 +867,8 @@ ORDER by DataBaseName, TableName;
724
867
 
725
868
  logger.debug(f"sql_alchemy_url={url}")
726
869
  engine = create_engine(url, **self.config.options)
870
+
871
+ # Get list of databases first
727
872
  with engine.connect() as conn:
728
873
  inspector = inspect(conn)
729
874
  if self.config.database and self.config.database != "":
@@ -732,13 +877,14 @@ ORDER by DataBaseName, TableName;
732
877
  databases = self.config.databases
733
878
  else:
734
879
  databases = inspector.get_schema_names()
735
- for db in databases:
736
- if self.config.database_pattern.allowed(db):
737
- # url = self.config.get_sql_alchemy_url(current_db=db)
738
- # with create_engine(url, **self.config.options).connect() as conn:
739
- # inspector = inspect(conn)
740
- inspector._datahub_database = db
741
- yield inspector
880
+
881
+ # Create separate connections for each database to avoid connection lifecycle issues
882
+ for db in databases:
883
+ if self.config.database_pattern.allowed(db):
884
+ with engine.connect() as conn:
885
+ db_inspector = inspect(conn)
886
+ db_inspector._datahub_database = db
887
+ yield db_inspector
742
888
 
743
889
  def get_db_name(self, inspector: Inspector) -> str:
744
890
  if hasattr(inspector, "_datahub_database"):
@@ -756,14 +902,15 @@ ORDER by DataBaseName, TableName;
756
902
  inspector: Inspector,
757
903
  schema: str,
758
904
  sql_config: SQLCommonConfig,
759
- ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
905
+ ) -> Iterable[MetadataWorkUnit]:
760
906
  setattr( # noqa: B010
761
907
  inspector,
762
908
  "get_table_names",
763
909
  lambda schema: [
764
910
  i.name
765
911
  for i in filter(
766
- lambda t: t.object_type != "View", self._tables_cache[schema]
912
+ lambda t: t.object_type != "View",
913
+ self._tables_cache.get(schema, []),
767
914
  )
768
915
  ],
769
916
  )
@@ -779,7 +926,8 @@ ORDER by DataBaseName, TableName;
779
926
  # this method and provide a location.
780
927
  location: Optional[str] = None
781
928
 
782
- for entry in self._tables_cache[schema]:
929
+ cache_entries = self._tables_cache.get(schema, [])
930
+ for entry in cache_entries:
783
931
  if entry.name == table:
784
932
  description = entry.description
785
933
  if entry.object_type == "View" and entry.request_text:
@@ -792,123 +940,734 @@ ORDER by DataBaseName, TableName;
792
940
  inspector: Inspector,
793
941
  schema: str,
794
942
  sql_config: SQLCommonConfig,
795
- ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
796
- setattr( # noqa: B010
797
- inspector,
798
- "get_view_names",
799
- lambda schema: [
800
- i.name
801
- for i in filter(
802
- lambda t: t.object_type == "View", self._tables_cache[schema]
943
+ ) -> Iterable[MetadataWorkUnit]:
944
+ start_time = time.time()
945
+
946
+ # Get view names from cache
947
+ view_names = [
948
+ i.name
949
+ for i in filter(
950
+ lambda t: t.object_type == "View", self._tables_cache.get(schema, [])
951
+ )
952
+ ]
953
+ actual_view_count = len(view_names)
954
+
955
+ if actual_view_count == 0:
956
+ end_time = time.time()
957
+ processing_time = end_time - start_time
958
+ logger.info(
959
+ f"View processing for schema '{schema}' completed in {processing_time:.2f} seconds (0 views, 0 work units)"
960
+ )
961
+ return
962
+
963
+ # Use custom threading implementation with connection pooling
964
+ work_unit_count = 0
965
+
966
+ for work_unit in self._loop_views_with_connection_pool(
967
+ view_names, schema, sql_config
968
+ ):
969
+ work_unit_count += 1
970
+ yield work_unit
971
+
972
+ end_time = time.time()
973
+ processing_time = end_time - start_time
974
+
975
+ logger.info(
976
+ f"View processing for schema '{schema}' completed in {processing_time:.2f} seconds ({actual_view_count} views, {work_unit_count} work units)"
977
+ )
978
+
979
+ # Update report timing metrics
980
+ if hasattr(self, "report"):
981
+ self.report.view_extraction_total_time_seconds += processing_time
982
+ self.report.num_views_processed += actual_view_count
983
+
984
+ # Track slowest view processing at view level (will be updated by individual view processing)
985
+ # Note: slowest_view_name now tracks individual views, not schemas
986
+
987
+ # Calculate average processing time per view
988
+ if self.report.num_views_processed > 0:
989
+ self.report.view_extraction_average_time_seconds = (
990
+ self.report.view_extraction_total_time_seconds
991
+ / self.report.num_views_processed
803
992
  )
804
- ],
993
+
994
+ def _loop_views_with_connection_pool(
995
+ self, view_names: List[str], schema: str, sql_config: SQLCommonConfig
996
+ ) -> Iterable[Union[MetadataWorkUnit, Any]]:
997
+ """
998
+ Process views using individual database connections per thread for true parallelization.
999
+
1000
+ Each thread gets its own connection from a QueuePool, enabling true concurrent processing.
1001
+ """
1002
+ if self.config.max_workers == 1:
1003
+ # Single-threaded processing - no need for complexity
1004
+ yield from self._process_views_single_threaded(
1005
+ view_names, schema, sql_config
1006
+ )
1007
+ return
1008
+
1009
+ logger.info(
1010
+ f"Processing {len(view_names)} views with {self.config.max_workers} worker threads"
805
1011
  )
806
- yield from super().loop_views(inspector, schema, sql_config)
807
1012
 
808
- def cache_tables_and_views(self) -> None:
1013
+ # Get or create reusable pooled engine
1014
+ engine = self._get_or_create_pooled_engine()
1015
+
1016
+ try:
1017
+ # Thread-safe result collection
1018
+ report_lock = Lock()
1019
+
1020
+ def process_single_view(
1021
+ view_name: str,
1022
+ ) -> List[Union[MetadataWorkUnit, Any]]:
1023
+ """Process a single view with its own database connection."""
1024
+ results: List[Union[MetadataWorkUnit, Any]] = []
1025
+
1026
+ # Detailed timing measurements for bottleneck analysis
1027
+ timings = {
1028
+ "connection_acquire": 0.0,
1029
+ "view_processing": 0.0,
1030
+ "work_unit_generation": 0.0,
1031
+ "total": 0.0,
1032
+ }
1033
+
1034
+ total_start = time.time()
1035
+ try:
1036
+ # Measure connection acquisition time
1037
+ conn_start = time.time()
1038
+ with engine.connect() as conn:
1039
+ timings["connection_acquire"] = time.time() - conn_start
1040
+
1041
+ # Update connection pool metrics
1042
+ with report_lock:
1043
+ pool_wait_time = timings["connection_acquire"]
1044
+ self.report.connection_pool_wait_time_seconds += (
1045
+ pool_wait_time
1046
+ )
1047
+ if (
1048
+ pool_wait_time
1049
+ > self.report.connection_pool_max_wait_time_seconds
1050
+ ):
1051
+ self.report.connection_pool_max_wait_time_seconds = (
1052
+ pool_wait_time
1053
+ )
1054
+
1055
+ # Measure view processing setup
1056
+ processing_start = time.time()
1057
+ thread_inspector = inspect(conn)
1058
+ # Inherit database information for Teradata two-tier architecture
1059
+ thread_inspector._datahub_database = schema # type: ignore
1060
+
1061
+ dataset_name = self.get_identifier(
1062
+ schema=schema, entity=view_name, inspector=thread_inspector
1063
+ )
1064
+
1065
+ # Thread-safe reporting
1066
+ with report_lock:
1067
+ self.report.report_entity_scanned(
1068
+ dataset_name, ent_type="view"
1069
+ )
1070
+
1071
+ if not sql_config.view_pattern.allowed(dataset_name):
1072
+ with report_lock:
1073
+ self.report.report_dropped(dataset_name)
1074
+ return results
1075
+
1076
+ timings["view_processing"] = time.time() - processing_start
1077
+
1078
+ # Measure work unit generation
1079
+ wu_start = time.time()
1080
+ for work_unit in self._process_view(
1081
+ dataset_name=dataset_name,
1082
+ inspector=thread_inspector,
1083
+ schema=schema,
1084
+ view=view_name,
1085
+ sql_config=sql_config,
1086
+ ):
1087
+ results.append(work_unit)
1088
+ timings["work_unit_generation"] = time.time() - wu_start
1089
+
1090
+ # Track individual view timing
1091
+ timings["total"] = time.time() - total_start
1092
+
1093
+ with report_lock:
1094
+ self.report.slowest_view_name[f"{schema}.{view_name}"] = (
1095
+ timings["total"]
1096
+ )
1097
+
1098
+ except Exception as e:
1099
+ with report_lock:
1100
+ self.report.num_view_processing_failures += 1
1101
+ # Log full exception details for debugging
1102
+ import traceback
1103
+
1104
+ full_traceback = traceback.format_exc()
1105
+ logger.error(
1106
+ f"Failed to process view {schema}.{view_name}: {str(e)}"
1107
+ )
1108
+ logger.error(f"Full traceback: {full_traceback}")
1109
+ self.report.warning(
1110
+ f"Error processing view {schema}.{view_name}",
1111
+ context=f"View: {schema}.{view_name}, Error: {str(e)}",
1112
+ exc=e,
1113
+ )
1114
+
1115
+ return results
1116
+
1117
+ # Use ThreadPoolExecutor for concurrent processing
1118
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
1119
+ # Submit all view processing tasks
1120
+ future_to_view = {
1121
+ executor.submit(process_single_view, view_name): view_name
1122
+ for view_name in view_names
1123
+ }
1124
+
1125
+ # Process completed tasks as they finish
1126
+ for future in as_completed(future_to_view):
1127
+ view_name = future_to_view[future]
1128
+ try:
1129
+ results = future.result()
1130
+ # Yield all results from this view
1131
+ for result in results:
1132
+ yield result
1133
+ except Exception as e:
1134
+ with report_lock:
1135
+ self.report.warning(
1136
+ "Error in thread processing view",
1137
+ context=f"{schema}.{view_name}",
1138
+ exc=e,
1139
+ )
1140
+
1141
+ finally:
1142
+ # Don't dispose the reusable engine here - it will be cleaned up in close()
1143
+ pass
1144
+
1145
+ def _process_views_single_threaded(
1146
+ self, view_names: List[str], schema: str, sql_config: SQLCommonConfig
1147
+ ) -> Iterable[Union[MetadataWorkUnit, Any]]:
1148
+ """Process views sequentially with a single connection."""
809
1149
  engine = self.get_metadata_engine()
810
- for entry in engine.execute(self.TABLES_AND_VIEWS_QUERY):
811
- table = TeradataTable(
812
- database=entry.DataBaseName.strip(),
813
- name=entry.name.strip(),
814
- description=entry.description.strip() if entry.description else None,
815
- object_type=entry.object_type,
816
- create_timestamp=entry.CreateTimeStamp,
817
- last_alter_name=entry.LastAlterName,
818
- last_alter_timestamp=entry.LastAlterTimeStamp,
819
- request_text=(
820
- entry.RequestText.strip()
821
- if entry.object_type == "View" and entry.RequestText
822
- else None
823
- ),
1150
+
1151
+ try:
1152
+ with engine.connect() as conn:
1153
+ inspector = inspect(conn)
1154
+
1155
+ for view_name in view_names:
1156
+ view_start_time = time.time()
1157
+ try:
1158
+ dataset_name = self.get_identifier(
1159
+ schema=schema, entity=view_name, inspector=inspector
1160
+ )
1161
+
1162
+ self.report.report_entity_scanned(dataset_name, ent_type="view")
1163
+
1164
+ if not sql_config.view_pattern.allowed(dataset_name):
1165
+ self.report.report_dropped(dataset_name)
1166
+ continue
1167
+
1168
+ # Process the view and yield results
1169
+ for work_unit in self._process_view(
1170
+ dataset_name=dataset_name,
1171
+ inspector=inspector,
1172
+ schema=schema,
1173
+ view=view_name,
1174
+ sql_config=sql_config,
1175
+ ):
1176
+ yield work_unit
1177
+
1178
+ # Track individual view timing
1179
+ view_end_time = time.time()
1180
+ view_processing_time = view_end_time - view_start_time
1181
+ self.report.slowest_view_name[f"{schema}.{view_name}"] = (
1182
+ view_processing_time
1183
+ )
1184
+
1185
+ except Exception as e:
1186
+ # Log full exception details for debugging
1187
+ import traceback
1188
+
1189
+ full_traceback = traceback.format_exc()
1190
+ logger.error(
1191
+ f"Failed to process view {schema}.{view_name}: {str(e)}"
1192
+ )
1193
+ logger.error(f"Full traceback: {full_traceback}")
1194
+ self.report.warning(
1195
+ f"Error processing view {schema}.{view_name}",
1196
+ context=f"View: {schema}.{view_name}, Error: {str(e)}",
1197
+ exc=e,
1198
+ )
1199
+
1200
+ finally:
1201
+ engine.dispose()
1202
+
1203
+ def _get_or_create_pooled_engine(self) -> Engine:
1204
+ """Get or create a reusable SQLAlchemy engine with QueuePool for concurrent connections."""
1205
+ with self._pooled_engine_lock:
1206
+ if self._pooled_engine is None:
1207
+ url = self.config.get_sql_alchemy_url()
1208
+
1209
+ # Optimal connection pool sizing to match max_workers exactly
1210
+ # Teradata driver can be sensitive to high connection counts, so cap at reasonable limit
1211
+ max_safe_connections = (
1212
+ 13 # Conservative limit: 8 base + 5 overflow for Teradata stability
1213
+ )
1214
+
1215
+ # Adjust max_workers to match available connection pool capacity
1216
+ effective_max_workers = min(
1217
+ self.config.max_workers, max_safe_connections
1218
+ )
1219
+
1220
+ # Set pool size to match effective workers for optimal performance
1221
+ base_connections = min(
1222
+ effective_max_workers, 8
1223
+ ) # Reasonable base connections
1224
+ max_overflow = (
1225
+ effective_max_workers - base_connections
1226
+ ) # Remaining as overflow
1227
+
1228
+ # Log adjustment if max_workers was reduced
1229
+ if effective_max_workers < self.config.max_workers:
1230
+ logger.warning(
1231
+ f"Reduced max_workers from {self.config.max_workers} to {effective_max_workers} to match Teradata connection pool capacity"
1232
+ )
1233
+
1234
+ # Update the config to reflect the effective value used
1235
+ self.config.max_workers = effective_max_workers
1236
+
1237
+ pool_options = {
1238
+ **self.config.options,
1239
+ "poolclass": QueuePool,
1240
+ "pool_size": base_connections,
1241
+ "max_overflow": max_overflow,
1242
+ "pool_pre_ping": True, # Validate connections
1243
+ "pool_recycle": 1800, # Recycle connections after 30 mins (more frequent)
1244
+ "pool_timeout": 60, # Longer timeout for connection acquisition
1245
+ "pool_reset_on_return": "rollback", # Explicit rollback on connection return
1246
+ }
1247
+
1248
+ # Add Teradata-specific connection options for stability
1249
+ if "connect_args" not in pool_options:
1250
+ pool_options["connect_args"] = {}
1251
+
1252
+ # Teradata-specific connection arguments for better stability
1253
+ pool_options["connect_args"].update(
1254
+ {
1255
+ "connect_timeout": "30000", # Connection timeout in ms (30 seconds)
1256
+ "request_timeout": "120000", # Request timeout in ms (2 minutes)
1257
+ }
1258
+ )
1259
+
1260
+ self._pooled_engine = create_engine(url, **pool_options)
1261
+ logger.info(
1262
+ f"Created optimized Teradata connection pool: {base_connections} base + {max_overflow} overflow = {base_connections + max_overflow} max connections (matching {effective_max_workers} workers)"
1263
+ )
1264
+
1265
+ return self._pooled_engine
1266
+
1267
+ def cache_tables_and_views(self) -> None:
1268
+ with self.report.new_stage("Cache tables and views"):
1269
+ engine = self.get_metadata_engine()
1270
+ try:
1271
+ database_counts: Dict[str, Dict[str, int]] = defaultdict(
1272
+ lambda: {"tables": 0, "views": 0}
1273
+ )
1274
+
1275
+ for entry in engine.execute(self.TABLES_AND_VIEWS_QUERY):
1276
+ table = TeradataTable(
1277
+ database=entry.DataBaseName.strip(),
1278
+ name=entry.name.strip(),
1279
+ description=entry.description.strip()
1280
+ if entry.description
1281
+ else None,
1282
+ object_type=entry.object_type,
1283
+ create_timestamp=entry.CreateTimeStamp,
1284
+ last_alter_name=entry.LastAlterName,
1285
+ last_alter_timestamp=entry.LastAlterTimeStamp,
1286
+ request_text=(
1287
+ entry.RequestText.strip()
1288
+ if entry.object_type == "View" and entry.RequestText
1289
+ else None
1290
+ ),
1291
+ )
1292
+
1293
+ # Count objects per database for metrics
1294
+ if table.object_type == "View":
1295
+ database_counts[table.database]["views"] += 1
1296
+ else:
1297
+ database_counts[table.database]["tables"] += 1
1298
+
1299
+ with self._tables_cache_lock:
1300
+ if table.database not in self._tables_cache:
1301
+ self._tables_cache[table.database] = []
1302
+ self._tables_cache[table.database].append(table)
1303
+
1304
+ for database, counts in database_counts.items():
1305
+ self.report.num_database_tables_to_scan[database] = counts["tables"]
1306
+ self.report.num_database_views_to_scan[database] = counts["views"]
1307
+
1308
+ finally:
1309
+ engine.dispose()
1310
+
1311
+ def _reconstruct_queries_streaming(
1312
+ self, entries: Iterable[Any]
1313
+ ) -> Iterable[ObservedQuery]:
1314
+ """Reconstruct complete queries from database entries in streaming fashion.
1315
+
1316
+ This method processes entries in order and reconstructs multi-row queries
1317
+ by concatenating rows with the same query_id.
1318
+ """
1319
+ current_query_id = None
1320
+ current_query_parts = []
1321
+ current_query_metadata = None
1322
+
1323
+ for entry in entries:
1324
+ query_id = getattr(entry, "query_id", None)
1325
+ query_text = str(getattr(entry, "query_text", ""))
1326
+
1327
+ if query_id != current_query_id:
1328
+ # New query started - yield the previous one if it exists
1329
+ if current_query_id is not None and current_query_parts:
1330
+ yield self._create_observed_query_from_parts(
1331
+ current_query_parts, current_query_metadata
1332
+ )
1333
+
1334
+ # Start new query
1335
+ current_query_id = query_id
1336
+ current_query_parts = [query_text] if query_text else []
1337
+ current_query_metadata = entry
1338
+ else:
1339
+ # Same query - append the text
1340
+ if query_text:
1341
+ current_query_parts.append(query_text)
1342
+
1343
+ # Yield the last query if it exists
1344
+ if current_query_id is not None and current_query_parts:
1345
+ yield self._create_observed_query_from_parts(
1346
+ current_query_parts, current_query_metadata
824
1347
  )
825
- if table.database not in self._tables_cache:
826
- self._tables_cache[table.database] = []
827
1348
 
828
- self._tables_cache[table.database].append(table)
1349
+ def _create_observed_query_from_parts(
1350
+ self, query_parts: List[str], metadata_entry: Any
1351
+ ) -> ObservedQuery:
1352
+ """Create ObservedQuery from reconstructed query parts and metadata."""
1353
+ # Join all parts to form the complete query
1354
+ # Teradata fragments are split at fixed lengths without artificial breaks
1355
+ full_query_text = "".join(query_parts)
1356
+
1357
+ # Extract metadata
1358
+ session_id = getattr(metadata_entry, "session_id", None)
1359
+ timestamp = getattr(metadata_entry, "timestamp", None)
1360
+ user = getattr(metadata_entry, "user", None)
1361
+ default_database = getattr(metadata_entry, "default_database", None)
1362
+
1363
+ # Apply Teradata-specific query transformations
1364
+ cleaned_query = full_query_text.replace("(NOT CASESPECIFIC)", "")
1365
+
1366
+ return ObservedQuery(
1367
+ query=cleaned_query,
1368
+ session_id=session_id,
1369
+ timestamp=timestamp,
1370
+ user=CorpUserUrn(user) if user else None,
1371
+ default_db=default_database,
1372
+ default_schema=default_database, # Teradata uses database as schema
1373
+ )
1374
+
1375
+ def _convert_entry_to_observed_query(self, entry: Any) -> ObservedQuery:
1376
+ """Convert database query entry to ObservedQuery for SqlParsingAggregator.
1377
+
1378
+ DEPRECATED: This method is deprecated in favor of _reconstruct_queries_streaming
1379
+ which properly handles multi-row queries. This method does not handle queries
1380
+ that span multiple rows correctly and should not be used.
1381
+ """
1382
+ # Extract fields from database result
1383
+ query_text = str(entry.query_text).strip()
1384
+ session_id = getattr(entry, "session_id", None)
1385
+ timestamp = getattr(entry, "timestamp", None)
1386
+ user = getattr(entry, "user", None)
1387
+ default_database = getattr(entry, "default_database", None)
1388
+
1389
+ # Apply Teradata-specific query transformations
1390
+ cleaned_query = query_text.replace("(NOT CASESPECIFIC)", "")
1391
+
1392
+ return ObservedQuery(
1393
+ query=cleaned_query,
1394
+ session_id=session_id,
1395
+ timestamp=timestamp,
1396
+ user=CorpUserUrn(user) if user else None,
1397
+ default_db=default_database,
1398
+ default_schema=default_database, # Teradata uses database as schema
1399
+ )
1400
+
1401
+ def _fetch_lineage_entries_chunked(self) -> Iterable[Any]:
1402
+ """Fetch lineage entries using server-side cursor to handle large result sets efficiently."""
1403
+ queries = self._make_lineage_queries()
1404
+
1405
+ fetch_engine = self.get_metadata_engine()
1406
+ try:
1407
+ with fetch_engine.connect() as conn:
1408
+ cursor_type = (
1409
+ "server-side"
1410
+ if self.config.use_server_side_cursors
1411
+ else "client-side"
1412
+ )
1413
+
1414
+ total_count_all_queries = 0
1415
+
1416
+ for query_index, query in enumerate(queries, 1):
1417
+ logger.info(
1418
+ f"Executing lineage query {query_index}/{len(queries)} with {cursor_type} cursor..."
1419
+ )
1420
+
1421
+ # Use helper method to try server-side cursor with fallback
1422
+ result = self._execute_with_cursor_fallback(conn, query)
1423
+
1424
+ # Stream results in batches to avoid memory issues
1425
+ batch_size = 5000
1426
+ batch_count = 0
1427
+ query_total_count = 0
1428
+
1429
+ while True:
1430
+ # Fetch a batch of rows
1431
+ batch = result.fetchmany(batch_size)
1432
+ if not batch:
1433
+ break
1434
+
1435
+ batch_count += 1
1436
+ query_total_count += len(batch)
1437
+ total_count_all_queries += len(batch)
1438
+
1439
+ logger.info(
1440
+ f"Query {query_index} - Fetched batch {batch_count}: {len(batch)} lineage entries (query total: {query_total_count})"
1441
+ )
1442
+ yield from batch
1443
+
1444
+ logger.info(
1445
+ f"Completed query {query_index}: {query_total_count} lineage entries in {batch_count} batches"
1446
+ )
1447
+
1448
+ logger.info(
1449
+ f"Completed fetching all queries: {total_count_all_queries} total lineage entries from {len(queries)} queries"
1450
+ )
829
1451
 
830
- def get_audit_log_mcps(self, urns: Set[str]) -> Iterable[MetadataWorkUnit]:
1452
+ except Exception as e:
1453
+ logger.error(f"Error fetching lineage entries: {e}")
1454
+ raise
1455
+ finally:
1456
+ fetch_engine.dispose()
1457
+
1458
+ def _check_historical_table_exists(self) -> bool:
1459
+ """
1460
+ Check if the PDCRINFO.DBQLSqlTbl_Hst table exists and is accessible.
1461
+ DBQL rows are periodically moved to history table and audit queries might not exist in DBC already.
1462
+ There is not guarantee that the historical table exists, so we need to check it.
1463
+
1464
+ Returns:
1465
+ bool: True if the historical table exists and is accessible, False otherwise.
1466
+ """
831
1467
  engine = self.get_metadata_engine()
832
- for entry in engine.execute(self._make_lineage_query()):
833
- self.report.num_queries_parsed += 1
834
- if self.report.num_queries_parsed % 1000 == 0:
835
- logger.info(f"Parsed {self.report.num_queries_parsed} queries")
836
-
837
- yield from self.gen_lineage_from_query(
838
- query=entry.query_text,
839
- default_database=entry.default_database,
840
- timestamp=entry.timestamp,
841
- user=entry.user,
842
- urns=urns,
1468
+ try:
1469
+ # Use a simple query to check if the table exists and is accessible
1470
+ check_query = """
1471
+ SELECT TOP 1 QueryID
1472
+ FROM PDCRINFO.DBQLSqlTbl_Hst
1473
+ WHERE 1=0
1474
+ """
1475
+ with engine.connect() as conn:
1476
+ conn.execute(text(check_query))
1477
+ logger.info(
1478
+ "Historical lineage table PDCRINFO.DBQLSqlTbl_Hst is available"
1479
+ )
1480
+ return True
1481
+ except Exception as e:
1482
+ logger.info(
1483
+ f"Historical lineage table PDCRINFO.DBQLSqlTbl_Hst is not available: {e}"
843
1484
  )
1485
+ return False
1486
+ finally:
1487
+ engine.dispose()
844
1488
 
845
- def _make_lineage_query(self) -> str:
1489
+ def _make_lineage_queries(self) -> List[str]:
846
1490
  databases_filter = (
847
1491
  ""
848
1492
  if not self.config.databases
849
- else "and default_database in ({databases})".format(
1493
+ else "and l.DefaultDatabase in ({databases})".format(
850
1494
  databases=",".join([f"'{db}'" for db in self.config.databases])
851
1495
  )
852
1496
  )
853
1497
 
854
- query = self.QUERY_TEXT_QUERY.format(
855
- start_time=self.config.start_time,
856
- end_time=self.config.end_time,
857
- databases_filter=databases_filter,
858
- )
859
- return query
1498
+ queries = []
860
1499
 
861
- def gen_lineage_from_query(
862
- self,
863
- query: str,
864
- default_database: Optional[str] = None,
865
- timestamp: Optional[datetime] = None,
866
- user: Optional[str] = None,
867
- view_urn: Optional[str] = None,
868
- urns: Optional[Set[str]] = None,
869
- ) -> Iterable[MetadataWorkUnit]:
870
- result = sqlglot_lineage(
871
- # With this clever hack we can make the query parser to not fail on queries with CASESPECIFIC
872
- sql=query.replace("(NOT CASESPECIFIC)", ""),
873
- schema_resolver=self.schema_resolver,
874
- default_db=None,
875
- default_schema=(
876
- default_database if default_database else self.config.default_db
877
- ),
878
- )
879
- if result.debug_info.table_error:
880
- logger.debug(
881
- f"Error parsing table lineage ({view_urn}):\n{result.debug_info.table_error}"
1500
+ # Check if historical lineage is configured and available
1501
+ if (
1502
+ self.config.include_historical_lineage
1503
+ and self._check_historical_table_exists()
1504
+ ):
1505
+ logger.info(
1506
+ "Using UNION query to combine historical and current lineage data to avoid duplicates"
882
1507
  )
883
- self.report.num_table_parse_failures += 1
1508
+ # For historical query, we need the database filter for historical part
1509
+ databases_filter_history = (
1510
+ databases_filter.replace("l.DefaultDatabase", "h.DefaultDatabase")
1511
+ if databases_filter
1512
+ else ""
1513
+ )
1514
+
1515
+ union_query = self.QUERY_TEXT_HISTORICAL_UNION.format(
1516
+ start_time=self.config.start_time,
1517
+ end_time=self.config.end_time,
1518
+ databases_filter=databases_filter,
1519
+ databases_filter_history=databases_filter_history,
1520
+ )
1521
+ queries.append(union_query)
884
1522
  else:
885
- yield from self.builder.process_sql_parsing_result(
886
- result,
887
- query=query,
888
- is_view_ddl=view_urn is not None,
889
- query_timestamp=timestamp,
890
- user=f"urn:li:corpuser:{user}",
891
- include_urns=urns,
1523
+ if self.config.include_historical_lineage:
1524
+ logger.warning(
1525
+ "Historical lineage was requested but PDCRINFO.DBQLSqlTbl_Hst table is not available. Falling back to current data only."
1526
+ )
1527
+
1528
+ # Use current-only query when historical data is not available
1529
+ current_query = self.QUERY_TEXT_CURRENT_QUERIES.format(
1530
+ start_time=self.config.start_time,
1531
+ end_time=self.config.end_time,
1532
+ databases_filter=databases_filter,
892
1533
  )
1534
+ queries.append(current_query)
1535
+
1536
+ return queries
893
1537
 
894
1538
  def get_metadata_engine(self) -> Engine:
895
1539
  url = self.config.get_sql_alchemy_url()
896
1540
  logger.debug(f"sql_alchemy_url={url}")
897
1541
  return create_engine(url, **self.config.options)
898
1542
 
899
- def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
1543
+ def _execute_with_cursor_fallback(
1544
+ self, connection: Connection, query: str, params: Optional[Dict] = None
1545
+ ) -> Any:
1546
+ """
1547
+ Execute query with server-side cursor if enabled and supported, otherwise fall back to regular execution.
1548
+
1549
+ Args:
1550
+ connection: Database connection
1551
+ query: SQL query to execute
1552
+ params: Query parameters
1553
+
1554
+ Returns:
1555
+ Query result object
1556
+ """
1557
+ if self.config.use_server_side_cursors:
1558
+ try:
1559
+ # Try server-side cursor first
1560
+ if params:
1561
+ result = connection.execution_options(stream_results=True).execute(
1562
+ text(query), params
1563
+ )
1564
+ else:
1565
+ result = connection.execution_options(stream_results=True).execute(
1566
+ text(query)
1567
+ )
1568
+
1569
+ logger.debug(
1570
+ "Successfully using server-side cursor for query execution"
1571
+ )
1572
+ return result
1573
+
1574
+ except Exception as e:
1575
+ logger.warning(
1576
+ f"Server-side cursor failed, falling back to client-side execution: {e}"
1577
+ )
1578
+ # Fall through to regular execution
1579
+
1580
+ # Regular execution (client-side)
1581
+ if params:
1582
+ return connection.execute(text(query), params)
1583
+ else:
1584
+ return connection.execute(text(query))
1585
+
1586
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1587
+ logger.info("Starting Teradata metadata extraction")
1588
+
900
1589
  # Add all schemas to the schema resolver
901
1590
  # Sql parser operates on lowercase urns so we need to lowercase the urns
902
- for wu in auto_lowercase_urns(super().get_workunits_internal()):
903
- urn = wu.get_urn()
904
- schema_metadata = wu.get_aspect_of_type(SchemaMetadataClass)
905
- if schema_metadata:
906
- self.schema_resolver.add_schema_metadata(urn, schema_metadata)
907
- yield wu
908
-
909
- urns = self.schema_resolver.get_urns()
1591
+ with self.report.new_stage("Schema metadata extraction"):
1592
+ yield from super().get_workunits_internal()
1593
+ logger.info("Completed schema metadata extraction")
1594
+
1595
+ with self.report.new_stage("Audit log extraction"):
1596
+ yield from self._get_audit_log_mcps_with_aggregator()
1597
+
1598
+ # SqlParsingAggregator handles its own work unit generation internally
1599
+ logger.info("Lineage processing completed by SqlParsingAggregator")
1600
+
1601
+ def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
1602
+ """Override base class to skip aggregator gen_metadata() call.
1603
+
1604
+ Teradata handles aggregator processing after adding audit log queries,
1605
+ so we skip the base class call to prevent duplicate processing.
1606
+ """
1607
+ # Return empty iterator - Teradata will handle aggregator processing
1608
+ # after adding audit log queries in _get_audit_log_mcps_with_aggregator()
1609
+ return iter([])
1610
+
1611
+ def _get_audit_log_mcps_with_aggregator(self) -> Iterable[MetadataWorkUnit]:
1612
+ """SqlParsingAggregator-based lineage extraction with enhanced capabilities."""
1613
+ logger.info(
1614
+ "Fetching queries from Teradata audit logs for SqlParsingAggregator"
1615
+ )
1616
+
910
1617
  if self.config.include_table_lineage or self.config.include_usage_statistics:
911
- with self.report.new_stage("Audit log extraction"):
912
- yield from self.get_audit_log_mcps(urns=urns)
1618
+ # Step 1: Stream query entries from database with memory-efficient processing
1619
+ with self.report.new_stage("Fetching lineage entries from Audit Logs"):
1620
+ queries_processed = 0
1621
+ entries_processed = False
1622
+
1623
+ # Use streaming query reconstruction for memory efficiency
1624
+ for observed_query in self._reconstruct_queries_streaming(
1625
+ self._fetch_lineage_entries_chunked()
1626
+ ):
1627
+ entries_processed = True
1628
+ self.aggregator.add(observed_query)
1629
+
1630
+ queries_processed += 1
1631
+ if queries_processed % 10000 == 0:
1632
+ logger.info(
1633
+ f"Processed {queries_processed} queries to aggregator"
1634
+ )
1635
+
1636
+ if not entries_processed:
1637
+ logger.info("No lineage entries found")
1638
+ return
1639
+
1640
+ logger.info(
1641
+ f"Completed adding {queries_processed} queries to SqlParsingAggregator"
1642
+ )
1643
+
1644
+ # Step 2: Generate work units from aggregator
1645
+ with self.report.new_stage("SqlParsingAggregator metadata generation"):
1646
+ logger.info("Generating metadata work units from SqlParsingAggregator")
1647
+ work_unit_count = 0
1648
+ for mcp in self.aggregator.gen_metadata():
1649
+ work_unit_count += 1
1650
+ if work_unit_count % 10000 == 0:
1651
+ logger.info(
1652
+ f"Generated {work_unit_count} work units from aggregator"
1653
+ )
1654
+ yield mcp.as_workunit()
1655
+
1656
+ logger.info(
1657
+ f"Completed SqlParsingAggregator processing: {work_unit_count} work units generated"
1658
+ )
1659
+
1660
+ def close(self) -> None:
1661
+ """Clean up resources when source is closed."""
1662
+ logger.info("Closing SqlParsingAggregator")
1663
+ self.aggregator.close()
1664
+
1665
+ # Clean up pooled engine
1666
+ with self._pooled_engine_lock:
1667
+ if self._pooled_engine is not None:
1668
+ logger.info("Disposing pooled engine")
1669
+ self._pooled_engine.dispose()
1670
+ self._pooled_engine = None
913
1671
 
914
- yield from self.builder.gen_workunits()
1672
+ # Report failed views summary
1673
+ super().close()