acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (226) hide show
  1. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
  2. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
  3. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +2 -1
  7. datahub/api/entities/external/__init__.py +0 -0
  8. datahub/api/entities/external/external_entities.py +239 -0
  9. datahub/api/entities/external/external_tag.py +145 -0
  10. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  11. datahub/api/entities/external/restricted_text.py +247 -0
  12. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  13. datahub/cli/check_cli.py +88 -7
  14. datahub/cli/cli_utils.py +63 -0
  15. datahub/cli/container_cli.py +5 -0
  16. datahub/cli/delete_cli.py +124 -27
  17. datahub/cli/docker_check.py +107 -12
  18. datahub/cli/docker_cli.py +149 -227
  19. datahub/cli/exists_cli.py +0 -2
  20. datahub/cli/get_cli.py +0 -2
  21. datahub/cli/iceberg_cli.py +5 -0
  22. datahub/cli/ingest_cli.py +12 -16
  23. datahub/cli/migrate.py +2 -0
  24. datahub/cli/put_cli.py +1 -4
  25. datahub/cli/quickstart_versioning.py +50 -7
  26. datahub/cli/specific/assertions_cli.py +0 -4
  27. datahub/cli/specific/datacontract_cli.py +0 -3
  28. datahub/cli/specific/dataproduct_cli.py +0 -11
  29. datahub/cli/specific/dataset_cli.py +1 -8
  30. datahub/cli/specific/forms_cli.py +0 -4
  31. datahub/cli/specific/group_cli.py +0 -2
  32. datahub/cli/specific/structuredproperties_cli.py +1 -4
  33. datahub/cli/specific/user_cli.py +0 -2
  34. datahub/cli/state_cli.py +0 -2
  35. datahub/cli/timeline_cli.py +0 -2
  36. datahub/emitter/response_helper.py +86 -1
  37. datahub/emitter/rest_emitter.py +71 -13
  38. datahub/entrypoints.py +4 -3
  39. datahub/ingestion/api/decorators.py +15 -3
  40. datahub/ingestion/api/report.py +332 -3
  41. datahub/ingestion/api/sink.py +3 -0
  42. datahub/ingestion/api/source.py +48 -44
  43. datahub/ingestion/autogenerated/__init__.py +0 -0
  44. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  45. datahub/ingestion/autogenerated/lineage.json +401 -0
  46. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  47. datahub/ingestion/extractor/schema_util.py +13 -4
  48. datahub/ingestion/glossary/classification_mixin.py +5 -0
  49. datahub/ingestion/graph/client.py +100 -15
  50. datahub/ingestion/graph/config.py +1 -0
  51. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  52. datahub/ingestion/run/pipeline.py +54 -2
  53. datahub/ingestion/sink/datahub_rest.py +13 -0
  54. datahub/ingestion/source/abs/source.py +1 -1
  55. datahub/ingestion/source/aws/aws_common.py +4 -0
  56. datahub/ingestion/source/aws/glue.py +489 -244
  57. datahub/ingestion/source/aws/tag_entities.py +292 -0
  58. datahub/ingestion/source/azure/azure_common.py +2 -2
  59. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  60. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  61. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  62. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  63. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  64. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  65. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  66. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  67. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  68. datahub/ingestion/source/common/subtypes.py +45 -0
  69. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  70. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  71. datahub/ingestion/source/datahub/config.py +11 -0
  72. datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
  73. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  74. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  75. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  76. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  77. datahub/ingestion/source/debug/__init__.py +0 -0
  78. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  79. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  80. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  81. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  82. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  83. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  84. datahub/ingestion/source/file.py +3 -0
  85. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  86. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  87. datahub/ingestion/source/ge_data_profiler.py +76 -28
  88. datahub/ingestion/source/ge_profiling_config.py +11 -0
  89. datahub/ingestion/source/hex/api.py +26 -1
  90. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  91. datahub/ingestion/source/identity/azure_ad.py +1 -1
  92. datahub/ingestion/source/identity/okta.py +1 -14
  93. datahub/ingestion/source/kafka/kafka.py +16 -0
  94. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  95. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  96. datahub/ingestion/source/looker/looker_source.py +1 -0
  97. datahub/ingestion/source/mlflow.py +11 -1
  98. datahub/ingestion/source/mock_data/__init__.py +0 -0
  99. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  100. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  101. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  102. datahub/ingestion/source/nifi.py +1 -1
  103. datahub/ingestion/source/openapi.py +12 -0
  104. datahub/ingestion/source/openapi_parser.py +56 -37
  105. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  106. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  107. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  108. datahub/ingestion/source/preset.py +2 -2
  109. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  110. datahub/ingestion/source/redshift/redshift.py +21 -1
  111. datahub/ingestion/source/redshift/usage.py +4 -3
  112. datahub/ingestion/source/s3/report.py +4 -2
  113. datahub/ingestion/source/s3/source.py +367 -115
  114. datahub/ingestion/source/sac/sac.py +3 -1
  115. datahub/ingestion/source/salesforce.py +6 -3
  116. datahub/ingestion/source/sigma/sigma.py +7 -1
  117. datahub/ingestion/source/slack/slack.py +2 -1
  118. datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
  119. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  120. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  121. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  122. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  123. datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
  124. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  125. datahub/ingestion/source/sql/athena.py +119 -11
  126. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  127. datahub/ingestion/source/sql/clickhouse.py +3 -1
  128. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  129. datahub/ingestion/source/sql/hana.py +3 -1
  130. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  131. datahub/ingestion/source/sql/mariadb.py +0 -1
  132. datahub/ingestion/source/sql/mssql/source.py +239 -34
  133. datahub/ingestion/source/sql/mysql.py +0 -1
  134. datahub/ingestion/source/sql/oracle.py +1 -1
  135. datahub/ingestion/source/sql/postgres.py +0 -1
  136. datahub/ingestion/source/sql/sql_common.py +121 -34
  137. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  138. datahub/ingestion/source/sql/teradata.py +997 -235
  139. datahub/ingestion/source/sql/vertica.py +10 -6
  140. datahub/ingestion/source/sql_queries.py +2 -2
  141. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  142. datahub/ingestion/source/superset.py +58 -3
  143. datahub/ingestion/source/tableau/tableau.py +58 -37
  144. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  145. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  146. datahub/ingestion/source/unity/config.py +5 -0
  147. datahub/ingestion/source/unity/proxy.py +118 -0
  148. datahub/ingestion/source/unity/source.py +195 -17
  149. datahub/ingestion/source/unity/tag_entities.py +295 -0
  150. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  151. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  152. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  153. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  154. datahub/metadata/_internal_schema_classes.py +1446 -559
  155. datahub/metadata/_urns/urn_defs.py +1721 -1553
  156. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  158. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  159. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  160. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  161. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  162. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  163. datahub/metadata/schema.avsc +18055 -17802
  164. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  165. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  166. datahub/metadata/schemas/Applications.avsc +38 -0
  167. datahub/metadata/schemas/ChartKey.avsc +1 -0
  168. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  169. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  170. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  171. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  172. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  175. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  176. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  177. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  178. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  179. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  180. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  181. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  182. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  183. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  184. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  185. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  186. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  187. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  188. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  189. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  190. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  191. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  192. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  193. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  194. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  195. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  196. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  197. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  198. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  199. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  200. datahub/sdk/__init__.py +6 -0
  201. datahub/sdk/_all_entities.py +11 -0
  202. datahub/sdk/_shared.py +118 -1
  203. datahub/sdk/chart.py +315 -0
  204. datahub/sdk/container.py +7 -0
  205. datahub/sdk/dashboard.py +432 -0
  206. datahub/sdk/dataflow.py +309 -0
  207. datahub/sdk/datajob.py +367 -0
  208. datahub/sdk/dataset.py +8 -2
  209. datahub/sdk/entity_client.py +90 -2
  210. datahub/sdk/lineage_client.py +683 -82
  211. datahub/sdk/main_client.py +46 -16
  212. datahub/sdk/mlmodel.py +101 -38
  213. datahub/sdk/mlmodelgroup.py +7 -0
  214. datahub/sdk/search_client.py +4 -3
  215. datahub/specific/chart.py +1 -1
  216. datahub/specific/dataproduct.py +4 -0
  217. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  218. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  219. datahub/telemetry/telemetry.py +17 -11
  220. datahub/testing/sdk_v2_helpers.py +7 -1
  221. datahub/upgrade/upgrade.py +46 -13
  222. datahub/utilities/server_config_util.py +8 -0
  223. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  224. datahub/utilities/stats_collections.py +4 -0
  225. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
  226. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,11 @@
1
1
  import logging
2
+ import time
2
3
  from collections import defaultdict
3
- from dataclasses import dataclass
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from dataclasses import dataclass, field
4
6
  from datetime import datetime
5
7
  from functools import lru_cache
8
+ from threading import Lock
6
9
  from typing import (
7
10
  Any,
8
11
  Dict,
@@ -10,7 +13,6 @@ from typing import (
10
13
  List,
11
14
  MutableMapping,
12
15
  Optional,
13
- Set,
14
16
  Tuple,
15
17
  Union,
16
18
  )
@@ -29,7 +31,6 @@ from teradatasqlalchemy.options import configure
29
31
 
30
32
  from datahub.configuration.common import AllowDenyPattern
31
33
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
32
- from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
33
34
  from datahub.ingestion.api.common import PipelineContext
34
35
  from datahub.ingestion.api.decorators import (
35
36
  SourceCapability,
@@ -39,10 +40,9 @@ from datahub.ingestion.api.decorators import (
39
40
  platform_name,
40
41
  support_status,
41
42
  )
42
- from datahub.ingestion.api.source_helpers import auto_lowercase_urns
43
43
  from datahub.ingestion.api.workunit import MetadataWorkUnit
44
44
  from datahub.ingestion.graph.client import DataHubGraph
45
- from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type
45
+ from datahub.ingestion.source.sql.sql_common import register_custom_type
46
46
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
47
47
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
48
48
  from datahub.ingestion.source.sql.two_tier_sql_source import (
@@ -56,13 +56,64 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
56
56
  BytesTypeClass,
57
57
  TimeTypeClass,
58
58
  )
59
- from datahub.metadata.schema_classes import SchemaMetadataClass
59
+ from datahub.metadata.urns import CorpUserUrn
60
60
  from datahub.sql_parsing.schema_resolver import SchemaResolver
61
- from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
61
+ from datahub.sql_parsing.sql_parsing_aggregator import (
62
+ ObservedQuery,
63
+ SqlParsingAggregator,
64
+ )
62
65
  from datahub.utilities.groupby import groupby_unsorted
66
+ from datahub.utilities.stats_collections import TopKDict
63
67
 
64
68
  logger: logging.Logger = logging.getLogger(__name__)
65
69
 
70
+ # Common excluded databases used in multiple places
71
+ EXCLUDED_DATABASES = [
72
+ "All",
73
+ "Crashdumps",
74
+ "Default",
75
+ "DemoNow_Monitor",
76
+ "EXTUSER",
77
+ "External_AP",
78
+ "GLOBAL_FUNCTIONS",
79
+ "LockLogShredder",
80
+ "PUBLIC",
81
+ "SQLJ",
82
+ "SYSBAR",
83
+ "SYSJDBC",
84
+ "SYSLIB",
85
+ "SYSSPATIAL",
86
+ "SYSUDTLIB",
87
+ "SYSUIF",
88
+ "SysAdmin",
89
+ "Sys_Calendar",
90
+ "SystemFe",
91
+ "TDBCMgmt",
92
+ "TDMaps",
93
+ "TDPUSER",
94
+ "TDQCD",
95
+ "TDStats",
96
+ "TD_ANALYTICS_DB",
97
+ "TD_SERVER_DB",
98
+ "TD_SYSFNLIB",
99
+ "TD_SYSGPL",
100
+ "TD_SYSXML",
101
+ "TDaaS_BAR",
102
+ "TDaaS_DB",
103
+ "TDaaS_Maint",
104
+ "TDaaS_Monitor",
105
+ "TDaaS_Support",
106
+ "TDaaS_TDBCMgmt1",
107
+ "TDaaS_TDBCMgmt2",
108
+ "dbcmngr",
109
+ "mldb",
110
+ "system",
111
+ "tapidb",
112
+ "tdwm",
113
+ "val",
114
+ "dbc",
115
+ ]
116
+
66
117
  register_custom_type(custom_types.JSON, BytesTypeClass)
67
118
  register_custom_type(custom_types.INTERVAL_DAY, TimeTypeClass)
68
119
  register_custom_type(custom_types.INTERVAL_DAY_TO_SECOND, TimeTypeClass)
@@ -99,14 +150,16 @@ class TeradataTable:
99
150
  request_text: Optional[str]
100
151
 
101
152
 
102
- # lru cache is set to 1 which work only in single threaded environment but it keeps the memory footprint lower
153
+ # Cache size of 1 is sufficient since schemas are processed sequentially
154
+ # Note: This cache is per-process and helps when processing multiple tables in the same schema
103
155
  @lru_cache(maxsize=1)
104
156
  def get_schema_columns(
105
157
  self: Any, connection: Connection, dbc_columns: str, schema: str
106
158
  ) -> Dict[str, List[Any]]:
159
+ start_time = time.time()
107
160
  columns: Dict[str, List[Any]] = {}
108
- columns_query = f"select * from dbc.{dbc_columns} where DatabaseName (NOT CASESPECIFIC) = '{schema}' (NOT CASESPECIFIC) order by TableName, ColumnId"
109
- rows = connection.execute(text(columns_query)).fetchall()
161
+ columns_query = f"select * from dbc.{dbc_columns} where DatabaseName (NOT CASESPECIFIC) = :schema (NOT CASESPECIFIC) order by TableName, ColumnId"
162
+ rows = connection.execute(text(columns_query), {"schema": schema}).fetchall()
110
163
  for row in rows:
111
164
  row_mapping = row._mapping
112
165
  if row_mapping.TableName not in columns:
@@ -114,18 +167,29 @@ def get_schema_columns(
114
167
 
115
168
  columns[row_mapping.TableName].append(row_mapping)
116
169
 
170
+ end_time = time.time()
171
+ extraction_time = end_time - start_time
172
+ logger.info(
173
+ f"Column extraction for schema '{schema}' completed in {extraction_time:.2f} seconds"
174
+ )
175
+
176
+ # Update report if available
177
+ if hasattr(self, "report"):
178
+ self.report.column_extraction_duration_seconds += extraction_time
179
+
117
180
  return columns
118
181
 
119
182
 
120
- # lru cache is set to 1 which work only in single threaded environment but it keeps the memory footprint lower
183
+ # Cache size of 1 is sufficient since schemas are processed sequentially
184
+ # Note: This cache is per-process and helps when processing multiple tables in the same schema
121
185
  @lru_cache(maxsize=1)
122
186
  def get_schema_pk_constraints(
123
187
  self: Any, connection: Connection, schema: str
124
188
  ) -> Dict[str, List[Any]]:
125
189
  dbc_indices = "IndicesV" + "X" if configure.usexviews else "IndicesV"
126
190
  primary_keys: Dict[str, List[Any]] = {}
127
- stmt = f"select * from dbc.{dbc_indices} where DatabaseName (NOT CASESPECIFIC) = '{schema}' (NOT CASESPECIFIC) and IndexType = 'K' order by IndexNumber"
128
- rows = connection.execute(text(stmt)).fetchall()
191
+ stmt = f"select * from dbc.{dbc_indices} where DatabaseName (NOT CASESPECIFIC) = :schema (NOT CASESPECIFIC) and IndexType = 'K' order by IndexNumber"
192
+ rows = connection.execute(text(stmt), {"schema": schema}).fetchall()
129
193
  for row in rows:
130
194
  row_mapping = row._mapping
131
195
  if row_mapping.TableName not in primary_keys:
@@ -172,6 +236,10 @@ def optimized_get_pk_constraint(
172
236
  index_column.IndexName
173
237
  ) # There should be just one IndexName
174
238
 
239
+ # Update counter if available
240
+ if hasattr(self, "report"):
241
+ self.report.num_primary_keys_processed += 1
242
+
175
243
  return {"constrained_columns": index_columns, "name": index_name}
176
244
 
177
245
 
@@ -228,23 +296,55 @@ def optimized_get_columns(
228
296
  table_name, []
229
297
  )
230
298
 
299
+ start_time = time.time()
300
+
231
301
  final_column_info = []
232
302
  # Don't care about ART tables now
233
303
  # Ignore the non-functional column in a PTI table
234
304
  for row in res:
235
- col_info = self._get_column_info(row)
236
- if "TSColumnType" in col_info and col_info["TSColumnType"] is not None:
237
- if (
238
- col_info["ColumnName"] == "TD_TIMEBUCKET"
239
- and col_info["TSColumnType"].strip() == "TB"
305
+ try:
306
+ col_info = self._get_column_info(row)
307
+
308
+ # Add CommentString as comment field for column description
309
+ if hasattr(row, "CommentString") and row.CommentString:
310
+ col_info["comment"] = row.CommentString.strip()
311
+ elif (
312
+ isinstance(row, dict)
313
+ and "CommentString" in row
314
+ and row["CommentString"]
240
315
  ):
241
- continue
242
- final_column_info.append(col_info)
316
+ col_info["comment"] = row["CommentString"].strip()
317
+
318
+ if "TSColumnType" in col_info and col_info["TSColumnType"] is not None:
319
+ if (
320
+ col_info["ColumnName"] == "TD_TIMEBUCKET"
321
+ and col_info["TSColumnType"].strip() == "TB"
322
+ ):
323
+ continue
324
+ final_column_info.append(col_info)
325
+
326
+ # Update counter - access report through self from the connection context
327
+ if hasattr(self, "report"):
328
+ self.report.num_columns_processed += 1
329
+
330
+ except Exception as e:
331
+ logger.error(
332
+ f"Failed to process column {getattr(row, 'ColumnName', 'unknown')}: {e}"
333
+ )
334
+ if hasattr(self, "report"):
335
+ self.report.num_column_extraction_failures += 1
336
+ continue
337
+
338
+ # Update timing
339
+ if hasattr(self, "report"):
340
+ end_time = time.time()
341
+ self.report.column_extraction_duration_seconds += end_time - start_time
243
342
 
244
343
  return final_column_info
245
344
 
246
345
 
247
- # lru cache is set to 1 which work only in single threaded environment but it keeps the memory footprint lower
346
+ # Cache size of 1 is sufficient since schemas are processed sequentially
347
+ # Note: This cache is per-process and helps when processing multiple tables in the same schema
248
348
  @lru_cache(maxsize=1)
249
349
  def get_schema_foreign_keys(
250
350
  self: Any, connection: Connection, schema: str
@@ -334,9 +434,24 @@ def optimized_get_view_definition(
334
434
 
335
435
  @dataclass
336
436
  class TeradataReport(SQLSourceReport, IngestionStageReport, BaseTimeWindowReport):
337
- num_queries_parsed: int = 0
338
- num_view_ddl_parsed: int = 0
339
- num_table_parse_failures: int = 0
437
+ # View processing metrics (actively used)
438
+ num_views_processed: int = 0
439
+ num_view_processing_failures: int = 0
440
+ view_extraction_total_time_seconds: float = 0.0
441
+ view_extraction_average_time_seconds: float = 0.0
442
+ slowest_view_processing_time_seconds: float = 0.0
443
+ slowest_view_name: TopKDict[str, float] = field(default_factory=TopKDict)
444
+
445
+ # Connection pool performance metrics (actively used)
446
+ connection_pool_wait_time_seconds: float = 0.0
447
+ connection_pool_max_wait_time_seconds: float = 0.0
448
+
449
+ # Database-level metrics similar to BigQuery's approach (actively used)
450
+ num_database_tables_to_scan: TopKDict[str, int] = field(default_factory=TopKDict)
451
+ num_database_views_to_scan: TopKDict[str, int] = field(default_factory=TopKDict)
452
+
453
+ # Global metadata extraction timing (single query for all databases)
454
+ metadata_extraction_total_sec: float = 0.0
340
455
 
341
456
 
342
457
  class BaseTeradataConfig(TwoTierSQLAlchemyConfig):
@@ -353,53 +468,7 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
353
468
  )
354
469
 
355
470
  database_pattern = Field(
356
- default=AllowDenyPattern(
357
- deny=[
358
- "All",
359
- "Crashdumps",
360
- "Default",
361
- "DemoNow_Monitor",
362
- "EXTUSER",
363
- "External_AP",
364
- "GLOBAL_FUNCTIONS",
365
- "LockLogShredder",
366
- "PUBLIC",
367
- "SQLJ",
368
- "SYSBAR",
369
- "SYSJDBC",
370
- "SYSLIB",
371
- "SYSSPATIAL",
372
- "SYSUDTLIB",
373
- "SYSUIF",
374
- "SysAdmin",
375
- "Sys_Calendar",
376
- "SystemFe",
377
- "TDBCMgmt",
378
- "TDMaps",
379
- "TDPUSER",
380
- "TDQCD",
381
- "TDStats",
382
- "TD_ANALYTICS_DB",
383
- "TD_SERVER_DB",
384
- "TD_SYSFNLIB",
385
- "TD_SYSGPL",
386
- "TD_SYSXML",
387
- "TDaaS_BAR",
388
- "TDaaS_DB",
389
- "TDaaS_Maint",
390
- "TDaaS_Monitor",
391
- "TDaaS_Support",
392
- "TDaaS_TDBCMgmt1",
393
- "TDaaS_TDBCMgmt2",
394
- "dbcmngr",
395
- "mldb",
396
- "system",
397
- "tapidb",
398
- "tdwm",
399
- "val",
400
- "dbc",
401
- ]
402
- ),
471
+ default=AllowDenyPattern(deny=EXCLUDED_DATABASES),
403
472
  description="Regex patterns for databases to filter in ingestion.",
404
473
  )
405
474
  include_table_lineage = Field(
@@ -413,6 +482,13 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
413
482
  description="Whether to include view lineage in the ingestion. "
414
483
  "This requires to have the view lineage feature enabled.",
415
484
  )
485
+
486
+ include_queries = Field(
487
+ default=True,
488
+ description="Whether to generate query entities for SQL queries. "
489
+ "Query entities provide metadata about individual SQL queries including "
490
+ "execution timestamps, user information, and query text.",
491
+ )
416
492
  usage: BaseUsageConfig = Field(
417
493
  description="The usage config to use when generating usage statistics",
418
494
  default=BaseUsageConfig(),
@@ -438,6 +514,26 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
438
514
  description="Whether to use QVCI to get column information. This is faster but requires to have QVCI enabled.",
439
515
  )
440
516
 
517
+ include_historical_lineage: bool = Field(
518
+ default=False,
519
+ description="Whether to include historical lineage data from PDCRINFO.DBQLSqlTbl_Hst in addition to current DBC.QryLogV data. "
520
+ "This provides access to historical query logs that may have been archived. "
521
+ "The historical table existence is checked automatically and gracefully falls back to current data only if not available.",
522
+ )
523
+
524
+ use_server_side_cursors: bool = Field(
525
+ default=True,
526
+ description="Enable server-side cursors for large result sets using SQLAlchemy's stream_results. "
527
+ "This reduces memory usage by streaming results from the database server. "
528
+ "Automatically falls back to client-side batching if server-side cursors are not supported.",
529
+ )
530
+
531
+ max_workers: int = Field(
532
+ default=10,
533
+ description="Maximum number of worker threads to use for parallel processing. "
534
+ "Controls the level of concurrency for operations like view processing.",
535
+ )
536
+
441
537
 
442
538
  @platform_name("Teradata")
443
539
  @config_class(TeradataConfig)
@@ -445,7 +541,10 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
445
541
  @capability(SourceCapability.DOMAINS, "Enabled by default")
446
542
  @capability(SourceCapability.CONTAINERS, "Enabled by default")
447
543
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
448
- @capability(SourceCapability.DELETION_DETECTION, "Optionally enabled via configuration")
544
+ @capability(
545
+ SourceCapability.DELETION_DETECTION,
546
+ "Enabled by default when stateful ingestion is turned on",
547
+ )
449
548
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
450
549
  @capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration")
451
550
  @capability(SourceCapability.LINEAGE_FINE, "Optionally enabled via configuration")
@@ -461,13 +560,7 @@ class TeradataSource(TwoTierSQLAlchemySource):
461
560
 
462
561
  config: TeradataConfig
463
562
 
464
- LINEAGE_QUERY_DATABASE_FILTER: str = """and default_database IN ({databases})"""
465
-
466
- LINEAGE_TIMESTAMP_BOUND_QUERY: str = """
467
- SELECT MIN(CollectTimeStamp) as "min_ts", MAX(CollectTimeStamp) as "max_ts" from DBC.QryLogV
468
- """.strip()
469
-
470
- QUERY_TEXT_QUERY: str = """
563
+ QUERY_TEXT_CURRENT_QUERIES: str = """
471
564
  SELECT
472
565
  s.QueryID as "query_id",
473
566
  UserName as "user",
@@ -500,10 +593,89 @@ class TeradataSource(TwoTierSQLAlchemySource):
500
593
  and s.CollectTimeStamp >= TIMESTAMP '{start_time}'
501
594
  and default_database not in ('DEMONOW_MONITOR')
502
595
  {databases_filter}
503
- ORDER BY "query_id", "row_no"
596
+ ORDER BY "timestamp", "query_id", "row_no"
597
+ """.strip()
598
+
599
+ QUERY_TEXT_HISTORICAL_UNION: str = """
600
+ SELECT
601
+ "query_id",
602
+ "user",
603
+ "timestamp",
604
+ default_database,
605
+ "query_text",
606
+ "row_no"
607
+ FROM (
608
+ SELECT
609
+ h.QueryID as "query_id",
610
+ h.UserName as "user",
611
+ h.StartTime AT TIME ZONE 'GMT' as "timestamp",
612
+ h.DefaultDatabase as default_database,
613
+ h.SqlTextInfo as "query_text",
614
+ h.SqlRowNo as "row_no"
615
+ FROM "PDCRINFO".DBQLSqlTbl_Hst as h
616
+ WHERE
617
+ h.ErrorCode = 0
618
+ AND h.statementtype not in (
619
+ 'Unrecognized type',
620
+ 'Create Database/User',
621
+ 'Help',
622
+ 'Modify Database',
623
+ 'Drop Table',
624
+ 'Show',
625
+ 'Not Applicable',
626
+ 'Grant',
627
+ 'Abort',
628
+ 'Database',
629
+ 'Flush Query Logging',
630
+ 'Null',
631
+ 'Begin/End DBQL',
632
+ 'Revoke'
633
+ )
634
+ and h.StartTime AT TIME ZONE 'GMT' >= TIMESTAMP '{start_time}'
635
+ and h.StartTime AT TIME ZONE 'GMT' < TIMESTAMP '{end_time}'
636
+ and h.CollectTimeStamp >= TIMESTAMP '{start_time}'
637
+ and h.DefaultDatabase not in ('DEMONOW_MONITOR')
638
+ {databases_filter_history}
639
+
640
+ UNION
641
+
642
+ SELECT
643
+ s.QueryID as "query_id",
644
+ l.UserName as "user",
645
+ l.StartTime AT TIME ZONE 'GMT' as "timestamp",
646
+ l.DefaultDatabase as default_database,
647
+ s.SqlTextInfo as "query_text",
648
+ s.SqlRowNo as "row_no"
649
+ FROM "DBC".QryLogV as l
650
+ JOIN "DBC".QryLogSqlV as s on s.QueryID = l.QueryID
651
+ WHERE
652
+ l.ErrorCode = 0
653
+ AND l.statementtype not in (
654
+ 'Unrecognized type',
655
+ 'Create Database/User',
656
+ 'Help',
657
+ 'Modify Database',
658
+ 'Drop Table',
659
+ 'Show',
660
+ 'Not Applicable',
661
+ 'Grant',
662
+ 'Abort',
663
+ 'Database',
664
+ 'Flush Query Logging',
665
+ 'Null',
666
+ 'Begin/End DBQL',
667
+ 'Revoke'
668
+ )
669
+ and l.StartTime AT TIME ZONE 'GMT' >= TIMESTAMP '{start_time}'
670
+ and l.StartTime AT TIME ZONE 'GMT' < TIMESTAMP '{end_time}'
671
+ and s.CollectTimeStamp >= TIMESTAMP '{start_time}'
672
+ and l.DefaultDatabase not in ('DEMONOW_MONITOR')
673
+ {databases_filter}
674
+ ) as combined_results
675
+ ORDER BY "timestamp", "query_id", "row_no"
504
676
  """.strip()
505
677
 
506
- TABLES_AND_VIEWS_QUERY: str = """
678
+ TABLES_AND_VIEWS_QUERY: str = f"""
507
679
  SELECT
508
680
  t.DataBaseName,
509
681
  t.TableName as name,
@@ -521,77 +693,51 @@ SELECT
521
693
  t.LastAlterTimeStamp,
522
694
  t.RequestText
523
695
  FROM dbc.TablesV t
524
- WHERE DataBaseName NOT IN (
525
- 'All',
526
- 'Crashdumps',
527
- 'Default',
528
- 'DemoNow_Monitor',
529
- 'EXTUSER',
530
- 'External_AP',
531
- 'GLOBAL_FUNCTIONS',
532
- 'LockLogShredder',
533
- 'PUBLIC',
534
- 'SQLJ',
535
- 'SYSBAR',
536
- 'SYSJDBC',
537
- 'SYSLIB',
538
- 'SYSSPATIAL',
539
- 'SYSUDTLIB',
540
- 'SYSUIF',
541
- 'SysAdmin',
542
- 'Sys_Calendar',
543
- 'SystemFe',
544
- 'TDBCMgmt',
545
- 'TDMaps',
546
- 'TDPUSER',
547
- 'TDQCD',
548
- 'TDStats',
549
- 'TD_ANALYTICS_DB',
550
- 'TD_SERVER_DB',
551
- 'TD_SYSFNLIB',
552
- 'TD_SYSGPL',
553
- 'TD_SYSXML',
554
- 'TDaaS_BAR',
555
- 'TDaaS_DB',
556
- 'TDaaS_Maint',
557
- 'TDaaS_Monitor',
558
- 'TDaaS_Support',
559
- 'TDaaS_TDBCMgmt1',
560
- 'TDaaS_TDBCMgmt2',
561
- 'dbcmngr',
562
- 'mldb',
563
- 'system',
564
- 'tapidb',
565
- 'tdwm',
566
- 'val',
567
- 'dbc'
568
- )
696
+ WHERE DataBaseName NOT IN ({",".join([f"'{db}'" for db in EXCLUDED_DATABASES])})
569
697
  AND t.TableKind in ('T', 'V', 'Q', 'O')
570
698
  ORDER by DataBaseName, TableName;
571
699
  """.strip()
572
700
 
573
701
  _tables_cache: MutableMapping[str, List[TeradataTable]] = defaultdict(list)
702
+ _tables_cache_lock = Lock() # Protect shared cache from concurrent access
703
+ _pooled_engine: Optional[Engine] = None # Reusable pooled engine
704
+ _pooled_engine_lock = Lock() # Protect engine creation
574
705
 
575
706
  def __init__(self, config: TeradataConfig, ctx: PipelineContext):
576
707
  super().__init__(config, ctx, "teradata")
577
708
 
578
709
  self.report: TeradataReport = TeradataReport()
579
710
  self.graph: Optional[DataHubGraph] = ctx.graph
711
+ self._report_lock = Lock() # Thread safety for report counters
580
712
 
581
- self.builder: SqlParsingBuilder = SqlParsingBuilder(
582
- usage_config=(
583
- self.config.usage if self.config.include_usage_statistics else None
584
- ),
585
- generate_lineage=True,
713
+ self.schema_resolver = self._init_schema_resolver()
714
+
715
+ # Initialize SqlParsingAggregator for modern lineage processing
716
+ logger.info("Initializing SqlParsingAggregator for enhanced lineage processing")
717
+ self.aggregator = SqlParsingAggregator(
718
+ platform="teradata",
719
+ platform_instance=self.config.platform_instance,
720
+ env=self.config.env,
721
+ schema_resolver=self.schema_resolver,
722
+ graph=self.ctx.graph,
723
+ generate_lineage=self.include_lineage,
724
+ generate_queries=self.config.include_queries,
586
725
  generate_usage_statistics=self.config.include_usage_statistics,
587
- generate_operations=self.config.usage.include_operational_stats,
726
+ generate_query_usage_statistics=self.config.include_usage_statistics,
727
+ generate_operations=self.config.usage.include_operational_stats
728
+ if self.config.include_usage_statistics
729
+ else False,
730
+ usage_config=self.config.usage
731
+ if self.config.include_usage_statistics
732
+ else None,
733
+ eager_graph_load=False,
588
734
  )
589
-
590
- self.schema_resolver = self._init_schema_resolver()
735
+ self.report.sql_aggregator = self.aggregator.report
591
736
 
592
737
  if self.config.include_tables or self.config.include_views:
593
- self.cache_tables_and_views()
594
- logger.info(f"Found {len(self._tables_cache)} tables and views")
738
+ with self.report.new_stage("Table and view discovery"):
739
+ self.cache_tables_and_views()
740
+ logger.info(f"Found {len(self._tables_cache)} tables and views")
595
741
  setattr(self, "loop_tables", self.cached_loop_tables) # noqa: B010
596
742
  setattr(self, "loop_views", self.cached_loop_views) # noqa: B010
597
743
  setattr( # noqa: B010
@@ -721,6 +867,8 @@ ORDER by DataBaseName, TableName;
721
867
 
722
868
  logger.debug(f"sql_alchemy_url={url}")
723
869
  engine = create_engine(url, **self.config.options)
870
+
871
+ # Get list of databases first
724
872
  with engine.connect() as conn:
725
873
  inspector = inspect(conn)
726
874
  if self.config.database and self.config.database != "":
@@ -729,13 +877,14 @@ ORDER by DataBaseName, TableName;
729
877
  databases = self.config.databases
730
878
  else:
731
879
  databases = inspector.get_schema_names()
732
- for db in databases:
733
- if self.config.database_pattern.allowed(db):
734
- # url = self.config.get_sql_alchemy_url(current_db=db)
735
- # with create_engine(url, **self.config.options).connect() as conn:
736
- # inspector = inspect(conn)
737
- inspector._datahub_database = db
738
- yield inspector
880
+
881
+ # Create separate connections for each database to avoid connection lifecycle issues
882
+ for db in databases:
883
+ if self.config.database_pattern.allowed(db):
884
+ with engine.connect() as conn:
885
+ db_inspector = inspect(conn)
886
+ db_inspector._datahub_database = db
887
+ yield db_inspector
739
888
 
740
889
  def get_db_name(self, inspector: Inspector) -> str:
741
890
  if hasattr(inspector, "_datahub_database"):
@@ -753,14 +902,15 @@ ORDER by DataBaseName, TableName;
753
902
  inspector: Inspector,
754
903
  schema: str,
755
904
  sql_config: SQLCommonConfig,
756
- ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
905
+ ) -> Iterable[MetadataWorkUnit]:
757
906
  setattr( # noqa: B010
758
907
  inspector,
759
908
  "get_table_names",
760
909
  lambda schema: [
761
910
  i.name
762
911
  for i in filter(
763
- lambda t: t.object_type != "View", self._tables_cache[schema]
912
+ lambda t: t.object_type != "View",
913
+ self._tables_cache.get(schema, []),
764
914
  )
765
915
  ],
766
916
  )
@@ -776,7 +926,8 @@ ORDER by DataBaseName, TableName;
776
926
  # this method and provide a location.
777
927
  location: Optional[str] = None
778
928
 
779
- for entry in self._tables_cache[schema]:
929
+ cache_entries = self._tables_cache.get(schema, [])
930
+ for entry in cache_entries:
780
931
  if entry.name == table:
781
932
  description = entry.description
782
933
  if entry.object_type == "View" and entry.request_text:
@@ -789,123 +940,734 @@ ORDER by DataBaseName, TableName;
789
940
  inspector: Inspector,
790
941
  schema: str,
791
942
  sql_config: SQLCommonConfig,
792
- ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
793
- setattr( # noqa: B010
794
- inspector,
795
- "get_view_names",
796
- lambda schema: [
797
- i.name
798
- for i in filter(
799
- lambda t: t.object_type == "View", self._tables_cache[schema]
943
+ ) -> Iterable[MetadataWorkUnit]:
944
+ start_time = time.time()
945
+
946
+ # Get view names from cache
947
+ view_names = [
948
+ i.name
949
+ for i in filter(
950
+ lambda t: t.object_type == "View", self._tables_cache.get(schema, [])
951
+ )
952
+ ]
953
+ actual_view_count = len(view_names)
954
+
955
+ if actual_view_count == 0:
956
+ end_time = time.time()
957
+ processing_time = end_time - start_time
958
+ logger.info(
959
+ f"View processing for schema '{schema}' completed in {processing_time:.2f} seconds (0 views, 0 work units)"
960
+ )
961
+ return
962
+
963
+ # Use custom threading implementation with connection pooling
964
+ work_unit_count = 0
965
+
966
+ for work_unit in self._loop_views_with_connection_pool(
967
+ view_names, schema, sql_config
968
+ ):
969
+ work_unit_count += 1
970
+ yield work_unit
971
+
972
+ end_time = time.time()
973
+ processing_time = end_time - start_time
974
+
975
+ logger.info(
976
+ f"View processing for schema '{schema}' completed in {processing_time:.2f} seconds ({actual_view_count} views, {work_unit_count} work units)"
977
+ )
978
+
979
+ # Update report timing metrics
980
+ if hasattr(self, "report"):
981
+ self.report.view_extraction_total_time_seconds += processing_time
982
+ self.report.num_views_processed += actual_view_count
983
+
984
+ # Track slowest view processing at view level (will be updated by individual view processing)
985
+ # Note: slowest_view_name now tracks individual views, not schemas
986
+
987
+ # Calculate average processing time per view
988
+ if self.report.num_views_processed > 0:
989
+ self.report.view_extraction_average_time_seconds = (
990
+ self.report.view_extraction_total_time_seconds
991
+ / self.report.num_views_processed
800
992
  )
801
- ],
993
+
994
+ def _loop_views_with_connection_pool(
995
+ self, view_names: List[str], schema: str, sql_config: SQLCommonConfig
996
+ ) -> Iterable[Union[MetadataWorkUnit, Any]]:
997
+ """
998
+ Process views using individual database connections per thread for true parallelization.
999
+
1000
+ Each thread gets its own connection from a QueuePool, enabling true concurrent processing.
1001
+ """
1002
+ if self.config.max_workers == 1:
1003
+ # Single-threaded processing - no need for complexity
1004
+ yield from self._process_views_single_threaded(
1005
+ view_names, schema, sql_config
1006
+ )
1007
+ return
1008
+
1009
+ logger.info(
1010
+ f"Processing {len(view_names)} views with {self.config.max_workers} worker threads"
802
1011
  )
803
- yield from super().loop_views(inspector, schema, sql_config)
804
1012
 
805
- def cache_tables_and_views(self) -> None:
1013
+ # Get or create reusable pooled engine
1014
+ engine = self._get_or_create_pooled_engine()
1015
+
1016
+ try:
1017
+ # Thread-safe result collection
1018
+ report_lock = Lock()
1019
+
1020
+ def process_single_view(
1021
+ view_name: str,
1022
+ ) -> List[Union[MetadataWorkUnit, Any]]:
1023
+ """Process a single view with its own database connection."""
1024
+ results: List[Union[MetadataWorkUnit, Any]] = []
1025
+
1026
+ # Detailed timing measurements for bottleneck analysis
1027
+ timings = {
1028
+ "connection_acquire": 0.0,
1029
+ "view_processing": 0.0,
1030
+ "work_unit_generation": 0.0,
1031
+ "total": 0.0,
1032
+ }
1033
+
1034
+ total_start = time.time()
1035
+ try:
1036
+ # Measure connection acquisition time
1037
+ conn_start = time.time()
1038
+ with engine.connect() as conn:
1039
+ timings["connection_acquire"] = time.time() - conn_start
1040
+
1041
+ # Update connection pool metrics
1042
+ with report_lock:
1043
+ pool_wait_time = timings["connection_acquire"]
1044
+ self.report.connection_pool_wait_time_seconds += (
1045
+ pool_wait_time
1046
+ )
1047
+ if (
1048
+ pool_wait_time
1049
+ > self.report.connection_pool_max_wait_time_seconds
1050
+ ):
1051
+ self.report.connection_pool_max_wait_time_seconds = (
1052
+ pool_wait_time
1053
+ )
1054
+
1055
+ # Measure view processing setup
1056
+ processing_start = time.time()
1057
+ thread_inspector = inspect(conn)
1058
+ # Inherit database information for Teradata two-tier architecture
1059
+ thread_inspector._datahub_database = schema # type: ignore
1060
+
1061
+ dataset_name = self.get_identifier(
1062
+ schema=schema, entity=view_name, inspector=thread_inspector
1063
+ )
1064
+
1065
+ # Thread-safe reporting
1066
+ with report_lock:
1067
+ self.report.report_entity_scanned(
1068
+ dataset_name, ent_type="view"
1069
+ )
1070
+
1071
+ if not sql_config.view_pattern.allowed(dataset_name):
1072
+ with report_lock:
1073
+ self.report.report_dropped(dataset_name)
1074
+ return results
1075
+
1076
+ timings["view_processing"] = time.time() - processing_start
1077
+
1078
+ # Measure work unit generation
1079
+ wu_start = time.time()
1080
+ for work_unit in self._process_view(
1081
+ dataset_name=dataset_name,
1082
+ inspector=thread_inspector,
1083
+ schema=schema,
1084
+ view=view_name,
1085
+ sql_config=sql_config,
1086
+ ):
1087
+ results.append(work_unit)
1088
+ timings["work_unit_generation"] = time.time() - wu_start
1089
+
1090
+ # Track individual view timing
1091
+ timings["total"] = time.time() - total_start
1092
+
1093
+ with report_lock:
1094
+ self.report.slowest_view_name[f"{schema}.{view_name}"] = (
1095
+ timings["total"]
1096
+ )
1097
+
1098
+ except Exception as e:
1099
+ with report_lock:
1100
+ self.report.num_view_processing_failures += 1
1101
+ # Log full exception details for debugging
1102
+ import traceback
1103
+
1104
+ full_traceback = traceback.format_exc()
1105
+ logger.error(
1106
+ f"Failed to process view {schema}.{view_name}: {str(e)}"
1107
+ )
1108
+ logger.error(f"Full traceback: {full_traceback}")
1109
+ self.report.warning(
1110
+ f"Error processing view {schema}.{view_name}",
1111
+ context=f"View: {schema}.{view_name}, Error: {str(e)}",
1112
+ exc=e,
1113
+ )
1114
+
1115
+ return results
1116
+
1117
+ # Use ThreadPoolExecutor for concurrent processing
1118
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
1119
+ # Submit all view processing tasks
1120
+ future_to_view = {
1121
+ executor.submit(process_single_view, view_name): view_name
1122
+ for view_name in view_names
1123
+ }
1124
+
1125
+ # Process completed tasks as they finish
1126
+ for future in as_completed(future_to_view):
1127
+ view_name = future_to_view[future]
1128
+ try:
1129
+ results = future.result()
1130
+ # Yield all results from this view
1131
+ for result in results:
1132
+ yield result
1133
+ except Exception as e:
1134
+ with report_lock:
1135
+ self.report.warning(
1136
+ "Error in thread processing view",
1137
+ context=f"{schema}.{view_name}",
1138
+ exc=e,
1139
+ )
1140
+
1141
+ finally:
1142
+ # Don't dispose the reusable engine here - it will be cleaned up in close()
1143
+ pass
1144
+
1145
+ def _process_views_single_threaded(
1146
+ self, view_names: List[str], schema: str, sql_config: SQLCommonConfig
1147
+ ) -> Iterable[Union[MetadataWorkUnit, Any]]:
1148
+ """Process views sequentially with a single connection."""
806
1149
  engine = self.get_metadata_engine()
807
- for entry in engine.execute(self.TABLES_AND_VIEWS_QUERY):
808
- table = TeradataTable(
809
- database=entry.DataBaseName.strip(),
810
- name=entry.name.strip(),
811
- description=entry.description.strip() if entry.description else None,
812
- object_type=entry.object_type,
813
- create_timestamp=entry.CreateTimeStamp,
814
- last_alter_name=entry.LastAlterName,
815
- last_alter_timestamp=entry.LastAlterTimeStamp,
816
- request_text=(
817
- entry.RequestText.strip()
818
- if entry.object_type == "View" and entry.RequestText
819
- else None
820
- ),
1150
+
1151
+ try:
1152
+ with engine.connect() as conn:
1153
+ inspector = inspect(conn)
1154
+
1155
+ for view_name in view_names:
1156
+ view_start_time = time.time()
1157
+ try:
1158
+ dataset_name = self.get_identifier(
1159
+ schema=schema, entity=view_name, inspector=inspector
1160
+ )
1161
+
1162
+ self.report.report_entity_scanned(dataset_name, ent_type="view")
1163
+
1164
+ if not sql_config.view_pattern.allowed(dataset_name):
1165
+ self.report.report_dropped(dataset_name)
1166
+ continue
1167
+
1168
+ # Process the view and yield results
1169
+ for work_unit in self._process_view(
1170
+ dataset_name=dataset_name,
1171
+ inspector=inspector,
1172
+ schema=schema,
1173
+ view=view_name,
1174
+ sql_config=sql_config,
1175
+ ):
1176
+ yield work_unit
1177
+
1178
+ # Track individual view timing
1179
+ view_end_time = time.time()
1180
+ view_processing_time = view_end_time - view_start_time
1181
+ self.report.slowest_view_name[f"{schema}.{view_name}"] = (
1182
+ view_processing_time
1183
+ )
1184
+
1185
+ except Exception as e:
1186
+ # Log full exception details for debugging
1187
+ import traceback
1188
+
1189
+ full_traceback = traceback.format_exc()
1190
+ logger.error(
1191
+ f"Failed to process view {schema}.{view_name}: {str(e)}"
1192
+ )
1193
+ logger.error(f"Full traceback: {full_traceback}")
1194
+ self.report.warning(
1195
+ f"Error processing view {schema}.{view_name}",
1196
+ context=f"View: {schema}.{view_name}, Error: {str(e)}",
1197
+ exc=e,
1198
+ )
1199
+
1200
+ finally:
1201
+ engine.dispose()
1202
+
1203
+ def _get_or_create_pooled_engine(self) -> Engine:
1204
+ """Get or create a reusable SQLAlchemy engine with QueuePool for concurrent connections."""
1205
+ with self._pooled_engine_lock:
1206
+ if self._pooled_engine is None:
1207
+ url = self.config.get_sql_alchemy_url()
1208
+
1209
+ # Optimal connection pool sizing to match max_workers exactly
1210
+ # Teradata driver can be sensitive to high connection counts, so cap at reasonable limit
1211
+ max_safe_connections = (
1212
+ 13 # Conservative limit: 8 base + 5 overflow for Teradata stability
1213
+ )
1214
+
1215
+ # Adjust max_workers to match available connection pool capacity
1216
+ effective_max_workers = min(
1217
+ self.config.max_workers, max_safe_connections
1218
+ )
1219
+
1220
+ # Set pool size to match effective workers for optimal performance
1221
+ base_connections = min(
1222
+ effective_max_workers, 8
1223
+ ) # Reasonable base connections
1224
+ max_overflow = (
1225
+ effective_max_workers - base_connections
1226
+ ) # Remaining as overflow
1227
+
1228
+ # Log adjustment if max_workers was reduced
1229
+ if effective_max_workers < self.config.max_workers:
1230
+ logger.warning(
1231
+ f"Reduced max_workers from {self.config.max_workers} to {effective_max_workers} to match Teradata connection pool capacity"
1232
+ )
1233
+
1234
+ # Update the config to reflect the effective value used
1235
+ self.config.max_workers = effective_max_workers
1236
+
1237
+ pool_options = {
1238
+ **self.config.options,
1239
+ "poolclass": QueuePool,
1240
+ "pool_size": base_connections,
1241
+ "max_overflow": max_overflow,
1242
+ "pool_pre_ping": True, # Validate connections
1243
+ "pool_recycle": 1800, # Recycle connections after 30 mins (more frequent)
1244
+ "pool_timeout": 60, # Longer timeout for connection acquisition
1245
+ "pool_reset_on_return": "rollback", # Explicit rollback on connection return
1246
+ }
1247
+
1248
+ # Add Teradata-specific connection options for stability
1249
+ if "connect_args" not in pool_options:
1250
+ pool_options["connect_args"] = {}
1251
+
1252
+ # Teradata-specific connection arguments for better stability
1253
+ pool_options["connect_args"].update(
1254
+ {
1255
+ "connect_timeout": "30000", # Connection timeout in ms (30 seconds)
1256
+ "request_timeout": "120000", # Request timeout in ms (2 minutes)
1257
+ }
1258
+ )
1259
+
1260
+ self._pooled_engine = create_engine(url, **pool_options)
1261
+ logger.info(
1262
+ f"Created optimized Teradata connection pool: {base_connections} base + {max_overflow} overflow = {base_connections + max_overflow} max connections (matching {effective_max_workers} workers)"
1263
+ )
1264
+
1265
+ return self._pooled_engine
1266
+
1267
+ def cache_tables_and_views(self) -> None:
1268
+ with self.report.new_stage("Cache tables and views"):
1269
+ engine = self.get_metadata_engine()
1270
+ try:
1271
+ database_counts: Dict[str, Dict[str, int]] = defaultdict(
1272
+ lambda: {"tables": 0, "views": 0}
1273
+ )
1274
+
1275
+ for entry in engine.execute(self.TABLES_AND_VIEWS_QUERY):
1276
+ table = TeradataTable(
1277
+ database=entry.DataBaseName.strip(),
1278
+ name=entry.name.strip(),
1279
+ description=entry.description.strip()
1280
+ if entry.description
1281
+ else None,
1282
+ object_type=entry.object_type,
1283
+ create_timestamp=entry.CreateTimeStamp,
1284
+ last_alter_name=entry.LastAlterName,
1285
+ last_alter_timestamp=entry.LastAlterTimeStamp,
1286
+ request_text=(
1287
+ entry.RequestText.strip()
1288
+ if entry.object_type == "View" and entry.RequestText
1289
+ else None
1290
+ ),
1291
+ )
1292
+
1293
+ # Count objects per database for metrics
1294
+ if table.object_type == "View":
1295
+ database_counts[table.database]["views"] += 1
1296
+ else:
1297
+ database_counts[table.database]["tables"] += 1
1298
+
1299
+ with self._tables_cache_lock:
1300
+ if table.database not in self._tables_cache:
1301
+ self._tables_cache[table.database] = []
1302
+ self._tables_cache[table.database].append(table)
1303
+
1304
+ for database, counts in database_counts.items():
1305
+ self.report.num_database_tables_to_scan[database] = counts["tables"]
1306
+ self.report.num_database_views_to_scan[database] = counts["views"]
1307
+
1308
+ finally:
1309
+ engine.dispose()
1310
+
1311
+ def _reconstruct_queries_streaming(
1312
+ self, entries: Iterable[Any]
1313
+ ) -> Iterable[ObservedQuery]:
1314
+ """Reconstruct complete queries from database entries in streaming fashion.
1315
+
1316
+ This method processes entries in order and reconstructs multi-row queries
1317
+ by concatenating rows with the same query_id.
1318
+ """
1319
+ current_query_id = None
1320
+ current_query_parts = []
1321
+ current_query_metadata = None
1322
+
1323
+ for entry in entries:
1324
+ query_id = getattr(entry, "query_id", None)
1325
+ query_text = str(getattr(entry, "query_text", ""))
1326
+
1327
+ if query_id != current_query_id:
1328
+ # New query started - yield the previous one if it exists
1329
+ if current_query_id is not None and current_query_parts:
1330
+ yield self._create_observed_query_from_parts(
1331
+ current_query_parts, current_query_metadata
1332
+ )
1333
+
1334
+ # Start new query
1335
+ current_query_id = query_id
1336
+ current_query_parts = [query_text] if query_text else []
1337
+ current_query_metadata = entry
1338
+ else:
1339
+ # Same query - append the text
1340
+ if query_text:
1341
+ current_query_parts.append(query_text)
1342
+
1343
+ # Yield the last query if it exists
1344
+ if current_query_id is not None and current_query_parts:
1345
+ yield self._create_observed_query_from_parts(
1346
+ current_query_parts, current_query_metadata
821
1347
  )
822
- if table.database not in self._tables_cache:
823
- self._tables_cache[table.database] = []
824
1348
 
825
- self._tables_cache[table.database].append(table)
1349
+ def _create_observed_query_from_parts(
1350
+ self, query_parts: List[str], metadata_entry: Any
1351
+ ) -> ObservedQuery:
1352
+ """Create ObservedQuery from reconstructed query parts and metadata."""
1353
+ # Join all parts to form the complete query
1354
+ # Teradata fragments are split at fixed lengths without artificial breaks
1355
+ full_query_text = "".join(query_parts)
1356
+
1357
+ # Extract metadata
1358
+ session_id = getattr(metadata_entry, "session_id", None)
1359
+ timestamp = getattr(metadata_entry, "timestamp", None)
1360
+ user = getattr(metadata_entry, "user", None)
1361
+ default_database = getattr(metadata_entry, "default_database", None)
1362
+
1363
+ # Apply Teradata-specific query transformations
1364
+ cleaned_query = full_query_text.replace("(NOT CASESPECIFIC)", "")
1365
+
1366
+ return ObservedQuery(
1367
+ query=cleaned_query,
1368
+ session_id=session_id,
1369
+ timestamp=timestamp,
1370
+ user=CorpUserUrn(user) if user else None,
1371
+ default_db=default_database,
1372
+ default_schema=default_database, # Teradata uses database as schema
1373
+ )
1374
+
1375
+ def _convert_entry_to_observed_query(self, entry: Any) -> ObservedQuery:
1376
+ """Convert database query entry to ObservedQuery for SqlParsingAggregator.
1377
+
1378
+ DEPRECATED: This method is deprecated in favor of _reconstruct_queries_streaming
1379
+ which properly handles multi-row queries. This method does not handle queries
1380
+ that span multiple rows correctly and should not be used.
1381
+ """
1382
+ # Extract fields from database result
1383
+ query_text = str(entry.query_text).strip()
1384
+ session_id = getattr(entry, "session_id", None)
1385
+ timestamp = getattr(entry, "timestamp", None)
1386
+ user = getattr(entry, "user", None)
1387
+ default_database = getattr(entry, "default_database", None)
1388
+
1389
+ # Apply Teradata-specific query transformations
1390
+ cleaned_query = query_text.replace("(NOT CASESPECIFIC)", "")
1391
+
1392
+ return ObservedQuery(
1393
+ query=cleaned_query,
1394
+ session_id=session_id,
1395
+ timestamp=timestamp,
1396
+ user=CorpUserUrn(user) if user else None,
1397
+ default_db=default_database,
1398
+ default_schema=default_database, # Teradata uses database as schema
1399
+ )
1400
+
1401
+ def _fetch_lineage_entries_chunked(self) -> Iterable[Any]:
1402
+ """Fetch lineage entries using server-side cursor to handle large result sets efficiently."""
1403
+ queries = self._make_lineage_queries()
1404
+
1405
+ fetch_engine = self.get_metadata_engine()
1406
+ try:
1407
+ with fetch_engine.connect() as conn:
1408
+ cursor_type = (
1409
+ "server-side"
1410
+ if self.config.use_server_side_cursors
1411
+ else "client-side"
1412
+ )
1413
+
1414
+ total_count_all_queries = 0
1415
+
1416
+ for query_index, query in enumerate(queries, 1):
1417
+ logger.info(
1418
+ f"Executing lineage query {query_index}/{len(queries)} with {cursor_type} cursor..."
1419
+ )
1420
+
1421
+ # Use helper method to try server-side cursor with fallback
1422
+ result = self._execute_with_cursor_fallback(conn, query)
1423
+
1424
+ # Stream results in batches to avoid memory issues
1425
+ batch_size = 5000
1426
+ batch_count = 0
1427
+ query_total_count = 0
1428
+
1429
+ while True:
1430
+ # Fetch a batch of rows
1431
+ batch = result.fetchmany(batch_size)
1432
+ if not batch:
1433
+ break
1434
+
1435
+ batch_count += 1
1436
+ query_total_count += len(batch)
1437
+ total_count_all_queries += len(batch)
1438
+
1439
+ logger.info(
1440
+ f"Query {query_index} - Fetched batch {batch_count}: {len(batch)} lineage entries (query total: {query_total_count})"
1441
+ )
1442
+ yield from batch
1443
+
1444
+ logger.info(
1445
+ f"Completed query {query_index}: {query_total_count} lineage entries in {batch_count} batches"
1446
+ )
1447
+
1448
+ logger.info(
1449
+ f"Completed fetching all queries: {total_count_all_queries} total lineage entries from {len(queries)} queries"
1450
+ )
826
1451
 
827
- def get_audit_log_mcps(self, urns: Set[str]) -> Iterable[MetadataWorkUnit]:
1452
+ except Exception as e:
1453
+ logger.error(f"Error fetching lineage entries: {e}")
1454
+ raise
1455
+ finally:
1456
+ fetch_engine.dispose()
1457
+
1458
+ def _check_historical_table_exists(self) -> bool:
1459
+ """
1460
+ Check if the PDCRINFO.DBQLSqlTbl_Hst table exists and is accessible.
1461
+ DBQL rows are periodically moved to history table and audit queries might not exist in DBC already.
1462
+ There is not guarantee that the historical table exists, so we need to check it.
1463
+
1464
+ Returns:
1465
+ bool: True if the historical table exists and is accessible, False otherwise.
1466
+ """
828
1467
  engine = self.get_metadata_engine()
829
- for entry in engine.execute(self._make_lineage_query()):
830
- self.report.num_queries_parsed += 1
831
- if self.report.num_queries_parsed % 1000 == 0:
832
- logger.info(f"Parsed {self.report.num_queries_parsed} queries")
833
-
834
- yield from self.gen_lineage_from_query(
835
- query=entry.query_text,
836
- default_database=entry.default_database,
837
- timestamp=entry.timestamp,
838
- user=entry.user,
839
- urns=urns,
1468
+ try:
1469
+ # Use a simple query to check if the table exists and is accessible
1470
+ check_query = """
1471
+ SELECT TOP 1 QueryID
1472
+ FROM PDCRINFO.DBQLSqlTbl_Hst
1473
+ WHERE 1=0
1474
+ """
1475
+ with engine.connect() as conn:
1476
+ conn.execute(text(check_query))
1477
+ logger.info(
1478
+ "Historical lineage table PDCRINFO.DBQLSqlTbl_Hst is available"
1479
+ )
1480
+ return True
1481
+ except Exception as e:
1482
+ logger.info(
1483
+ f"Historical lineage table PDCRINFO.DBQLSqlTbl_Hst is not available: {e}"
840
1484
  )
1485
+ return False
1486
+ finally:
1487
+ engine.dispose()
841
1488
 
842
- def _make_lineage_query(self) -> str:
1489
+ def _make_lineage_queries(self) -> List[str]:
843
1490
  databases_filter = (
844
1491
  ""
845
1492
  if not self.config.databases
846
- else "and default_database in ({databases})".format(
1493
+ else "and l.DefaultDatabase in ({databases})".format(
847
1494
  databases=",".join([f"'{db}'" for db in self.config.databases])
848
1495
  )
849
1496
  )
850
1497
 
851
- query = self.QUERY_TEXT_QUERY.format(
852
- start_time=self.config.start_time,
853
- end_time=self.config.end_time,
854
- databases_filter=databases_filter,
855
- )
856
- return query
1498
+ queries = []
857
1499
 
858
- def gen_lineage_from_query(
859
- self,
860
- query: str,
861
- default_database: Optional[str] = None,
862
- timestamp: Optional[datetime] = None,
863
- user: Optional[str] = None,
864
- view_urn: Optional[str] = None,
865
- urns: Optional[Set[str]] = None,
866
- ) -> Iterable[MetadataWorkUnit]:
867
- result = sqlglot_lineage(
868
- # With this clever hack we can make the query parser to not fail on queries with CASESPECIFIC
869
- sql=query.replace("(NOT CASESPECIFIC)", ""),
870
- schema_resolver=self.schema_resolver,
871
- default_db=None,
872
- default_schema=(
873
- default_database if default_database else self.config.default_db
874
- ),
875
- )
876
- if result.debug_info.table_error:
877
- logger.debug(
878
- f"Error parsing table lineage ({view_urn}):\n{result.debug_info.table_error}"
1500
+ # Check if historical lineage is configured and available
1501
+ if (
1502
+ self.config.include_historical_lineage
1503
+ and self._check_historical_table_exists()
1504
+ ):
1505
+ logger.info(
1506
+ "Using UNION query to combine historical and current lineage data to avoid duplicates"
879
1507
  )
880
- self.report.num_table_parse_failures += 1
1508
+ # For historical query, we need the database filter for historical part
1509
+ databases_filter_history = (
1510
+ databases_filter.replace("l.DefaultDatabase", "h.DefaultDatabase")
1511
+ if databases_filter
1512
+ else ""
1513
+ )
1514
+
1515
+ union_query = self.QUERY_TEXT_HISTORICAL_UNION.format(
1516
+ start_time=self.config.start_time,
1517
+ end_time=self.config.end_time,
1518
+ databases_filter=databases_filter,
1519
+ databases_filter_history=databases_filter_history,
1520
+ )
1521
+ queries.append(union_query)
881
1522
  else:
882
- yield from self.builder.process_sql_parsing_result(
883
- result,
884
- query=query,
885
- is_view_ddl=view_urn is not None,
886
- query_timestamp=timestamp,
887
- user=f"urn:li:corpuser:{user}",
888
- include_urns=urns,
1523
+ if self.config.include_historical_lineage:
1524
+ logger.warning(
1525
+ "Historical lineage was requested but PDCRINFO.DBQLSqlTbl_Hst table is not available. Falling back to current data only."
1526
+ )
1527
+
1528
+ # Use current-only query when historical data is not available
1529
+ current_query = self.QUERY_TEXT_CURRENT_QUERIES.format(
1530
+ start_time=self.config.start_time,
1531
+ end_time=self.config.end_time,
1532
+ databases_filter=databases_filter,
889
1533
  )
1534
+ queries.append(current_query)
1535
+
1536
+ return queries
890
1537
 
891
1538
  def get_metadata_engine(self) -> Engine:
892
1539
  url = self.config.get_sql_alchemy_url()
893
1540
  logger.debug(f"sql_alchemy_url={url}")
894
1541
  return create_engine(url, **self.config.options)
895
1542
 
896
- def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
1543
+ def _execute_with_cursor_fallback(
1544
+ self, connection: Connection, query: str, params: Optional[Dict] = None
1545
+ ) -> Any:
1546
+ """
1547
+ Execute query with server-side cursor if enabled and supported, otherwise fall back to regular execution.
1548
+
1549
+ Args:
1550
+ connection: Database connection
1551
+ query: SQL query to execute
1552
+ params: Query parameters
1553
+
1554
+ Returns:
1555
+ Query result object
1556
+ """
1557
+ if self.config.use_server_side_cursors:
1558
+ try:
1559
+ # Try server-side cursor first
1560
+ if params:
1561
+ result = connection.execution_options(stream_results=True).execute(
1562
+ text(query), params
1563
+ )
1564
+ else:
1565
+ result = connection.execution_options(stream_results=True).execute(
1566
+ text(query)
1567
+ )
1568
+
1569
+ logger.debug(
1570
+ "Successfully using server-side cursor for query execution"
1571
+ )
1572
+ return result
1573
+
1574
+ except Exception as e:
1575
+ logger.warning(
1576
+ f"Server-side cursor failed, falling back to client-side execution: {e}"
1577
+ )
1578
+ # Fall through to regular execution
1579
+
1580
+ # Regular execution (client-side)
1581
+ if params:
1582
+ return connection.execute(text(query), params)
1583
+ else:
1584
+ return connection.execute(text(query))
1585
+
1586
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1587
+ logger.info("Starting Teradata metadata extraction")
1588
+
897
1589
  # Add all schemas to the schema resolver
898
1590
  # Sql parser operates on lowercase urns so we need to lowercase the urns
899
- for wu in auto_lowercase_urns(super().get_workunits_internal()):
900
- urn = wu.get_urn()
901
- schema_metadata = wu.get_aspect_of_type(SchemaMetadataClass)
902
- if schema_metadata:
903
- self.schema_resolver.add_schema_metadata(urn, schema_metadata)
904
- yield wu
905
-
906
- urns = self.schema_resolver.get_urns()
1591
+ with self.report.new_stage("Schema metadata extraction"):
1592
+ yield from super().get_workunits_internal()
1593
+ logger.info("Completed schema metadata extraction")
1594
+
1595
+ with self.report.new_stage("Audit log extraction"):
1596
+ yield from self._get_audit_log_mcps_with_aggregator()
1597
+
1598
+ # SqlParsingAggregator handles its own work unit generation internally
1599
+ logger.info("Lineage processing completed by SqlParsingAggregator")
1600
+
1601
+ def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
1602
+ """Override base class to skip aggregator gen_metadata() call.
1603
+
1604
+ Teradata handles aggregator processing after adding audit log queries,
1605
+ so we skip the base class call to prevent duplicate processing.
1606
+ """
1607
+ # Return empty iterator - Teradata will handle aggregator processing
1608
+ # after adding audit log queries in _get_audit_log_mcps_with_aggregator()
1609
+ return iter([])
1610
+
1611
+ def _get_audit_log_mcps_with_aggregator(self) -> Iterable[MetadataWorkUnit]:
1612
+ """SqlParsingAggregator-based lineage extraction with enhanced capabilities."""
1613
+ logger.info(
1614
+ "Fetching queries from Teradata audit logs for SqlParsingAggregator"
1615
+ )
1616
+
907
1617
  if self.config.include_table_lineage or self.config.include_usage_statistics:
908
- with self.report.new_stage("Audit log extraction"):
909
- yield from self.get_audit_log_mcps(urns=urns)
1618
+ # Step 1: Stream query entries from database with memory-efficient processing
1619
+ with self.report.new_stage("Fetching lineage entries from Audit Logs"):
1620
+ queries_processed = 0
1621
+ entries_processed = False
1622
+
1623
+ # Use streaming query reconstruction for memory efficiency
1624
+ for observed_query in self._reconstruct_queries_streaming(
1625
+ self._fetch_lineage_entries_chunked()
1626
+ ):
1627
+ entries_processed = True
1628
+ self.aggregator.add(observed_query)
1629
+
1630
+ queries_processed += 1
1631
+ if queries_processed % 10000 == 0:
1632
+ logger.info(
1633
+ f"Processed {queries_processed} queries to aggregator"
1634
+ )
1635
+
1636
+ if not entries_processed:
1637
+ logger.info("No lineage entries found")
1638
+ return
1639
+
1640
+ logger.info(
1641
+ f"Completed adding {queries_processed} queries to SqlParsingAggregator"
1642
+ )
1643
+
1644
+ # Step 2: Generate work units from aggregator
1645
+ with self.report.new_stage("SqlParsingAggregator metadata generation"):
1646
+ logger.info("Generating metadata work units from SqlParsingAggregator")
1647
+ work_unit_count = 0
1648
+ for mcp in self.aggregator.gen_metadata():
1649
+ work_unit_count += 1
1650
+ if work_unit_count % 10000 == 0:
1651
+ logger.info(
1652
+ f"Generated {work_unit_count} work units from aggregator"
1653
+ )
1654
+ yield mcp.as_workunit()
1655
+
1656
+ logger.info(
1657
+ f"Completed SqlParsingAggregator processing: {work_unit_count} work units generated"
1658
+ )
1659
+
1660
+ def close(self) -> None:
1661
+ """Clean up resources when source is closed."""
1662
+ logger.info("Closing SqlParsingAggregator")
1663
+ self.aggregator.close()
1664
+
1665
+ # Clean up pooled engine
1666
+ with self._pooled_engine_lock:
1667
+ if self._pooled_engine is not None:
1668
+ logger.info("Disposing pooled engine")
1669
+ self._pooled_engine.dispose()
1670
+ self._pooled_engine = None
910
1671
 
911
- yield from self.builder.gen_workunits()
1672
+ # Report failed views summary
1673
+ super().close()