acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,12 @@
1
1
  import logging
2
+ import re
3
+ import time
2
4
  from collections import defaultdict
3
- from dataclasses import dataclass
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from dataclasses import dataclass, field
4
7
  from datetime import datetime
5
8
  from functools import lru_cache
9
+ from threading import Lock
6
10
  from typing import (
7
11
  Any,
8
12
  Dict,
@@ -10,7 +14,6 @@ from typing import (
10
14
  List,
11
15
  MutableMapping,
12
16
  Optional,
13
- Set,
14
17
  Tuple,
15
18
  Union,
16
19
  )
@@ -29,7 +32,6 @@ from teradatasqlalchemy.options import configure
29
32
 
30
33
  from datahub.configuration.common import AllowDenyPattern
31
34
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
32
- from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
33
35
  from datahub.ingestion.api.common import PipelineContext
34
36
  from datahub.ingestion.api.decorators import (
35
37
  SourceCapability,
@@ -39,10 +41,10 @@ from datahub.ingestion.api.decorators import (
39
41
  platform_name,
40
42
  support_status,
41
43
  )
42
- from datahub.ingestion.api.source_helpers import auto_lowercase_urns
43
44
  from datahub.ingestion.api.workunit import MetadataWorkUnit
44
45
  from datahub.ingestion.graph.client import DataHubGraph
45
- from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type
46
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
47
+ from datahub.ingestion.source.sql.sql_common import register_custom_type
46
48
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
47
49
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
48
50
  from datahub.ingestion.source.sql.two_tier_sql_source import (
@@ -50,19 +52,75 @@ from datahub.ingestion.source.sql.two_tier_sql_source import (
50
52
  TwoTierSQLAlchemySource,
51
53
  )
52
54
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
53
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
54
55
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
55
56
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
56
57
  BytesTypeClass,
57
58
  TimeTypeClass,
58
59
  )
59
- from datahub.metadata.schema_classes import SchemaMetadataClass
60
+ from datahub.metadata.urns import CorpUserUrn
60
61
  from datahub.sql_parsing.schema_resolver import SchemaResolver
61
- from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
62
+ from datahub.sql_parsing.sql_parsing_aggregator import (
63
+ ObservedQuery,
64
+ SqlParsingAggregator,
65
+ )
62
66
  from datahub.utilities.groupby import groupby_unsorted
67
+ from datahub.utilities.stats_collections import TopKDict
63
68
 
64
69
  logger: logging.Logger = logging.getLogger(__name__)
65
70
 
71
+ # Precompiled regex pattern for case-insensitive "(not casespecific)" removal
72
+ NOT_CASESPECIFIC_PATTERN = re.compile(r"\(not casespecific\)", re.IGNORECASE)
73
+
74
+ # Teradata uses a two-tier database.table naming approach without default database prefixing
75
+ DEFAULT_NO_DATABASE_TERADATA = None
76
+
77
+ # Common excluded databases used in multiple places
78
+ EXCLUDED_DATABASES = [
79
+ "All",
80
+ "Crashdumps",
81
+ "Default",
82
+ "DemoNow_Monitor",
83
+ "EXTUSER",
84
+ "External_AP",
85
+ "GLOBAL_FUNCTIONS",
86
+ "LockLogShredder",
87
+ "PUBLIC",
88
+ "SQLJ",
89
+ "SYSBAR",
90
+ "SYSJDBC",
91
+ "SYSLIB",
92
+ "SYSSPATIAL",
93
+ "SYSUDTLIB",
94
+ "SYSUIF",
95
+ "SysAdmin",
96
+ "Sys_Calendar",
97
+ "SystemFe",
98
+ "TDBCMgmt",
99
+ "TDMaps",
100
+ "TDPUSER",
101
+ "TDQCD",
102
+ "TDStats",
103
+ "TD_ANALYTICS_DB",
104
+ "TD_SERVER_DB",
105
+ "TD_SYSFNLIB",
106
+ "TD_SYSGPL",
107
+ "TD_SYSXML",
108
+ "TDaaS_BAR",
109
+ "TDaaS_DB",
110
+ "TDaaS_Maint",
111
+ "TDaaS_Monitor",
112
+ "TDaaS_Support",
113
+ "TDaaS_TDBCMgmt1",
114
+ "TDaaS_TDBCMgmt2",
115
+ "dbcmngr",
116
+ "mldb",
117
+ "system",
118
+ "tapidb",
119
+ "tdwm",
120
+ "val",
121
+ "dbc",
122
+ ]
123
+
66
124
  register_custom_type(custom_types.JSON, BytesTypeClass)
67
125
  register_custom_type(custom_types.INTERVAL_DAY, TimeTypeClass)
68
126
  register_custom_type(custom_types.INTERVAL_DAY_TO_SECOND, TimeTypeClass)
@@ -99,14 +157,16 @@ class TeradataTable:
99
157
  request_text: Optional[str]
100
158
 
101
159
 
102
- # lru cache is set to 1 which work only in single threaded environment but it keeps the memory footprint lower
160
+ # Cache size of 1 is sufficient since schemas are processed sequentially
161
+ # Note: This cache is per-process and helps when processing multiple tables in the same schema
103
162
  @lru_cache(maxsize=1)
104
163
  def get_schema_columns(
105
164
  self: Any, connection: Connection, dbc_columns: str, schema: str
106
165
  ) -> Dict[str, List[Any]]:
166
+ start_time = time.time()
107
167
  columns: Dict[str, List[Any]] = {}
108
- columns_query = f"select * from dbc.{dbc_columns} where DatabaseName (NOT CASESPECIFIC) = '{schema}' (NOT CASESPECIFIC) order by TableName, ColumnId"
109
- rows = connection.execute(text(columns_query)).fetchall()
168
+ columns_query = f"select * from dbc.{dbc_columns} where DatabaseName (NOT CASESPECIFIC) = :schema (NOT CASESPECIFIC) order by TableName, ColumnId"
169
+ rows = connection.execute(text(columns_query), {"schema": schema}).fetchall()
110
170
  for row in rows:
111
171
  row_mapping = row._mapping
112
172
  if row_mapping.TableName not in columns:
@@ -114,18 +174,29 @@ def get_schema_columns(
114
174
 
115
175
  columns[row_mapping.TableName].append(row_mapping)
116
176
 
177
+ end_time = time.time()
178
+ extraction_time = end_time - start_time
179
+ logger.info(
180
+ f"Column extraction for schema '{schema}' completed in {extraction_time:.2f} seconds"
181
+ )
182
+
183
+ # Update report if available
184
+ if hasattr(self, "report"):
185
+ self.report.column_extraction_duration_seconds += extraction_time
186
+
117
187
  return columns
118
188
 
119
189
 
120
- # lru cache is set to 1 which work only in single threaded environment but it keeps the memory footprint lower
190
+ # Cache size of 1 is sufficient since schemas are processed sequentially
191
+ # Note: This cache is per-process and helps when processing multiple tables in the same schema
121
192
  @lru_cache(maxsize=1)
122
193
  def get_schema_pk_constraints(
123
194
  self: Any, connection: Connection, schema: str
124
195
  ) -> Dict[str, List[Any]]:
125
196
  dbc_indices = "IndicesV" + "X" if configure.usexviews else "IndicesV"
126
197
  primary_keys: Dict[str, List[Any]] = {}
127
- stmt = f"select * from dbc.{dbc_indices} where DatabaseName (NOT CASESPECIFIC) = '{schema}' (NOT CASESPECIFIC) and IndexType = 'K' order by IndexNumber"
128
- rows = connection.execute(text(stmt)).fetchall()
198
+ stmt = f"select * from dbc.{dbc_indices} where DatabaseName (NOT CASESPECIFIC) = :schema (NOT CASESPECIFIC) and IndexType = 'K' order by IndexNumber"
199
+ rows = connection.execute(text(stmt), {"schema": schema}).fetchall()
129
200
  for row in rows:
130
201
  row_mapping = row._mapping
131
202
  if row_mapping.TableName not in primary_keys:
@@ -172,6 +243,10 @@ def optimized_get_pk_constraint(
172
243
  index_column.IndexName
173
244
  ) # There should be just one IndexName
174
245
 
246
+ # Update counter if available
247
+ if hasattr(self, "report"):
248
+ self.report.num_primary_keys_processed += 1
249
+
175
250
  return {"constrained_columns": index_columns, "name": index_name}
176
251
 
177
252
 
@@ -228,23 +303,55 @@ def optimized_get_columns(
228
303
  table_name, []
229
304
  )
230
305
 
306
+ start_time = time.time()
307
+
231
308
  final_column_info = []
232
309
  # Don't care about ART tables now
233
310
  # Ignore the non-functional column in a PTI table
234
311
  for row in res:
235
- col_info = self._get_column_info(row)
236
- if "TSColumnType" in col_info and col_info["TSColumnType"] is not None:
237
- if (
238
- col_info["ColumnName"] == "TD_TIMEBUCKET"
239
- and col_info["TSColumnType"].strip() == "TB"
312
+ try:
313
+ col_info = self._get_column_info(row)
314
+
315
+ # Add CommentString as comment field for column description
316
+ if hasattr(row, "CommentString") and row.CommentString:
317
+ col_info["comment"] = row.CommentString.strip()
318
+ elif (
319
+ isinstance(row, dict)
320
+ and "CommentString" in row
321
+ and row["CommentString"]
240
322
  ):
241
- continue
242
- final_column_info.append(col_info)
323
+ col_info["comment"] = row["CommentString"].strip()
324
+
325
+ if "TSColumnType" in col_info and col_info["TSColumnType"] is not None:
326
+ if (
327
+ col_info["ColumnName"] == "TD_TIMEBUCKET"
328
+ and col_info["TSColumnType"].strip() == "TB"
329
+ ):
330
+ continue
331
+ final_column_info.append(col_info)
332
+
333
+ # Update counter - access report through self from the connection context
334
+ if hasattr(self, "report"):
335
+ self.report.num_columns_processed += 1
336
+
337
+ except Exception as e:
338
+ logger.error(
339
+ f"Failed to process column {getattr(row, 'ColumnName', 'unknown')}: {e}"
340
+ )
341
+ if hasattr(self, "report"):
342
+ self.report.num_column_extraction_failures += 1
343
+ continue
344
+
345
+ # Update timing
346
+ if hasattr(self, "report"):
347
+ end_time = time.time()
348
+ self.report.column_extraction_duration_seconds += end_time - start_time
243
349
 
244
350
  return final_column_info
245
351
 
246
352
 
247
- # lru cache is set to 1 which work only in single threaded environment but it keeps the memory footprint lower
353
+ # Cache size of 1 is sufficient since schemas are processed sequentially
354
+ # Note: This cache is per-process and helps when processing multiple tables in the same schema
248
355
  @lru_cache(maxsize=1)
249
356
  def get_schema_foreign_keys(
250
357
  self: Any, connection: Connection, schema: str
@@ -333,10 +440,32 @@ def optimized_get_view_definition(
333
440
 
334
441
 
335
442
  @dataclass
336
- class TeradataReport(SQLSourceReport, IngestionStageReport, BaseTimeWindowReport):
337
- num_queries_parsed: int = 0
338
- num_view_ddl_parsed: int = 0
339
- num_table_parse_failures: int = 0
443
+ class TeradataReport(SQLSourceReport, BaseTimeWindowReport):
444
+ # View processing metrics (actively used)
445
+ num_views_processed: int = 0
446
+ num_view_processing_failures: int = 0
447
+ view_extraction_total_time_seconds: float = 0.0
448
+ view_extraction_average_time_seconds: float = 0.0
449
+ slowest_view_processing_time_seconds: float = 0.0
450
+ slowest_view_name: TopKDict[str, float] = field(default_factory=TopKDict)
451
+
452
+ # Connection pool performance metrics (actively used)
453
+ connection_pool_wait_time_seconds: float = 0.0
454
+ connection_pool_max_wait_time_seconds: float = 0.0
455
+
456
+ # Database-level metrics similar to BigQuery's approach (actively used)
457
+ num_database_tables_to_scan: TopKDict[str, int] = field(default_factory=TopKDict)
458
+ num_database_views_to_scan: TopKDict[str, int] = field(default_factory=TopKDict)
459
+
460
+ # Global metadata extraction timing (single query for all databases)
461
+ metadata_extraction_total_sec: float = 0.0
462
+
463
+ # Lineage extraction query time range (actively used)
464
+ lineage_start_time: Optional[datetime] = None
465
+ lineage_end_time: Optional[datetime] = None
466
+
467
+ # Audit query processing statistics
468
+ num_audit_query_entries_processed: int = 0
340
469
 
341
470
 
342
471
  class BaseTeradataConfig(TwoTierSQLAlchemyConfig):
@@ -352,67 +481,28 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
352
481
  ),
353
482
  )
354
483
 
355
- database_pattern = Field(
356
- default=AllowDenyPattern(
357
- deny=[
358
- "All",
359
- "Crashdumps",
360
- "Default",
361
- "DemoNow_Monitor",
362
- "EXTUSER",
363
- "External_AP",
364
- "GLOBAL_FUNCTIONS",
365
- "LockLogShredder",
366
- "PUBLIC",
367
- "SQLJ",
368
- "SYSBAR",
369
- "SYSJDBC",
370
- "SYSLIB",
371
- "SYSSPATIAL",
372
- "SYSUDTLIB",
373
- "SYSUIF",
374
- "SysAdmin",
375
- "Sys_Calendar",
376
- "SystemFe",
377
- "TDBCMgmt",
378
- "TDMaps",
379
- "TDPUSER",
380
- "TDQCD",
381
- "TDStats",
382
- "TD_ANALYTICS_DB",
383
- "TD_SERVER_DB",
384
- "TD_SYSFNLIB",
385
- "TD_SYSGPL",
386
- "TD_SYSXML",
387
- "TDaaS_BAR",
388
- "TDaaS_DB",
389
- "TDaaS_Maint",
390
- "TDaaS_Monitor",
391
- "TDaaS_Support",
392
- "TDaaS_TDBCMgmt1",
393
- "TDaaS_TDBCMgmt2",
394
- "dbcmngr",
395
- "mldb",
396
- "system",
397
- "tapidb",
398
- "tdwm",
399
- "val",
400
- "dbc",
401
- ]
402
- ),
484
+ database_pattern: AllowDenyPattern = Field(
485
+ default=AllowDenyPattern(deny=EXCLUDED_DATABASES),
403
486
  description="Regex patterns for databases to filter in ingestion.",
404
487
  )
405
- include_table_lineage = Field(
488
+ include_table_lineage: bool = Field(
406
489
  default=False,
407
490
  description="Whether to include table lineage in the ingestion. "
408
491
  "This requires to have the table lineage feature enabled.",
409
492
  )
410
493
 
411
- include_view_lineage = Field(
494
+ include_view_lineage: bool = Field(
412
495
  default=True,
413
496
  description="Whether to include view lineage in the ingestion. "
414
497
  "This requires to have the view lineage feature enabled.",
415
498
  )
499
+
500
+ include_queries: bool = Field(
501
+ default=True,
502
+ description="Whether to generate query entities for SQL queries. "
503
+ "Query entities provide metadata about individual SQL queries including "
504
+ "execution timestamps, user information, and query text.",
505
+ )
416
506
  usage: BaseUsageConfig = Field(
417
507
  description="The usage config to use when generating usage statistics",
418
508
  default=BaseUsageConfig(),
@@ -438,14 +528,43 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
438
528
  description="Whether to use QVCI to get column information. This is faster but requires to have QVCI enabled.",
439
529
  )
440
530
 
531
+ include_historical_lineage: bool = Field(
532
+ default=False,
533
+ description="Whether to include historical lineage data from PDCRINFO.DBQLSqlTbl_Hst in addition to current DBC.QryLogV data. "
534
+ "This provides access to historical query logs that may have been archived. "
535
+ "The historical table existence is checked automatically and gracefully falls back to current data only if not available.",
536
+ )
537
+
538
+ use_server_side_cursors: bool = Field(
539
+ default=True,
540
+ description="Enable server-side cursors for large result sets using SQLAlchemy's stream_results. "
541
+ "This reduces memory usage by streaming results from the database server. "
542
+ "Automatically falls back to client-side batching if server-side cursors are not supported.",
543
+ )
544
+
545
+ max_workers: int = Field(
546
+ default=10,
547
+ description="Maximum number of worker threads to use for parallel processing. "
548
+ "Controls the level of concurrency for operations like view processing.",
549
+ )
550
+
441
551
 
442
552
  @platform_name("Teradata")
443
553
  @config_class(TeradataConfig)
444
554
  @support_status(SupportStatus.TESTING)
445
555
  @capability(SourceCapability.DOMAINS, "Enabled by default")
446
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
556
+ @capability(
557
+ SourceCapability.CONTAINERS,
558
+ "Enabled by default",
559
+ subtype_modifier=[
560
+ SourceCapabilityModifier.DATABASE,
561
+ ],
562
+ )
447
563
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
448
- @capability(SourceCapability.DELETION_DETECTION, "Optionally enabled via configuration")
564
+ @capability(
565
+ SourceCapability.DELETION_DETECTION,
566
+ "Enabled by default when stateful ingestion is turned on",
567
+ )
449
568
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
450
569
  @capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration")
451
570
  @capability(SourceCapability.LINEAGE_FINE, "Optionally enabled via configuration")
@@ -461,13 +580,7 @@ class TeradataSource(TwoTierSQLAlchemySource):
461
580
 
462
581
  config: TeradataConfig
463
582
 
464
- LINEAGE_QUERY_DATABASE_FILTER: str = """and default_database IN ({databases})"""
465
-
466
- LINEAGE_TIMESTAMP_BOUND_QUERY: str = """
467
- SELECT MIN(CollectTimeStamp) as "min_ts", MAX(CollectTimeStamp) as "max_ts" from DBC.QryLogV
468
- """.strip()
469
-
470
- QUERY_TEXT_QUERY: str = """
583
+ QUERY_TEXT_CURRENT_QUERIES: str = """
471
584
  SELECT
472
585
  s.QueryID as "query_id",
473
586
  UserName as "user",
@@ -500,10 +613,89 @@ class TeradataSource(TwoTierSQLAlchemySource):
500
613
  and s.CollectTimeStamp >= TIMESTAMP '{start_time}'
501
614
  and default_database not in ('DEMONOW_MONITOR')
502
615
  {databases_filter}
503
- ORDER BY "query_id", "row_no"
616
+ ORDER BY "timestamp", "query_id", "row_no"
504
617
  """.strip()
505
618
 
506
- TABLES_AND_VIEWS_QUERY: str = """
619
+ QUERY_TEXT_HISTORICAL_UNION: str = """
620
+ SELECT
621
+ "query_id",
622
+ "user",
623
+ "timestamp",
624
+ default_database,
625
+ "query_text",
626
+ "row_no"
627
+ FROM (
628
+ SELECT
629
+ h.QueryID as "query_id",
630
+ h.UserName as "user",
631
+ h.StartTime AT TIME ZONE 'GMT' as "timestamp",
632
+ h.DefaultDatabase as default_database,
633
+ h.SqlTextInfo as "query_text",
634
+ h.SqlRowNo as "row_no"
635
+ FROM "PDCRINFO".DBQLSqlTbl_Hst as h
636
+ WHERE
637
+ h.ErrorCode = 0
638
+ AND h.statementtype not in (
639
+ 'Unrecognized type',
640
+ 'Create Database/User',
641
+ 'Help',
642
+ 'Modify Database',
643
+ 'Drop Table',
644
+ 'Show',
645
+ 'Not Applicable',
646
+ 'Grant',
647
+ 'Abort',
648
+ 'Database',
649
+ 'Flush Query Logging',
650
+ 'Null',
651
+ 'Begin/End DBQL',
652
+ 'Revoke'
653
+ )
654
+ and h.StartTime AT TIME ZONE 'GMT' >= TIMESTAMP '{start_time}'
655
+ and h.StartTime AT TIME ZONE 'GMT' < TIMESTAMP '{end_time}'
656
+ and h.CollectTimeStamp >= TIMESTAMP '{start_time}'
657
+ and h.DefaultDatabase not in ('DEMONOW_MONITOR')
658
+ {databases_filter_history}
659
+
660
+ UNION
661
+
662
+ SELECT
663
+ s.QueryID as "query_id",
664
+ l.UserName as "user",
665
+ l.StartTime AT TIME ZONE 'GMT' as "timestamp",
666
+ l.DefaultDatabase as default_database,
667
+ s.SqlTextInfo as "query_text",
668
+ s.SqlRowNo as "row_no"
669
+ FROM "DBC".QryLogV as l
670
+ JOIN "DBC".QryLogSqlV as s on s.QueryID = l.QueryID
671
+ WHERE
672
+ l.ErrorCode = 0
673
+ AND l.statementtype not in (
674
+ 'Unrecognized type',
675
+ 'Create Database/User',
676
+ 'Help',
677
+ 'Modify Database',
678
+ 'Drop Table',
679
+ 'Show',
680
+ 'Not Applicable',
681
+ 'Grant',
682
+ 'Abort',
683
+ 'Database',
684
+ 'Flush Query Logging',
685
+ 'Null',
686
+ 'Begin/End DBQL',
687
+ 'Revoke'
688
+ )
689
+ and l.StartTime AT TIME ZONE 'GMT' >= TIMESTAMP '{start_time}'
690
+ and l.StartTime AT TIME ZONE 'GMT' < TIMESTAMP '{end_time}'
691
+ and s.CollectTimeStamp >= TIMESTAMP '{start_time}'
692
+ and l.DefaultDatabase not in ('DEMONOW_MONITOR')
693
+ {databases_filter}
694
+ ) as combined_results
695
+ ORDER BY "timestamp", "query_id", "row_no"
696
+ """.strip()
697
+
698
+ TABLES_AND_VIEWS_QUERY: str = f"""
507
699
  SELECT
508
700
  t.DataBaseName,
509
701
  t.TableName as name,
@@ -521,77 +713,52 @@ SELECT
521
713
  t.LastAlterTimeStamp,
522
714
  t.RequestText
523
715
  FROM dbc.TablesV t
524
- WHERE DataBaseName NOT IN (
525
- 'All',
526
- 'Crashdumps',
527
- 'Default',
528
- 'DemoNow_Monitor',
529
- 'EXTUSER',
530
- 'External_AP',
531
- 'GLOBAL_FUNCTIONS',
532
- 'LockLogShredder',
533
- 'PUBLIC',
534
- 'SQLJ',
535
- 'SYSBAR',
536
- 'SYSJDBC',
537
- 'SYSLIB',
538
- 'SYSSPATIAL',
539
- 'SYSUDTLIB',
540
- 'SYSUIF',
541
- 'SysAdmin',
542
- 'Sys_Calendar',
543
- 'SystemFe',
544
- 'TDBCMgmt',
545
- 'TDMaps',
546
- 'TDPUSER',
547
- 'TDQCD',
548
- 'TDStats',
549
- 'TD_ANALYTICS_DB',
550
- 'TD_SERVER_DB',
551
- 'TD_SYSFNLIB',
552
- 'TD_SYSGPL',
553
- 'TD_SYSXML',
554
- 'TDaaS_BAR',
555
- 'TDaaS_DB',
556
- 'TDaaS_Maint',
557
- 'TDaaS_Monitor',
558
- 'TDaaS_Support',
559
- 'TDaaS_TDBCMgmt1',
560
- 'TDaaS_TDBCMgmt2',
561
- 'dbcmngr',
562
- 'mldb',
563
- 'system',
564
- 'tapidb',
565
- 'tdwm',
566
- 'val',
567
- 'dbc'
568
- )
716
+ WHERE DataBaseName NOT IN ({",".join([f"'{db}'" for db in EXCLUDED_DATABASES])})
569
717
  AND t.TableKind in ('T', 'V', 'Q', 'O')
570
718
  ORDER by DataBaseName, TableName;
571
719
  """.strip()
572
720
 
573
721
  _tables_cache: MutableMapping[str, List[TeradataTable]] = defaultdict(list)
722
+ _tables_cache_lock = Lock() # Protect shared cache from concurrent access
723
+ _pooled_engine: Optional[Engine] = None # Reusable pooled engine
724
+ _pooled_engine_lock = Lock() # Protect engine creation
574
725
 
575
726
  def __init__(self, config: TeradataConfig, ctx: PipelineContext):
576
727
  super().__init__(config, ctx, "teradata")
577
728
 
578
729
  self.report: TeradataReport = TeradataReport()
579
730
  self.graph: Optional[DataHubGraph] = ctx.graph
731
+ self._report_lock = Lock() # Thread safety for report counters
732
+
733
+ self.schema_resolver = self._init_schema_resolver()
580
734
 
581
- self.builder: SqlParsingBuilder = SqlParsingBuilder(
582
- usage_config=(
583
- self.config.usage if self.config.include_usage_statistics else None
584
- ),
585
- generate_lineage=True,
735
+ # Initialize SqlParsingAggregator for modern lineage processing
736
+ logger.info("Initializing SqlParsingAggregator for enhanced lineage processing")
737
+ self.aggregator = SqlParsingAggregator(
738
+ platform="teradata",
739
+ platform_instance=self.config.platform_instance,
740
+ env=self.config.env,
741
+ schema_resolver=self.schema_resolver,
742
+ graph=self.ctx.graph,
743
+ generate_lineage=self.config.include_view_lineage
744
+ or self.config.include_table_lineage,
745
+ generate_queries=self.config.include_queries,
586
746
  generate_usage_statistics=self.config.include_usage_statistics,
587
- generate_operations=self.config.usage.include_operational_stats,
747
+ generate_query_usage_statistics=self.config.include_usage_statistics,
748
+ generate_operations=self.config.usage.include_operational_stats
749
+ if self.config.include_usage_statistics
750
+ else False,
751
+ usage_config=self.config.usage
752
+ if self.config.include_usage_statistics
753
+ else None,
754
+ eager_graph_load=False,
588
755
  )
589
-
590
- self.schema_resolver = self._init_schema_resolver()
756
+ self.report.sql_aggregator = self.aggregator.report
591
757
 
592
758
  if self.config.include_tables or self.config.include_views:
593
- self.cache_tables_and_views()
594
- logger.info(f"Found {len(self._tables_cache)} tables and views")
759
+ with self.report.new_stage("Table and view discovery"):
760
+ self.cache_tables_and_views()
761
+ logger.info(f"Found {len(self._tables_cache)} tables and views")
595
762
  setattr(self, "loop_tables", self.cached_loop_tables) # noqa: B010
596
763
  setattr(self, "loop_views", self.cached_loop_views) # noqa: B010
597
764
  setattr( # noqa: B010
@@ -721,6 +888,8 @@ ORDER by DataBaseName, TableName;
721
888
 
722
889
  logger.debug(f"sql_alchemy_url={url}")
723
890
  engine = create_engine(url, **self.config.options)
891
+
892
+ # Get list of databases first
724
893
  with engine.connect() as conn:
725
894
  inspector = inspect(conn)
726
895
  if self.config.database and self.config.database != "":
@@ -729,13 +898,14 @@ ORDER by DataBaseName, TableName;
729
898
  databases = self.config.databases
730
899
  else:
731
900
  databases = inspector.get_schema_names()
732
- for db in databases:
733
- if self.config.database_pattern.allowed(db):
734
- # url = self.config.get_sql_alchemy_url(current_db=db)
735
- # with create_engine(url, **self.config.options).connect() as conn:
736
- # inspector = inspect(conn)
737
- inspector._datahub_database = db
738
- yield inspector
901
+
902
+ # Create separate connections for each database to avoid connection lifecycle issues
903
+ for db in databases:
904
+ if self.config.database_pattern.allowed(db):
905
+ with engine.connect() as conn:
906
+ db_inspector = inspect(conn)
907
+ db_inspector._datahub_database = db
908
+ yield db_inspector
739
909
 
740
910
  def get_db_name(self, inspector: Inspector) -> str:
741
911
  if hasattr(inspector, "_datahub_database"):
@@ -753,14 +923,15 @@ ORDER by DataBaseName, TableName;
753
923
  inspector: Inspector,
754
924
  schema: str,
755
925
  sql_config: SQLCommonConfig,
756
- ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
926
+ ) -> Iterable[MetadataWorkUnit]:
757
927
  setattr( # noqa: B010
758
928
  inspector,
759
929
  "get_table_names",
760
930
  lambda schema: [
761
931
  i.name
762
932
  for i in filter(
763
- lambda t: t.object_type != "View", self._tables_cache[schema]
933
+ lambda t: t.object_type != "View",
934
+ self._tables_cache.get(schema, []),
764
935
  )
765
936
  ],
766
937
  )
@@ -776,7 +947,8 @@ ORDER by DataBaseName, TableName;
776
947
  # this method and provide a location.
777
948
  location: Optional[str] = None
778
949
 
779
- for entry in self._tables_cache[schema]:
950
+ cache_entries = self._tables_cache.get(schema, [])
951
+ for entry in cache_entries:
780
952
  if entry.name == table:
781
953
  description = entry.description
782
954
  if entry.object_type == "View" and entry.request_text:
@@ -789,123 +961,734 @@ ORDER by DataBaseName, TableName;
789
961
  inspector: Inspector,
790
962
  schema: str,
791
963
  sql_config: SQLCommonConfig,
792
- ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
793
- setattr( # noqa: B010
794
- inspector,
795
- "get_view_names",
796
- lambda schema: [
797
- i.name
798
- for i in filter(
799
- lambda t: t.object_type == "View", self._tables_cache[schema]
964
+ ) -> Iterable[MetadataWorkUnit]:
965
+ start_time = time.time()
966
+
967
+ # Get view names from cache
968
+ view_names = [
969
+ i.name
970
+ for i in filter(
971
+ lambda t: t.object_type == "View", self._tables_cache.get(schema, [])
972
+ )
973
+ ]
974
+ actual_view_count = len(view_names)
975
+
976
+ if actual_view_count == 0:
977
+ end_time = time.time()
978
+ processing_time = end_time - start_time
979
+ logger.info(
980
+ f"View processing for schema '{schema}' completed in {processing_time:.2f} seconds (0 views, 0 work units)"
981
+ )
982
+ return
983
+
984
+ # Use custom threading implementation with connection pooling
985
+ work_unit_count = 0
986
+
987
+ for work_unit in self._loop_views_with_connection_pool(
988
+ view_names, schema, sql_config
989
+ ):
990
+ work_unit_count += 1
991
+ yield work_unit
992
+
993
+ end_time = time.time()
994
+ processing_time = end_time - start_time
995
+
996
+ logger.info(
997
+ f"View processing for schema '{schema}' completed in {processing_time:.2f} seconds ({actual_view_count} views, {work_unit_count} work units)"
998
+ )
999
+
1000
+ # Update report timing metrics
1001
+ if hasattr(self, "report"):
1002
+ self.report.view_extraction_total_time_seconds += processing_time
1003
+ self.report.num_views_processed += actual_view_count
1004
+
1005
+ # Track slowest view processing at view level (will be updated by individual view processing)
1006
+ # Note: slowest_view_name now tracks individual views, not schemas
1007
+
1008
+ # Calculate average processing time per view
1009
+ if self.report.num_views_processed > 0:
1010
+ self.report.view_extraction_average_time_seconds = (
1011
+ self.report.view_extraction_total_time_seconds
1012
+ / self.report.num_views_processed
800
1013
  )
801
- ],
1014
+
1015
+ def _loop_views_with_connection_pool(
1016
+ self, view_names: List[str], schema: str, sql_config: SQLCommonConfig
1017
+ ) -> Iterable[Union[MetadataWorkUnit, Any]]:
1018
+ """
1019
+ Process views using individual database connections per thread for true parallelization.
1020
+
1021
+ Each thread gets its own connection from a QueuePool, enabling true concurrent processing.
1022
+ """
1023
+ if self.config.max_workers == 1:
1024
+ # Single-threaded processing - no need for complexity
1025
+ yield from self._process_views_single_threaded(
1026
+ view_names, schema, sql_config
1027
+ )
1028
+ return
1029
+
1030
+ logger.info(
1031
+ f"Processing {len(view_names)} views with {self.config.max_workers} worker threads"
802
1032
  )
803
- yield from super().loop_views(inspector, schema, sql_config)
804
1033
 
805
- def cache_tables_and_views(self) -> None:
1034
+ # Get or create reusable pooled engine
1035
+ engine = self._get_or_create_pooled_engine()
1036
+
1037
+ try:
1038
+ # Thread-safe result collection
1039
+ report_lock = Lock()
1040
+
1041
+ def process_single_view(
1042
+ view_name: str,
1043
+ ) -> List[Union[MetadataWorkUnit, Any]]:
1044
+ """Process a single view with its own database connection."""
1045
+ results: List[Union[MetadataWorkUnit, Any]] = []
1046
+
1047
+ # Detailed timing measurements for bottleneck analysis
1048
+ timings = {
1049
+ "connection_acquire": 0.0,
1050
+ "view_processing": 0.0,
1051
+ "work_unit_generation": 0.0,
1052
+ "total": 0.0,
1053
+ }
1054
+
1055
+ total_start = time.time()
1056
+ try:
1057
+ # Measure connection acquisition time
1058
+ conn_start = time.time()
1059
+ with engine.connect() as conn:
1060
+ timings["connection_acquire"] = time.time() - conn_start
1061
+
1062
+ # Update connection pool metrics
1063
+ with report_lock:
1064
+ pool_wait_time = timings["connection_acquire"]
1065
+ self.report.connection_pool_wait_time_seconds += (
1066
+ pool_wait_time
1067
+ )
1068
+ if (
1069
+ pool_wait_time
1070
+ > self.report.connection_pool_max_wait_time_seconds
1071
+ ):
1072
+ self.report.connection_pool_max_wait_time_seconds = (
1073
+ pool_wait_time
1074
+ )
1075
+
1076
+ # Measure view processing setup
1077
+ processing_start = time.time()
1078
+ thread_inspector = inspect(conn)
1079
+ # Inherit database information for Teradata two-tier architecture
1080
+ thread_inspector._datahub_database = schema # type: ignore
1081
+
1082
+ dataset_name = self.get_identifier(
1083
+ schema=schema, entity=view_name, inspector=thread_inspector
1084
+ )
1085
+
1086
+ # Thread-safe reporting
1087
+ with report_lock:
1088
+ self.report.report_entity_scanned(
1089
+ dataset_name, ent_type="view"
1090
+ )
1091
+
1092
+ if not sql_config.view_pattern.allowed(dataset_name):
1093
+ with report_lock:
1094
+ self.report.report_dropped(dataset_name)
1095
+ return results
1096
+
1097
+ timings["view_processing"] = time.time() - processing_start
1098
+
1099
+ # Measure work unit generation
1100
+ wu_start = time.time()
1101
+ for work_unit in self._process_view(
1102
+ dataset_name=dataset_name,
1103
+ inspector=thread_inspector,
1104
+ schema=schema,
1105
+ view=view_name,
1106
+ sql_config=sql_config,
1107
+ ):
1108
+ results.append(work_unit)
1109
+ timings["work_unit_generation"] = time.time() - wu_start
1110
+
1111
+ # Track individual view timing
1112
+ timings["total"] = time.time() - total_start
1113
+
1114
+ with report_lock:
1115
+ self.report.slowest_view_name[f"{schema}.{view_name}"] = (
1116
+ timings["total"]
1117
+ )
1118
+
1119
+ except Exception as e:
1120
+ with report_lock:
1121
+ self.report.num_view_processing_failures += 1
1122
+ # Log full exception details for debugging
1123
+ import traceback
1124
+
1125
+ full_traceback = traceback.format_exc()
1126
+ logger.error(
1127
+ f"Failed to process view {schema}.{view_name}: {str(e)}"
1128
+ )
1129
+ logger.error(f"Full traceback: {full_traceback}")
1130
+ self.report.warning(
1131
+ f"Error processing view {schema}.{view_name}",
1132
+ context=f"View: {schema}.{view_name}, Error: {str(e)}",
1133
+ exc=e,
1134
+ )
1135
+
1136
+ return results
1137
+
1138
+ # Use ThreadPoolExecutor for concurrent processing
1139
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
1140
+ # Submit all view processing tasks
1141
+ future_to_view = {
1142
+ executor.submit(process_single_view, view_name): view_name
1143
+ for view_name in view_names
1144
+ }
1145
+
1146
+ # Process completed tasks as they finish
1147
+ for future in as_completed(future_to_view):
1148
+ view_name = future_to_view[future]
1149
+ try:
1150
+ results = future.result()
1151
+ # Yield all results from this view
1152
+ for result in results:
1153
+ yield result
1154
+ except Exception as e:
1155
+ with report_lock:
1156
+ self.report.warning(
1157
+ "Error in thread processing view",
1158
+ context=f"{schema}.{view_name}",
1159
+ exc=e,
1160
+ )
1161
+
1162
+ finally:
1163
+ # Don't dispose the reusable engine here - it will be cleaned up in close()
1164
+ pass
1165
+
1166
+ def _process_views_single_threaded(
1167
+ self, view_names: List[str], schema: str, sql_config: SQLCommonConfig
1168
+ ) -> Iterable[Union[MetadataWorkUnit, Any]]:
1169
+ """Process views sequentially with a single connection."""
806
1170
  engine = self.get_metadata_engine()
807
- for entry in engine.execute(self.TABLES_AND_VIEWS_QUERY):
808
- table = TeradataTable(
809
- database=entry.DataBaseName.strip(),
810
- name=entry.name.strip(),
811
- description=entry.description.strip() if entry.description else None,
812
- object_type=entry.object_type,
813
- create_timestamp=entry.CreateTimeStamp,
814
- last_alter_name=entry.LastAlterName,
815
- last_alter_timestamp=entry.LastAlterTimeStamp,
816
- request_text=(
817
- entry.RequestText.strip()
818
- if entry.object_type == "View" and entry.RequestText
819
- else None
820
- ),
1171
+
1172
+ try:
1173
+ with engine.connect() as conn:
1174
+ inspector = inspect(conn)
1175
+
1176
+ for view_name in view_names:
1177
+ view_start_time = time.time()
1178
+ try:
1179
+ dataset_name = self.get_identifier(
1180
+ schema=schema, entity=view_name, inspector=inspector
1181
+ )
1182
+
1183
+ self.report.report_entity_scanned(dataset_name, ent_type="view")
1184
+
1185
+ if not sql_config.view_pattern.allowed(dataset_name):
1186
+ self.report.report_dropped(dataset_name)
1187
+ continue
1188
+
1189
+ # Process the view and yield results
1190
+ for work_unit in self._process_view(
1191
+ dataset_name=dataset_name,
1192
+ inspector=inspector,
1193
+ schema=schema,
1194
+ view=view_name,
1195
+ sql_config=sql_config,
1196
+ ):
1197
+ yield work_unit
1198
+
1199
+ # Track individual view timing
1200
+ view_end_time = time.time()
1201
+ view_processing_time = view_end_time - view_start_time
1202
+ self.report.slowest_view_name[f"{schema}.{view_name}"] = (
1203
+ view_processing_time
1204
+ )
1205
+
1206
+ except Exception as e:
1207
+ # Log full exception details for debugging
1208
+ import traceback
1209
+
1210
+ full_traceback = traceback.format_exc()
1211
+ logger.error(
1212
+ f"Failed to process view {schema}.{view_name}: {str(e)}"
1213
+ )
1214
+ logger.error(f"Full traceback: {full_traceback}")
1215
+ self.report.warning(
1216
+ f"Error processing view {schema}.{view_name}",
1217
+ context=f"View: {schema}.{view_name}, Error: {str(e)}",
1218
+ exc=e,
1219
+ )
1220
+
1221
+ finally:
1222
+ engine.dispose()
1223
+
1224
+ def _get_or_create_pooled_engine(self) -> Engine:
1225
+ """Get or create a reusable SQLAlchemy engine with QueuePool for concurrent connections."""
1226
+ with self._pooled_engine_lock:
1227
+ if self._pooled_engine is None:
1228
+ url = self.config.get_sql_alchemy_url()
1229
+
1230
+ # Optimal connection pool sizing to match max_workers exactly
1231
+ # Teradata driver can be sensitive to high connection counts, so cap at reasonable limit
1232
+ max_safe_connections = (
1233
+ 13 # Conservative limit: 8 base + 5 overflow for Teradata stability
1234
+ )
1235
+
1236
+ # Adjust max_workers to match available connection pool capacity
1237
+ effective_max_workers = min(
1238
+ self.config.max_workers, max_safe_connections
1239
+ )
1240
+
1241
+ # Set pool size to match effective workers for optimal performance
1242
+ base_connections = min(
1243
+ effective_max_workers, 8
1244
+ ) # Reasonable base connections
1245
+ max_overflow = (
1246
+ effective_max_workers - base_connections
1247
+ ) # Remaining as overflow
1248
+
1249
+ # Log adjustment if max_workers was reduced
1250
+ if effective_max_workers < self.config.max_workers:
1251
+ logger.warning(
1252
+ f"Reduced max_workers from {self.config.max_workers} to {effective_max_workers} to match Teradata connection pool capacity"
1253
+ )
1254
+
1255
+ # Update the config to reflect the effective value used
1256
+ self.config.max_workers = effective_max_workers
1257
+
1258
+ pool_options = {
1259
+ **self.config.options,
1260
+ "poolclass": QueuePool,
1261
+ "pool_size": base_connections,
1262
+ "max_overflow": max_overflow,
1263
+ "pool_pre_ping": True, # Validate connections
1264
+ "pool_recycle": 1800, # Recycle connections after 30 mins (more frequent)
1265
+ "pool_timeout": 60, # Longer timeout for connection acquisition
1266
+ "pool_reset_on_return": "rollback", # Explicit rollback on connection return
1267
+ }
1268
+
1269
+ # Add Teradata-specific connection options for stability
1270
+ if "connect_args" not in pool_options:
1271
+ pool_options["connect_args"] = {}
1272
+
1273
+ # Teradata-specific connection arguments for better stability
1274
+ pool_options["connect_args"].update(
1275
+ {
1276
+ "connect_timeout": "30000", # Connection timeout in ms (30 seconds)
1277
+ "request_timeout": "120000", # Request timeout in ms (2 minutes)
1278
+ }
1279
+ )
1280
+
1281
+ self._pooled_engine = create_engine(url, **pool_options)
1282
+ logger.info(
1283
+ f"Created optimized Teradata connection pool: {base_connections} base + {max_overflow} overflow = {base_connections + max_overflow} max connections (matching {effective_max_workers} workers)"
1284
+ )
1285
+
1286
+ return self._pooled_engine
1287
+
1288
+ def cache_tables_and_views(self) -> None:
1289
+ with self.report.new_stage("Cache tables and views"):
1290
+ engine = self.get_metadata_engine()
1291
+ try:
1292
+ database_counts: Dict[str, Dict[str, int]] = defaultdict(
1293
+ lambda: {"tables": 0, "views": 0}
1294
+ )
1295
+
1296
+ for entry in engine.execute(self.TABLES_AND_VIEWS_QUERY):
1297
+ table = TeradataTable(
1298
+ database=entry.DataBaseName.strip(),
1299
+ name=entry.name.strip(),
1300
+ description=entry.description.strip()
1301
+ if entry.description
1302
+ else None,
1303
+ object_type=entry.object_type,
1304
+ create_timestamp=entry.CreateTimeStamp,
1305
+ last_alter_name=entry.LastAlterName,
1306
+ last_alter_timestamp=entry.LastAlterTimeStamp,
1307
+ request_text=(
1308
+ entry.RequestText.strip()
1309
+ if entry.object_type == "View" and entry.RequestText
1310
+ else None
1311
+ ),
1312
+ )
1313
+
1314
+ # Count objects per database for metrics
1315
+ if table.object_type == "View":
1316
+ database_counts[table.database]["views"] += 1
1317
+ else:
1318
+ database_counts[table.database]["tables"] += 1
1319
+
1320
+ with self._tables_cache_lock:
1321
+ if table.database not in self._tables_cache:
1322
+ self._tables_cache[table.database] = []
1323
+ self._tables_cache[table.database].append(table)
1324
+
1325
+ for database, counts in database_counts.items():
1326
+ self.report.num_database_tables_to_scan[database] = counts["tables"]
1327
+ self.report.num_database_views_to_scan[database] = counts["views"]
1328
+
1329
+ finally:
1330
+ engine.dispose()
1331
+
1332
+ def _reconstruct_queries_streaming(
1333
+ self, entries: Iterable[Any]
1334
+ ) -> Iterable[ObservedQuery]:
1335
+ """Reconstruct complete queries from database entries in streaming fashion.
1336
+
1337
+ This method processes entries in order and reconstructs multi-row queries
1338
+ by concatenating rows with the same query_id.
1339
+ """
1340
+ current_query_id = None
1341
+ current_query_parts = []
1342
+ current_query_metadata = None
1343
+
1344
+ for entry in entries:
1345
+ # Count each audit query entry processed
1346
+ self.report.num_audit_query_entries_processed += 1
1347
+
1348
+ query_id = getattr(entry, "query_id", None)
1349
+ query_text = str(getattr(entry, "query_text", ""))
1350
+
1351
+ if query_id != current_query_id:
1352
+ # New query started - yield the previous one if it exists
1353
+ if current_query_id is not None and current_query_parts:
1354
+ yield self._create_observed_query_from_parts(
1355
+ current_query_parts, current_query_metadata
1356
+ )
1357
+
1358
+ # Start new query
1359
+ current_query_id = query_id
1360
+ current_query_parts = [query_text] if query_text else []
1361
+ current_query_metadata = entry
1362
+ else:
1363
+ # Same query - append the text
1364
+ if query_text:
1365
+ current_query_parts.append(query_text)
1366
+
1367
+ # Yield the last query if it exists
1368
+ if current_query_id is not None and current_query_parts:
1369
+ yield self._create_observed_query_from_parts(
1370
+ current_query_parts, current_query_metadata
821
1371
  )
822
- if table.database not in self._tables_cache:
823
- self._tables_cache[table.database] = []
824
1372
 
825
- self._tables_cache[table.database].append(table)
1373
+ def _create_observed_query_from_parts(
1374
+ self, query_parts: List[str], metadata_entry: Any
1375
+ ) -> ObservedQuery:
1376
+ """Create ObservedQuery from reconstructed query parts and metadata."""
1377
+ # Join all parts to form the complete query
1378
+ # Teradata fragments are split at fixed lengths without artificial breaks
1379
+ full_query_text = "".join(query_parts)
1380
+
1381
+ # Extract metadata
1382
+ session_id = getattr(metadata_entry, "session_id", None)
1383
+ timestamp = getattr(metadata_entry, "timestamp", None)
1384
+ user = getattr(metadata_entry, "user", None)
1385
+ default_database = getattr(metadata_entry, "default_database", None)
1386
+
1387
+ # Apply Teradata-specific query transformations
1388
+ cleaned_query = NOT_CASESPECIFIC_PATTERN.sub("", full_query_text)
1389
+
1390
+ # For Teradata's two-tier architecture (database.table), we should not set default_db
1391
+ # to avoid incorrect URN generation like "dbc.database.table" instead of "database.table"
1392
+ # The SQL parser will treat database.table references correctly without default_db
1393
+ return ObservedQuery(
1394
+ query=cleaned_query,
1395
+ session_id=session_id,
1396
+ timestamp=timestamp,
1397
+ user=CorpUserUrn(user) if user else None,
1398
+ default_db=DEFAULT_NO_DATABASE_TERADATA, # Teradata uses two-tier database.table naming without default database prefixing
1399
+ default_schema=default_database,
1400
+ )
1401
+
1402
+ def _convert_entry_to_observed_query(self, entry: Any) -> ObservedQuery:
1403
+ """Convert database query entry to ObservedQuery for SqlParsingAggregator.
1404
+
1405
+ DEPRECATED: This method is deprecated in favor of _reconstruct_queries_streaming
1406
+ which properly handles multi-row queries. This method does not handle queries
1407
+ that span multiple rows correctly and should not be used.
1408
+ """
1409
+ # Extract fields from database result
1410
+ query_text = str(entry.query_text).strip()
1411
+ session_id = getattr(entry, "session_id", None)
1412
+ timestamp = getattr(entry, "timestamp", None)
1413
+ user = getattr(entry, "user", None)
1414
+ default_database = getattr(entry, "default_database", None)
1415
+
1416
+ # Apply Teradata-specific query transformations
1417
+ cleaned_query = NOT_CASESPECIFIC_PATTERN.sub("", query_text)
1418
+
1419
+ # For Teradata's two-tier architecture (database.table), we should not set default_db
1420
+ # to avoid incorrect URN generation like "dbc.database.table" instead of "database.table"
1421
+ # However, we should set default_schema for unqualified table references
1422
+ return ObservedQuery(
1423
+ query=cleaned_query,
1424
+ session_id=session_id,
1425
+ timestamp=timestamp,
1426
+ user=CorpUserUrn(user) if user else None,
1427
+ default_db=DEFAULT_NO_DATABASE_TERADATA, # Teradata uses two-tier database.table naming without default database prefixing
1428
+ default_schema=default_database, # Set default_schema for unqualified table references
1429
+ )
1430
+
1431
+ def _fetch_lineage_entries_chunked(self) -> Iterable[Any]:
1432
+ """Fetch lineage entries using server-side cursor to handle large result sets efficiently."""
1433
+ queries = self._make_lineage_queries()
1434
+
1435
+ fetch_engine = self.get_metadata_engine()
1436
+ try:
1437
+ with fetch_engine.connect() as conn:
1438
+ cursor_type = (
1439
+ "server-side"
1440
+ if self.config.use_server_side_cursors
1441
+ else "client-side"
1442
+ )
1443
+
1444
+ total_count_all_queries = 0
1445
+
1446
+ for query_index, query in enumerate(queries, 1):
1447
+ logger.info(
1448
+ f"Executing lineage query {query_index}/{len(queries)} for time range {self.config.start_time} to {self.config.end_time} with {cursor_type} cursor..."
1449
+ )
1450
+
1451
+ # Use helper method to try server-side cursor with fallback
1452
+ result = self._execute_with_cursor_fallback(conn, query)
1453
+
1454
+ # Stream results in batches to avoid memory issues
1455
+ batch_size = 5000
1456
+ batch_count = 0
1457
+ query_total_count = 0
1458
+
1459
+ while True:
1460
+ # Fetch a batch of rows
1461
+ batch = result.fetchmany(batch_size)
1462
+ if not batch:
1463
+ break
826
1464
 
827
- def get_audit_log_mcps(self, urns: Set[str]) -> Iterable[MetadataWorkUnit]:
1465
+ batch_count += 1
1466
+ query_total_count += len(batch)
1467
+ total_count_all_queries += len(batch)
1468
+
1469
+ logger.info(
1470
+ f"Query {query_index} - Fetched batch {batch_count}: {len(batch)} lineage entries (query total: {query_total_count})"
1471
+ )
1472
+ yield from batch
1473
+
1474
+ logger.info(
1475
+ f"Completed query {query_index}: {query_total_count} lineage entries in {batch_count} batches"
1476
+ )
1477
+
1478
+ logger.info(
1479
+ f"Completed fetching all queries: {total_count_all_queries} total lineage entries from {len(queries)} queries"
1480
+ )
1481
+
1482
+ except Exception as e:
1483
+ logger.error(f"Error fetching lineage entries: {e}")
1484
+ raise
1485
+ finally:
1486
+ fetch_engine.dispose()
1487
+
1488
+ def _check_historical_table_exists(self) -> bool:
1489
+ """
1490
+ Check if the PDCRINFO.DBQLSqlTbl_Hst table exists and is accessible.
1491
+ DBQL rows are periodically moved to history table and audit queries might not exist in DBC already.
1492
+ There is not guarantee that the historical table exists, so we need to check it.
1493
+
1494
+ Returns:
1495
+ bool: True if the historical table exists and is accessible, False otherwise.
1496
+ """
828
1497
  engine = self.get_metadata_engine()
829
- for entry in engine.execute(self._make_lineage_query()):
830
- self.report.num_queries_parsed += 1
831
- if self.report.num_queries_parsed % 1000 == 0:
832
- logger.info(f"Parsed {self.report.num_queries_parsed} queries")
833
-
834
- yield from self.gen_lineage_from_query(
835
- query=entry.query_text,
836
- default_database=entry.default_database,
837
- timestamp=entry.timestamp,
838
- user=entry.user,
839
- urns=urns,
1498
+ try:
1499
+ # Use a simple query to check if the table exists and is accessible
1500
+ check_query = """
1501
+ SELECT TOP 1 QueryID
1502
+ FROM PDCRINFO.DBQLSqlTbl_Hst
1503
+ WHERE 1=0
1504
+ """
1505
+ with engine.connect() as conn:
1506
+ conn.execute(text(check_query))
1507
+ logger.info(
1508
+ "Historical lineage table PDCRINFO.DBQLSqlTbl_Hst is available"
1509
+ )
1510
+ return True
1511
+ except Exception as e:
1512
+ logger.info(
1513
+ f"Historical lineage table PDCRINFO.DBQLSqlTbl_Hst is not available: {e}"
840
1514
  )
1515
+ return False
1516
+ finally:
1517
+ engine.dispose()
841
1518
 
842
- def _make_lineage_query(self) -> str:
1519
+ def _make_lineage_queries(self) -> List[str]:
843
1520
  databases_filter = (
844
1521
  ""
845
1522
  if not self.config.databases
846
- else "and default_database in ({databases})".format(
1523
+ else "and l.DefaultDatabase in ({databases})".format(
847
1524
  databases=",".join([f"'{db}'" for db in self.config.databases])
848
1525
  )
849
1526
  )
850
1527
 
851
- query = self.QUERY_TEXT_QUERY.format(
852
- start_time=self.config.start_time,
853
- end_time=self.config.end_time,
854
- databases_filter=databases_filter,
855
- )
856
- return query
1528
+ queries = []
857
1529
 
858
- def gen_lineage_from_query(
859
- self,
860
- query: str,
861
- default_database: Optional[str] = None,
862
- timestamp: Optional[datetime] = None,
863
- user: Optional[str] = None,
864
- view_urn: Optional[str] = None,
865
- urns: Optional[Set[str]] = None,
866
- ) -> Iterable[MetadataWorkUnit]:
867
- result = sqlglot_lineage(
868
- # With this clever hack we can make the query parser to not fail on queries with CASESPECIFIC
869
- sql=query.replace("(NOT CASESPECIFIC)", ""),
870
- schema_resolver=self.schema_resolver,
871
- default_db=None,
872
- default_schema=(
873
- default_database if default_database else self.config.default_db
874
- ),
875
- )
876
- if result.debug_info.table_error:
877
- logger.debug(
878
- f"Error parsing table lineage ({view_urn}):\n{result.debug_info.table_error}"
1530
+ # Check if historical lineage is configured and available
1531
+ if (
1532
+ self.config.include_historical_lineage
1533
+ and self._check_historical_table_exists()
1534
+ ):
1535
+ logger.info(
1536
+ "Using UNION query to combine historical and current lineage data to avoid duplicates"
1537
+ )
1538
+ # For historical query, we need the database filter for historical part
1539
+ databases_filter_history = (
1540
+ databases_filter.replace("l.DefaultDatabase", "h.DefaultDatabase")
1541
+ if databases_filter
1542
+ else ""
1543
+ )
1544
+
1545
+ union_query = self.QUERY_TEXT_HISTORICAL_UNION.format(
1546
+ start_time=self.config.start_time,
1547
+ end_time=self.config.end_time,
1548
+ databases_filter=databases_filter,
1549
+ databases_filter_history=databases_filter_history,
879
1550
  )
880
- self.report.num_table_parse_failures += 1
1551
+ queries.append(union_query)
881
1552
  else:
882
- yield from self.builder.process_sql_parsing_result(
883
- result,
884
- query=query,
885
- is_view_ddl=view_urn is not None,
886
- query_timestamp=timestamp,
887
- user=f"urn:li:corpuser:{user}",
888
- include_urns=urns,
1553
+ if self.config.include_historical_lineage:
1554
+ logger.warning(
1555
+ "Historical lineage was requested but PDCRINFO.DBQLSqlTbl_Hst table is not available. Falling back to current data only."
1556
+ )
1557
+
1558
+ # Use current-only query when historical data is not available
1559
+ current_query = self.QUERY_TEXT_CURRENT_QUERIES.format(
1560
+ start_time=self.config.start_time,
1561
+ end_time=self.config.end_time,
1562
+ databases_filter=databases_filter,
889
1563
  )
1564
+ queries.append(current_query)
1565
+
1566
+ return queries
890
1567
 
891
1568
  def get_metadata_engine(self) -> Engine:
892
1569
  url = self.config.get_sql_alchemy_url()
893
1570
  logger.debug(f"sql_alchemy_url={url}")
894
1571
  return create_engine(url, **self.config.options)
895
1572
 
896
- def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
897
- # Add all schemas to the schema resolver
898
- # Sql parser operates on lowercase urns so we need to lowercase the urns
899
- for wu in auto_lowercase_urns(super().get_workunits_internal()):
900
- urn = wu.get_urn()
901
- schema_metadata = wu.get_aspect_of_type(SchemaMetadataClass)
902
- if schema_metadata:
903
- self.schema_resolver.add_schema_metadata(urn, schema_metadata)
904
- yield wu
905
-
906
- urns = self.schema_resolver.get_urns()
907
- if self.config.include_table_lineage or self.config.include_usage_statistics:
908
- with self.report.new_stage("Audit log extraction"):
909
- yield from self.get_audit_log_mcps(urns=urns)
910
-
911
- yield from self.builder.gen_workunits()
1573
+ def _execute_with_cursor_fallback(
1574
+ self, connection: Connection, query: str, params: Optional[Dict] = None
1575
+ ) -> Any:
1576
+ """
1577
+ Execute query with server-side cursor if enabled and supported, otherwise fall back to regular execution.
1578
+
1579
+ Args:
1580
+ connection: Database connection
1581
+ query: SQL query to execute
1582
+ params: Query parameters
1583
+
1584
+ Returns:
1585
+ Query result object
1586
+ """
1587
+ if self.config.use_server_side_cursors:
1588
+ try:
1589
+ # Try server-side cursor first
1590
+ if params:
1591
+ result = connection.execution_options(stream_results=True).execute(
1592
+ text(query), params
1593
+ )
1594
+ else:
1595
+ result = connection.execution_options(stream_results=True).execute(
1596
+ text(query)
1597
+ )
1598
+
1599
+ logger.debug(
1600
+ "Successfully using server-side cursor for query execution"
1601
+ )
1602
+ return result
1603
+
1604
+ except Exception as e:
1605
+ logger.warning(
1606
+ f"Server-side cursor failed, falling back to client-side execution: {e}"
1607
+ )
1608
+ # Fall through to regular execution
1609
+
1610
+ # Regular execution (client-side)
1611
+ if params:
1612
+ return connection.execute(text(query), params)
1613
+ else:
1614
+ return connection.execute(text(query))
1615
+
1616
+ def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
1617
+ """Override to prevent parent class from generating aggregator work units during schema extraction.
1618
+
1619
+ We handle aggregator generation manually after populating it with audit log data.
1620
+ """
1621
+ # Do nothing - we'll call the parent implementation manually after populating the aggregator
1622
+ return iter([])
1623
+
1624
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1625
+ logger.info("Starting Teradata metadata extraction")
1626
+
1627
+ # Step 1: Schema extraction first (parent class will skip aggregator generation due to our override)
1628
+ with self.report.new_stage("Schema metadata extraction"):
1629
+ yield from super().get_workunits_internal()
1630
+ logger.info("Completed schema metadata extraction")
1631
+
1632
+ # Step 2: Lineage extraction after schema extraction
1633
+ # This allows lineage processing to have access to all discovered schema information
1634
+ with self.report.new_stage("Audit log extraction and lineage processing"):
1635
+ self._populate_aggregator_from_audit_logs()
1636
+ # Call parent implementation directly to generate aggregator work units
1637
+ yield from super()._generate_aggregator_workunits()
1638
+ logger.info("Completed lineage processing")
1639
+
1640
+ def _populate_aggregator_from_audit_logs(self) -> None:
1641
+ """SqlParsingAggregator-based lineage extraction with enhanced capabilities."""
1642
+ with self.report.new_stage("Lineage extraction from Teradata audit logs"):
1643
+ # Record the lineage query time range in the report
1644
+ self.report.lineage_start_time = self.config.start_time
1645
+ self.report.lineage_end_time = self.config.end_time
1646
+
1647
+ logger.info(
1648
+ f"Starting lineage extraction from Teradata audit logs (time range: {self.config.start_time} to {self.config.end_time})"
1649
+ )
1650
+
1651
+ if (
1652
+ self.config.include_table_lineage
1653
+ or self.config.include_usage_statistics
1654
+ ):
1655
+ # Step 1: Stream query entries from database with memory-efficient processing
1656
+ with self.report.new_stage("Fetching lineage entries from Audit Logs"):
1657
+ queries_processed = 0
1658
+
1659
+ # Use streaming query reconstruction for memory efficiency
1660
+ for observed_query in self._reconstruct_queries_streaming(
1661
+ self._fetch_lineage_entries_chunked()
1662
+ ):
1663
+ self.aggregator.add(observed_query)
1664
+
1665
+ queries_processed += 1
1666
+ if queries_processed % 10000 == 0:
1667
+ logger.info(
1668
+ f"Processed {queries_processed} queries to aggregator"
1669
+ )
1670
+
1671
+ if queries_processed == 0:
1672
+ logger.info("No lineage entries found")
1673
+ return
1674
+
1675
+ logger.info(
1676
+ f"Completed adding {queries_processed} queries to SqlParsingAggregator"
1677
+ )
1678
+
1679
+ logger.info("Completed lineage extraction from Teradata audit logs")
1680
+
1681
+ def close(self) -> None:
1682
+ """Clean up resources when source is closed."""
1683
+ logger.info("Closing SqlParsingAggregator")
1684
+ self.aggregator.close()
1685
+
1686
+ # Clean up pooled engine
1687
+ with self._pooled_engine_lock:
1688
+ if self._pooled_engine is not None:
1689
+ logger.info("Disposing pooled engine")
1690
+ self._pooled_engine.dispose()
1691
+ self._pooled_engine = None
1692
+
1693
+ # Report failed views summary
1694
+ super().close()