acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +12 -16
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/response_helper.py +86 -1
- datahub/emitter/rest_emitter.py +71 -13
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/config.py +11 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/openapi.py +12 -0
- datahub/ingestion/source/openapi_parser.py +56 -37
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1446 -559
- datahub/metadata/_urns/urn_defs.py +1721 -1553
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +18055 -17802
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/sdk/__init__.py +6 -0
- datahub/sdk/_all_entities.py +11 -0
- datahub/sdk/_shared.py +118 -1
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +90 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import time
|
|
2
3
|
from collections import defaultdict
|
|
3
|
-
from
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
|
+
from dataclasses import dataclass, field
|
|
4
6
|
from datetime import datetime
|
|
5
7
|
from functools import lru_cache
|
|
8
|
+
from threading import Lock
|
|
6
9
|
from typing import (
|
|
7
10
|
Any,
|
|
8
11
|
Dict,
|
|
@@ -10,7 +13,6 @@ from typing import (
|
|
|
10
13
|
List,
|
|
11
14
|
MutableMapping,
|
|
12
15
|
Optional,
|
|
13
|
-
Set,
|
|
14
16
|
Tuple,
|
|
15
17
|
Union,
|
|
16
18
|
)
|
|
@@ -29,7 +31,6 @@ from teradatasqlalchemy.options import configure
|
|
|
29
31
|
|
|
30
32
|
from datahub.configuration.common import AllowDenyPattern
|
|
31
33
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
32
|
-
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
|
|
33
34
|
from datahub.ingestion.api.common import PipelineContext
|
|
34
35
|
from datahub.ingestion.api.decorators import (
|
|
35
36
|
SourceCapability,
|
|
@@ -39,10 +40,9 @@ from datahub.ingestion.api.decorators import (
|
|
|
39
40
|
platform_name,
|
|
40
41
|
support_status,
|
|
41
42
|
)
|
|
42
|
-
from datahub.ingestion.api.source_helpers import auto_lowercase_urns
|
|
43
43
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
44
44
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
45
|
-
from datahub.ingestion.source.sql.sql_common import
|
|
45
|
+
from datahub.ingestion.source.sql.sql_common import register_custom_type
|
|
46
46
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
47
47
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
48
48
|
from datahub.ingestion.source.sql.two_tier_sql_source import (
|
|
@@ -56,13 +56,64 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
56
56
|
BytesTypeClass,
|
|
57
57
|
TimeTypeClass,
|
|
58
58
|
)
|
|
59
|
-
from datahub.metadata.
|
|
59
|
+
from datahub.metadata.urns import CorpUserUrn
|
|
60
60
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
61
|
-
from datahub.sql_parsing.
|
|
61
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
62
|
+
ObservedQuery,
|
|
63
|
+
SqlParsingAggregator,
|
|
64
|
+
)
|
|
62
65
|
from datahub.utilities.groupby import groupby_unsorted
|
|
66
|
+
from datahub.utilities.stats_collections import TopKDict
|
|
63
67
|
|
|
64
68
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
65
69
|
|
|
70
|
+
# Common excluded databases used in multiple places
|
|
71
|
+
EXCLUDED_DATABASES = [
|
|
72
|
+
"All",
|
|
73
|
+
"Crashdumps",
|
|
74
|
+
"Default",
|
|
75
|
+
"DemoNow_Monitor",
|
|
76
|
+
"EXTUSER",
|
|
77
|
+
"External_AP",
|
|
78
|
+
"GLOBAL_FUNCTIONS",
|
|
79
|
+
"LockLogShredder",
|
|
80
|
+
"PUBLIC",
|
|
81
|
+
"SQLJ",
|
|
82
|
+
"SYSBAR",
|
|
83
|
+
"SYSJDBC",
|
|
84
|
+
"SYSLIB",
|
|
85
|
+
"SYSSPATIAL",
|
|
86
|
+
"SYSUDTLIB",
|
|
87
|
+
"SYSUIF",
|
|
88
|
+
"SysAdmin",
|
|
89
|
+
"Sys_Calendar",
|
|
90
|
+
"SystemFe",
|
|
91
|
+
"TDBCMgmt",
|
|
92
|
+
"TDMaps",
|
|
93
|
+
"TDPUSER",
|
|
94
|
+
"TDQCD",
|
|
95
|
+
"TDStats",
|
|
96
|
+
"TD_ANALYTICS_DB",
|
|
97
|
+
"TD_SERVER_DB",
|
|
98
|
+
"TD_SYSFNLIB",
|
|
99
|
+
"TD_SYSGPL",
|
|
100
|
+
"TD_SYSXML",
|
|
101
|
+
"TDaaS_BAR",
|
|
102
|
+
"TDaaS_DB",
|
|
103
|
+
"TDaaS_Maint",
|
|
104
|
+
"TDaaS_Monitor",
|
|
105
|
+
"TDaaS_Support",
|
|
106
|
+
"TDaaS_TDBCMgmt1",
|
|
107
|
+
"TDaaS_TDBCMgmt2",
|
|
108
|
+
"dbcmngr",
|
|
109
|
+
"mldb",
|
|
110
|
+
"system",
|
|
111
|
+
"tapidb",
|
|
112
|
+
"tdwm",
|
|
113
|
+
"val",
|
|
114
|
+
"dbc",
|
|
115
|
+
]
|
|
116
|
+
|
|
66
117
|
register_custom_type(custom_types.JSON, BytesTypeClass)
|
|
67
118
|
register_custom_type(custom_types.INTERVAL_DAY, TimeTypeClass)
|
|
68
119
|
register_custom_type(custom_types.INTERVAL_DAY_TO_SECOND, TimeTypeClass)
|
|
@@ -99,14 +150,16 @@ class TeradataTable:
|
|
|
99
150
|
request_text: Optional[str]
|
|
100
151
|
|
|
101
152
|
|
|
102
|
-
#
|
|
153
|
+
# Cache size of 1 is sufficient since schemas are processed sequentially
|
|
154
|
+
# Note: This cache is per-process and helps when processing multiple tables in the same schema
|
|
103
155
|
@lru_cache(maxsize=1)
|
|
104
156
|
def get_schema_columns(
|
|
105
157
|
self: Any, connection: Connection, dbc_columns: str, schema: str
|
|
106
158
|
) -> Dict[str, List[Any]]:
|
|
159
|
+
start_time = time.time()
|
|
107
160
|
columns: Dict[str, List[Any]] = {}
|
|
108
|
-
columns_query = f"select * from dbc.{dbc_columns} where DatabaseName (NOT CASESPECIFIC) =
|
|
109
|
-
rows = connection.execute(text(columns_query)).fetchall()
|
|
161
|
+
columns_query = f"select * from dbc.{dbc_columns} where DatabaseName (NOT CASESPECIFIC) = :schema (NOT CASESPECIFIC) order by TableName, ColumnId"
|
|
162
|
+
rows = connection.execute(text(columns_query), {"schema": schema}).fetchall()
|
|
110
163
|
for row in rows:
|
|
111
164
|
row_mapping = row._mapping
|
|
112
165
|
if row_mapping.TableName not in columns:
|
|
@@ -114,18 +167,29 @@ def get_schema_columns(
|
|
|
114
167
|
|
|
115
168
|
columns[row_mapping.TableName].append(row_mapping)
|
|
116
169
|
|
|
170
|
+
end_time = time.time()
|
|
171
|
+
extraction_time = end_time - start_time
|
|
172
|
+
logger.info(
|
|
173
|
+
f"Column extraction for schema '{schema}' completed in {extraction_time:.2f} seconds"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Update report if available
|
|
177
|
+
if hasattr(self, "report"):
|
|
178
|
+
self.report.column_extraction_duration_seconds += extraction_time
|
|
179
|
+
|
|
117
180
|
return columns
|
|
118
181
|
|
|
119
182
|
|
|
120
|
-
#
|
|
183
|
+
# Cache size of 1 is sufficient since schemas are processed sequentially
|
|
184
|
+
# Note: This cache is per-process and helps when processing multiple tables in the same schema
|
|
121
185
|
@lru_cache(maxsize=1)
|
|
122
186
|
def get_schema_pk_constraints(
|
|
123
187
|
self: Any, connection: Connection, schema: str
|
|
124
188
|
) -> Dict[str, List[Any]]:
|
|
125
189
|
dbc_indices = "IndicesV" + "X" if configure.usexviews else "IndicesV"
|
|
126
190
|
primary_keys: Dict[str, List[Any]] = {}
|
|
127
|
-
stmt = f"select * from dbc.{dbc_indices} where DatabaseName (NOT CASESPECIFIC) =
|
|
128
|
-
rows = connection.execute(text(stmt)).fetchall()
|
|
191
|
+
stmt = f"select * from dbc.{dbc_indices} where DatabaseName (NOT CASESPECIFIC) = :schema (NOT CASESPECIFIC) and IndexType = 'K' order by IndexNumber"
|
|
192
|
+
rows = connection.execute(text(stmt), {"schema": schema}).fetchall()
|
|
129
193
|
for row in rows:
|
|
130
194
|
row_mapping = row._mapping
|
|
131
195
|
if row_mapping.TableName not in primary_keys:
|
|
@@ -172,6 +236,10 @@ def optimized_get_pk_constraint(
|
|
|
172
236
|
index_column.IndexName
|
|
173
237
|
) # There should be just one IndexName
|
|
174
238
|
|
|
239
|
+
# Update counter if available
|
|
240
|
+
if hasattr(self, "report"):
|
|
241
|
+
self.report.num_primary_keys_processed += 1
|
|
242
|
+
|
|
175
243
|
return {"constrained_columns": index_columns, "name": index_name}
|
|
176
244
|
|
|
177
245
|
|
|
@@ -228,23 +296,55 @@ def optimized_get_columns(
|
|
|
228
296
|
table_name, []
|
|
229
297
|
)
|
|
230
298
|
|
|
299
|
+
start_time = time.time()
|
|
300
|
+
|
|
231
301
|
final_column_info = []
|
|
232
302
|
# Don't care about ART tables now
|
|
233
303
|
# Ignore the non-functional column in a PTI table
|
|
234
304
|
for row in res:
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
305
|
+
try:
|
|
306
|
+
col_info = self._get_column_info(row)
|
|
307
|
+
|
|
308
|
+
# Add CommentString as comment field for column description
|
|
309
|
+
if hasattr(row, "CommentString") and row.CommentString:
|
|
310
|
+
col_info["comment"] = row.CommentString.strip()
|
|
311
|
+
elif (
|
|
312
|
+
isinstance(row, dict)
|
|
313
|
+
and "CommentString" in row
|
|
314
|
+
and row["CommentString"]
|
|
240
315
|
):
|
|
241
|
-
|
|
242
|
-
|
|
316
|
+
col_info["comment"] = row["CommentString"].strip()
|
|
317
|
+
|
|
318
|
+
if "TSColumnType" in col_info and col_info["TSColumnType"] is not None:
|
|
319
|
+
if (
|
|
320
|
+
col_info["ColumnName"] == "TD_TIMEBUCKET"
|
|
321
|
+
and col_info["TSColumnType"].strip() == "TB"
|
|
322
|
+
):
|
|
323
|
+
continue
|
|
324
|
+
final_column_info.append(col_info)
|
|
325
|
+
|
|
326
|
+
# Update counter - access report through self from the connection context
|
|
327
|
+
if hasattr(self, "report"):
|
|
328
|
+
self.report.num_columns_processed += 1
|
|
329
|
+
|
|
330
|
+
except Exception as e:
|
|
331
|
+
logger.error(
|
|
332
|
+
f"Failed to process column {getattr(row, 'ColumnName', 'unknown')}: {e}"
|
|
333
|
+
)
|
|
334
|
+
if hasattr(self, "report"):
|
|
335
|
+
self.report.num_column_extraction_failures += 1
|
|
336
|
+
continue
|
|
337
|
+
|
|
338
|
+
# Update timing
|
|
339
|
+
if hasattr(self, "report"):
|
|
340
|
+
end_time = time.time()
|
|
341
|
+
self.report.column_extraction_duration_seconds += end_time - start_time
|
|
243
342
|
|
|
244
343
|
return final_column_info
|
|
245
344
|
|
|
246
345
|
|
|
247
|
-
#
|
|
346
|
+
# Cache size of 1 is sufficient since schemas are processed sequentially
|
|
347
|
+
# Note: This cache is per-process and helps when processing multiple tables in the same schema
|
|
248
348
|
@lru_cache(maxsize=1)
|
|
249
349
|
def get_schema_foreign_keys(
|
|
250
350
|
self: Any, connection: Connection, schema: str
|
|
@@ -334,9 +434,24 @@ def optimized_get_view_definition(
|
|
|
334
434
|
|
|
335
435
|
@dataclass
|
|
336
436
|
class TeradataReport(SQLSourceReport, IngestionStageReport, BaseTimeWindowReport):
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
437
|
+
# View processing metrics (actively used)
|
|
438
|
+
num_views_processed: int = 0
|
|
439
|
+
num_view_processing_failures: int = 0
|
|
440
|
+
view_extraction_total_time_seconds: float = 0.0
|
|
441
|
+
view_extraction_average_time_seconds: float = 0.0
|
|
442
|
+
slowest_view_processing_time_seconds: float = 0.0
|
|
443
|
+
slowest_view_name: TopKDict[str, float] = field(default_factory=TopKDict)
|
|
444
|
+
|
|
445
|
+
# Connection pool performance metrics (actively used)
|
|
446
|
+
connection_pool_wait_time_seconds: float = 0.0
|
|
447
|
+
connection_pool_max_wait_time_seconds: float = 0.0
|
|
448
|
+
|
|
449
|
+
# Database-level metrics similar to BigQuery's approach (actively used)
|
|
450
|
+
num_database_tables_to_scan: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
451
|
+
num_database_views_to_scan: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
452
|
+
|
|
453
|
+
# Global metadata extraction timing (single query for all databases)
|
|
454
|
+
metadata_extraction_total_sec: float = 0.0
|
|
340
455
|
|
|
341
456
|
|
|
342
457
|
class BaseTeradataConfig(TwoTierSQLAlchemyConfig):
|
|
@@ -353,53 +468,7 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
|
|
|
353
468
|
)
|
|
354
469
|
|
|
355
470
|
database_pattern = Field(
|
|
356
|
-
default=AllowDenyPattern(
|
|
357
|
-
deny=[
|
|
358
|
-
"All",
|
|
359
|
-
"Crashdumps",
|
|
360
|
-
"Default",
|
|
361
|
-
"DemoNow_Monitor",
|
|
362
|
-
"EXTUSER",
|
|
363
|
-
"External_AP",
|
|
364
|
-
"GLOBAL_FUNCTIONS",
|
|
365
|
-
"LockLogShredder",
|
|
366
|
-
"PUBLIC",
|
|
367
|
-
"SQLJ",
|
|
368
|
-
"SYSBAR",
|
|
369
|
-
"SYSJDBC",
|
|
370
|
-
"SYSLIB",
|
|
371
|
-
"SYSSPATIAL",
|
|
372
|
-
"SYSUDTLIB",
|
|
373
|
-
"SYSUIF",
|
|
374
|
-
"SysAdmin",
|
|
375
|
-
"Sys_Calendar",
|
|
376
|
-
"SystemFe",
|
|
377
|
-
"TDBCMgmt",
|
|
378
|
-
"TDMaps",
|
|
379
|
-
"TDPUSER",
|
|
380
|
-
"TDQCD",
|
|
381
|
-
"TDStats",
|
|
382
|
-
"TD_ANALYTICS_DB",
|
|
383
|
-
"TD_SERVER_DB",
|
|
384
|
-
"TD_SYSFNLIB",
|
|
385
|
-
"TD_SYSGPL",
|
|
386
|
-
"TD_SYSXML",
|
|
387
|
-
"TDaaS_BAR",
|
|
388
|
-
"TDaaS_DB",
|
|
389
|
-
"TDaaS_Maint",
|
|
390
|
-
"TDaaS_Monitor",
|
|
391
|
-
"TDaaS_Support",
|
|
392
|
-
"TDaaS_TDBCMgmt1",
|
|
393
|
-
"TDaaS_TDBCMgmt2",
|
|
394
|
-
"dbcmngr",
|
|
395
|
-
"mldb",
|
|
396
|
-
"system",
|
|
397
|
-
"tapidb",
|
|
398
|
-
"tdwm",
|
|
399
|
-
"val",
|
|
400
|
-
"dbc",
|
|
401
|
-
]
|
|
402
|
-
),
|
|
471
|
+
default=AllowDenyPattern(deny=EXCLUDED_DATABASES),
|
|
403
472
|
description="Regex patterns for databases to filter in ingestion.",
|
|
404
473
|
)
|
|
405
474
|
include_table_lineage = Field(
|
|
@@ -413,6 +482,13 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
|
|
|
413
482
|
description="Whether to include view lineage in the ingestion. "
|
|
414
483
|
"This requires to have the view lineage feature enabled.",
|
|
415
484
|
)
|
|
485
|
+
|
|
486
|
+
include_queries = Field(
|
|
487
|
+
default=True,
|
|
488
|
+
description="Whether to generate query entities for SQL queries. "
|
|
489
|
+
"Query entities provide metadata about individual SQL queries including "
|
|
490
|
+
"execution timestamps, user information, and query text.",
|
|
491
|
+
)
|
|
416
492
|
usage: BaseUsageConfig = Field(
|
|
417
493
|
description="The usage config to use when generating usage statistics",
|
|
418
494
|
default=BaseUsageConfig(),
|
|
@@ -438,6 +514,26 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
|
|
|
438
514
|
description="Whether to use QVCI to get column information. This is faster but requires to have QVCI enabled.",
|
|
439
515
|
)
|
|
440
516
|
|
|
517
|
+
include_historical_lineage: bool = Field(
|
|
518
|
+
default=False,
|
|
519
|
+
description="Whether to include historical lineage data from PDCRINFO.DBQLSqlTbl_Hst in addition to current DBC.QryLogV data. "
|
|
520
|
+
"This provides access to historical query logs that may have been archived. "
|
|
521
|
+
"The historical table existence is checked automatically and gracefully falls back to current data only if not available.",
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
use_server_side_cursors: bool = Field(
|
|
525
|
+
default=True,
|
|
526
|
+
description="Enable server-side cursors for large result sets using SQLAlchemy's stream_results. "
|
|
527
|
+
"This reduces memory usage by streaming results from the database server. "
|
|
528
|
+
"Automatically falls back to client-side batching if server-side cursors are not supported.",
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
max_workers: int = Field(
|
|
532
|
+
default=10,
|
|
533
|
+
description="Maximum number of worker threads to use for parallel processing. "
|
|
534
|
+
"Controls the level of concurrency for operations like view processing.",
|
|
535
|
+
)
|
|
536
|
+
|
|
441
537
|
|
|
442
538
|
@platform_name("Teradata")
|
|
443
539
|
@config_class(TeradataConfig)
|
|
@@ -445,7 +541,10 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
|
|
|
445
541
|
@capability(SourceCapability.DOMAINS, "Enabled by default")
|
|
446
542
|
@capability(SourceCapability.CONTAINERS, "Enabled by default")
|
|
447
543
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
448
|
-
@capability(
|
|
544
|
+
@capability(
|
|
545
|
+
SourceCapability.DELETION_DETECTION,
|
|
546
|
+
"Enabled by default when stateful ingestion is turned on",
|
|
547
|
+
)
|
|
449
548
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
450
549
|
@capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration")
|
|
451
550
|
@capability(SourceCapability.LINEAGE_FINE, "Optionally enabled via configuration")
|
|
@@ -461,13 +560,7 @@ class TeradataSource(TwoTierSQLAlchemySource):
|
|
|
461
560
|
|
|
462
561
|
config: TeradataConfig
|
|
463
562
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
LINEAGE_TIMESTAMP_BOUND_QUERY: str = """
|
|
467
|
-
SELECT MIN(CollectTimeStamp) as "min_ts", MAX(CollectTimeStamp) as "max_ts" from DBC.QryLogV
|
|
468
|
-
""".strip()
|
|
469
|
-
|
|
470
|
-
QUERY_TEXT_QUERY: str = """
|
|
563
|
+
QUERY_TEXT_CURRENT_QUERIES: str = """
|
|
471
564
|
SELECT
|
|
472
565
|
s.QueryID as "query_id",
|
|
473
566
|
UserName as "user",
|
|
@@ -500,10 +593,89 @@ class TeradataSource(TwoTierSQLAlchemySource):
|
|
|
500
593
|
and s.CollectTimeStamp >= TIMESTAMP '{start_time}'
|
|
501
594
|
and default_database not in ('DEMONOW_MONITOR')
|
|
502
595
|
{databases_filter}
|
|
503
|
-
ORDER BY "query_id", "row_no"
|
|
596
|
+
ORDER BY "timestamp", "query_id", "row_no"
|
|
597
|
+
""".strip()
|
|
598
|
+
|
|
599
|
+
QUERY_TEXT_HISTORICAL_UNION: str = """
|
|
600
|
+
SELECT
|
|
601
|
+
"query_id",
|
|
602
|
+
"user",
|
|
603
|
+
"timestamp",
|
|
604
|
+
default_database,
|
|
605
|
+
"query_text",
|
|
606
|
+
"row_no"
|
|
607
|
+
FROM (
|
|
608
|
+
SELECT
|
|
609
|
+
h.QueryID as "query_id",
|
|
610
|
+
h.UserName as "user",
|
|
611
|
+
h.StartTime AT TIME ZONE 'GMT' as "timestamp",
|
|
612
|
+
h.DefaultDatabase as default_database,
|
|
613
|
+
h.SqlTextInfo as "query_text",
|
|
614
|
+
h.SqlRowNo as "row_no"
|
|
615
|
+
FROM "PDCRINFO".DBQLSqlTbl_Hst as h
|
|
616
|
+
WHERE
|
|
617
|
+
h.ErrorCode = 0
|
|
618
|
+
AND h.statementtype not in (
|
|
619
|
+
'Unrecognized type',
|
|
620
|
+
'Create Database/User',
|
|
621
|
+
'Help',
|
|
622
|
+
'Modify Database',
|
|
623
|
+
'Drop Table',
|
|
624
|
+
'Show',
|
|
625
|
+
'Not Applicable',
|
|
626
|
+
'Grant',
|
|
627
|
+
'Abort',
|
|
628
|
+
'Database',
|
|
629
|
+
'Flush Query Logging',
|
|
630
|
+
'Null',
|
|
631
|
+
'Begin/End DBQL',
|
|
632
|
+
'Revoke'
|
|
633
|
+
)
|
|
634
|
+
and h.StartTime AT TIME ZONE 'GMT' >= TIMESTAMP '{start_time}'
|
|
635
|
+
and h.StartTime AT TIME ZONE 'GMT' < TIMESTAMP '{end_time}'
|
|
636
|
+
and h.CollectTimeStamp >= TIMESTAMP '{start_time}'
|
|
637
|
+
and h.DefaultDatabase not in ('DEMONOW_MONITOR')
|
|
638
|
+
{databases_filter_history}
|
|
639
|
+
|
|
640
|
+
UNION
|
|
641
|
+
|
|
642
|
+
SELECT
|
|
643
|
+
s.QueryID as "query_id",
|
|
644
|
+
l.UserName as "user",
|
|
645
|
+
l.StartTime AT TIME ZONE 'GMT' as "timestamp",
|
|
646
|
+
l.DefaultDatabase as default_database,
|
|
647
|
+
s.SqlTextInfo as "query_text",
|
|
648
|
+
s.SqlRowNo as "row_no"
|
|
649
|
+
FROM "DBC".QryLogV as l
|
|
650
|
+
JOIN "DBC".QryLogSqlV as s on s.QueryID = l.QueryID
|
|
651
|
+
WHERE
|
|
652
|
+
l.ErrorCode = 0
|
|
653
|
+
AND l.statementtype not in (
|
|
654
|
+
'Unrecognized type',
|
|
655
|
+
'Create Database/User',
|
|
656
|
+
'Help',
|
|
657
|
+
'Modify Database',
|
|
658
|
+
'Drop Table',
|
|
659
|
+
'Show',
|
|
660
|
+
'Not Applicable',
|
|
661
|
+
'Grant',
|
|
662
|
+
'Abort',
|
|
663
|
+
'Database',
|
|
664
|
+
'Flush Query Logging',
|
|
665
|
+
'Null',
|
|
666
|
+
'Begin/End DBQL',
|
|
667
|
+
'Revoke'
|
|
668
|
+
)
|
|
669
|
+
and l.StartTime AT TIME ZONE 'GMT' >= TIMESTAMP '{start_time}'
|
|
670
|
+
and l.StartTime AT TIME ZONE 'GMT' < TIMESTAMP '{end_time}'
|
|
671
|
+
and s.CollectTimeStamp >= TIMESTAMP '{start_time}'
|
|
672
|
+
and l.DefaultDatabase not in ('DEMONOW_MONITOR')
|
|
673
|
+
{databases_filter}
|
|
674
|
+
) as combined_results
|
|
675
|
+
ORDER BY "timestamp", "query_id", "row_no"
|
|
504
676
|
""".strip()
|
|
505
677
|
|
|
506
|
-
TABLES_AND_VIEWS_QUERY: str = """
|
|
678
|
+
TABLES_AND_VIEWS_QUERY: str = f"""
|
|
507
679
|
SELECT
|
|
508
680
|
t.DataBaseName,
|
|
509
681
|
t.TableName as name,
|
|
@@ -521,77 +693,51 @@ SELECT
|
|
|
521
693
|
t.LastAlterTimeStamp,
|
|
522
694
|
t.RequestText
|
|
523
695
|
FROM dbc.TablesV t
|
|
524
|
-
WHERE DataBaseName NOT IN (
|
|
525
|
-
'All',
|
|
526
|
-
'Crashdumps',
|
|
527
|
-
'Default',
|
|
528
|
-
'DemoNow_Monitor',
|
|
529
|
-
'EXTUSER',
|
|
530
|
-
'External_AP',
|
|
531
|
-
'GLOBAL_FUNCTIONS',
|
|
532
|
-
'LockLogShredder',
|
|
533
|
-
'PUBLIC',
|
|
534
|
-
'SQLJ',
|
|
535
|
-
'SYSBAR',
|
|
536
|
-
'SYSJDBC',
|
|
537
|
-
'SYSLIB',
|
|
538
|
-
'SYSSPATIAL',
|
|
539
|
-
'SYSUDTLIB',
|
|
540
|
-
'SYSUIF',
|
|
541
|
-
'SysAdmin',
|
|
542
|
-
'Sys_Calendar',
|
|
543
|
-
'SystemFe',
|
|
544
|
-
'TDBCMgmt',
|
|
545
|
-
'TDMaps',
|
|
546
|
-
'TDPUSER',
|
|
547
|
-
'TDQCD',
|
|
548
|
-
'TDStats',
|
|
549
|
-
'TD_ANALYTICS_DB',
|
|
550
|
-
'TD_SERVER_DB',
|
|
551
|
-
'TD_SYSFNLIB',
|
|
552
|
-
'TD_SYSGPL',
|
|
553
|
-
'TD_SYSXML',
|
|
554
|
-
'TDaaS_BAR',
|
|
555
|
-
'TDaaS_DB',
|
|
556
|
-
'TDaaS_Maint',
|
|
557
|
-
'TDaaS_Monitor',
|
|
558
|
-
'TDaaS_Support',
|
|
559
|
-
'TDaaS_TDBCMgmt1',
|
|
560
|
-
'TDaaS_TDBCMgmt2',
|
|
561
|
-
'dbcmngr',
|
|
562
|
-
'mldb',
|
|
563
|
-
'system',
|
|
564
|
-
'tapidb',
|
|
565
|
-
'tdwm',
|
|
566
|
-
'val',
|
|
567
|
-
'dbc'
|
|
568
|
-
)
|
|
696
|
+
WHERE DataBaseName NOT IN ({",".join([f"'{db}'" for db in EXCLUDED_DATABASES])})
|
|
569
697
|
AND t.TableKind in ('T', 'V', 'Q', 'O')
|
|
570
698
|
ORDER by DataBaseName, TableName;
|
|
571
699
|
""".strip()
|
|
572
700
|
|
|
573
701
|
_tables_cache: MutableMapping[str, List[TeradataTable]] = defaultdict(list)
|
|
702
|
+
_tables_cache_lock = Lock() # Protect shared cache from concurrent access
|
|
703
|
+
_pooled_engine: Optional[Engine] = None # Reusable pooled engine
|
|
704
|
+
_pooled_engine_lock = Lock() # Protect engine creation
|
|
574
705
|
|
|
575
706
|
def __init__(self, config: TeradataConfig, ctx: PipelineContext):
|
|
576
707
|
super().__init__(config, ctx, "teradata")
|
|
577
708
|
|
|
578
709
|
self.report: TeradataReport = TeradataReport()
|
|
579
710
|
self.graph: Optional[DataHubGraph] = ctx.graph
|
|
711
|
+
self._report_lock = Lock() # Thread safety for report counters
|
|
580
712
|
|
|
581
|
-
self.
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
713
|
+
self.schema_resolver = self._init_schema_resolver()
|
|
714
|
+
|
|
715
|
+
# Initialize SqlParsingAggregator for modern lineage processing
|
|
716
|
+
logger.info("Initializing SqlParsingAggregator for enhanced lineage processing")
|
|
717
|
+
self.aggregator = SqlParsingAggregator(
|
|
718
|
+
platform="teradata",
|
|
719
|
+
platform_instance=self.config.platform_instance,
|
|
720
|
+
env=self.config.env,
|
|
721
|
+
schema_resolver=self.schema_resolver,
|
|
722
|
+
graph=self.ctx.graph,
|
|
723
|
+
generate_lineage=self.include_lineage,
|
|
724
|
+
generate_queries=self.config.include_queries,
|
|
586
725
|
generate_usage_statistics=self.config.include_usage_statistics,
|
|
587
|
-
|
|
726
|
+
generate_query_usage_statistics=self.config.include_usage_statistics,
|
|
727
|
+
generate_operations=self.config.usage.include_operational_stats
|
|
728
|
+
if self.config.include_usage_statistics
|
|
729
|
+
else False,
|
|
730
|
+
usage_config=self.config.usage
|
|
731
|
+
if self.config.include_usage_statistics
|
|
732
|
+
else None,
|
|
733
|
+
eager_graph_load=False,
|
|
588
734
|
)
|
|
589
|
-
|
|
590
|
-
self.schema_resolver = self._init_schema_resolver()
|
|
735
|
+
self.report.sql_aggregator = self.aggregator.report
|
|
591
736
|
|
|
592
737
|
if self.config.include_tables or self.config.include_views:
|
|
593
|
-
self.
|
|
594
|
-
|
|
738
|
+
with self.report.new_stage("Table and view discovery"):
|
|
739
|
+
self.cache_tables_and_views()
|
|
740
|
+
logger.info(f"Found {len(self._tables_cache)} tables and views")
|
|
595
741
|
setattr(self, "loop_tables", self.cached_loop_tables) # noqa: B010
|
|
596
742
|
setattr(self, "loop_views", self.cached_loop_views) # noqa: B010
|
|
597
743
|
setattr( # noqa: B010
|
|
@@ -721,6 +867,8 @@ ORDER by DataBaseName, TableName;
|
|
|
721
867
|
|
|
722
868
|
logger.debug(f"sql_alchemy_url={url}")
|
|
723
869
|
engine = create_engine(url, **self.config.options)
|
|
870
|
+
|
|
871
|
+
# Get list of databases first
|
|
724
872
|
with engine.connect() as conn:
|
|
725
873
|
inspector = inspect(conn)
|
|
726
874
|
if self.config.database and self.config.database != "":
|
|
@@ -729,13 +877,14 @@ ORDER by DataBaseName, TableName;
|
|
|
729
877
|
databases = self.config.databases
|
|
730
878
|
else:
|
|
731
879
|
databases = inspector.get_schema_names()
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
880
|
+
|
|
881
|
+
# Create separate connections for each database to avoid connection lifecycle issues
|
|
882
|
+
for db in databases:
|
|
883
|
+
if self.config.database_pattern.allowed(db):
|
|
884
|
+
with engine.connect() as conn:
|
|
885
|
+
db_inspector = inspect(conn)
|
|
886
|
+
db_inspector._datahub_database = db
|
|
887
|
+
yield db_inspector
|
|
739
888
|
|
|
740
889
|
def get_db_name(self, inspector: Inspector) -> str:
|
|
741
890
|
if hasattr(inspector, "_datahub_database"):
|
|
@@ -753,14 +902,15 @@ ORDER by DataBaseName, TableName;
|
|
|
753
902
|
inspector: Inspector,
|
|
754
903
|
schema: str,
|
|
755
904
|
sql_config: SQLCommonConfig,
|
|
756
|
-
) -> Iterable[
|
|
905
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
757
906
|
setattr( # noqa: B010
|
|
758
907
|
inspector,
|
|
759
908
|
"get_table_names",
|
|
760
909
|
lambda schema: [
|
|
761
910
|
i.name
|
|
762
911
|
for i in filter(
|
|
763
|
-
lambda t: t.object_type != "View",
|
|
912
|
+
lambda t: t.object_type != "View",
|
|
913
|
+
self._tables_cache.get(schema, []),
|
|
764
914
|
)
|
|
765
915
|
],
|
|
766
916
|
)
|
|
@@ -776,7 +926,8 @@ ORDER by DataBaseName, TableName;
|
|
|
776
926
|
# this method and provide a location.
|
|
777
927
|
location: Optional[str] = None
|
|
778
928
|
|
|
779
|
-
|
|
929
|
+
cache_entries = self._tables_cache.get(schema, [])
|
|
930
|
+
for entry in cache_entries:
|
|
780
931
|
if entry.name == table:
|
|
781
932
|
description = entry.description
|
|
782
933
|
if entry.object_type == "View" and entry.request_text:
|
|
@@ -789,123 +940,734 @@ ORDER by DataBaseName, TableName;
|
|
|
789
940
|
inspector: Inspector,
|
|
790
941
|
schema: str,
|
|
791
942
|
sql_config: SQLCommonConfig,
|
|
792
|
-
) -> Iterable[
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
943
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
944
|
+
start_time = time.time()
|
|
945
|
+
|
|
946
|
+
# Get view names from cache
|
|
947
|
+
view_names = [
|
|
948
|
+
i.name
|
|
949
|
+
for i in filter(
|
|
950
|
+
lambda t: t.object_type == "View", self._tables_cache.get(schema, [])
|
|
951
|
+
)
|
|
952
|
+
]
|
|
953
|
+
actual_view_count = len(view_names)
|
|
954
|
+
|
|
955
|
+
if actual_view_count == 0:
|
|
956
|
+
end_time = time.time()
|
|
957
|
+
processing_time = end_time - start_time
|
|
958
|
+
logger.info(
|
|
959
|
+
f"View processing for schema '{schema}' completed in {processing_time:.2f} seconds (0 views, 0 work units)"
|
|
960
|
+
)
|
|
961
|
+
return
|
|
962
|
+
|
|
963
|
+
# Use custom threading implementation with connection pooling
|
|
964
|
+
work_unit_count = 0
|
|
965
|
+
|
|
966
|
+
for work_unit in self._loop_views_with_connection_pool(
|
|
967
|
+
view_names, schema, sql_config
|
|
968
|
+
):
|
|
969
|
+
work_unit_count += 1
|
|
970
|
+
yield work_unit
|
|
971
|
+
|
|
972
|
+
end_time = time.time()
|
|
973
|
+
processing_time = end_time - start_time
|
|
974
|
+
|
|
975
|
+
logger.info(
|
|
976
|
+
f"View processing for schema '{schema}' completed in {processing_time:.2f} seconds ({actual_view_count} views, {work_unit_count} work units)"
|
|
977
|
+
)
|
|
978
|
+
|
|
979
|
+
# Update report timing metrics
|
|
980
|
+
if hasattr(self, "report"):
|
|
981
|
+
self.report.view_extraction_total_time_seconds += processing_time
|
|
982
|
+
self.report.num_views_processed += actual_view_count
|
|
983
|
+
|
|
984
|
+
# Track slowest view processing at view level (will be updated by individual view processing)
|
|
985
|
+
# Note: slowest_view_name now tracks individual views, not schemas
|
|
986
|
+
|
|
987
|
+
# Calculate average processing time per view
|
|
988
|
+
if self.report.num_views_processed > 0:
|
|
989
|
+
self.report.view_extraction_average_time_seconds = (
|
|
990
|
+
self.report.view_extraction_total_time_seconds
|
|
991
|
+
/ self.report.num_views_processed
|
|
800
992
|
)
|
|
801
|
-
|
|
993
|
+
|
|
994
|
+
def _loop_views_with_connection_pool(
|
|
995
|
+
self, view_names: List[str], schema: str, sql_config: SQLCommonConfig
|
|
996
|
+
) -> Iterable[Union[MetadataWorkUnit, Any]]:
|
|
997
|
+
"""
|
|
998
|
+
Process views using individual database connections per thread for true parallelization.
|
|
999
|
+
|
|
1000
|
+
Each thread gets its own connection from a QueuePool, enabling true concurrent processing.
|
|
1001
|
+
"""
|
|
1002
|
+
if self.config.max_workers == 1:
|
|
1003
|
+
# Single-threaded processing - no need for complexity
|
|
1004
|
+
yield from self._process_views_single_threaded(
|
|
1005
|
+
view_names, schema, sql_config
|
|
1006
|
+
)
|
|
1007
|
+
return
|
|
1008
|
+
|
|
1009
|
+
logger.info(
|
|
1010
|
+
f"Processing {len(view_names)} views with {self.config.max_workers} worker threads"
|
|
802
1011
|
)
|
|
803
|
-
yield from super().loop_views(inspector, schema, sql_config)
|
|
804
1012
|
|
|
805
|
-
|
|
1013
|
+
# Get or create reusable pooled engine
|
|
1014
|
+
engine = self._get_or_create_pooled_engine()
|
|
1015
|
+
|
|
1016
|
+
try:
|
|
1017
|
+
# Thread-safe result collection
|
|
1018
|
+
report_lock = Lock()
|
|
1019
|
+
|
|
1020
|
+
def process_single_view(
|
|
1021
|
+
view_name: str,
|
|
1022
|
+
) -> List[Union[MetadataWorkUnit, Any]]:
|
|
1023
|
+
"""Process a single view with its own database connection."""
|
|
1024
|
+
results: List[Union[MetadataWorkUnit, Any]] = []
|
|
1025
|
+
|
|
1026
|
+
# Detailed timing measurements for bottleneck analysis
|
|
1027
|
+
timings = {
|
|
1028
|
+
"connection_acquire": 0.0,
|
|
1029
|
+
"view_processing": 0.0,
|
|
1030
|
+
"work_unit_generation": 0.0,
|
|
1031
|
+
"total": 0.0,
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
total_start = time.time()
|
|
1035
|
+
try:
|
|
1036
|
+
# Measure connection acquisition time
|
|
1037
|
+
conn_start = time.time()
|
|
1038
|
+
with engine.connect() as conn:
|
|
1039
|
+
timings["connection_acquire"] = time.time() - conn_start
|
|
1040
|
+
|
|
1041
|
+
# Update connection pool metrics
|
|
1042
|
+
with report_lock:
|
|
1043
|
+
pool_wait_time = timings["connection_acquire"]
|
|
1044
|
+
self.report.connection_pool_wait_time_seconds += (
|
|
1045
|
+
pool_wait_time
|
|
1046
|
+
)
|
|
1047
|
+
if (
|
|
1048
|
+
pool_wait_time
|
|
1049
|
+
> self.report.connection_pool_max_wait_time_seconds
|
|
1050
|
+
):
|
|
1051
|
+
self.report.connection_pool_max_wait_time_seconds = (
|
|
1052
|
+
pool_wait_time
|
|
1053
|
+
)
|
|
1054
|
+
|
|
1055
|
+
# Measure view processing setup
|
|
1056
|
+
processing_start = time.time()
|
|
1057
|
+
thread_inspector = inspect(conn)
|
|
1058
|
+
# Inherit database information for Teradata two-tier architecture
|
|
1059
|
+
thread_inspector._datahub_database = schema # type: ignore
|
|
1060
|
+
|
|
1061
|
+
dataset_name = self.get_identifier(
|
|
1062
|
+
schema=schema, entity=view_name, inspector=thread_inspector
|
|
1063
|
+
)
|
|
1064
|
+
|
|
1065
|
+
# Thread-safe reporting
|
|
1066
|
+
with report_lock:
|
|
1067
|
+
self.report.report_entity_scanned(
|
|
1068
|
+
dataset_name, ent_type="view"
|
|
1069
|
+
)
|
|
1070
|
+
|
|
1071
|
+
if not sql_config.view_pattern.allowed(dataset_name):
|
|
1072
|
+
with report_lock:
|
|
1073
|
+
self.report.report_dropped(dataset_name)
|
|
1074
|
+
return results
|
|
1075
|
+
|
|
1076
|
+
timings["view_processing"] = time.time() - processing_start
|
|
1077
|
+
|
|
1078
|
+
# Measure work unit generation
|
|
1079
|
+
wu_start = time.time()
|
|
1080
|
+
for work_unit in self._process_view(
|
|
1081
|
+
dataset_name=dataset_name,
|
|
1082
|
+
inspector=thread_inspector,
|
|
1083
|
+
schema=schema,
|
|
1084
|
+
view=view_name,
|
|
1085
|
+
sql_config=sql_config,
|
|
1086
|
+
):
|
|
1087
|
+
results.append(work_unit)
|
|
1088
|
+
timings["work_unit_generation"] = time.time() - wu_start
|
|
1089
|
+
|
|
1090
|
+
# Track individual view timing
|
|
1091
|
+
timings["total"] = time.time() - total_start
|
|
1092
|
+
|
|
1093
|
+
with report_lock:
|
|
1094
|
+
self.report.slowest_view_name[f"{schema}.{view_name}"] = (
|
|
1095
|
+
timings["total"]
|
|
1096
|
+
)
|
|
1097
|
+
|
|
1098
|
+
except Exception as e:
|
|
1099
|
+
with report_lock:
|
|
1100
|
+
self.report.num_view_processing_failures += 1
|
|
1101
|
+
# Log full exception details for debugging
|
|
1102
|
+
import traceback
|
|
1103
|
+
|
|
1104
|
+
full_traceback = traceback.format_exc()
|
|
1105
|
+
logger.error(
|
|
1106
|
+
f"Failed to process view {schema}.{view_name}: {str(e)}"
|
|
1107
|
+
)
|
|
1108
|
+
logger.error(f"Full traceback: {full_traceback}")
|
|
1109
|
+
self.report.warning(
|
|
1110
|
+
f"Error processing view {schema}.{view_name}",
|
|
1111
|
+
context=f"View: {schema}.{view_name}, Error: {str(e)}",
|
|
1112
|
+
exc=e,
|
|
1113
|
+
)
|
|
1114
|
+
|
|
1115
|
+
return results
|
|
1116
|
+
|
|
1117
|
+
# Use ThreadPoolExecutor for concurrent processing
|
|
1118
|
+
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
1119
|
+
# Submit all view processing tasks
|
|
1120
|
+
future_to_view = {
|
|
1121
|
+
executor.submit(process_single_view, view_name): view_name
|
|
1122
|
+
for view_name in view_names
|
|
1123
|
+
}
|
|
1124
|
+
|
|
1125
|
+
# Process completed tasks as they finish
|
|
1126
|
+
for future in as_completed(future_to_view):
|
|
1127
|
+
view_name = future_to_view[future]
|
|
1128
|
+
try:
|
|
1129
|
+
results = future.result()
|
|
1130
|
+
# Yield all results from this view
|
|
1131
|
+
for result in results:
|
|
1132
|
+
yield result
|
|
1133
|
+
except Exception as e:
|
|
1134
|
+
with report_lock:
|
|
1135
|
+
self.report.warning(
|
|
1136
|
+
"Error in thread processing view",
|
|
1137
|
+
context=f"{schema}.{view_name}",
|
|
1138
|
+
exc=e,
|
|
1139
|
+
)
|
|
1140
|
+
|
|
1141
|
+
finally:
|
|
1142
|
+
# Don't dispose the reusable engine here - it will be cleaned up in close()
|
|
1143
|
+
pass
|
|
1144
|
+
|
|
1145
|
+
def _process_views_single_threaded(
|
|
1146
|
+
self, view_names: List[str], schema: str, sql_config: SQLCommonConfig
|
|
1147
|
+
) -> Iterable[Union[MetadataWorkUnit, Any]]:
|
|
1148
|
+
"""Process views sequentially with a single connection."""
|
|
806
1149
|
engine = self.get_metadata_engine()
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
1150
|
+
|
|
1151
|
+
try:
|
|
1152
|
+
with engine.connect() as conn:
|
|
1153
|
+
inspector = inspect(conn)
|
|
1154
|
+
|
|
1155
|
+
for view_name in view_names:
|
|
1156
|
+
view_start_time = time.time()
|
|
1157
|
+
try:
|
|
1158
|
+
dataset_name = self.get_identifier(
|
|
1159
|
+
schema=schema, entity=view_name, inspector=inspector
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
self.report.report_entity_scanned(dataset_name, ent_type="view")
|
|
1163
|
+
|
|
1164
|
+
if not sql_config.view_pattern.allowed(dataset_name):
|
|
1165
|
+
self.report.report_dropped(dataset_name)
|
|
1166
|
+
continue
|
|
1167
|
+
|
|
1168
|
+
# Process the view and yield results
|
|
1169
|
+
for work_unit in self._process_view(
|
|
1170
|
+
dataset_name=dataset_name,
|
|
1171
|
+
inspector=inspector,
|
|
1172
|
+
schema=schema,
|
|
1173
|
+
view=view_name,
|
|
1174
|
+
sql_config=sql_config,
|
|
1175
|
+
):
|
|
1176
|
+
yield work_unit
|
|
1177
|
+
|
|
1178
|
+
# Track individual view timing
|
|
1179
|
+
view_end_time = time.time()
|
|
1180
|
+
view_processing_time = view_end_time - view_start_time
|
|
1181
|
+
self.report.slowest_view_name[f"{schema}.{view_name}"] = (
|
|
1182
|
+
view_processing_time
|
|
1183
|
+
)
|
|
1184
|
+
|
|
1185
|
+
except Exception as e:
|
|
1186
|
+
# Log full exception details for debugging
|
|
1187
|
+
import traceback
|
|
1188
|
+
|
|
1189
|
+
full_traceback = traceback.format_exc()
|
|
1190
|
+
logger.error(
|
|
1191
|
+
f"Failed to process view {schema}.{view_name}: {str(e)}"
|
|
1192
|
+
)
|
|
1193
|
+
logger.error(f"Full traceback: {full_traceback}")
|
|
1194
|
+
self.report.warning(
|
|
1195
|
+
f"Error processing view {schema}.{view_name}",
|
|
1196
|
+
context=f"View: {schema}.{view_name}, Error: {str(e)}",
|
|
1197
|
+
exc=e,
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
finally:
|
|
1201
|
+
engine.dispose()
|
|
1202
|
+
|
|
1203
|
+
def _get_or_create_pooled_engine(self) -> Engine:
|
|
1204
|
+
"""Get or create a reusable SQLAlchemy engine with QueuePool for concurrent connections."""
|
|
1205
|
+
with self._pooled_engine_lock:
|
|
1206
|
+
if self._pooled_engine is None:
|
|
1207
|
+
url = self.config.get_sql_alchemy_url()
|
|
1208
|
+
|
|
1209
|
+
# Optimal connection pool sizing to match max_workers exactly
|
|
1210
|
+
# Teradata driver can be sensitive to high connection counts, so cap at reasonable limit
|
|
1211
|
+
max_safe_connections = (
|
|
1212
|
+
13 # Conservative limit: 8 base + 5 overflow for Teradata stability
|
|
1213
|
+
)
|
|
1214
|
+
|
|
1215
|
+
# Adjust max_workers to match available connection pool capacity
|
|
1216
|
+
effective_max_workers = min(
|
|
1217
|
+
self.config.max_workers, max_safe_connections
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1220
|
+
# Set pool size to match effective workers for optimal performance
|
|
1221
|
+
base_connections = min(
|
|
1222
|
+
effective_max_workers, 8
|
|
1223
|
+
) # Reasonable base connections
|
|
1224
|
+
max_overflow = (
|
|
1225
|
+
effective_max_workers - base_connections
|
|
1226
|
+
) # Remaining as overflow
|
|
1227
|
+
|
|
1228
|
+
# Log adjustment if max_workers was reduced
|
|
1229
|
+
if effective_max_workers < self.config.max_workers:
|
|
1230
|
+
logger.warning(
|
|
1231
|
+
f"Reduced max_workers from {self.config.max_workers} to {effective_max_workers} to match Teradata connection pool capacity"
|
|
1232
|
+
)
|
|
1233
|
+
|
|
1234
|
+
# Update the config to reflect the effective value used
|
|
1235
|
+
self.config.max_workers = effective_max_workers
|
|
1236
|
+
|
|
1237
|
+
pool_options = {
|
|
1238
|
+
**self.config.options,
|
|
1239
|
+
"poolclass": QueuePool,
|
|
1240
|
+
"pool_size": base_connections,
|
|
1241
|
+
"max_overflow": max_overflow,
|
|
1242
|
+
"pool_pre_ping": True, # Validate connections
|
|
1243
|
+
"pool_recycle": 1800, # Recycle connections after 30 mins (more frequent)
|
|
1244
|
+
"pool_timeout": 60, # Longer timeout for connection acquisition
|
|
1245
|
+
"pool_reset_on_return": "rollback", # Explicit rollback on connection return
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
# Add Teradata-specific connection options for stability
|
|
1249
|
+
if "connect_args" not in pool_options:
|
|
1250
|
+
pool_options["connect_args"] = {}
|
|
1251
|
+
|
|
1252
|
+
# Teradata-specific connection arguments for better stability
|
|
1253
|
+
pool_options["connect_args"].update(
|
|
1254
|
+
{
|
|
1255
|
+
"connect_timeout": "30000", # Connection timeout in ms (30 seconds)
|
|
1256
|
+
"request_timeout": "120000", # Request timeout in ms (2 minutes)
|
|
1257
|
+
}
|
|
1258
|
+
)
|
|
1259
|
+
|
|
1260
|
+
self._pooled_engine = create_engine(url, **pool_options)
|
|
1261
|
+
logger.info(
|
|
1262
|
+
f"Created optimized Teradata connection pool: {base_connections} base + {max_overflow} overflow = {base_connections + max_overflow} max connections (matching {effective_max_workers} workers)"
|
|
1263
|
+
)
|
|
1264
|
+
|
|
1265
|
+
return self._pooled_engine
|
|
1266
|
+
|
|
1267
|
+
def cache_tables_and_views(self) -> None:
|
|
1268
|
+
with self.report.new_stage("Cache tables and views"):
|
|
1269
|
+
engine = self.get_metadata_engine()
|
|
1270
|
+
try:
|
|
1271
|
+
database_counts: Dict[str, Dict[str, int]] = defaultdict(
|
|
1272
|
+
lambda: {"tables": 0, "views": 0}
|
|
1273
|
+
)
|
|
1274
|
+
|
|
1275
|
+
for entry in engine.execute(self.TABLES_AND_VIEWS_QUERY):
|
|
1276
|
+
table = TeradataTable(
|
|
1277
|
+
database=entry.DataBaseName.strip(),
|
|
1278
|
+
name=entry.name.strip(),
|
|
1279
|
+
description=entry.description.strip()
|
|
1280
|
+
if entry.description
|
|
1281
|
+
else None,
|
|
1282
|
+
object_type=entry.object_type,
|
|
1283
|
+
create_timestamp=entry.CreateTimeStamp,
|
|
1284
|
+
last_alter_name=entry.LastAlterName,
|
|
1285
|
+
last_alter_timestamp=entry.LastAlterTimeStamp,
|
|
1286
|
+
request_text=(
|
|
1287
|
+
entry.RequestText.strip()
|
|
1288
|
+
if entry.object_type == "View" and entry.RequestText
|
|
1289
|
+
else None
|
|
1290
|
+
),
|
|
1291
|
+
)
|
|
1292
|
+
|
|
1293
|
+
# Count objects per database for metrics
|
|
1294
|
+
if table.object_type == "View":
|
|
1295
|
+
database_counts[table.database]["views"] += 1
|
|
1296
|
+
else:
|
|
1297
|
+
database_counts[table.database]["tables"] += 1
|
|
1298
|
+
|
|
1299
|
+
with self._tables_cache_lock:
|
|
1300
|
+
if table.database not in self._tables_cache:
|
|
1301
|
+
self._tables_cache[table.database] = []
|
|
1302
|
+
self._tables_cache[table.database].append(table)
|
|
1303
|
+
|
|
1304
|
+
for database, counts in database_counts.items():
|
|
1305
|
+
self.report.num_database_tables_to_scan[database] = counts["tables"]
|
|
1306
|
+
self.report.num_database_views_to_scan[database] = counts["views"]
|
|
1307
|
+
|
|
1308
|
+
finally:
|
|
1309
|
+
engine.dispose()
|
|
1310
|
+
|
|
1311
|
+
def _reconstruct_queries_streaming(
|
|
1312
|
+
self, entries: Iterable[Any]
|
|
1313
|
+
) -> Iterable[ObservedQuery]:
|
|
1314
|
+
"""Reconstruct complete queries from database entries in streaming fashion.
|
|
1315
|
+
|
|
1316
|
+
This method processes entries in order and reconstructs multi-row queries
|
|
1317
|
+
by concatenating rows with the same query_id.
|
|
1318
|
+
"""
|
|
1319
|
+
current_query_id = None
|
|
1320
|
+
current_query_parts = []
|
|
1321
|
+
current_query_metadata = None
|
|
1322
|
+
|
|
1323
|
+
for entry in entries:
|
|
1324
|
+
query_id = getattr(entry, "query_id", None)
|
|
1325
|
+
query_text = str(getattr(entry, "query_text", ""))
|
|
1326
|
+
|
|
1327
|
+
if query_id != current_query_id:
|
|
1328
|
+
# New query started - yield the previous one if it exists
|
|
1329
|
+
if current_query_id is not None and current_query_parts:
|
|
1330
|
+
yield self._create_observed_query_from_parts(
|
|
1331
|
+
current_query_parts, current_query_metadata
|
|
1332
|
+
)
|
|
1333
|
+
|
|
1334
|
+
# Start new query
|
|
1335
|
+
current_query_id = query_id
|
|
1336
|
+
current_query_parts = [query_text] if query_text else []
|
|
1337
|
+
current_query_metadata = entry
|
|
1338
|
+
else:
|
|
1339
|
+
# Same query - append the text
|
|
1340
|
+
if query_text:
|
|
1341
|
+
current_query_parts.append(query_text)
|
|
1342
|
+
|
|
1343
|
+
# Yield the last query if it exists
|
|
1344
|
+
if current_query_id is not None and current_query_parts:
|
|
1345
|
+
yield self._create_observed_query_from_parts(
|
|
1346
|
+
current_query_parts, current_query_metadata
|
|
821
1347
|
)
|
|
822
|
-
if table.database not in self._tables_cache:
|
|
823
|
-
self._tables_cache[table.database] = []
|
|
824
1348
|
|
|
825
|
-
|
|
1349
|
+
def _create_observed_query_from_parts(
|
|
1350
|
+
self, query_parts: List[str], metadata_entry: Any
|
|
1351
|
+
) -> ObservedQuery:
|
|
1352
|
+
"""Create ObservedQuery from reconstructed query parts and metadata."""
|
|
1353
|
+
# Join all parts to form the complete query
|
|
1354
|
+
# Teradata fragments are split at fixed lengths without artificial breaks
|
|
1355
|
+
full_query_text = "".join(query_parts)
|
|
1356
|
+
|
|
1357
|
+
# Extract metadata
|
|
1358
|
+
session_id = getattr(metadata_entry, "session_id", None)
|
|
1359
|
+
timestamp = getattr(metadata_entry, "timestamp", None)
|
|
1360
|
+
user = getattr(metadata_entry, "user", None)
|
|
1361
|
+
default_database = getattr(metadata_entry, "default_database", None)
|
|
1362
|
+
|
|
1363
|
+
# Apply Teradata-specific query transformations
|
|
1364
|
+
cleaned_query = full_query_text.replace("(NOT CASESPECIFIC)", "")
|
|
1365
|
+
|
|
1366
|
+
return ObservedQuery(
|
|
1367
|
+
query=cleaned_query,
|
|
1368
|
+
session_id=session_id,
|
|
1369
|
+
timestamp=timestamp,
|
|
1370
|
+
user=CorpUserUrn(user) if user else None,
|
|
1371
|
+
default_db=default_database,
|
|
1372
|
+
default_schema=default_database, # Teradata uses database as schema
|
|
1373
|
+
)
|
|
1374
|
+
|
|
1375
|
+
def _convert_entry_to_observed_query(self, entry: Any) -> ObservedQuery:
|
|
1376
|
+
"""Convert database query entry to ObservedQuery for SqlParsingAggregator.
|
|
1377
|
+
|
|
1378
|
+
DEPRECATED: This method is deprecated in favor of _reconstruct_queries_streaming
|
|
1379
|
+
which properly handles multi-row queries. This method does not handle queries
|
|
1380
|
+
that span multiple rows correctly and should not be used.
|
|
1381
|
+
"""
|
|
1382
|
+
# Extract fields from database result
|
|
1383
|
+
query_text = str(entry.query_text).strip()
|
|
1384
|
+
session_id = getattr(entry, "session_id", None)
|
|
1385
|
+
timestamp = getattr(entry, "timestamp", None)
|
|
1386
|
+
user = getattr(entry, "user", None)
|
|
1387
|
+
default_database = getattr(entry, "default_database", None)
|
|
1388
|
+
|
|
1389
|
+
# Apply Teradata-specific query transformations
|
|
1390
|
+
cleaned_query = query_text.replace("(NOT CASESPECIFIC)", "")
|
|
1391
|
+
|
|
1392
|
+
return ObservedQuery(
|
|
1393
|
+
query=cleaned_query,
|
|
1394
|
+
session_id=session_id,
|
|
1395
|
+
timestamp=timestamp,
|
|
1396
|
+
user=CorpUserUrn(user) if user else None,
|
|
1397
|
+
default_db=default_database,
|
|
1398
|
+
default_schema=default_database, # Teradata uses database as schema
|
|
1399
|
+
)
|
|
1400
|
+
|
|
1401
|
+
def _fetch_lineage_entries_chunked(self) -> Iterable[Any]:
|
|
1402
|
+
"""Fetch lineage entries using server-side cursor to handle large result sets efficiently."""
|
|
1403
|
+
queries = self._make_lineage_queries()
|
|
1404
|
+
|
|
1405
|
+
fetch_engine = self.get_metadata_engine()
|
|
1406
|
+
try:
|
|
1407
|
+
with fetch_engine.connect() as conn:
|
|
1408
|
+
cursor_type = (
|
|
1409
|
+
"server-side"
|
|
1410
|
+
if self.config.use_server_side_cursors
|
|
1411
|
+
else "client-side"
|
|
1412
|
+
)
|
|
1413
|
+
|
|
1414
|
+
total_count_all_queries = 0
|
|
1415
|
+
|
|
1416
|
+
for query_index, query in enumerate(queries, 1):
|
|
1417
|
+
logger.info(
|
|
1418
|
+
f"Executing lineage query {query_index}/{len(queries)} with {cursor_type} cursor..."
|
|
1419
|
+
)
|
|
1420
|
+
|
|
1421
|
+
# Use helper method to try server-side cursor with fallback
|
|
1422
|
+
result = self._execute_with_cursor_fallback(conn, query)
|
|
1423
|
+
|
|
1424
|
+
# Stream results in batches to avoid memory issues
|
|
1425
|
+
batch_size = 5000
|
|
1426
|
+
batch_count = 0
|
|
1427
|
+
query_total_count = 0
|
|
1428
|
+
|
|
1429
|
+
while True:
|
|
1430
|
+
# Fetch a batch of rows
|
|
1431
|
+
batch = result.fetchmany(batch_size)
|
|
1432
|
+
if not batch:
|
|
1433
|
+
break
|
|
1434
|
+
|
|
1435
|
+
batch_count += 1
|
|
1436
|
+
query_total_count += len(batch)
|
|
1437
|
+
total_count_all_queries += len(batch)
|
|
1438
|
+
|
|
1439
|
+
logger.info(
|
|
1440
|
+
f"Query {query_index} - Fetched batch {batch_count}: {len(batch)} lineage entries (query total: {query_total_count})"
|
|
1441
|
+
)
|
|
1442
|
+
yield from batch
|
|
1443
|
+
|
|
1444
|
+
logger.info(
|
|
1445
|
+
f"Completed query {query_index}: {query_total_count} lineage entries in {batch_count} batches"
|
|
1446
|
+
)
|
|
1447
|
+
|
|
1448
|
+
logger.info(
|
|
1449
|
+
f"Completed fetching all queries: {total_count_all_queries} total lineage entries from {len(queries)} queries"
|
|
1450
|
+
)
|
|
826
1451
|
|
|
827
|
-
|
|
1452
|
+
except Exception as e:
|
|
1453
|
+
logger.error(f"Error fetching lineage entries: {e}")
|
|
1454
|
+
raise
|
|
1455
|
+
finally:
|
|
1456
|
+
fetch_engine.dispose()
|
|
1457
|
+
|
|
1458
|
+
def _check_historical_table_exists(self) -> bool:
|
|
1459
|
+
"""
|
|
1460
|
+
Check if the PDCRINFO.DBQLSqlTbl_Hst table exists and is accessible.
|
|
1461
|
+
DBQL rows are periodically moved to history table and audit queries might not exist in DBC already.
|
|
1462
|
+
There is not guarantee that the historical table exists, so we need to check it.
|
|
1463
|
+
|
|
1464
|
+
Returns:
|
|
1465
|
+
bool: True if the historical table exists and is accessible, False otherwise.
|
|
1466
|
+
"""
|
|
828
1467
|
engine = self.get_metadata_engine()
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
1468
|
+
try:
|
|
1469
|
+
# Use a simple query to check if the table exists and is accessible
|
|
1470
|
+
check_query = """
|
|
1471
|
+
SELECT TOP 1 QueryID
|
|
1472
|
+
FROM PDCRINFO.DBQLSqlTbl_Hst
|
|
1473
|
+
WHERE 1=0
|
|
1474
|
+
"""
|
|
1475
|
+
with engine.connect() as conn:
|
|
1476
|
+
conn.execute(text(check_query))
|
|
1477
|
+
logger.info(
|
|
1478
|
+
"Historical lineage table PDCRINFO.DBQLSqlTbl_Hst is available"
|
|
1479
|
+
)
|
|
1480
|
+
return True
|
|
1481
|
+
except Exception as e:
|
|
1482
|
+
logger.info(
|
|
1483
|
+
f"Historical lineage table PDCRINFO.DBQLSqlTbl_Hst is not available: {e}"
|
|
840
1484
|
)
|
|
1485
|
+
return False
|
|
1486
|
+
finally:
|
|
1487
|
+
engine.dispose()
|
|
841
1488
|
|
|
842
|
-
def
|
|
1489
|
+
def _make_lineage_queries(self) -> List[str]:
|
|
843
1490
|
databases_filter = (
|
|
844
1491
|
""
|
|
845
1492
|
if not self.config.databases
|
|
846
|
-
else "and
|
|
1493
|
+
else "and l.DefaultDatabase in ({databases})".format(
|
|
847
1494
|
databases=",".join([f"'{db}'" for db in self.config.databases])
|
|
848
1495
|
)
|
|
849
1496
|
)
|
|
850
1497
|
|
|
851
|
-
|
|
852
|
-
start_time=self.config.start_time,
|
|
853
|
-
end_time=self.config.end_time,
|
|
854
|
-
databases_filter=databases_filter,
|
|
855
|
-
)
|
|
856
|
-
return query
|
|
1498
|
+
queries = []
|
|
857
1499
|
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
urns: Optional[Set[str]] = None,
|
|
866
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
867
|
-
result = sqlglot_lineage(
|
|
868
|
-
# With this clever hack we can make the query parser to not fail on queries with CASESPECIFIC
|
|
869
|
-
sql=query.replace("(NOT CASESPECIFIC)", ""),
|
|
870
|
-
schema_resolver=self.schema_resolver,
|
|
871
|
-
default_db=None,
|
|
872
|
-
default_schema=(
|
|
873
|
-
default_database if default_database else self.config.default_db
|
|
874
|
-
),
|
|
875
|
-
)
|
|
876
|
-
if result.debug_info.table_error:
|
|
877
|
-
logger.debug(
|
|
878
|
-
f"Error parsing table lineage ({view_urn}):\n{result.debug_info.table_error}"
|
|
1500
|
+
# Check if historical lineage is configured and available
|
|
1501
|
+
if (
|
|
1502
|
+
self.config.include_historical_lineage
|
|
1503
|
+
and self._check_historical_table_exists()
|
|
1504
|
+
):
|
|
1505
|
+
logger.info(
|
|
1506
|
+
"Using UNION query to combine historical and current lineage data to avoid duplicates"
|
|
879
1507
|
)
|
|
880
|
-
|
|
1508
|
+
# For historical query, we need the database filter for historical part
|
|
1509
|
+
databases_filter_history = (
|
|
1510
|
+
databases_filter.replace("l.DefaultDatabase", "h.DefaultDatabase")
|
|
1511
|
+
if databases_filter
|
|
1512
|
+
else ""
|
|
1513
|
+
)
|
|
1514
|
+
|
|
1515
|
+
union_query = self.QUERY_TEXT_HISTORICAL_UNION.format(
|
|
1516
|
+
start_time=self.config.start_time,
|
|
1517
|
+
end_time=self.config.end_time,
|
|
1518
|
+
databases_filter=databases_filter,
|
|
1519
|
+
databases_filter_history=databases_filter_history,
|
|
1520
|
+
)
|
|
1521
|
+
queries.append(union_query)
|
|
881
1522
|
else:
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
1523
|
+
if self.config.include_historical_lineage:
|
|
1524
|
+
logger.warning(
|
|
1525
|
+
"Historical lineage was requested but PDCRINFO.DBQLSqlTbl_Hst table is not available. Falling back to current data only."
|
|
1526
|
+
)
|
|
1527
|
+
|
|
1528
|
+
# Use current-only query when historical data is not available
|
|
1529
|
+
current_query = self.QUERY_TEXT_CURRENT_QUERIES.format(
|
|
1530
|
+
start_time=self.config.start_time,
|
|
1531
|
+
end_time=self.config.end_time,
|
|
1532
|
+
databases_filter=databases_filter,
|
|
889
1533
|
)
|
|
1534
|
+
queries.append(current_query)
|
|
1535
|
+
|
|
1536
|
+
return queries
|
|
890
1537
|
|
|
891
1538
|
def get_metadata_engine(self) -> Engine:
|
|
892
1539
|
url = self.config.get_sql_alchemy_url()
|
|
893
1540
|
logger.debug(f"sql_alchemy_url={url}")
|
|
894
1541
|
return create_engine(url, **self.config.options)
|
|
895
1542
|
|
|
896
|
-
def
|
|
1543
|
+
def _execute_with_cursor_fallback(
|
|
1544
|
+
self, connection: Connection, query: str, params: Optional[Dict] = None
|
|
1545
|
+
) -> Any:
|
|
1546
|
+
"""
|
|
1547
|
+
Execute query with server-side cursor if enabled and supported, otherwise fall back to regular execution.
|
|
1548
|
+
|
|
1549
|
+
Args:
|
|
1550
|
+
connection: Database connection
|
|
1551
|
+
query: SQL query to execute
|
|
1552
|
+
params: Query parameters
|
|
1553
|
+
|
|
1554
|
+
Returns:
|
|
1555
|
+
Query result object
|
|
1556
|
+
"""
|
|
1557
|
+
if self.config.use_server_side_cursors:
|
|
1558
|
+
try:
|
|
1559
|
+
# Try server-side cursor first
|
|
1560
|
+
if params:
|
|
1561
|
+
result = connection.execution_options(stream_results=True).execute(
|
|
1562
|
+
text(query), params
|
|
1563
|
+
)
|
|
1564
|
+
else:
|
|
1565
|
+
result = connection.execution_options(stream_results=True).execute(
|
|
1566
|
+
text(query)
|
|
1567
|
+
)
|
|
1568
|
+
|
|
1569
|
+
logger.debug(
|
|
1570
|
+
"Successfully using server-side cursor for query execution"
|
|
1571
|
+
)
|
|
1572
|
+
return result
|
|
1573
|
+
|
|
1574
|
+
except Exception as e:
|
|
1575
|
+
logger.warning(
|
|
1576
|
+
f"Server-side cursor failed, falling back to client-side execution: {e}"
|
|
1577
|
+
)
|
|
1578
|
+
# Fall through to regular execution
|
|
1579
|
+
|
|
1580
|
+
# Regular execution (client-side)
|
|
1581
|
+
if params:
|
|
1582
|
+
return connection.execute(text(query), params)
|
|
1583
|
+
else:
|
|
1584
|
+
return connection.execute(text(query))
|
|
1585
|
+
|
|
1586
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1587
|
+
logger.info("Starting Teradata metadata extraction")
|
|
1588
|
+
|
|
897
1589
|
# Add all schemas to the schema resolver
|
|
898
1590
|
# Sql parser operates on lowercase urns so we need to lowercase the urns
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
yield
|
|
905
|
-
|
|
906
|
-
|
|
1591
|
+
with self.report.new_stage("Schema metadata extraction"):
|
|
1592
|
+
yield from super().get_workunits_internal()
|
|
1593
|
+
logger.info("Completed schema metadata extraction")
|
|
1594
|
+
|
|
1595
|
+
with self.report.new_stage("Audit log extraction"):
|
|
1596
|
+
yield from self._get_audit_log_mcps_with_aggregator()
|
|
1597
|
+
|
|
1598
|
+
# SqlParsingAggregator handles its own work unit generation internally
|
|
1599
|
+
logger.info("Lineage processing completed by SqlParsingAggregator")
|
|
1600
|
+
|
|
1601
|
+
def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
1602
|
+
"""Override base class to skip aggregator gen_metadata() call.
|
|
1603
|
+
|
|
1604
|
+
Teradata handles aggregator processing after adding audit log queries,
|
|
1605
|
+
so we skip the base class call to prevent duplicate processing.
|
|
1606
|
+
"""
|
|
1607
|
+
# Return empty iterator - Teradata will handle aggregator processing
|
|
1608
|
+
# after adding audit log queries in _get_audit_log_mcps_with_aggregator()
|
|
1609
|
+
return iter([])
|
|
1610
|
+
|
|
1611
|
+
def _get_audit_log_mcps_with_aggregator(self) -> Iterable[MetadataWorkUnit]:
|
|
1612
|
+
"""SqlParsingAggregator-based lineage extraction with enhanced capabilities."""
|
|
1613
|
+
logger.info(
|
|
1614
|
+
"Fetching queries from Teradata audit logs for SqlParsingAggregator"
|
|
1615
|
+
)
|
|
1616
|
+
|
|
907
1617
|
if self.config.include_table_lineage or self.config.include_usage_statistics:
|
|
908
|
-
with
|
|
909
|
-
|
|
1618
|
+
# Step 1: Stream query entries from database with memory-efficient processing
|
|
1619
|
+
with self.report.new_stage("Fetching lineage entries from Audit Logs"):
|
|
1620
|
+
queries_processed = 0
|
|
1621
|
+
entries_processed = False
|
|
1622
|
+
|
|
1623
|
+
# Use streaming query reconstruction for memory efficiency
|
|
1624
|
+
for observed_query in self._reconstruct_queries_streaming(
|
|
1625
|
+
self._fetch_lineage_entries_chunked()
|
|
1626
|
+
):
|
|
1627
|
+
entries_processed = True
|
|
1628
|
+
self.aggregator.add(observed_query)
|
|
1629
|
+
|
|
1630
|
+
queries_processed += 1
|
|
1631
|
+
if queries_processed % 10000 == 0:
|
|
1632
|
+
logger.info(
|
|
1633
|
+
f"Processed {queries_processed} queries to aggregator"
|
|
1634
|
+
)
|
|
1635
|
+
|
|
1636
|
+
if not entries_processed:
|
|
1637
|
+
logger.info("No lineage entries found")
|
|
1638
|
+
return
|
|
1639
|
+
|
|
1640
|
+
logger.info(
|
|
1641
|
+
f"Completed adding {queries_processed} queries to SqlParsingAggregator"
|
|
1642
|
+
)
|
|
1643
|
+
|
|
1644
|
+
# Step 2: Generate work units from aggregator
|
|
1645
|
+
with self.report.new_stage("SqlParsingAggregator metadata generation"):
|
|
1646
|
+
logger.info("Generating metadata work units from SqlParsingAggregator")
|
|
1647
|
+
work_unit_count = 0
|
|
1648
|
+
for mcp in self.aggregator.gen_metadata():
|
|
1649
|
+
work_unit_count += 1
|
|
1650
|
+
if work_unit_count % 10000 == 0:
|
|
1651
|
+
logger.info(
|
|
1652
|
+
f"Generated {work_unit_count} work units from aggregator"
|
|
1653
|
+
)
|
|
1654
|
+
yield mcp.as_workunit()
|
|
1655
|
+
|
|
1656
|
+
logger.info(
|
|
1657
|
+
f"Completed SqlParsingAggregator processing: {work_unit_count} work units generated"
|
|
1658
|
+
)
|
|
1659
|
+
|
|
1660
|
+
def close(self) -> None:
|
|
1661
|
+
"""Clean up resources when source is closed."""
|
|
1662
|
+
logger.info("Closing SqlParsingAggregator")
|
|
1663
|
+
self.aggregator.close()
|
|
1664
|
+
|
|
1665
|
+
# Clean up pooled engine
|
|
1666
|
+
with self._pooled_engine_lock:
|
|
1667
|
+
if self._pooled_engine is not None:
|
|
1668
|
+
logger.info("Disposing pooled engine")
|
|
1669
|
+
self._pooled_engine.dispose()
|
|
1670
|
+
self._pooled_engine = None
|
|
910
1671
|
|
|
911
|
-
|
|
1672
|
+
# Report failed views summary
|
|
1673
|
+
super().close()
|