PyPI - acryl-datahub - Versions diffs - 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

acryl-datahub 1.1.1rc3py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (226) hide show

{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +2 -1
datahub/api/entities/external/__init__.py +0 -0
datahub/api/entities/external/external_entities.py +239 -0
datahub/api/entities/external/external_tag.py +145 -0
datahub/api/entities/external/lake_formation_external_entites.py +161 -0
datahub/api/entities/external/restricted_text.py +247 -0
datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
datahub/cli/check_cli.py +88 -7
datahub/cli/cli_utils.py +63 -0
datahub/cli/container_cli.py +5 -0
datahub/cli/delete_cli.py +124 -27
datahub/cli/docker_check.py +107 -12
datahub/cli/docker_cli.py +149 -227
datahub/cli/exists_cli.py +0 -2
datahub/cli/get_cli.py +0 -2
datahub/cli/iceberg_cli.py +5 -0
datahub/cli/ingest_cli.py +12 -16
datahub/cli/migrate.py +2 -0
datahub/cli/put_cli.py +1 -4
datahub/cli/quickstart_versioning.py +50 -7
datahub/cli/specific/assertions_cli.py +0 -4
datahub/cli/specific/datacontract_cli.py +0 -3
datahub/cli/specific/dataproduct_cli.py +0 -11
datahub/cli/specific/dataset_cli.py +1 -8
datahub/cli/specific/forms_cli.py +0 -4
datahub/cli/specific/group_cli.py +0 -2
datahub/cli/specific/structuredproperties_cli.py +1 -4
datahub/cli/specific/user_cli.py +0 -2
datahub/cli/state_cli.py +0 -2
datahub/cli/timeline_cli.py +0 -2
datahub/emitter/response_helper.py +86 -1
datahub/emitter/rest_emitter.py +71 -13
datahub/entrypoints.py +4 -3
datahub/ingestion/api/decorators.py +15 -3
datahub/ingestion/api/report.py +332 -3
datahub/ingestion/api/sink.py +3 -0
datahub/ingestion/api/source.py +48 -44
datahub/ingestion/autogenerated/__init__.py +0 -0
datahub/ingestion/autogenerated/capability_summary.json +3449 -0
datahub/ingestion/autogenerated/lineage.json +401 -0
datahub/ingestion/autogenerated/lineage_helper.py +177 -0
datahub/ingestion/extractor/schema_util.py +13 -4
datahub/ingestion/glossary/classification_mixin.py +5 -0
datahub/ingestion/graph/client.py +100 -15
datahub/ingestion/graph/config.py +1 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
datahub/ingestion/run/pipeline.py +54 -2
datahub/ingestion/sink/datahub_rest.py +13 -0
datahub/ingestion/source/abs/source.py +1 -1
datahub/ingestion/source/aws/aws_common.py +4 -0
datahub/ingestion/source/aws/glue.py +489 -244
datahub/ingestion/source/aws/tag_entities.py +292 -0
datahub/ingestion/source/azure/azure_common.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
datahub/ingestion/source/bigquery_v2/common.py +1 -1
datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
datahub/ingestion/source/bigquery_v2/queries.py +3 -3
datahub/ingestion/source/cassandra/cassandra.py +1 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
datahub/ingestion/source/common/subtypes.py +45 -0
datahub/ingestion/source/data_lake_common/object_store.py +115 -27
datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
datahub/ingestion/source/datahub/config.py +11 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
datahub/ingestion/source/datahub/datahub_source.py +1 -1
datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
datahub/ingestion/source/dbt/dbt_common.py +6 -2
datahub/ingestion/source/dbt/dbt_core.py +3 -0
datahub/ingestion/source/debug/__init__.py +0 -0
datahub/ingestion/source/debug/datahub_debug.py +300 -0
datahub/ingestion/source/dremio/dremio_api.py +114 -73
datahub/ingestion/source/dremio/dremio_config.py +2 -0
datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
datahub/ingestion/source/dremio/dremio_source.py +94 -81
datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
datahub/ingestion/source/file.py +3 -0
datahub/ingestion/source/fivetran/fivetran.py +34 -26
datahub/ingestion/source/gcs/gcs_source.py +13 -2
datahub/ingestion/source/ge_data_profiler.py +76 -28
datahub/ingestion/source/ge_profiling_config.py +11 -0
datahub/ingestion/source/hex/api.py +26 -1
datahub/ingestion/source/iceberg/iceberg.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +1 -1
datahub/ingestion/source/identity/okta.py +1 -14
datahub/ingestion/source/kafka/kafka.py +16 -0
datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
datahub/ingestion/source/looker/looker_source.py +1 -0
datahub/ingestion/source/mlflow.py +11 -1
datahub/ingestion/source/mock_data/__init__.py +0 -0
datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
datahub/ingestion/source/nifi.py +1 -1
datahub/ingestion/source/openapi.py +12 -0
datahub/ingestion/source/openapi_parser.py +56 -37
datahub/ingestion/source/powerbi/powerbi.py +1 -5
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/preset.py +2 -2
datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
datahub/ingestion/source/redshift/redshift.py +21 -1
datahub/ingestion/source/redshift/usage.py +4 -3
datahub/ingestion/source/s3/report.py +4 -2
datahub/ingestion/source/s3/source.py +367 -115
datahub/ingestion/source/sac/sac.py +3 -1
datahub/ingestion/source/salesforce.py +6 -3
datahub/ingestion/source/sigma/sigma.py +7 -1
datahub/ingestion/source/slack/slack.py +2 -1
datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/athena.py +119 -11
datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
datahub/ingestion/source/sql/clickhouse.py +3 -1
datahub/ingestion/source/sql/cockroachdb.py +0 -1
datahub/ingestion/source/sql/hana.py +3 -1
datahub/ingestion/source/sql/hive_metastore.py +3 -11
datahub/ingestion/source/sql/mariadb.py +0 -1
datahub/ingestion/source/sql/mssql/source.py +239 -34
datahub/ingestion/source/sql/mysql.py +0 -1
datahub/ingestion/source/sql/oracle.py +1 -1
datahub/ingestion/source/sql/postgres.py +0 -1
datahub/ingestion/source/sql/sql_common.py +121 -34
datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
datahub/ingestion/source/sql/teradata.py +997 -235
datahub/ingestion/source/sql/vertica.py +10 -6
datahub/ingestion/source/sql_queries.py +2 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
datahub/ingestion/source/superset.py +58 -3
datahub/ingestion/source/tableau/tableau.py +58 -37
datahub/ingestion/source/tableau/tableau_common.py +4 -2
datahub/ingestion/source/tableau/tableau_constant.py +0 -4
datahub/ingestion/source/unity/config.py +5 -0
datahub/ingestion/source/unity/proxy.py +118 -0
datahub/ingestion/source/unity/source.py +195 -17
datahub/ingestion/source/unity/tag_entities.py +295 -0
datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/metadata/_internal_schema_classes.py +1446 -559
datahub/metadata/_urns/urn_defs.py +1721 -1553
datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
datahub/metadata/schema.avsc +18055 -17802
datahub/metadata/schemas/ApplicationKey.avsc +31 -0
datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
datahub/metadata/schemas/Applications.avsc +38 -0
datahub/metadata/schemas/ChartKey.avsc +1 -0
datahub/metadata/schemas/ContainerKey.avsc +1 -0
datahub/metadata/schemas/ContainerProperties.avsc +8 -0
datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
datahub/metadata/schemas/DashboardKey.avsc +1 -0
datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
datahub/metadata/schemas/DataJobInfo.avsc +8 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessKey.avsc +8 -0
datahub/metadata/schemas/DataProductKey.avsc +1 -0
datahub/metadata/schemas/DataProductProperties.avsc +1 -1
datahub/metadata/schemas/DatasetKey.avsc +11 -1
datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
datahub/metadata/schemas/LogicalParent.avsc +140 -0
datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
datahub/metadata/schemas/MLModelKey.avsc +9 -0
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
datahub/metadata/schemas/NotebookKey.avsc +1 -0
datahub/metadata/schemas/QuerySubjects.avsc +1 -12
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/sdk/__init__.py +6 -0
datahub/sdk/_all_entities.py +11 -0
datahub/sdk/_shared.py +118 -1
datahub/sdk/chart.py +315 -0
datahub/sdk/container.py +7 -0
datahub/sdk/dashboard.py +432 -0
datahub/sdk/dataflow.py +309 -0
datahub/sdk/datajob.py +367 -0
datahub/sdk/dataset.py +8 -2
datahub/sdk/entity_client.py +90 -2
datahub/sdk/lineage_client.py +683 -82
datahub/sdk/main_client.py +46 -16
datahub/sdk/mlmodel.py +101 -38
datahub/sdk/mlmodelgroup.py +7 -0
datahub/sdk/search_client.py +4 -3
datahub/specific/chart.py +1 -1
datahub/specific/dataproduct.py +4 -0
datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
datahub/sql_parsing/sqlglot_lineage.py +62 -13
datahub/telemetry/telemetry.py +17 -11
datahub/testing/sdk_v2_helpers.py +7 -1
datahub/upgrade/upgrade.py +46 -13
datahub/utilities/server_config_util.py +8 -0
datahub/utilities/sqlalchemy_query_combiner.py +5 -2
datahub/utilities/stats_collections.py +4 -0
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/sql/teradata.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import logging
+import time
 from collections import defaultdict
-from dataclasses import dataclass
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field
 from datetime import datetime
 from functools import lru_cache
+from threading import Lock
 from typing import (
     Any,
     Dict,
@@ -10,7 +13,6 @@ from typing import (
     List,
     MutableMapping,
     Optional,
-    Set,
     Tuple,
     Union,
 )
@@ -29,7 +31,6 @@ from teradatasqlalchemy.options import configure
 from datahub.configuration.common import AllowDenyPattern
 from datahub.configuration.time_window_config import BaseTimeWindowConfig
-from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SourceCapability,
@@ -39,10 +40,9 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source_helpers import auto_lowercase_urns
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.graph.client import DataHubGraph
-from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type
+from datahub.ingestion.source.sql.sql_common import register_custom_type
 from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
 from datahub.ingestion.source.sql.sql_report import SQLSourceReport
 from datahub.ingestion.source.sql.two_tier_sql_source import (
@@ -56,13 +56,64 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     BytesTypeClass,
     TimeTypeClass,
 )
-from datahub.metadata.schema_classes import SchemaMetadataClass
+from datahub.metadata.urns import CorpUserUrn
 from datahub.sql_parsing.schema_resolver import SchemaResolver
-from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
+from datahub.sql_parsing.sql_parsing_aggregator import (
+    ObservedQuery,
+    SqlParsingAggregator,
+)
 from datahub.utilities.groupby import groupby_unsorted
+from datahub.utilities.stats_collections import TopKDict
 logger: logging.Logger = logging.getLogger(__name__)
+# Common excluded databases used in multiple places
+EXCLUDED_DATABASES = [
+    "All",
+    "Crashdumps",
+    "Default",
+    "DemoNow_Monitor",
+    "EXTUSER",
+    "External_AP",
+    "GLOBAL_FUNCTIONS",
+    "LockLogShredder",
+    "PUBLIC",
+    "SQLJ",
+    "SYSBAR",
+    "SYSJDBC",
+    "SYSLIB",
+    "SYSSPATIAL",
+    "SYSUDTLIB",
+    "SYSUIF",
+    "SysAdmin",
+    "Sys_Calendar",
+    "SystemFe",
+    "TDBCMgmt",
+    "TDMaps",
+    "TDPUSER",
+    "TDQCD",
+    "TDStats",
+    "TD_ANALYTICS_DB",
+    "TD_SERVER_DB",
+    "TD_SYSFNLIB",
+    "TD_SYSGPL",
+    "TD_SYSXML",
+    "TDaaS_BAR",
+    "TDaaS_DB",
+    "TDaaS_Maint",
+    "TDaaS_Monitor",
+    "TDaaS_Support",
+    "TDaaS_TDBCMgmt1",
+    "TDaaS_TDBCMgmt2",
+    "dbcmngr",
+    "mldb",
+    "system",
+    "tapidb",
+    "tdwm",
+    "val",
+    "dbc",
+]
 register_custom_type(custom_types.JSON, BytesTypeClass)
 register_custom_type(custom_types.INTERVAL_DAY, TimeTypeClass)
 register_custom_type(custom_types.INTERVAL_DAY_TO_SECOND, TimeTypeClass)
@@ -99,14 +150,16 @@ class TeradataTable:
     request_text: Optional[str]
-# lru cache is set to 1 which work only in single threaded environment but it keeps the memory footprint lower
+# Cache size of 1 is sufficient since schemas are processed sequentially
+# Note: This cache is per-process and helps when processing multiple tables in the same schema
 @lru_cache(maxsize=1)
 def get_schema_columns(
     self: Any, connection: Connection, dbc_columns: str, schema: str
 ) -> Dict[str, List[Any]]:
+    start_time = time.time()
     columns: Dict[str, List[Any]] = {}
-    columns_query = f"select * from dbc.{dbc_columns} where DatabaseName (NOT CASESPECIFIC) = '{schema}' (NOT CASESPECIFIC) order by TableName, ColumnId"
-    rows = connection.execute(text(columns_query)).fetchall()
+    columns_query = f"select * from dbc.{dbc_columns} where DatabaseName (NOT CASESPECIFIC) = :schema (NOT CASESPECIFIC) order by TableName, ColumnId"
+    rows = connection.execute(text(columns_query), {"schema": schema}).fetchall()
     for row in rows:
         row_mapping = row._mapping
         if row_mapping.TableName not in columns:
@@ -114,18 +167,29 @@ def get_schema_columns(
         columns[row_mapping.TableName].append(row_mapping)
+    end_time = time.time()
+    extraction_time = end_time - start_time
+    logger.info(
+        f"Column extraction for schema '{schema}' completed in {extraction_time:.2f} seconds"
+    )
+    # Update report if available
+    if hasattr(self, "report"):
+        self.report.column_extraction_duration_seconds += extraction_time
     return columns
-# lru cache is set to 1 which work only in single threaded environment but it keeps the memory footprint lower
+# Cache size of 1 is sufficient since schemas are processed sequentially
+# Note: This cache is per-process and helps when processing multiple tables in the same schema
 @lru_cache(maxsize=1)
 def get_schema_pk_constraints(
     self: Any, connection: Connection, schema: str
 ) -> Dict[str, List[Any]]:
     dbc_indices = "IndicesV" + "X" if configure.usexviews else "IndicesV"
     primary_keys: Dict[str, List[Any]] = {}
-    stmt = f"select * from dbc.{dbc_indices} where DatabaseName (NOT CASESPECIFIC) = '{schema}' (NOT CASESPECIFIC) and IndexType = 'K' order by IndexNumber"
-    rows = connection.execute(text(stmt)).fetchall()
+    stmt = f"select * from dbc.{dbc_indices} where DatabaseName (NOT CASESPECIFIC) = :schema (NOT CASESPECIFIC) and IndexType = 'K' order by IndexNumber"
+    rows = connection.execute(text(stmt), {"schema": schema}).fetchall()
     for row in rows:
         row_mapping = row._mapping
         if row_mapping.TableName not in primary_keys:
@@ -172,6 +236,10 @@ def optimized_get_pk_constraint(
             index_column.IndexName
         )  # There should be just one IndexName
+        # Update counter if available
+        if hasattr(self, "report"):
+            self.report.num_primary_keys_processed += 1
     return {"constrained_columns": index_columns, "name": index_name}
@@ -228,23 +296,55 @@ def optimized_get_columns(
             table_name, []
         )
+    start_time = time.time()
     final_column_info = []
     # Don't care about ART tables now
     # Ignore the non-functional column in a PTI table
     for row in res:
-        col_info = self._get_column_info(row)
-        if "TSColumnType" in col_info and col_info["TSColumnType"] is not None:
-            if (
-                col_info["ColumnName"] == "TD_TIMEBUCKET"
-                and col_info["TSColumnType"].strip() == "TB"
+        try:
+            col_info = self._get_column_info(row)
+            # Add CommentString as comment field for column description
+            if hasattr(row, "CommentString") and row.CommentString:
+                col_info["comment"] = row.CommentString.strip()
+            elif (
+                isinstance(row, dict)
+                and "CommentString" in row
+                and row["CommentString"]
             ):
-                continue
-        final_column_info.append(col_info)
+                col_info["comment"] = row["CommentString"].strip()
+            if "TSColumnType" in col_info and col_info["TSColumnType"] is not None:
+                if (
+                    col_info["ColumnName"] == "TD_TIMEBUCKET"
+                    and col_info["TSColumnType"].strip() == "TB"
+                ):
+                    continue
+            final_column_info.append(col_info)
+            # Update counter - access report through self from the connection context
+            if hasattr(self, "report"):
+                self.report.num_columns_processed += 1
+        except Exception as e:
+            logger.error(
+                f"Failed to process column {getattr(row, 'ColumnName', 'unknown')}: {e}"
+            )
+            if hasattr(self, "report"):
+                self.report.num_column_extraction_failures += 1
+            continue
+    # Update timing
+    if hasattr(self, "report"):
+        end_time = time.time()
+        self.report.column_extraction_duration_seconds += end_time - start_time
     return final_column_info
-# lru cache is set to 1 which work only in single threaded environment but it keeps the memory footprint lower
+# Cache size of 1 is sufficient since schemas are processed sequentially
+# Note: This cache is per-process and helps when processing multiple tables in the same schema
 @lru_cache(maxsize=1)
 def get_schema_foreign_keys(
     self: Any, connection: Connection, schema: str
@@ -334,9 +434,24 @@ def optimized_get_view_definition(
 @dataclass
 class TeradataReport(SQLSourceReport, IngestionStageReport, BaseTimeWindowReport):
-    num_queries_parsed: int = 0
-    num_view_ddl_parsed: int = 0
-    num_table_parse_failures: int = 0
+    # View processing metrics (actively used)
+    num_views_processed: int = 0
+    num_view_processing_failures: int = 0
+    view_extraction_total_time_seconds: float = 0.0
+    view_extraction_average_time_seconds: float = 0.0
+    slowest_view_processing_time_seconds: float = 0.0
+    slowest_view_name: TopKDict[str, float] = field(default_factory=TopKDict)
+    # Connection pool performance metrics (actively used)
+    connection_pool_wait_time_seconds: float = 0.0
+    connection_pool_max_wait_time_seconds: float = 0.0
+    # Database-level metrics similar to BigQuery's approach (actively used)
+    num_database_tables_to_scan: TopKDict[str, int] = field(default_factory=TopKDict)
+    num_database_views_to_scan: TopKDict[str, int] = field(default_factory=TopKDict)
+    # Global metadata extraction timing (single query for all databases)
+    metadata_extraction_total_sec: float = 0.0
 class BaseTeradataConfig(TwoTierSQLAlchemyConfig):
@@ -353,53 +468,7 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
     )
     database_pattern = Field(
-        default=AllowDenyPattern(
-            deny=[
-                "All",
-                "Crashdumps",
-                "Default",
-                "DemoNow_Monitor",
-                "EXTUSER",
-                "External_AP",
-                "GLOBAL_FUNCTIONS",
-                "LockLogShredder",
-                "PUBLIC",
-                "SQLJ",
-                "SYSBAR",
-                "SYSJDBC",
-                "SYSLIB",
-                "SYSSPATIAL",
-                "SYSUDTLIB",
-                "SYSUIF",
-                "SysAdmin",
-                "Sys_Calendar",
-                "SystemFe",
-                "TDBCMgmt",
-                "TDMaps",
-                "TDPUSER",
-                "TDQCD",
-                "TDStats",
-                "TD_ANALYTICS_DB",
-                "TD_SERVER_DB",
-                "TD_SYSFNLIB",
-                "TD_SYSGPL",
-                "TD_SYSXML",
-                "TDaaS_BAR",
-                "TDaaS_DB",
-                "TDaaS_Maint",
-                "TDaaS_Monitor",
-                "TDaaS_Support",
-                "TDaaS_TDBCMgmt1",
-                "TDaaS_TDBCMgmt2",
-                "dbcmngr",
-                "mldb",
-                "system",
-                "tapidb",
-                "tdwm",
-                "val",
-                "dbc",
-            ]
-        ),
+        default=AllowDenyPattern(deny=EXCLUDED_DATABASES),
         description="Regex patterns for databases to filter in ingestion.",
     )
     include_table_lineage = Field(
@@ -413,6 +482,13 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
         description="Whether to include view lineage in the ingestion. "
         "This requires to have the view lineage feature enabled.",
     )
+    include_queries = Field(
+        default=True,
+        description="Whether to generate query entities for SQL queries. "
+        "Query entities provide metadata about individual SQL queries including "
+        "execution timestamps, user information, and query text.",
+    )
     usage: BaseUsageConfig = Field(
         description="The usage config to use when generating usage statistics",
         default=BaseUsageConfig(),
@@ -438,6 +514,26 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
         description="Whether to use QVCI to get column information. This is faster but requires to have QVCI enabled.",
     )
+    include_historical_lineage: bool = Field(
+        default=False,
+        description="Whether to include historical lineage data from PDCRINFO.DBQLSqlTbl_Hst in addition to current DBC.QryLogV data. "
+        "This provides access to historical query logs that may have been archived. "
+        "The historical table existence is checked automatically and gracefully falls back to current data only if not available.",
+    )
+    use_server_side_cursors: bool = Field(
+        default=True,
+        description="Enable server-side cursors for large result sets using SQLAlchemy's stream_results. "
+        "This reduces memory usage by streaming results from the database server. "
+        "Automatically falls back to client-side batching if server-side cursors are not supported.",
+    )
+    max_workers: int = Field(
+        default=10,
+        description="Maximum number of worker threads to use for parallel processing. "
+        "Controls the level of concurrency for operations like view processing.",
+    )
 @platform_name("Teradata")
 @config_class(TeradataConfig)
@@ -445,7 +541,10 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
 @capability(SourceCapability.DOMAINS, "Enabled by default")
 @capability(SourceCapability.CONTAINERS, "Enabled by default")
 @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
-@capability(SourceCapability.DELETION_DETECTION, "Optionally enabled via configuration")
+@capability(
+    SourceCapability.DELETION_DETECTION,
+    "Enabled by default when stateful ingestion is turned on",
+)
 @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
 @capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration")
 @capability(SourceCapability.LINEAGE_FINE, "Optionally enabled via configuration")
@@ -461,13 +560,7 @@ class TeradataSource(TwoTierSQLAlchemySource):
     config: TeradataConfig
-    LINEAGE_QUERY_DATABASE_FILTER: str = """and default_database IN ({databases})"""
-    LINEAGE_TIMESTAMP_BOUND_QUERY: str = """
-    SELECT MIN(CollectTimeStamp) as "min_ts", MAX(CollectTimeStamp) as "max_ts" from DBC.QryLogV
-    """.strip()
-    QUERY_TEXT_QUERY: str = """
+    QUERY_TEXT_CURRENT_QUERIES: str = """
     SELECT
         s.QueryID as "query_id",
         UserName as "user",
@@ -500,10 +593,89 @@ class TeradataSource(TwoTierSQLAlchemySource):
         and s.CollectTimeStamp >= TIMESTAMP '{start_time}'
         and default_database not in ('DEMONOW_MONITOR')
         {databases_filter}
-    ORDER BY "query_id", "row_no"
+    ORDER BY "timestamp", "query_id", "row_no"
+    """.strip()
+    QUERY_TEXT_HISTORICAL_UNION: str = """
+    SELECT
+        "query_id",
+        "user",
+        "timestamp",
+        default_database,
+        "query_text",
+        "row_no"
+    FROM (
+        SELECT
+            h.QueryID as "query_id",
+            h.UserName as "user",
+            h.StartTime AT TIME ZONE 'GMT' as "timestamp",
+            h.DefaultDatabase as default_database,
+            h.SqlTextInfo as "query_text",
+            h.SqlRowNo as "row_no"
+        FROM "PDCRINFO".DBQLSqlTbl_Hst as h
+        WHERE
+            h.ErrorCode = 0
+            AND h.statementtype not in (
+            'Unrecognized type',
+            'Create Database/User',
+            'Help',
+            'Modify Database',
+            'Drop Table',
+            'Show',
+            'Not Applicable',
+            'Grant',
+            'Abort',
+            'Database',
+            'Flush Query Logging',
+            'Null',
+            'Begin/End DBQL',
+            'Revoke'
+        )
+            and h.StartTime AT TIME ZONE 'GMT' >= TIMESTAMP '{start_time}'
+            and h.StartTime AT TIME ZONE 'GMT' < TIMESTAMP '{end_time}'
+            and h.CollectTimeStamp >= TIMESTAMP '{start_time}'
+            and h.DefaultDatabase not in ('DEMONOW_MONITOR')
+            {databases_filter_history}
+        UNION
+        SELECT
+            s.QueryID as "query_id",
+            l.UserName as "user",
+            l.StartTime AT TIME ZONE 'GMT' as "timestamp",
+            l.DefaultDatabase as default_database,
+            s.SqlTextInfo as "query_text",
+            s.SqlRowNo as "row_no"
+        FROM "DBC".QryLogV as l
+        JOIN "DBC".QryLogSqlV as s on s.QueryID = l.QueryID
+        WHERE
+            l.ErrorCode = 0
+            AND l.statementtype not in (
+            'Unrecognized type',
+            'Create Database/User',
+            'Help',
+            'Modify Database',
+            'Drop Table',
+            'Show',
+            'Not Applicable',
+            'Grant',
+            'Abort',
+            'Database',
+            'Flush Query Logging',
+            'Null',
+            'Begin/End DBQL',
+            'Revoke'
+        )
+            and l.StartTime AT TIME ZONE 'GMT' >= TIMESTAMP '{start_time}'
+            and l.StartTime AT TIME ZONE 'GMT' < TIMESTAMP '{end_time}'
+            and s.CollectTimeStamp >= TIMESTAMP '{start_time}'
+            and l.DefaultDatabase not in ('DEMONOW_MONITOR')
+            {databases_filter}
+    ) as combined_results
+    ORDER BY "timestamp", "query_id", "row_no"
     """.strip()
-    TABLES_AND_VIEWS_QUERY: str = """
+    TABLES_AND_VIEWS_QUERY: str = f"""
 SELECT
     t.DataBaseName,
     t.TableName as name,
@@ -521,77 +693,51 @@ SELECT
     t.LastAlterTimeStamp,
     t.RequestText
 FROM dbc.TablesV t
-WHERE DataBaseName NOT IN (
-                'All',
-                'Crashdumps',
-                'Default',
-                'DemoNow_Monitor',
-                'EXTUSER',
-                'External_AP',
-                'GLOBAL_FUNCTIONS',
-                'LockLogShredder',
-                'PUBLIC',
-                'SQLJ',
-                'SYSBAR',
-                'SYSJDBC',
-                'SYSLIB',
-                'SYSSPATIAL',
-                'SYSUDTLIB',
-                'SYSUIF',
-                'SysAdmin',
-                'Sys_Calendar',
-                'SystemFe',
-                'TDBCMgmt',
-                'TDMaps',
-                'TDPUSER',
-                'TDQCD',
-                'TDStats',
-                'TD_ANALYTICS_DB',
-                'TD_SERVER_DB',
-                'TD_SYSFNLIB',
-                'TD_SYSGPL',
-                'TD_SYSXML',
-                'TDaaS_BAR',
-                'TDaaS_DB',
-                'TDaaS_Maint',
-                'TDaaS_Monitor',
-                'TDaaS_Support',
-                'TDaaS_TDBCMgmt1',
-                'TDaaS_TDBCMgmt2',
-                'dbcmngr',
-                'mldb',
-                'system',
-                'tapidb',
-                'tdwm',
-                'val',
-                'dbc'
-)
+WHERE DataBaseName NOT IN ({",".join([f"'{db}'" for db in EXCLUDED_DATABASES])})
 AND t.TableKind in ('T', 'V', 'Q', 'O')
 ORDER by DataBaseName, TableName;
      """.strip()
     _tables_cache: MutableMapping[str, List[TeradataTable]] = defaultdict(list)
+    _tables_cache_lock = Lock()  # Protect shared cache from concurrent access
+    _pooled_engine: Optional[Engine] = None  # Reusable pooled engine
+    _pooled_engine_lock = Lock()  # Protect engine creation
     def __init__(self, config: TeradataConfig, ctx: PipelineContext):
         super().__init__(config, ctx, "teradata")
         self.report: TeradataReport = TeradataReport()
         self.graph: Optional[DataHubGraph] = ctx.graph
+        self._report_lock = Lock()  # Thread safety for report counters
-        self.builder: SqlParsingBuilder = SqlParsingBuilder(
-            usage_config=(
-                self.config.usage if self.config.include_usage_statistics else None
-            ),
-            generate_lineage=True,
+        self.schema_resolver = self._init_schema_resolver()
+        # Initialize SqlParsingAggregator for modern lineage processing
+        logger.info("Initializing SqlParsingAggregator for enhanced lineage processing")
+        self.aggregator = SqlParsingAggregator(
+            platform="teradata",
+            platform_instance=self.config.platform_instance,
+            env=self.config.env,
+            schema_resolver=self.schema_resolver,
+            graph=self.ctx.graph,
+            generate_lineage=self.include_lineage,
+            generate_queries=self.config.include_queries,
             generate_usage_statistics=self.config.include_usage_statistics,
-            generate_operations=self.config.usage.include_operational_stats,
+            generate_query_usage_statistics=self.config.include_usage_statistics,
+            generate_operations=self.config.usage.include_operational_stats
+            if self.config.include_usage_statistics
+            else False,
+            usage_config=self.config.usage
+            if self.config.include_usage_statistics
+            else None,
+            eager_graph_load=False,
         )
-        self.schema_resolver = self._init_schema_resolver()
+        self.report.sql_aggregator = self.aggregator.report
         if self.config.include_tables or self.config.include_views:
-            self.cache_tables_and_views()
-            logger.info(f"Found {len(self._tables_cache)} tables and views")
+            with self.report.new_stage("Table and view discovery"):
+                self.cache_tables_and_views()
+                logger.info(f"Found {len(self._tables_cache)} tables and views")
             setattr(self, "loop_tables", self.cached_loop_tables)  # noqa: B010
             setattr(self, "loop_views", self.cached_loop_views)  # noqa: B010
             setattr(  # noqa: B010
@@ -721,6 +867,8 @@ ORDER by DataBaseName, TableName;
         logger.debug(f"sql_alchemy_url={url}")
         engine = create_engine(url, **self.config.options)
+        # Get list of databases first
         with engine.connect() as conn:
             inspector = inspect(conn)
             if self.config.database and self.config.database != "":
@@ -729,13 +877,14 @@ ORDER by DataBaseName, TableName;
                 databases = self.config.databases
             else:
                 databases = inspector.get_schema_names()
-            for db in databases:
-                if self.config.database_pattern.allowed(db):
-                    # url = self.config.get_sql_alchemy_url(current_db=db)
-                    # with create_engine(url, **self.config.options).connect() as conn:
-                    #    inspector = inspect(conn)
-                    inspector._datahub_database = db
-                    yield inspector
+        # Create separate connections for each database to avoid connection lifecycle issues
+        for db in databases:
+            if self.config.database_pattern.allowed(db):
+                with engine.connect() as conn:
+                    db_inspector = inspect(conn)
+                    db_inspector._datahub_database = db
+                    yield db_inspector
     def get_db_name(self, inspector: Inspector) -> str:
         if hasattr(inspector, "_datahub_database"):
@@ -753,14 +902,15 @@ ORDER by DataBaseName, TableName;
         inspector: Inspector,
         schema: str,
         sql_config: SQLCommonConfig,
-    ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
+    ) -> Iterable[MetadataWorkUnit]:
         setattr(  # noqa: B010
             inspector,
             "get_table_names",
             lambda schema: [
                 i.name
                 for i in filter(
-                    lambda t: t.object_type != "View", self._tables_cache[schema]
+                    lambda t: t.object_type != "View",
+                    self._tables_cache.get(schema, []),
                 )
             ],
         )
@@ -776,7 +926,8 @@ ORDER by DataBaseName, TableName;
         # this method and provide a location.
         location: Optional[str] = None
-        for entry in self._tables_cache[schema]:
+        cache_entries = self._tables_cache.get(schema, [])
+        for entry in cache_entries:
             if entry.name == table:
                 description = entry.description
                 if entry.object_type == "View" and entry.request_text:
@@ -789,123 +940,734 @@ ORDER by DataBaseName, TableName;
         inspector: Inspector,
         schema: str,
         sql_config: SQLCommonConfig,
-    ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
-        setattr(  # noqa: B010
-            inspector,
-            "get_view_names",
-            lambda schema: [
-                i.name
-                for i in filter(
-                    lambda t: t.object_type == "View", self._tables_cache[schema]
+    ) -> Iterable[MetadataWorkUnit]:
+        start_time = time.time()
+        # Get view names from cache
+        view_names = [
+            i.name
+            for i in filter(
+                lambda t: t.object_type == "View", self._tables_cache.get(schema, [])
+            )
+        ]
+        actual_view_count = len(view_names)
+        if actual_view_count == 0:
+            end_time = time.time()
+            processing_time = end_time - start_time
+            logger.info(
+                f"View processing for schema '{schema}' completed in {processing_time:.2f} seconds (0 views, 0 work units)"
+            )
+            return
+        # Use custom threading implementation with connection pooling
+        work_unit_count = 0
+        for work_unit in self._loop_views_with_connection_pool(
+            view_names, schema, sql_config
+        ):
+            work_unit_count += 1
+            yield work_unit
+        end_time = time.time()
+        processing_time = end_time - start_time
+        logger.info(
+            f"View processing for schema '{schema}' completed in {processing_time:.2f} seconds ({actual_view_count} views, {work_unit_count} work units)"
+        )
+        # Update report timing metrics
+        if hasattr(self, "report"):
+            self.report.view_extraction_total_time_seconds += processing_time
+            self.report.num_views_processed += actual_view_count
+            # Track slowest view processing at view level (will be updated by individual view processing)
+            # Note: slowest_view_name now tracks individual views, not schemas
+            # Calculate average processing time per view
+            if self.report.num_views_processed > 0:
+                self.report.view_extraction_average_time_seconds = (
+                    self.report.view_extraction_total_time_seconds
+                    / self.report.num_views_processed
                 )
-            ],
+    def _loop_views_with_connection_pool(
+        self, view_names: List[str], schema: str, sql_config: SQLCommonConfig
+    ) -> Iterable[Union[MetadataWorkUnit, Any]]:
+        """
+        Process views using individual database connections per thread for true parallelization.
+        Each thread gets its own connection from a QueuePool, enabling true concurrent processing.
+        """
+        if self.config.max_workers == 1:
+            # Single-threaded processing - no need for complexity
+            yield from self._process_views_single_threaded(
+                view_names, schema, sql_config
+            )
+            return
+        logger.info(
+            f"Processing {len(view_names)} views with {self.config.max_workers} worker threads"
         )
-        yield from super().loop_views(inspector, schema, sql_config)
-    def cache_tables_and_views(self) -> None:
+        # Get or create reusable pooled engine
+        engine = self._get_or_create_pooled_engine()
+        try:
+            # Thread-safe result collection
+            report_lock = Lock()
+            def process_single_view(
+                view_name: str,
+            ) -> List[Union[MetadataWorkUnit, Any]]:
+                """Process a single view with its own database connection."""
+                results: List[Union[MetadataWorkUnit, Any]] = []
+                # Detailed timing measurements for bottleneck analysis
+                timings = {
+                    "connection_acquire": 0.0,
+                    "view_processing": 0.0,
+                    "work_unit_generation": 0.0,
+                    "total": 0.0,
+                }
+                total_start = time.time()
+                try:
+                    # Measure connection acquisition time
+                    conn_start = time.time()
+                    with engine.connect() as conn:
+                        timings["connection_acquire"] = time.time() - conn_start
+                        # Update connection pool metrics
+                        with report_lock:
+                            pool_wait_time = timings["connection_acquire"]
+                            self.report.connection_pool_wait_time_seconds += (
+                                pool_wait_time
+                            )
+                            if (
+                                pool_wait_time
+                                > self.report.connection_pool_max_wait_time_seconds
+                            ):
+                                self.report.connection_pool_max_wait_time_seconds = (
+                                    pool_wait_time
+                                )
+                        # Measure view processing setup
+                        processing_start = time.time()
+                        thread_inspector = inspect(conn)
+                        # Inherit database information for Teradata two-tier architecture
+                        thread_inspector._datahub_database = schema  # type: ignore
+                        dataset_name = self.get_identifier(
+                            schema=schema, entity=view_name, inspector=thread_inspector
+                        )
+                        # Thread-safe reporting
+                        with report_lock:
+                            self.report.report_entity_scanned(
+                                dataset_name, ent_type="view"
+                            )
+                        if not sql_config.view_pattern.allowed(dataset_name):
+                            with report_lock:
+                                self.report.report_dropped(dataset_name)
+                            return results
+                        timings["view_processing"] = time.time() - processing_start
+                        # Measure work unit generation
+                        wu_start = time.time()
+                        for work_unit in self._process_view(
+                            dataset_name=dataset_name,
+                            inspector=thread_inspector,
+                            schema=schema,
+                            view=view_name,
+                            sql_config=sql_config,
+                        ):
+                            results.append(work_unit)
+                        timings["work_unit_generation"] = time.time() - wu_start
+                    # Track individual view timing
+                    timings["total"] = time.time() - total_start
+                    with report_lock:
+                        self.report.slowest_view_name[f"{schema}.{view_name}"] = (
+                            timings["total"]
+                        )
+                except Exception as e:
+                    with report_lock:
+                        self.report.num_view_processing_failures += 1
+                        # Log full exception details for debugging
+                        import traceback
+                        full_traceback = traceback.format_exc()
+                        logger.error(
+                            f"Failed to process view {schema}.{view_name}: {str(e)}"
+                        )
+                        logger.error(f"Full traceback: {full_traceback}")
+                        self.report.warning(
+                            f"Error processing view {schema}.{view_name}",
+                            context=f"View: {schema}.{view_name}, Error: {str(e)}",
+                            exc=e,
+                        )
+                return results
+            # Use ThreadPoolExecutor for concurrent processing
+            with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
+                # Submit all view processing tasks
+                future_to_view = {
+                    executor.submit(process_single_view, view_name): view_name
+                    for view_name in view_names
+                }
+                # Process completed tasks as they finish
+                for future in as_completed(future_to_view):
+                    view_name = future_to_view[future]
+                    try:
+                        results = future.result()
+                        # Yield all results from this view
+                        for result in results:
+                            yield result
+                    except Exception as e:
+                        with report_lock:
+                            self.report.warning(
+                                "Error in thread processing view",
+                                context=f"{schema}.{view_name}",
+                                exc=e,
+                            )
+        finally:
+            # Don't dispose the reusable engine here - it will be cleaned up in close()
+            pass
+    def _process_views_single_threaded(
+        self, view_names: List[str], schema: str, sql_config: SQLCommonConfig
+    ) -> Iterable[Union[MetadataWorkUnit, Any]]:
+        """Process views sequentially with a single connection."""
         engine = self.get_metadata_engine()
-        for entry in engine.execute(self.TABLES_AND_VIEWS_QUERY):
-            table = TeradataTable(
-                database=entry.DataBaseName.strip(),
-                name=entry.name.strip(),
-                description=entry.description.strip() if entry.description else None,
-                object_type=entry.object_type,
-                create_timestamp=entry.CreateTimeStamp,
-                last_alter_name=entry.LastAlterName,
-                last_alter_timestamp=entry.LastAlterTimeStamp,
-                request_text=(
-                    entry.RequestText.strip()
-                    if entry.object_type == "View" and entry.RequestText
-                    else None
-                ),
+        try:
+            with engine.connect() as conn:
+                inspector = inspect(conn)
+                for view_name in view_names:
+                    view_start_time = time.time()
+                    try:
+                        dataset_name = self.get_identifier(
+                            schema=schema, entity=view_name, inspector=inspector
+                        )
+                        self.report.report_entity_scanned(dataset_name, ent_type="view")
+                        if not sql_config.view_pattern.allowed(dataset_name):
+                            self.report.report_dropped(dataset_name)
+                            continue
+                        # Process the view and yield results
+                        for work_unit in self._process_view(
+                            dataset_name=dataset_name,
+                            inspector=inspector,
+                            schema=schema,
+                            view=view_name,
+                            sql_config=sql_config,
+                        ):
+                            yield work_unit
+                        # Track individual view timing
+                        view_end_time = time.time()
+                        view_processing_time = view_end_time - view_start_time
+                        self.report.slowest_view_name[f"{schema}.{view_name}"] = (
+                            view_processing_time
+                        )
+                    except Exception as e:
+                        # Log full exception details for debugging
+                        import traceback
+                        full_traceback = traceback.format_exc()
+                        logger.error(
+                            f"Failed to process view {schema}.{view_name}: {str(e)}"
+                        )
+                        logger.error(f"Full traceback: {full_traceback}")
+                        self.report.warning(
+                            f"Error processing view {schema}.{view_name}",
+                            context=f"View: {schema}.{view_name}, Error: {str(e)}",
+                            exc=e,
+                        )
+        finally:
+            engine.dispose()
+    def _get_or_create_pooled_engine(self) -> Engine:
+        """Get or create a reusable SQLAlchemy engine with QueuePool for concurrent connections."""
+        with self._pooled_engine_lock:
+            if self._pooled_engine is None:
+                url = self.config.get_sql_alchemy_url()
+                # Optimal connection pool sizing to match max_workers exactly
+                # Teradata driver can be sensitive to high connection counts, so cap at reasonable limit
+                max_safe_connections = (
+                    13  # Conservative limit: 8 base + 5 overflow for Teradata stability
+                )
+                # Adjust max_workers to match available connection pool capacity
+                effective_max_workers = min(
+                    self.config.max_workers, max_safe_connections
+                )
+                # Set pool size to match effective workers for optimal performance
+                base_connections = min(
+                    effective_max_workers, 8
+                )  # Reasonable base connections
+                max_overflow = (
+                    effective_max_workers - base_connections
+                )  # Remaining as overflow
+                # Log adjustment if max_workers was reduced
+                if effective_max_workers < self.config.max_workers:
+                    logger.warning(
+                        f"Reduced max_workers from {self.config.max_workers} to {effective_max_workers} to match Teradata connection pool capacity"
+                    )
+                # Update the config to reflect the effective value used
+                self.config.max_workers = effective_max_workers
+                pool_options = {
+                    **self.config.options,
+                    "poolclass": QueuePool,
+                    "pool_size": base_connections,
+                    "max_overflow": max_overflow,
+                    "pool_pre_ping": True,  # Validate connections
+                    "pool_recycle": 1800,  # Recycle connections after 30 mins (more frequent)
+                    "pool_timeout": 60,  # Longer timeout for connection acquisition
+                    "pool_reset_on_return": "rollback",  # Explicit rollback on connection return
+                }
+                # Add Teradata-specific connection options for stability
+                if "connect_args" not in pool_options:
+                    pool_options["connect_args"] = {}
+                # Teradata-specific connection arguments for better stability
+                pool_options["connect_args"].update(
+                    {
+                        "connect_timeout": "30000",  # Connection timeout in ms (30 seconds)
+                        "request_timeout": "120000",  # Request timeout in ms (2 minutes)
+                    }
+                )
+                self._pooled_engine = create_engine(url, **pool_options)
+                logger.info(
+                    f"Created optimized Teradata connection pool: {base_connections} base + {max_overflow} overflow = {base_connections + max_overflow} max connections (matching {effective_max_workers} workers)"
+                )
+            return self._pooled_engine
+    def cache_tables_and_views(self) -> None:
+        with self.report.new_stage("Cache tables and views"):
+            engine = self.get_metadata_engine()
+            try:
+                database_counts: Dict[str, Dict[str, int]] = defaultdict(
+                    lambda: {"tables": 0, "views": 0}
+                )
+                for entry in engine.execute(self.TABLES_AND_VIEWS_QUERY):
+                    table = TeradataTable(
+                        database=entry.DataBaseName.strip(),
+                        name=entry.name.strip(),
+                        description=entry.description.strip()
+                        if entry.description
+                        else None,
+                        object_type=entry.object_type,
+                        create_timestamp=entry.CreateTimeStamp,
+                        last_alter_name=entry.LastAlterName,
+                        last_alter_timestamp=entry.LastAlterTimeStamp,
+                        request_text=(
+                            entry.RequestText.strip()
+                            if entry.object_type == "View" and entry.RequestText
+                            else None
+                        ),
+                    )
+                    # Count objects per database for metrics
+                    if table.object_type == "View":
+                        database_counts[table.database]["views"] += 1
+                    else:
+                        database_counts[table.database]["tables"] += 1
+                    with self._tables_cache_lock:
+                        if table.database not in self._tables_cache:
+                            self._tables_cache[table.database] = []
+                        self._tables_cache[table.database].append(table)
+                for database, counts in database_counts.items():
+                    self.report.num_database_tables_to_scan[database] = counts["tables"]
+                    self.report.num_database_views_to_scan[database] = counts["views"]
+            finally:
+                engine.dispose()
+    def _reconstruct_queries_streaming(
+        self, entries: Iterable[Any]
+    ) -> Iterable[ObservedQuery]:
+        """Reconstruct complete queries from database entries in streaming fashion.
+        This method processes entries in order and reconstructs multi-row queries
+        by concatenating rows with the same query_id.
+        """
+        current_query_id = None
+        current_query_parts = []
+        current_query_metadata = None
+        for entry in entries:
+            query_id = getattr(entry, "query_id", None)
+            query_text = str(getattr(entry, "query_text", ""))
+            if query_id != current_query_id:
+                # New query started - yield the previous one if it exists
+                if current_query_id is not None and current_query_parts:
+                    yield self._create_observed_query_from_parts(
+                        current_query_parts, current_query_metadata
+                    )
+                # Start new query
+                current_query_id = query_id
+                current_query_parts = [query_text] if query_text else []
+                current_query_metadata = entry
+            else:
+                # Same query - append the text
+                if query_text:
+                    current_query_parts.append(query_text)
+        # Yield the last query if it exists
+        if current_query_id is not None and current_query_parts:
+            yield self._create_observed_query_from_parts(
+                current_query_parts, current_query_metadata
             )
-            if table.database not in self._tables_cache:
-                self._tables_cache[table.database] = []
-            self._tables_cache[table.database].append(table)
+    def _create_observed_query_from_parts(
+        self, query_parts: List[str], metadata_entry: Any
+    ) -> ObservedQuery:
+        """Create ObservedQuery from reconstructed query parts and metadata."""
+        # Join all parts to form the complete query
+        # Teradata fragments are split at fixed lengths without artificial breaks
+        full_query_text = "".join(query_parts)
+        # Extract metadata
+        session_id = getattr(metadata_entry, "session_id", None)
+        timestamp = getattr(metadata_entry, "timestamp", None)
+        user = getattr(metadata_entry, "user", None)
+        default_database = getattr(metadata_entry, "default_database", None)
+        # Apply Teradata-specific query transformations
+        cleaned_query = full_query_text.replace("(NOT CASESPECIFIC)", "")
+        return ObservedQuery(
+            query=cleaned_query,
+            session_id=session_id,
+            timestamp=timestamp,
+            user=CorpUserUrn(user) if user else None,
+            default_db=default_database,
+            default_schema=default_database,  # Teradata uses database as schema
+        )
+    def _convert_entry_to_observed_query(self, entry: Any) -> ObservedQuery:
+        """Convert database query entry to ObservedQuery for SqlParsingAggregator.
+        DEPRECATED: This method is deprecated in favor of _reconstruct_queries_streaming
+        which properly handles multi-row queries. This method does not handle queries
+        that span multiple rows correctly and should not be used.
+        """
+        # Extract fields from database result
+        query_text = str(entry.query_text).strip()
+        session_id = getattr(entry, "session_id", None)
+        timestamp = getattr(entry, "timestamp", None)
+        user = getattr(entry, "user", None)
+        default_database = getattr(entry, "default_database", None)
+        # Apply Teradata-specific query transformations
+        cleaned_query = query_text.replace("(NOT CASESPECIFIC)", "")
+        return ObservedQuery(
+            query=cleaned_query,
+            session_id=session_id,
+            timestamp=timestamp,
+            user=CorpUserUrn(user) if user else None,
+            default_db=default_database,
+            default_schema=default_database,  # Teradata uses database as schema
+        )
+    def _fetch_lineage_entries_chunked(self) -> Iterable[Any]:
+        """Fetch lineage entries using server-side cursor to handle large result sets efficiently."""
+        queries = self._make_lineage_queries()
+        fetch_engine = self.get_metadata_engine()
+        try:
+            with fetch_engine.connect() as conn:
+                cursor_type = (
+                    "server-side"
+                    if self.config.use_server_side_cursors
+                    else "client-side"
+                )
+                total_count_all_queries = 0
+                for query_index, query in enumerate(queries, 1):
+                    logger.info(
+                        f"Executing lineage query {query_index}/{len(queries)} with {cursor_type} cursor..."
+                    )
+                    # Use helper method to try server-side cursor with fallback
+                    result = self._execute_with_cursor_fallback(conn, query)
+                    # Stream results in batches to avoid memory issues
+                    batch_size = 5000
+                    batch_count = 0
+                    query_total_count = 0
+                    while True:
+                        # Fetch a batch of rows
+                        batch = result.fetchmany(batch_size)
+                        if not batch:
+                            break
+                        batch_count += 1
+                        query_total_count += len(batch)
+                        total_count_all_queries += len(batch)
+                        logger.info(
+                            f"Query {query_index} - Fetched batch {batch_count}: {len(batch)} lineage entries (query total: {query_total_count})"
+                        )
+                        yield from batch
+                    logger.info(
+                        f"Completed query {query_index}: {query_total_count} lineage entries in {batch_count} batches"
+                    )
+                logger.info(
+                    f"Completed fetching all queries: {total_count_all_queries} total lineage entries from {len(queries)} queries"
+                )
-    def get_audit_log_mcps(self, urns: Set[str]) -> Iterable[MetadataWorkUnit]:
+        except Exception as e:
+            logger.error(f"Error fetching lineage entries: {e}")
+            raise
+        finally:
+            fetch_engine.dispose()
+    def _check_historical_table_exists(self) -> bool:
+        """
+        Check if the PDCRINFO.DBQLSqlTbl_Hst table exists and is accessible.
+        DBQL rows are periodically moved to history table and audit queries might not exist in DBC already.
+        There is not guarantee that the historical table exists, so we need to check it.
+        Returns:
+            bool: True if the historical table exists and is accessible, False otherwise.
+        """
         engine = self.get_metadata_engine()
-        for entry in engine.execute(self._make_lineage_query()):
-            self.report.num_queries_parsed += 1
-            if self.report.num_queries_parsed % 1000 == 0:
-                logger.info(f"Parsed {self.report.num_queries_parsed} queries")
-            yield from self.gen_lineage_from_query(
-                query=entry.query_text,
-                default_database=entry.default_database,
-                timestamp=entry.timestamp,
-                user=entry.user,
-                urns=urns,
+        try:
+            # Use a simple query to check if the table exists and is accessible
+            check_query = """
+                SELECT TOP 1 QueryID
+                FROM PDCRINFO.DBQLSqlTbl_Hst
+                WHERE 1=0
+            """
+            with engine.connect() as conn:
+                conn.execute(text(check_query))
+                logger.info(
+                    "Historical lineage table PDCRINFO.DBQLSqlTbl_Hst is available"
+                )
+                return True
+        except Exception as e:
+            logger.info(
+                f"Historical lineage table PDCRINFO.DBQLSqlTbl_Hst is not available: {e}"
             )
+            return False
+        finally:
+            engine.dispose()
-    def _make_lineage_query(self) -> str:
+    def _make_lineage_queries(self) -> List[str]:
         databases_filter = (
             ""
             if not self.config.databases
-            else "and default_database in ({databases})".format(
+            else "and l.DefaultDatabase in ({databases})".format(
                 databases=",".join([f"'{db}'" for db in self.config.databases])
             )
         )
-        query = self.QUERY_TEXT_QUERY.format(
-            start_time=self.config.start_time,
-            end_time=self.config.end_time,
-            databases_filter=databases_filter,
-        )
-        return query
+        queries = []
-    def gen_lineage_from_query(
-        self,
-        query: str,
-        default_database: Optional[str] = None,
-        timestamp: Optional[datetime] = None,
-        user: Optional[str] = None,
-        view_urn: Optional[str] = None,
-        urns: Optional[Set[str]] = None,
-    ) -> Iterable[MetadataWorkUnit]:
-        result = sqlglot_lineage(
-            # With this clever hack we can make the query parser to not fail on queries with CASESPECIFIC
-            sql=query.replace("(NOT CASESPECIFIC)", ""),
-            schema_resolver=self.schema_resolver,
-            default_db=None,
-            default_schema=(
-                default_database if default_database else self.config.default_db
-            ),
-        )
-        if result.debug_info.table_error:
-            logger.debug(
-                f"Error parsing table lineage ({view_urn}):\n{result.debug_info.table_error}"
+        # Check if historical lineage is configured and available
+        if (
+            self.config.include_historical_lineage
+            and self._check_historical_table_exists()
+        ):
+            logger.info(
+                "Using UNION query to combine historical and current lineage data to avoid duplicates"
             )
-            self.report.num_table_parse_failures += 1
+            # For historical query, we need the database filter for historical part
+            databases_filter_history = (
+                databases_filter.replace("l.DefaultDatabase", "h.DefaultDatabase")
+                if databases_filter
+                else ""
+            )
+            union_query = self.QUERY_TEXT_HISTORICAL_UNION.format(
+                start_time=self.config.start_time,
+                end_time=self.config.end_time,
+                databases_filter=databases_filter,
+                databases_filter_history=databases_filter_history,
+            )
+            queries.append(union_query)
         else:
-            yield from self.builder.process_sql_parsing_result(
-                result,
-                query=query,
-                is_view_ddl=view_urn is not None,
-                query_timestamp=timestamp,
-                user=f"urn:li:corpuser:{user}",
-                include_urns=urns,
+            if self.config.include_historical_lineage:
+                logger.warning(
+                    "Historical lineage was requested but PDCRINFO.DBQLSqlTbl_Hst table is not available. Falling back to current data only."
+                )
+            # Use current-only query when historical data is not available
+            current_query = self.QUERY_TEXT_CURRENT_QUERIES.format(
+                start_time=self.config.start_time,
+                end_time=self.config.end_time,
+                databases_filter=databases_filter,
             )
+            queries.append(current_query)
+        return queries
     def get_metadata_engine(self) -> Engine:
         url = self.config.get_sql_alchemy_url()
         logger.debug(f"sql_alchemy_url={url}")
         return create_engine(url, **self.config.options)
-    def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
+    def _execute_with_cursor_fallback(
+        self, connection: Connection, query: str, params: Optional[Dict] = None
+    ) -> Any:
+        """
+        Execute query with server-side cursor if enabled and supported, otherwise fall back to regular execution.
+        Args:
+            connection: Database connection
+            query: SQL query to execute
+            params: Query parameters
+        Returns:
+            Query result object
+        """
+        if self.config.use_server_side_cursors:
+            try:
+                # Try server-side cursor first
+                if params:
+                    result = connection.execution_options(stream_results=True).execute(
+                        text(query), params
+                    )
+                else:
+                    result = connection.execution_options(stream_results=True).execute(
+                        text(query)
+                    )
+                logger.debug(
+                    "Successfully using server-side cursor for query execution"
+                )
+                return result
+            except Exception as e:
+                logger.warning(
+                    f"Server-side cursor failed, falling back to client-side execution: {e}"
+                )
+                # Fall through to regular execution
+        # Regular execution (client-side)
+        if params:
+            return connection.execute(text(query), params)
+        else:
+            return connection.execute(text(query))
+    def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+        logger.info("Starting Teradata metadata extraction")
         # Add all schemas to the schema resolver
         # Sql parser operates on lowercase urns so we need to lowercase the urns
-        for wu in auto_lowercase_urns(super().get_workunits_internal()):
-            urn = wu.get_urn()
-            schema_metadata = wu.get_aspect_of_type(SchemaMetadataClass)
-            if schema_metadata:
-                self.schema_resolver.add_schema_metadata(urn, schema_metadata)
-            yield wu
-        urns = self.schema_resolver.get_urns()
+        with self.report.new_stage("Schema metadata extraction"):
+            yield from super().get_workunits_internal()
+            logger.info("Completed schema metadata extraction")
+        with self.report.new_stage("Audit log extraction"):
+            yield from self._get_audit_log_mcps_with_aggregator()
+        # SqlParsingAggregator handles its own work unit generation internally
+        logger.info("Lineage processing completed by SqlParsingAggregator")
+    def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
+        """Override base class to skip aggregator gen_metadata() call.
+        Teradata handles aggregator processing after adding audit log queries,
+        so we skip the base class call to prevent duplicate processing.
+        """
+        # Return empty iterator - Teradata will handle aggregator processing
+        # after adding audit log queries in _get_audit_log_mcps_with_aggregator()
+        return iter([])
+    def _get_audit_log_mcps_with_aggregator(self) -> Iterable[MetadataWorkUnit]:
+        """SqlParsingAggregator-based lineage extraction with enhanced capabilities."""
+        logger.info(
+            "Fetching queries from Teradata audit logs for SqlParsingAggregator"
+        )
         if self.config.include_table_lineage or self.config.include_usage_statistics:
-            with self.report.new_stage("Audit log extraction"):
-                yield from self.get_audit_log_mcps(urns=urns)
+            # Step 1: Stream query entries from database with memory-efficient processing
+            with self.report.new_stage("Fetching lineage entries from Audit Logs"):
+                queries_processed = 0
+                entries_processed = False
+                # Use streaming query reconstruction for memory efficiency
+                for observed_query in self._reconstruct_queries_streaming(
+                    self._fetch_lineage_entries_chunked()
+                ):
+                    entries_processed = True
+                    self.aggregator.add(observed_query)
+                    queries_processed += 1
+                    if queries_processed % 10000 == 0:
+                        logger.info(
+                            f"Processed {queries_processed} queries to aggregator"
+                        )
+                if not entries_processed:
+                    logger.info("No lineage entries found")
+                    return
+                logger.info(
+                    f"Completed adding {queries_processed} queries to SqlParsingAggregator"
+                )
+        # Step 2: Generate work units from aggregator
+        with self.report.new_stage("SqlParsingAggregator metadata generation"):
+            logger.info("Generating metadata work units from SqlParsingAggregator")
+            work_unit_count = 0
+            for mcp in self.aggregator.gen_metadata():
+                work_unit_count += 1
+                if work_unit_count % 10000 == 0:
+                    logger.info(
+                        f"Generated {work_unit_count} work units from aggregator"
+                    )
+                yield mcp.as_workunit()
+            logger.info(
+                f"Completed SqlParsingAggregator processing: {work_unit_count} work units generated"
+            )
+    def close(self) -> None:
+        """Clean up resources when source is closed."""
+        logger.info("Closing SqlParsingAggregator")
+        self.aggregator.close()
+        # Clean up pooled engine
+        with self._pooled_engine_lock:
+            if self._pooled_engine is not None:
+                logger.info("Disposing pooled engine")
+                self._pooled_engine.dispose()
+                self._pooled_engine = None
-        yield from self.builder.gen_workunits()
+        # Report failed views summary
+        super().close()

acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.1rc3py3-none-any.whl → 1.2.0py3-none-any.whl