acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import re
|
|
3
|
+
import time
|
|
2
4
|
from collections import defaultdict
|
|
3
|
-
from
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
from dataclasses import dataclass, field
|
|
4
7
|
from datetime import datetime
|
|
5
8
|
from functools import lru_cache
|
|
9
|
+
from threading import Lock
|
|
6
10
|
from typing import (
|
|
7
11
|
Any,
|
|
8
12
|
Dict,
|
|
@@ -10,7 +14,6 @@ from typing import (
|
|
|
10
14
|
List,
|
|
11
15
|
MutableMapping,
|
|
12
16
|
Optional,
|
|
13
|
-
Set,
|
|
14
17
|
Tuple,
|
|
15
18
|
Union,
|
|
16
19
|
)
|
|
@@ -29,7 +32,6 @@ from teradatasqlalchemy.options import configure
|
|
|
29
32
|
|
|
30
33
|
from datahub.configuration.common import AllowDenyPattern
|
|
31
34
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
32
|
-
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
|
|
33
35
|
from datahub.ingestion.api.common import PipelineContext
|
|
34
36
|
from datahub.ingestion.api.decorators import (
|
|
35
37
|
SourceCapability,
|
|
@@ -39,10 +41,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
39
41
|
platform_name,
|
|
40
42
|
support_status,
|
|
41
43
|
)
|
|
42
|
-
from datahub.ingestion.api.source_helpers import auto_lowercase_urns
|
|
43
44
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
44
45
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
45
|
-
from datahub.ingestion.source.
|
|
46
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
47
|
+
from datahub.ingestion.source.sql.sql_common import register_custom_type
|
|
46
48
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
47
49
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
48
50
|
from datahub.ingestion.source.sql.two_tier_sql_source import (
|
|
@@ -50,19 +52,75 @@ from datahub.ingestion.source.sql.two_tier_sql_source import (
|
|
|
50
52
|
TwoTierSQLAlchemySource,
|
|
51
53
|
)
|
|
52
54
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
53
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
54
55
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
55
56
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
56
57
|
BytesTypeClass,
|
|
57
58
|
TimeTypeClass,
|
|
58
59
|
)
|
|
59
|
-
from datahub.metadata.
|
|
60
|
+
from datahub.metadata.urns import CorpUserUrn
|
|
60
61
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
61
|
-
from datahub.sql_parsing.
|
|
62
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
63
|
+
ObservedQuery,
|
|
64
|
+
SqlParsingAggregator,
|
|
65
|
+
)
|
|
62
66
|
from datahub.utilities.groupby import groupby_unsorted
|
|
67
|
+
from datahub.utilities.stats_collections import TopKDict
|
|
63
68
|
|
|
64
69
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
65
70
|
|
|
71
|
+
# Precompiled regex pattern for case-insensitive "(not casespecific)" removal
|
|
72
|
+
NOT_CASESPECIFIC_PATTERN = re.compile(r"\(not casespecific\)", re.IGNORECASE)
|
|
73
|
+
|
|
74
|
+
# Teradata uses a two-tier database.table naming approach without default database prefixing
|
|
75
|
+
DEFAULT_NO_DATABASE_TERADATA = None
|
|
76
|
+
|
|
77
|
+
# Common excluded databases used in multiple places
|
|
78
|
+
EXCLUDED_DATABASES = [
|
|
79
|
+
"All",
|
|
80
|
+
"Crashdumps",
|
|
81
|
+
"Default",
|
|
82
|
+
"DemoNow_Monitor",
|
|
83
|
+
"EXTUSER",
|
|
84
|
+
"External_AP",
|
|
85
|
+
"GLOBAL_FUNCTIONS",
|
|
86
|
+
"LockLogShredder",
|
|
87
|
+
"PUBLIC",
|
|
88
|
+
"SQLJ",
|
|
89
|
+
"SYSBAR",
|
|
90
|
+
"SYSJDBC",
|
|
91
|
+
"SYSLIB",
|
|
92
|
+
"SYSSPATIAL",
|
|
93
|
+
"SYSUDTLIB",
|
|
94
|
+
"SYSUIF",
|
|
95
|
+
"SysAdmin",
|
|
96
|
+
"Sys_Calendar",
|
|
97
|
+
"SystemFe",
|
|
98
|
+
"TDBCMgmt",
|
|
99
|
+
"TDMaps",
|
|
100
|
+
"TDPUSER",
|
|
101
|
+
"TDQCD",
|
|
102
|
+
"TDStats",
|
|
103
|
+
"TD_ANALYTICS_DB",
|
|
104
|
+
"TD_SERVER_DB",
|
|
105
|
+
"TD_SYSFNLIB",
|
|
106
|
+
"TD_SYSGPL",
|
|
107
|
+
"TD_SYSXML",
|
|
108
|
+
"TDaaS_BAR",
|
|
109
|
+
"TDaaS_DB",
|
|
110
|
+
"TDaaS_Maint",
|
|
111
|
+
"TDaaS_Monitor",
|
|
112
|
+
"TDaaS_Support",
|
|
113
|
+
"TDaaS_TDBCMgmt1",
|
|
114
|
+
"TDaaS_TDBCMgmt2",
|
|
115
|
+
"dbcmngr",
|
|
116
|
+
"mldb",
|
|
117
|
+
"system",
|
|
118
|
+
"tapidb",
|
|
119
|
+
"tdwm",
|
|
120
|
+
"val",
|
|
121
|
+
"dbc",
|
|
122
|
+
]
|
|
123
|
+
|
|
66
124
|
register_custom_type(custom_types.JSON, BytesTypeClass)
|
|
67
125
|
register_custom_type(custom_types.INTERVAL_DAY, TimeTypeClass)
|
|
68
126
|
register_custom_type(custom_types.INTERVAL_DAY_TO_SECOND, TimeTypeClass)
|
|
@@ -99,14 +157,16 @@ class TeradataTable:
|
|
|
99
157
|
request_text: Optional[str]
|
|
100
158
|
|
|
101
159
|
|
|
102
|
-
#
|
|
160
|
+
# Cache size of 1 is sufficient since schemas are processed sequentially
|
|
161
|
+
# Note: This cache is per-process and helps when processing multiple tables in the same schema
|
|
103
162
|
@lru_cache(maxsize=1)
|
|
104
163
|
def get_schema_columns(
|
|
105
164
|
self: Any, connection: Connection, dbc_columns: str, schema: str
|
|
106
165
|
) -> Dict[str, List[Any]]:
|
|
166
|
+
start_time = time.time()
|
|
107
167
|
columns: Dict[str, List[Any]] = {}
|
|
108
|
-
columns_query = f"select * from dbc.{dbc_columns} where DatabaseName (NOT CASESPECIFIC) =
|
|
109
|
-
rows = connection.execute(text(columns_query)).fetchall()
|
|
168
|
+
columns_query = f"select * from dbc.{dbc_columns} where DatabaseName (NOT CASESPECIFIC) = :schema (NOT CASESPECIFIC) order by TableName, ColumnId"
|
|
169
|
+
rows = connection.execute(text(columns_query), {"schema": schema}).fetchall()
|
|
110
170
|
for row in rows:
|
|
111
171
|
row_mapping = row._mapping
|
|
112
172
|
if row_mapping.TableName not in columns:
|
|
@@ -114,18 +174,29 @@ def get_schema_columns(
|
|
|
114
174
|
|
|
115
175
|
columns[row_mapping.TableName].append(row_mapping)
|
|
116
176
|
|
|
177
|
+
end_time = time.time()
|
|
178
|
+
extraction_time = end_time - start_time
|
|
179
|
+
logger.info(
|
|
180
|
+
f"Column extraction for schema '{schema}' completed in {extraction_time:.2f} seconds"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Update report if available
|
|
184
|
+
if hasattr(self, "report"):
|
|
185
|
+
self.report.column_extraction_duration_seconds += extraction_time
|
|
186
|
+
|
|
117
187
|
return columns
|
|
118
188
|
|
|
119
189
|
|
|
120
|
-
#
|
|
190
|
+
# Cache size of 1 is sufficient since schemas are processed sequentially
|
|
191
|
+
# Note: This cache is per-process and helps when processing multiple tables in the same schema
|
|
121
192
|
@lru_cache(maxsize=1)
|
|
122
193
|
def get_schema_pk_constraints(
|
|
123
194
|
self: Any, connection: Connection, schema: str
|
|
124
195
|
) -> Dict[str, List[Any]]:
|
|
125
196
|
dbc_indices = "IndicesV" + "X" if configure.usexviews else "IndicesV"
|
|
126
197
|
primary_keys: Dict[str, List[Any]] = {}
|
|
127
|
-
stmt = f"select * from dbc.{dbc_indices} where DatabaseName (NOT CASESPECIFIC) =
|
|
128
|
-
rows = connection.execute(text(stmt)).fetchall()
|
|
198
|
+
stmt = f"select * from dbc.{dbc_indices} where DatabaseName (NOT CASESPECIFIC) = :schema (NOT CASESPECIFIC) and IndexType = 'K' order by IndexNumber"
|
|
199
|
+
rows = connection.execute(text(stmt), {"schema": schema}).fetchall()
|
|
129
200
|
for row in rows:
|
|
130
201
|
row_mapping = row._mapping
|
|
131
202
|
if row_mapping.TableName not in primary_keys:
|
|
@@ -172,6 +243,10 @@ def optimized_get_pk_constraint(
|
|
|
172
243
|
index_column.IndexName
|
|
173
244
|
) # There should be just one IndexName
|
|
174
245
|
|
|
246
|
+
# Update counter if available
|
|
247
|
+
if hasattr(self, "report"):
|
|
248
|
+
self.report.num_primary_keys_processed += 1
|
|
249
|
+
|
|
175
250
|
return {"constrained_columns": index_columns, "name": index_name}
|
|
176
251
|
|
|
177
252
|
|
|
@@ -228,23 +303,55 @@ def optimized_get_columns(
|
|
|
228
303
|
table_name, []
|
|
229
304
|
)
|
|
230
305
|
|
|
306
|
+
start_time = time.time()
|
|
307
|
+
|
|
231
308
|
final_column_info = []
|
|
232
309
|
# Don't care about ART tables now
|
|
233
310
|
# Ignore the non-functional column in a PTI table
|
|
234
311
|
for row in res:
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
312
|
+
try:
|
|
313
|
+
col_info = self._get_column_info(row)
|
|
314
|
+
|
|
315
|
+
# Add CommentString as comment field for column description
|
|
316
|
+
if hasattr(row, "CommentString") and row.CommentString:
|
|
317
|
+
col_info["comment"] = row.CommentString.strip()
|
|
318
|
+
elif (
|
|
319
|
+
isinstance(row, dict)
|
|
320
|
+
and "CommentString" in row
|
|
321
|
+
and row["CommentString"]
|
|
240
322
|
):
|
|
241
|
-
|
|
242
|
-
|
|
323
|
+
col_info["comment"] = row["CommentString"].strip()
|
|
324
|
+
|
|
325
|
+
if "TSColumnType" in col_info and col_info["TSColumnType"] is not None:
|
|
326
|
+
if (
|
|
327
|
+
col_info["ColumnName"] == "TD_TIMEBUCKET"
|
|
328
|
+
and col_info["TSColumnType"].strip() == "TB"
|
|
329
|
+
):
|
|
330
|
+
continue
|
|
331
|
+
final_column_info.append(col_info)
|
|
332
|
+
|
|
333
|
+
# Update counter - access report through self from the connection context
|
|
334
|
+
if hasattr(self, "report"):
|
|
335
|
+
self.report.num_columns_processed += 1
|
|
336
|
+
|
|
337
|
+
except Exception as e:
|
|
338
|
+
logger.error(
|
|
339
|
+
f"Failed to process column {getattr(row, 'ColumnName', 'unknown')}: {e}"
|
|
340
|
+
)
|
|
341
|
+
if hasattr(self, "report"):
|
|
342
|
+
self.report.num_column_extraction_failures += 1
|
|
343
|
+
continue
|
|
344
|
+
|
|
345
|
+
# Update timing
|
|
346
|
+
if hasattr(self, "report"):
|
|
347
|
+
end_time = time.time()
|
|
348
|
+
self.report.column_extraction_duration_seconds += end_time - start_time
|
|
243
349
|
|
|
244
350
|
return final_column_info
|
|
245
351
|
|
|
246
352
|
|
|
247
|
-
#
|
|
353
|
+
# Cache size of 1 is sufficient since schemas are processed sequentially
|
|
354
|
+
# Note: This cache is per-process and helps when processing multiple tables in the same schema
|
|
248
355
|
@lru_cache(maxsize=1)
|
|
249
356
|
def get_schema_foreign_keys(
|
|
250
357
|
self: Any, connection: Connection, schema: str
|
|
@@ -333,10 +440,32 @@ def optimized_get_view_definition(
|
|
|
333
440
|
|
|
334
441
|
|
|
335
442
|
@dataclass
|
|
336
|
-
class TeradataReport(SQLSourceReport,
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
443
|
+
class TeradataReport(SQLSourceReport, BaseTimeWindowReport):
|
|
444
|
+
# View processing metrics (actively used)
|
|
445
|
+
num_views_processed: int = 0
|
|
446
|
+
num_view_processing_failures: int = 0
|
|
447
|
+
view_extraction_total_time_seconds: float = 0.0
|
|
448
|
+
view_extraction_average_time_seconds: float = 0.0
|
|
449
|
+
slowest_view_processing_time_seconds: float = 0.0
|
|
450
|
+
slowest_view_name: TopKDict[str, float] = field(default_factory=TopKDict)
|
|
451
|
+
|
|
452
|
+
# Connection pool performance metrics (actively used)
|
|
453
|
+
connection_pool_wait_time_seconds: float = 0.0
|
|
454
|
+
connection_pool_max_wait_time_seconds: float = 0.0
|
|
455
|
+
|
|
456
|
+
# Database-level metrics similar to BigQuery's approach (actively used)
|
|
457
|
+
num_database_tables_to_scan: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
458
|
+
num_database_views_to_scan: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
459
|
+
|
|
460
|
+
# Global metadata extraction timing (single query for all databases)
|
|
461
|
+
metadata_extraction_total_sec: float = 0.0
|
|
462
|
+
|
|
463
|
+
# Lineage extraction query time range (actively used)
|
|
464
|
+
lineage_start_time: Optional[datetime] = None
|
|
465
|
+
lineage_end_time: Optional[datetime] = None
|
|
466
|
+
|
|
467
|
+
# Audit query processing statistics
|
|
468
|
+
num_audit_query_entries_processed: int = 0
|
|
340
469
|
|
|
341
470
|
|
|
342
471
|
class BaseTeradataConfig(TwoTierSQLAlchemyConfig):
|
|
@@ -352,67 +481,28 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
|
|
|
352
481
|
),
|
|
353
482
|
)
|
|
354
483
|
|
|
355
|
-
database_pattern = Field(
|
|
356
|
-
default=AllowDenyPattern(
|
|
357
|
-
deny=[
|
|
358
|
-
"All",
|
|
359
|
-
"Crashdumps",
|
|
360
|
-
"Default",
|
|
361
|
-
"DemoNow_Monitor",
|
|
362
|
-
"EXTUSER",
|
|
363
|
-
"External_AP",
|
|
364
|
-
"GLOBAL_FUNCTIONS",
|
|
365
|
-
"LockLogShredder",
|
|
366
|
-
"PUBLIC",
|
|
367
|
-
"SQLJ",
|
|
368
|
-
"SYSBAR",
|
|
369
|
-
"SYSJDBC",
|
|
370
|
-
"SYSLIB",
|
|
371
|
-
"SYSSPATIAL",
|
|
372
|
-
"SYSUDTLIB",
|
|
373
|
-
"SYSUIF",
|
|
374
|
-
"SysAdmin",
|
|
375
|
-
"Sys_Calendar",
|
|
376
|
-
"SystemFe",
|
|
377
|
-
"TDBCMgmt",
|
|
378
|
-
"TDMaps",
|
|
379
|
-
"TDPUSER",
|
|
380
|
-
"TDQCD",
|
|
381
|
-
"TDStats",
|
|
382
|
-
"TD_ANALYTICS_DB",
|
|
383
|
-
"TD_SERVER_DB",
|
|
384
|
-
"TD_SYSFNLIB",
|
|
385
|
-
"TD_SYSGPL",
|
|
386
|
-
"TD_SYSXML",
|
|
387
|
-
"TDaaS_BAR",
|
|
388
|
-
"TDaaS_DB",
|
|
389
|
-
"TDaaS_Maint",
|
|
390
|
-
"TDaaS_Monitor",
|
|
391
|
-
"TDaaS_Support",
|
|
392
|
-
"TDaaS_TDBCMgmt1",
|
|
393
|
-
"TDaaS_TDBCMgmt2",
|
|
394
|
-
"dbcmngr",
|
|
395
|
-
"mldb",
|
|
396
|
-
"system",
|
|
397
|
-
"tapidb",
|
|
398
|
-
"tdwm",
|
|
399
|
-
"val",
|
|
400
|
-
"dbc",
|
|
401
|
-
]
|
|
402
|
-
),
|
|
484
|
+
database_pattern: AllowDenyPattern = Field(
|
|
485
|
+
default=AllowDenyPattern(deny=EXCLUDED_DATABASES),
|
|
403
486
|
description="Regex patterns for databases to filter in ingestion.",
|
|
404
487
|
)
|
|
405
|
-
include_table_lineage = Field(
|
|
488
|
+
include_table_lineage: bool = Field(
|
|
406
489
|
default=False,
|
|
407
490
|
description="Whether to include table lineage in the ingestion. "
|
|
408
491
|
"This requires to have the table lineage feature enabled.",
|
|
409
492
|
)
|
|
410
493
|
|
|
411
|
-
include_view_lineage = Field(
|
|
494
|
+
include_view_lineage: bool = Field(
|
|
412
495
|
default=True,
|
|
413
496
|
description="Whether to include view lineage in the ingestion. "
|
|
414
497
|
"This requires to have the view lineage feature enabled.",
|
|
415
498
|
)
|
|
499
|
+
|
|
500
|
+
include_queries: bool = Field(
|
|
501
|
+
default=True,
|
|
502
|
+
description="Whether to generate query entities for SQL queries. "
|
|
503
|
+
"Query entities provide metadata about individual SQL queries including "
|
|
504
|
+
"execution timestamps, user information, and query text.",
|
|
505
|
+
)
|
|
416
506
|
usage: BaseUsageConfig = Field(
|
|
417
507
|
description="The usage config to use when generating usage statistics",
|
|
418
508
|
default=BaseUsageConfig(),
|
|
@@ -438,14 +528,43 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
|
|
|
438
528
|
description="Whether to use QVCI to get column information. This is faster but requires to have QVCI enabled.",
|
|
439
529
|
)
|
|
440
530
|
|
|
531
|
+
include_historical_lineage: bool = Field(
|
|
532
|
+
default=False,
|
|
533
|
+
description="Whether to include historical lineage data from PDCRINFO.DBQLSqlTbl_Hst in addition to current DBC.QryLogV data. "
|
|
534
|
+
"This provides access to historical query logs that may have been archived. "
|
|
535
|
+
"The historical table existence is checked automatically and gracefully falls back to current data only if not available.",
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
use_server_side_cursors: bool = Field(
|
|
539
|
+
default=True,
|
|
540
|
+
description="Enable server-side cursors for large result sets using SQLAlchemy's stream_results. "
|
|
541
|
+
"This reduces memory usage by streaming results from the database server. "
|
|
542
|
+
"Automatically falls back to client-side batching if server-side cursors are not supported.",
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
max_workers: int = Field(
|
|
546
|
+
default=10,
|
|
547
|
+
description="Maximum number of worker threads to use for parallel processing. "
|
|
548
|
+
"Controls the level of concurrency for operations like view processing.",
|
|
549
|
+
)
|
|
550
|
+
|
|
441
551
|
|
|
442
552
|
@platform_name("Teradata")
|
|
443
553
|
@config_class(TeradataConfig)
|
|
444
554
|
@support_status(SupportStatus.TESTING)
|
|
445
555
|
@capability(SourceCapability.DOMAINS, "Enabled by default")
|
|
446
|
-
@capability(
|
|
556
|
+
@capability(
|
|
557
|
+
SourceCapability.CONTAINERS,
|
|
558
|
+
"Enabled by default",
|
|
559
|
+
subtype_modifier=[
|
|
560
|
+
SourceCapabilityModifier.DATABASE,
|
|
561
|
+
],
|
|
562
|
+
)
|
|
447
563
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
448
|
-
@capability(
|
|
564
|
+
@capability(
|
|
565
|
+
SourceCapability.DELETION_DETECTION,
|
|
566
|
+
"Enabled by default when stateful ingestion is turned on",
|
|
567
|
+
)
|
|
449
568
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
450
569
|
@capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration")
|
|
451
570
|
@capability(SourceCapability.LINEAGE_FINE, "Optionally enabled via configuration")
|
|
@@ -461,13 +580,7 @@ class TeradataSource(TwoTierSQLAlchemySource):
|
|
|
461
580
|
|
|
462
581
|
config: TeradataConfig
|
|
463
582
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
LINEAGE_TIMESTAMP_BOUND_QUERY: str = """
|
|
467
|
-
SELECT MIN(CollectTimeStamp) as "min_ts", MAX(CollectTimeStamp) as "max_ts" from DBC.QryLogV
|
|
468
|
-
""".strip()
|
|
469
|
-
|
|
470
|
-
QUERY_TEXT_QUERY: str = """
|
|
583
|
+
QUERY_TEXT_CURRENT_QUERIES: str = """
|
|
471
584
|
SELECT
|
|
472
585
|
s.QueryID as "query_id",
|
|
473
586
|
UserName as "user",
|
|
@@ -500,10 +613,89 @@ class TeradataSource(TwoTierSQLAlchemySource):
|
|
|
500
613
|
and s.CollectTimeStamp >= TIMESTAMP '{start_time}'
|
|
501
614
|
and default_database not in ('DEMONOW_MONITOR')
|
|
502
615
|
{databases_filter}
|
|
503
|
-
ORDER BY "query_id", "row_no"
|
|
616
|
+
ORDER BY "timestamp", "query_id", "row_no"
|
|
504
617
|
""".strip()
|
|
505
618
|
|
|
506
|
-
|
|
619
|
+
QUERY_TEXT_HISTORICAL_UNION: str = """
|
|
620
|
+
SELECT
|
|
621
|
+
"query_id",
|
|
622
|
+
"user",
|
|
623
|
+
"timestamp",
|
|
624
|
+
default_database,
|
|
625
|
+
"query_text",
|
|
626
|
+
"row_no"
|
|
627
|
+
FROM (
|
|
628
|
+
SELECT
|
|
629
|
+
h.QueryID as "query_id",
|
|
630
|
+
h.UserName as "user",
|
|
631
|
+
h.StartTime AT TIME ZONE 'GMT' as "timestamp",
|
|
632
|
+
h.DefaultDatabase as default_database,
|
|
633
|
+
h.SqlTextInfo as "query_text",
|
|
634
|
+
h.SqlRowNo as "row_no"
|
|
635
|
+
FROM "PDCRINFO".DBQLSqlTbl_Hst as h
|
|
636
|
+
WHERE
|
|
637
|
+
h.ErrorCode = 0
|
|
638
|
+
AND h.statementtype not in (
|
|
639
|
+
'Unrecognized type',
|
|
640
|
+
'Create Database/User',
|
|
641
|
+
'Help',
|
|
642
|
+
'Modify Database',
|
|
643
|
+
'Drop Table',
|
|
644
|
+
'Show',
|
|
645
|
+
'Not Applicable',
|
|
646
|
+
'Grant',
|
|
647
|
+
'Abort',
|
|
648
|
+
'Database',
|
|
649
|
+
'Flush Query Logging',
|
|
650
|
+
'Null',
|
|
651
|
+
'Begin/End DBQL',
|
|
652
|
+
'Revoke'
|
|
653
|
+
)
|
|
654
|
+
and h.StartTime AT TIME ZONE 'GMT' >= TIMESTAMP '{start_time}'
|
|
655
|
+
and h.StartTime AT TIME ZONE 'GMT' < TIMESTAMP '{end_time}'
|
|
656
|
+
and h.CollectTimeStamp >= TIMESTAMP '{start_time}'
|
|
657
|
+
and h.DefaultDatabase not in ('DEMONOW_MONITOR')
|
|
658
|
+
{databases_filter_history}
|
|
659
|
+
|
|
660
|
+
UNION
|
|
661
|
+
|
|
662
|
+
SELECT
|
|
663
|
+
s.QueryID as "query_id",
|
|
664
|
+
l.UserName as "user",
|
|
665
|
+
l.StartTime AT TIME ZONE 'GMT' as "timestamp",
|
|
666
|
+
l.DefaultDatabase as default_database,
|
|
667
|
+
s.SqlTextInfo as "query_text",
|
|
668
|
+
s.SqlRowNo as "row_no"
|
|
669
|
+
FROM "DBC".QryLogV as l
|
|
670
|
+
JOIN "DBC".QryLogSqlV as s on s.QueryID = l.QueryID
|
|
671
|
+
WHERE
|
|
672
|
+
l.ErrorCode = 0
|
|
673
|
+
AND l.statementtype not in (
|
|
674
|
+
'Unrecognized type',
|
|
675
|
+
'Create Database/User',
|
|
676
|
+
'Help',
|
|
677
|
+
'Modify Database',
|
|
678
|
+
'Drop Table',
|
|
679
|
+
'Show',
|
|
680
|
+
'Not Applicable',
|
|
681
|
+
'Grant',
|
|
682
|
+
'Abort',
|
|
683
|
+
'Database',
|
|
684
|
+
'Flush Query Logging',
|
|
685
|
+
'Null',
|
|
686
|
+
'Begin/End DBQL',
|
|
687
|
+
'Revoke'
|
|
688
|
+
)
|
|
689
|
+
and l.StartTime AT TIME ZONE 'GMT' >= TIMESTAMP '{start_time}'
|
|
690
|
+
and l.StartTime AT TIME ZONE 'GMT' < TIMESTAMP '{end_time}'
|
|
691
|
+
and s.CollectTimeStamp >= TIMESTAMP '{start_time}'
|
|
692
|
+
and l.DefaultDatabase not in ('DEMONOW_MONITOR')
|
|
693
|
+
{databases_filter}
|
|
694
|
+
) as combined_results
|
|
695
|
+
ORDER BY "timestamp", "query_id", "row_no"
|
|
696
|
+
""".strip()
|
|
697
|
+
|
|
698
|
+
TABLES_AND_VIEWS_QUERY: str = f"""
|
|
507
699
|
SELECT
|
|
508
700
|
t.DataBaseName,
|
|
509
701
|
t.TableName as name,
|
|
@@ -521,77 +713,52 @@ SELECT
|
|
|
521
713
|
t.LastAlterTimeStamp,
|
|
522
714
|
t.RequestText
|
|
523
715
|
FROM dbc.TablesV t
|
|
524
|
-
WHERE DataBaseName NOT IN (
|
|
525
|
-
'All',
|
|
526
|
-
'Crashdumps',
|
|
527
|
-
'Default',
|
|
528
|
-
'DemoNow_Monitor',
|
|
529
|
-
'EXTUSER',
|
|
530
|
-
'External_AP',
|
|
531
|
-
'GLOBAL_FUNCTIONS',
|
|
532
|
-
'LockLogShredder',
|
|
533
|
-
'PUBLIC',
|
|
534
|
-
'SQLJ',
|
|
535
|
-
'SYSBAR',
|
|
536
|
-
'SYSJDBC',
|
|
537
|
-
'SYSLIB',
|
|
538
|
-
'SYSSPATIAL',
|
|
539
|
-
'SYSUDTLIB',
|
|
540
|
-
'SYSUIF',
|
|
541
|
-
'SysAdmin',
|
|
542
|
-
'Sys_Calendar',
|
|
543
|
-
'SystemFe',
|
|
544
|
-
'TDBCMgmt',
|
|
545
|
-
'TDMaps',
|
|
546
|
-
'TDPUSER',
|
|
547
|
-
'TDQCD',
|
|
548
|
-
'TDStats',
|
|
549
|
-
'TD_ANALYTICS_DB',
|
|
550
|
-
'TD_SERVER_DB',
|
|
551
|
-
'TD_SYSFNLIB',
|
|
552
|
-
'TD_SYSGPL',
|
|
553
|
-
'TD_SYSXML',
|
|
554
|
-
'TDaaS_BAR',
|
|
555
|
-
'TDaaS_DB',
|
|
556
|
-
'TDaaS_Maint',
|
|
557
|
-
'TDaaS_Monitor',
|
|
558
|
-
'TDaaS_Support',
|
|
559
|
-
'TDaaS_TDBCMgmt1',
|
|
560
|
-
'TDaaS_TDBCMgmt2',
|
|
561
|
-
'dbcmngr',
|
|
562
|
-
'mldb',
|
|
563
|
-
'system',
|
|
564
|
-
'tapidb',
|
|
565
|
-
'tdwm',
|
|
566
|
-
'val',
|
|
567
|
-
'dbc'
|
|
568
|
-
)
|
|
716
|
+
WHERE DataBaseName NOT IN ({",".join([f"'{db}'" for db in EXCLUDED_DATABASES])})
|
|
569
717
|
AND t.TableKind in ('T', 'V', 'Q', 'O')
|
|
570
718
|
ORDER by DataBaseName, TableName;
|
|
571
719
|
""".strip()
|
|
572
720
|
|
|
573
721
|
_tables_cache: MutableMapping[str, List[TeradataTable]] = defaultdict(list)
|
|
722
|
+
_tables_cache_lock = Lock() # Protect shared cache from concurrent access
|
|
723
|
+
_pooled_engine: Optional[Engine] = None # Reusable pooled engine
|
|
724
|
+
_pooled_engine_lock = Lock() # Protect engine creation
|
|
574
725
|
|
|
575
726
|
def __init__(self, config: TeradataConfig, ctx: PipelineContext):
|
|
576
727
|
super().__init__(config, ctx, "teradata")
|
|
577
728
|
|
|
578
729
|
self.report: TeradataReport = TeradataReport()
|
|
579
730
|
self.graph: Optional[DataHubGraph] = ctx.graph
|
|
731
|
+
self._report_lock = Lock() # Thread safety for report counters
|
|
732
|
+
|
|
733
|
+
self.schema_resolver = self._init_schema_resolver()
|
|
580
734
|
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
735
|
+
# Initialize SqlParsingAggregator for modern lineage processing
|
|
736
|
+
logger.info("Initializing SqlParsingAggregator for enhanced lineage processing")
|
|
737
|
+
self.aggregator = SqlParsingAggregator(
|
|
738
|
+
platform="teradata",
|
|
739
|
+
platform_instance=self.config.platform_instance,
|
|
740
|
+
env=self.config.env,
|
|
741
|
+
schema_resolver=self.schema_resolver,
|
|
742
|
+
graph=self.ctx.graph,
|
|
743
|
+
generate_lineage=self.config.include_view_lineage
|
|
744
|
+
or self.config.include_table_lineage,
|
|
745
|
+
generate_queries=self.config.include_queries,
|
|
586
746
|
generate_usage_statistics=self.config.include_usage_statistics,
|
|
587
|
-
|
|
747
|
+
generate_query_usage_statistics=self.config.include_usage_statistics,
|
|
748
|
+
generate_operations=self.config.usage.include_operational_stats
|
|
749
|
+
if self.config.include_usage_statistics
|
|
750
|
+
else False,
|
|
751
|
+
usage_config=self.config.usage
|
|
752
|
+
if self.config.include_usage_statistics
|
|
753
|
+
else None,
|
|
754
|
+
eager_graph_load=False,
|
|
588
755
|
)
|
|
589
|
-
|
|
590
|
-
self.schema_resolver = self._init_schema_resolver()
|
|
756
|
+
self.report.sql_aggregator = self.aggregator.report
|
|
591
757
|
|
|
592
758
|
if self.config.include_tables or self.config.include_views:
|
|
593
|
-
self.
|
|
594
|
-
|
|
759
|
+
with self.report.new_stage("Table and view discovery"):
|
|
760
|
+
self.cache_tables_and_views()
|
|
761
|
+
logger.info(f"Found {len(self._tables_cache)} tables and views")
|
|
595
762
|
setattr(self, "loop_tables", self.cached_loop_tables) # noqa: B010
|
|
596
763
|
setattr(self, "loop_views", self.cached_loop_views) # noqa: B010
|
|
597
764
|
setattr( # noqa: B010
|
|
@@ -721,6 +888,8 @@ ORDER by DataBaseName, TableName;
|
|
|
721
888
|
|
|
722
889
|
logger.debug(f"sql_alchemy_url={url}")
|
|
723
890
|
engine = create_engine(url, **self.config.options)
|
|
891
|
+
|
|
892
|
+
# Get list of databases first
|
|
724
893
|
with engine.connect() as conn:
|
|
725
894
|
inspector = inspect(conn)
|
|
726
895
|
if self.config.database and self.config.database != "":
|
|
@@ -729,13 +898,14 @@ ORDER by DataBaseName, TableName;
|
|
|
729
898
|
databases = self.config.databases
|
|
730
899
|
else:
|
|
731
900
|
databases = inspector.get_schema_names()
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
901
|
+
|
|
902
|
+
# Create separate connections for each database to avoid connection lifecycle issues
|
|
903
|
+
for db in databases:
|
|
904
|
+
if self.config.database_pattern.allowed(db):
|
|
905
|
+
with engine.connect() as conn:
|
|
906
|
+
db_inspector = inspect(conn)
|
|
907
|
+
db_inspector._datahub_database = db
|
|
908
|
+
yield db_inspector
|
|
739
909
|
|
|
740
910
|
def get_db_name(self, inspector: Inspector) -> str:
|
|
741
911
|
if hasattr(inspector, "_datahub_database"):
|
|
@@ -753,14 +923,15 @@ ORDER by DataBaseName, TableName;
|
|
|
753
923
|
inspector: Inspector,
|
|
754
924
|
schema: str,
|
|
755
925
|
sql_config: SQLCommonConfig,
|
|
756
|
-
) -> Iterable[
|
|
926
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
757
927
|
setattr( # noqa: B010
|
|
758
928
|
inspector,
|
|
759
929
|
"get_table_names",
|
|
760
930
|
lambda schema: [
|
|
761
931
|
i.name
|
|
762
932
|
for i in filter(
|
|
763
|
-
lambda t: t.object_type != "View",
|
|
933
|
+
lambda t: t.object_type != "View",
|
|
934
|
+
self._tables_cache.get(schema, []),
|
|
764
935
|
)
|
|
765
936
|
],
|
|
766
937
|
)
|
|
@@ -776,7 +947,8 @@ ORDER by DataBaseName, TableName;
|
|
|
776
947
|
# this method and provide a location.
|
|
777
948
|
location: Optional[str] = None
|
|
778
949
|
|
|
779
|
-
|
|
950
|
+
cache_entries = self._tables_cache.get(schema, [])
|
|
951
|
+
for entry in cache_entries:
|
|
780
952
|
if entry.name == table:
|
|
781
953
|
description = entry.description
|
|
782
954
|
if entry.object_type == "View" and entry.request_text:
|
|
@@ -789,123 +961,734 @@ ORDER by DataBaseName, TableName;
|
|
|
789
961
|
inspector: Inspector,
|
|
790
962
|
schema: str,
|
|
791
963
|
sql_config: SQLCommonConfig,
|
|
792
|
-
) -> Iterable[
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
964
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
965
|
+
start_time = time.time()
|
|
966
|
+
|
|
967
|
+
# Get view names from cache
|
|
968
|
+
view_names = [
|
|
969
|
+
i.name
|
|
970
|
+
for i in filter(
|
|
971
|
+
lambda t: t.object_type == "View", self._tables_cache.get(schema, [])
|
|
972
|
+
)
|
|
973
|
+
]
|
|
974
|
+
actual_view_count = len(view_names)
|
|
975
|
+
|
|
976
|
+
if actual_view_count == 0:
|
|
977
|
+
end_time = time.time()
|
|
978
|
+
processing_time = end_time - start_time
|
|
979
|
+
logger.info(
|
|
980
|
+
f"View processing for schema '{schema}' completed in {processing_time:.2f} seconds (0 views, 0 work units)"
|
|
981
|
+
)
|
|
982
|
+
return
|
|
983
|
+
|
|
984
|
+
# Use custom threading implementation with connection pooling
|
|
985
|
+
work_unit_count = 0
|
|
986
|
+
|
|
987
|
+
for work_unit in self._loop_views_with_connection_pool(
|
|
988
|
+
view_names, schema, sql_config
|
|
989
|
+
):
|
|
990
|
+
work_unit_count += 1
|
|
991
|
+
yield work_unit
|
|
992
|
+
|
|
993
|
+
end_time = time.time()
|
|
994
|
+
processing_time = end_time - start_time
|
|
995
|
+
|
|
996
|
+
logger.info(
|
|
997
|
+
f"View processing for schema '{schema}' completed in {processing_time:.2f} seconds ({actual_view_count} views, {work_unit_count} work units)"
|
|
998
|
+
)
|
|
999
|
+
|
|
1000
|
+
# Update report timing metrics
|
|
1001
|
+
if hasattr(self, "report"):
|
|
1002
|
+
self.report.view_extraction_total_time_seconds += processing_time
|
|
1003
|
+
self.report.num_views_processed += actual_view_count
|
|
1004
|
+
|
|
1005
|
+
# Track slowest view processing at view level (will be updated by individual view processing)
|
|
1006
|
+
# Note: slowest_view_name now tracks individual views, not schemas
|
|
1007
|
+
|
|
1008
|
+
# Calculate average processing time per view
|
|
1009
|
+
if self.report.num_views_processed > 0:
|
|
1010
|
+
self.report.view_extraction_average_time_seconds = (
|
|
1011
|
+
self.report.view_extraction_total_time_seconds
|
|
1012
|
+
/ self.report.num_views_processed
|
|
800
1013
|
)
|
|
801
|
-
|
|
1014
|
+
|
|
1015
|
+
def _loop_views_with_connection_pool(
|
|
1016
|
+
self, view_names: List[str], schema: str, sql_config: SQLCommonConfig
|
|
1017
|
+
) -> Iterable[Union[MetadataWorkUnit, Any]]:
|
|
1018
|
+
"""
|
|
1019
|
+
Process views using individual database connections per thread for true parallelization.
|
|
1020
|
+
|
|
1021
|
+
Each thread gets its own connection from a QueuePool, enabling true concurrent processing.
|
|
1022
|
+
"""
|
|
1023
|
+
if self.config.max_workers == 1:
|
|
1024
|
+
# Single-threaded processing - no need for complexity
|
|
1025
|
+
yield from self._process_views_single_threaded(
|
|
1026
|
+
view_names, schema, sql_config
|
|
1027
|
+
)
|
|
1028
|
+
return
|
|
1029
|
+
|
|
1030
|
+
logger.info(
|
|
1031
|
+
f"Processing {len(view_names)} views with {self.config.max_workers} worker threads"
|
|
802
1032
|
)
|
|
803
|
-
yield from super().loop_views(inspector, schema, sql_config)
|
|
804
1033
|
|
|
805
|
-
|
|
1034
|
+
# Get or create reusable pooled engine
|
|
1035
|
+
engine = self._get_or_create_pooled_engine()
|
|
1036
|
+
|
|
1037
|
+
try:
|
|
1038
|
+
# Thread-safe result collection
|
|
1039
|
+
report_lock = Lock()
|
|
1040
|
+
|
|
1041
|
+
def process_single_view(
|
|
1042
|
+
view_name: str,
|
|
1043
|
+
) -> List[Union[MetadataWorkUnit, Any]]:
|
|
1044
|
+
"""Process a single view with its own database connection."""
|
|
1045
|
+
results: List[Union[MetadataWorkUnit, Any]] = []
|
|
1046
|
+
|
|
1047
|
+
# Detailed timing measurements for bottleneck analysis
|
|
1048
|
+
timings = {
|
|
1049
|
+
"connection_acquire": 0.0,
|
|
1050
|
+
"view_processing": 0.0,
|
|
1051
|
+
"work_unit_generation": 0.0,
|
|
1052
|
+
"total": 0.0,
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
total_start = time.time()
|
|
1056
|
+
try:
|
|
1057
|
+
# Measure connection acquisition time
|
|
1058
|
+
conn_start = time.time()
|
|
1059
|
+
with engine.connect() as conn:
|
|
1060
|
+
timings["connection_acquire"] = time.time() - conn_start
|
|
1061
|
+
|
|
1062
|
+
# Update connection pool metrics
|
|
1063
|
+
with report_lock:
|
|
1064
|
+
pool_wait_time = timings["connection_acquire"]
|
|
1065
|
+
self.report.connection_pool_wait_time_seconds += (
|
|
1066
|
+
pool_wait_time
|
|
1067
|
+
)
|
|
1068
|
+
if (
|
|
1069
|
+
pool_wait_time
|
|
1070
|
+
> self.report.connection_pool_max_wait_time_seconds
|
|
1071
|
+
):
|
|
1072
|
+
self.report.connection_pool_max_wait_time_seconds = (
|
|
1073
|
+
pool_wait_time
|
|
1074
|
+
)
|
|
1075
|
+
|
|
1076
|
+
# Measure view processing setup
|
|
1077
|
+
processing_start = time.time()
|
|
1078
|
+
thread_inspector = inspect(conn)
|
|
1079
|
+
# Inherit database information for Teradata two-tier architecture
|
|
1080
|
+
thread_inspector._datahub_database = schema # type: ignore
|
|
1081
|
+
|
|
1082
|
+
dataset_name = self.get_identifier(
|
|
1083
|
+
schema=schema, entity=view_name, inspector=thread_inspector
|
|
1084
|
+
)
|
|
1085
|
+
|
|
1086
|
+
# Thread-safe reporting
|
|
1087
|
+
with report_lock:
|
|
1088
|
+
self.report.report_entity_scanned(
|
|
1089
|
+
dataset_name, ent_type="view"
|
|
1090
|
+
)
|
|
1091
|
+
|
|
1092
|
+
if not sql_config.view_pattern.allowed(dataset_name):
|
|
1093
|
+
with report_lock:
|
|
1094
|
+
self.report.report_dropped(dataset_name)
|
|
1095
|
+
return results
|
|
1096
|
+
|
|
1097
|
+
timings["view_processing"] = time.time() - processing_start
|
|
1098
|
+
|
|
1099
|
+
# Measure work unit generation
|
|
1100
|
+
wu_start = time.time()
|
|
1101
|
+
for work_unit in self._process_view(
|
|
1102
|
+
dataset_name=dataset_name,
|
|
1103
|
+
inspector=thread_inspector,
|
|
1104
|
+
schema=schema,
|
|
1105
|
+
view=view_name,
|
|
1106
|
+
sql_config=sql_config,
|
|
1107
|
+
):
|
|
1108
|
+
results.append(work_unit)
|
|
1109
|
+
timings["work_unit_generation"] = time.time() - wu_start
|
|
1110
|
+
|
|
1111
|
+
# Track individual view timing
|
|
1112
|
+
timings["total"] = time.time() - total_start
|
|
1113
|
+
|
|
1114
|
+
with report_lock:
|
|
1115
|
+
self.report.slowest_view_name[f"{schema}.{view_name}"] = (
|
|
1116
|
+
timings["total"]
|
|
1117
|
+
)
|
|
1118
|
+
|
|
1119
|
+
except Exception as e:
|
|
1120
|
+
with report_lock:
|
|
1121
|
+
self.report.num_view_processing_failures += 1
|
|
1122
|
+
# Log full exception details for debugging
|
|
1123
|
+
import traceback
|
|
1124
|
+
|
|
1125
|
+
full_traceback = traceback.format_exc()
|
|
1126
|
+
logger.error(
|
|
1127
|
+
f"Failed to process view {schema}.{view_name}: {str(e)}"
|
|
1128
|
+
)
|
|
1129
|
+
logger.error(f"Full traceback: {full_traceback}")
|
|
1130
|
+
self.report.warning(
|
|
1131
|
+
f"Error processing view {schema}.{view_name}",
|
|
1132
|
+
context=f"View: {schema}.{view_name}, Error: {str(e)}",
|
|
1133
|
+
exc=e,
|
|
1134
|
+
)
|
|
1135
|
+
|
|
1136
|
+
return results
|
|
1137
|
+
|
|
1138
|
+
# Use ThreadPoolExecutor for concurrent processing
|
|
1139
|
+
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
1140
|
+
# Submit all view processing tasks
|
|
1141
|
+
future_to_view = {
|
|
1142
|
+
executor.submit(process_single_view, view_name): view_name
|
|
1143
|
+
for view_name in view_names
|
|
1144
|
+
}
|
|
1145
|
+
|
|
1146
|
+
# Process completed tasks as they finish
|
|
1147
|
+
for future in as_completed(future_to_view):
|
|
1148
|
+
view_name = future_to_view[future]
|
|
1149
|
+
try:
|
|
1150
|
+
results = future.result()
|
|
1151
|
+
# Yield all results from this view
|
|
1152
|
+
for result in results:
|
|
1153
|
+
yield result
|
|
1154
|
+
except Exception as e:
|
|
1155
|
+
with report_lock:
|
|
1156
|
+
self.report.warning(
|
|
1157
|
+
"Error in thread processing view",
|
|
1158
|
+
context=f"{schema}.{view_name}",
|
|
1159
|
+
exc=e,
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
finally:
|
|
1163
|
+
# Don't dispose the reusable engine here - it will be cleaned up in close()
|
|
1164
|
+
pass
|
|
1165
|
+
|
|
1166
|
+
def _process_views_single_threaded(
|
|
1167
|
+
self, view_names: List[str], schema: str, sql_config: SQLCommonConfig
|
|
1168
|
+
) -> Iterable[Union[MetadataWorkUnit, Any]]:
|
|
1169
|
+
"""Process views sequentially with a single connection."""
|
|
806
1170
|
engine = self.get_metadata_engine()
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
1171
|
+
|
|
1172
|
+
try:
|
|
1173
|
+
with engine.connect() as conn:
|
|
1174
|
+
inspector = inspect(conn)
|
|
1175
|
+
|
|
1176
|
+
for view_name in view_names:
|
|
1177
|
+
view_start_time = time.time()
|
|
1178
|
+
try:
|
|
1179
|
+
dataset_name = self.get_identifier(
|
|
1180
|
+
schema=schema, entity=view_name, inspector=inspector
|
|
1181
|
+
)
|
|
1182
|
+
|
|
1183
|
+
self.report.report_entity_scanned(dataset_name, ent_type="view")
|
|
1184
|
+
|
|
1185
|
+
if not sql_config.view_pattern.allowed(dataset_name):
|
|
1186
|
+
self.report.report_dropped(dataset_name)
|
|
1187
|
+
continue
|
|
1188
|
+
|
|
1189
|
+
# Process the view and yield results
|
|
1190
|
+
for work_unit in self._process_view(
|
|
1191
|
+
dataset_name=dataset_name,
|
|
1192
|
+
inspector=inspector,
|
|
1193
|
+
schema=schema,
|
|
1194
|
+
view=view_name,
|
|
1195
|
+
sql_config=sql_config,
|
|
1196
|
+
):
|
|
1197
|
+
yield work_unit
|
|
1198
|
+
|
|
1199
|
+
# Track individual view timing
|
|
1200
|
+
view_end_time = time.time()
|
|
1201
|
+
view_processing_time = view_end_time - view_start_time
|
|
1202
|
+
self.report.slowest_view_name[f"{schema}.{view_name}"] = (
|
|
1203
|
+
view_processing_time
|
|
1204
|
+
)
|
|
1205
|
+
|
|
1206
|
+
except Exception as e:
|
|
1207
|
+
# Log full exception details for debugging
|
|
1208
|
+
import traceback
|
|
1209
|
+
|
|
1210
|
+
full_traceback = traceback.format_exc()
|
|
1211
|
+
logger.error(
|
|
1212
|
+
f"Failed to process view {schema}.{view_name}: {str(e)}"
|
|
1213
|
+
)
|
|
1214
|
+
logger.error(f"Full traceback: {full_traceback}")
|
|
1215
|
+
self.report.warning(
|
|
1216
|
+
f"Error processing view {schema}.{view_name}",
|
|
1217
|
+
context=f"View: {schema}.{view_name}, Error: {str(e)}",
|
|
1218
|
+
exc=e,
|
|
1219
|
+
)
|
|
1220
|
+
|
|
1221
|
+
finally:
|
|
1222
|
+
engine.dispose()
|
|
1223
|
+
|
|
1224
|
+
def _get_or_create_pooled_engine(self) -> Engine:
|
|
1225
|
+
"""Get or create a reusable SQLAlchemy engine with QueuePool for concurrent connections."""
|
|
1226
|
+
with self._pooled_engine_lock:
|
|
1227
|
+
if self._pooled_engine is None:
|
|
1228
|
+
url = self.config.get_sql_alchemy_url()
|
|
1229
|
+
|
|
1230
|
+
# Optimal connection pool sizing to match max_workers exactly
|
|
1231
|
+
# Teradata driver can be sensitive to high connection counts, so cap at reasonable limit
|
|
1232
|
+
max_safe_connections = (
|
|
1233
|
+
13 # Conservative limit: 8 base + 5 overflow for Teradata stability
|
|
1234
|
+
)
|
|
1235
|
+
|
|
1236
|
+
# Adjust max_workers to match available connection pool capacity
|
|
1237
|
+
effective_max_workers = min(
|
|
1238
|
+
self.config.max_workers, max_safe_connections
|
|
1239
|
+
)
|
|
1240
|
+
|
|
1241
|
+
# Set pool size to match effective workers for optimal performance
|
|
1242
|
+
base_connections = min(
|
|
1243
|
+
effective_max_workers, 8
|
|
1244
|
+
) # Reasonable base connections
|
|
1245
|
+
max_overflow = (
|
|
1246
|
+
effective_max_workers - base_connections
|
|
1247
|
+
) # Remaining as overflow
|
|
1248
|
+
|
|
1249
|
+
# Log adjustment if max_workers was reduced
|
|
1250
|
+
if effective_max_workers < self.config.max_workers:
|
|
1251
|
+
logger.warning(
|
|
1252
|
+
f"Reduced max_workers from {self.config.max_workers} to {effective_max_workers} to match Teradata connection pool capacity"
|
|
1253
|
+
)
|
|
1254
|
+
|
|
1255
|
+
# Update the config to reflect the effective value used
|
|
1256
|
+
self.config.max_workers = effective_max_workers
|
|
1257
|
+
|
|
1258
|
+
pool_options = {
|
|
1259
|
+
**self.config.options,
|
|
1260
|
+
"poolclass": QueuePool,
|
|
1261
|
+
"pool_size": base_connections,
|
|
1262
|
+
"max_overflow": max_overflow,
|
|
1263
|
+
"pool_pre_ping": True, # Validate connections
|
|
1264
|
+
"pool_recycle": 1800, # Recycle connections after 30 mins (more frequent)
|
|
1265
|
+
"pool_timeout": 60, # Longer timeout for connection acquisition
|
|
1266
|
+
"pool_reset_on_return": "rollback", # Explicit rollback on connection return
|
|
1267
|
+
}
|
|
1268
|
+
|
|
1269
|
+
# Add Teradata-specific connection options for stability
|
|
1270
|
+
if "connect_args" not in pool_options:
|
|
1271
|
+
pool_options["connect_args"] = {}
|
|
1272
|
+
|
|
1273
|
+
# Teradata-specific connection arguments for better stability
|
|
1274
|
+
pool_options["connect_args"].update(
|
|
1275
|
+
{
|
|
1276
|
+
"connect_timeout": "30000", # Connection timeout in ms (30 seconds)
|
|
1277
|
+
"request_timeout": "120000", # Request timeout in ms (2 minutes)
|
|
1278
|
+
}
|
|
1279
|
+
)
|
|
1280
|
+
|
|
1281
|
+
self._pooled_engine = create_engine(url, **pool_options)
|
|
1282
|
+
logger.info(
|
|
1283
|
+
f"Created optimized Teradata connection pool: {base_connections} base + {max_overflow} overflow = {base_connections + max_overflow} max connections (matching {effective_max_workers} workers)"
|
|
1284
|
+
)
|
|
1285
|
+
|
|
1286
|
+
return self._pooled_engine
|
|
1287
|
+
|
|
1288
|
+
def cache_tables_and_views(self) -> None:
|
|
1289
|
+
with self.report.new_stage("Cache tables and views"):
|
|
1290
|
+
engine = self.get_metadata_engine()
|
|
1291
|
+
try:
|
|
1292
|
+
database_counts: Dict[str, Dict[str, int]] = defaultdict(
|
|
1293
|
+
lambda: {"tables": 0, "views": 0}
|
|
1294
|
+
)
|
|
1295
|
+
|
|
1296
|
+
for entry in engine.execute(self.TABLES_AND_VIEWS_QUERY):
|
|
1297
|
+
table = TeradataTable(
|
|
1298
|
+
database=entry.DataBaseName.strip(),
|
|
1299
|
+
name=entry.name.strip(),
|
|
1300
|
+
description=entry.description.strip()
|
|
1301
|
+
if entry.description
|
|
1302
|
+
else None,
|
|
1303
|
+
object_type=entry.object_type,
|
|
1304
|
+
create_timestamp=entry.CreateTimeStamp,
|
|
1305
|
+
last_alter_name=entry.LastAlterName,
|
|
1306
|
+
last_alter_timestamp=entry.LastAlterTimeStamp,
|
|
1307
|
+
request_text=(
|
|
1308
|
+
entry.RequestText.strip()
|
|
1309
|
+
if entry.object_type == "View" and entry.RequestText
|
|
1310
|
+
else None
|
|
1311
|
+
),
|
|
1312
|
+
)
|
|
1313
|
+
|
|
1314
|
+
# Count objects per database for metrics
|
|
1315
|
+
if table.object_type == "View":
|
|
1316
|
+
database_counts[table.database]["views"] += 1
|
|
1317
|
+
else:
|
|
1318
|
+
database_counts[table.database]["tables"] += 1
|
|
1319
|
+
|
|
1320
|
+
with self._tables_cache_lock:
|
|
1321
|
+
if table.database not in self._tables_cache:
|
|
1322
|
+
self._tables_cache[table.database] = []
|
|
1323
|
+
self._tables_cache[table.database].append(table)
|
|
1324
|
+
|
|
1325
|
+
for database, counts in database_counts.items():
|
|
1326
|
+
self.report.num_database_tables_to_scan[database] = counts["tables"]
|
|
1327
|
+
self.report.num_database_views_to_scan[database] = counts["views"]
|
|
1328
|
+
|
|
1329
|
+
finally:
|
|
1330
|
+
engine.dispose()
|
|
1331
|
+
|
|
1332
|
+
def _reconstruct_queries_streaming(
|
|
1333
|
+
self, entries: Iterable[Any]
|
|
1334
|
+
) -> Iterable[ObservedQuery]:
|
|
1335
|
+
"""Reconstruct complete queries from database entries in streaming fashion.
|
|
1336
|
+
|
|
1337
|
+
This method processes entries in order and reconstructs multi-row queries
|
|
1338
|
+
by concatenating rows with the same query_id.
|
|
1339
|
+
"""
|
|
1340
|
+
current_query_id = None
|
|
1341
|
+
current_query_parts = []
|
|
1342
|
+
current_query_metadata = None
|
|
1343
|
+
|
|
1344
|
+
for entry in entries:
|
|
1345
|
+
# Count each audit query entry processed
|
|
1346
|
+
self.report.num_audit_query_entries_processed += 1
|
|
1347
|
+
|
|
1348
|
+
query_id = getattr(entry, "query_id", None)
|
|
1349
|
+
query_text = str(getattr(entry, "query_text", ""))
|
|
1350
|
+
|
|
1351
|
+
if query_id != current_query_id:
|
|
1352
|
+
# New query started - yield the previous one if it exists
|
|
1353
|
+
if current_query_id is not None and current_query_parts:
|
|
1354
|
+
yield self._create_observed_query_from_parts(
|
|
1355
|
+
current_query_parts, current_query_metadata
|
|
1356
|
+
)
|
|
1357
|
+
|
|
1358
|
+
# Start new query
|
|
1359
|
+
current_query_id = query_id
|
|
1360
|
+
current_query_parts = [query_text] if query_text else []
|
|
1361
|
+
current_query_metadata = entry
|
|
1362
|
+
else:
|
|
1363
|
+
# Same query - append the text
|
|
1364
|
+
if query_text:
|
|
1365
|
+
current_query_parts.append(query_text)
|
|
1366
|
+
|
|
1367
|
+
# Yield the last query if it exists
|
|
1368
|
+
if current_query_id is not None and current_query_parts:
|
|
1369
|
+
yield self._create_observed_query_from_parts(
|
|
1370
|
+
current_query_parts, current_query_metadata
|
|
821
1371
|
)
|
|
822
|
-
if table.database not in self._tables_cache:
|
|
823
|
-
self._tables_cache[table.database] = []
|
|
824
1372
|
|
|
825
|
-
|
|
1373
|
+
def _create_observed_query_from_parts(
|
|
1374
|
+
self, query_parts: List[str], metadata_entry: Any
|
|
1375
|
+
) -> ObservedQuery:
|
|
1376
|
+
"""Create ObservedQuery from reconstructed query parts and metadata."""
|
|
1377
|
+
# Join all parts to form the complete query
|
|
1378
|
+
# Teradata fragments are split at fixed lengths without artificial breaks
|
|
1379
|
+
full_query_text = "".join(query_parts)
|
|
1380
|
+
|
|
1381
|
+
# Extract metadata
|
|
1382
|
+
session_id = getattr(metadata_entry, "session_id", None)
|
|
1383
|
+
timestamp = getattr(metadata_entry, "timestamp", None)
|
|
1384
|
+
user = getattr(metadata_entry, "user", None)
|
|
1385
|
+
default_database = getattr(metadata_entry, "default_database", None)
|
|
1386
|
+
|
|
1387
|
+
# Apply Teradata-specific query transformations
|
|
1388
|
+
cleaned_query = NOT_CASESPECIFIC_PATTERN.sub("", full_query_text)
|
|
1389
|
+
|
|
1390
|
+
# For Teradata's two-tier architecture (database.table), we should not set default_db
|
|
1391
|
+
# to avoid incorrect URN generation like "dbc.database.table" instead of "database.table"
|
|
1392
|
+
# The SQL parser will treat database.table references correctly without default_db
|
|
1393
|
+
return ObservedQuery(
|
|
1394
|
+
query=cleaned_query,
|
|
1395
|
+
session_id=session_id,
|
|
1396
|
+
timestamp=timestamp,
|
|
1397
|
+
user=CorpUserUrn(user) if user else None,
|
|
1398
|
+
default_db=DEFAULT_NO_DATABASE_TERADATA, # Teradata uses two-tier database.table naming without default database prefixing
|
|
1399
|
+
default_schema=default_database,
|
|
1400
|
+
)
|
|
1401
|
+
|
|
1402
|
+
def _convert_entry_to_observed_query(self, entry: Any) -> ObservedQuery:
|
|
1403
|
+
"""Convert database query entry to ObservedQuery for SqlParsingAggregator.
|
|
1404
|
+
|
|
1405
|
+
DEPRECATED: This method is deprecated in favor of _reconstruct_queries_streaming
|
|
1406
|
+
which properly handles multi-row queries. This method does not handle queries
|
|
1407
|
+
that span multiple rows correctly and should not be used.
|
|
1408
|
+
"""
|
|
1409
|
+
# Extract fields from database result
|
|
1410
|
+
query_text = str(entry.query_text).strip()
|
|
1411
|
+
session_id = getattr(entry, "session_id", None)
|
|
1412
|
+
timestamp = getattr(entry, "timestamp", None)
|
|
1413
|
+
user = getattr(entry, "user", None)
|
|
1414
|
+
default_database = getattr(entry, "default_database", None)
|
|
1415
|
+
|
|
1416
|
+
# Apply Teradata-specific query transformations
|
|
1417
|
+
cleaned_query = NOT_CASESPECIFIC_PATTERN.sub("", query_text)
|
|
1418
|
+
|
|
1419
|
+
# For Teradata's two-tier architecture (database.table), we should not set default_db
|
|
1420
|
+
# to avoid incorrect URN generation like "dbc.database.table" instead of "database.table"
|
|
1421
|
+
# However, we should set default_schema for unqualified table references
|
|
1422
|
+
return ObservedQuery(
|
|
1423
|
+
query=cleaned_query,
|
|
1424
|
+
session_id=session_id,
|
|
1425
|
+
timestamp=timestamp,
|
|
1426
|
+
user=CorpUserUrn(user) if user else None,
|
|
1427
|
+
default_db=DEFAULT_NO_DATABASE_TERADATA, # Teradata uses two-tier database.table naming without default database prefixing
|
|
1428
|
+
default_schema=default_database, # Set default_schema for unqualified table references
|
|
1429
|
+
)
|
|
1430
|
+
|
|
1431
|
+
def _fetch_lineage_entries_chunked(self) -> Iterable[Any]:
|
|
1432
|
+
"""Fetch lineage entries using server-side cursor to handle large result sets efficiently."""
|
|
1433
|
+
queries = self._make_lineage_queries()
|
|
1434
|
+
|
|
1435
|
+
fetch_engine = self.get_metadata_engine()
|
|
1436
|
+
try:
|
|
1437
|
+
with fetch_engine.connect() as conn:
|
|
1438
|
+
cursor_type = (
|
|
1439
|
+
"server-side"
|
|
1440
|
+
if self.config.use_server_side_cursors
|
|
1441
|
+
else "client-side"
|
|
1442
|
+
)
|
|
1443
|
+
|
|
1444
|
+
total_count_all_queries = 0
|
|
1445
|
+
|
|
1446
|
+
for query_index, query in enumerate(queries, 1):
|
|
1447
|
+
logger.info(
|
|
1448
|
+
f"Executing lineage query {query_index}/{len(queries)} for time range {self.config.start_time} to {self.config.end_time} with {cursor_type} cursor..."
|
|
1449
|
+
)
|
|
1450
|
+
|
|
1451
|
+
# Use helper method to try server-side cursor with fallback
|
|
1452
|
+
result = self._execute_with_cursor_fallback(conn, query)
|
|
1453
|
+
|
|
1454
|
+
# Stream results in batches to avoid memory issues
|
|
1455
|
+
batch_size = 5000
|
|
1456
|
+
batch_count = 0
|
|
1457
|
+
query_total_count = 0
|
|
1458
|
+
|
|
1459
|
+
while True:
|
|
1460
|
+
# Fetch a batch of rows
|
|
1461
|
+
batch = result.fetchmany(batch_size)
|
|
1462
|
+
if not batch:
|
|
1463
|
+
break
|
|
826
1464
|
|
|
827
|
-
|
|
1465
|
+
batch_count += 1
|
|
1466
|
+
query_total_count += len(batch)
|
|
1467
|
+
total_count_all_queries += len(batch)
|
|
1468
|
+
|
|
1469
|
+
logger.info(
|
|
1470
|
+
f"Query {query_index} - Fetched batch {batch_count}: {len(batch)} lineage entries (query total: {query_total_count})"
|
|
1471
|
+
)
|
|
1472
|
+
yield from batch
|
|
1473
|
+
|
|
1474
|
+
logger.info(
|
|
1475
|
+
f"Completed query {query_index}: {query_total_count} lineage entries in {batch_count} batches"
|
|
1476
|
+
)
|
|
1477
|
+
|
|
1478
|
+
logger.info(
|
|
1479
|
+
f"Completed fetching all queries: {total_count_all_queries} total lineage entries from {len(queries)} queries"
|
|
1480
|
+
)
|
|
1481
|
+
|
|
1482
|
+
except Exception as e:
|
|
1483
|
+
logger.error(f"Error fetching lineage entries: {e}")
|
|
1484
|
+
raise
|
|
1485
|
+
finally:
|
|
1486
|
+
fetch_engine.dispose()
|
|
1487
|
+
|
|
1488
|
+
def _check_historical_table_exists(self) -> bool:
|
|
1489
|
+
"""
|
|
1490
|
+
Check if the PDCRINFO.DBQLSqlTbl_Hst table exists and is accessible.
|
|
1491
|
+
DBQL rows are periodically moved to history table and audit queries might not exist in DBC already.
|
|
1492
|
+
There is not guarantee that the historical table exists, so we need to check it.
|
|
1493
|
+
|
|
1494
|
+
Returns:
|
|
1495
|
+
bool: True if the historical table exists and is accessible, False otherwise.
|
|
1496
|
+
"""
|
|
828
1497
|
engine = self.get_metadata_engine()
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
1498
|
+
try:
|
|
1499
|
+
# Use a simple query to check if the table exists and is accessible
|
|
1500
|
+
check_query = """
|
|
1501
|
+
SELECT TOP 1 QueryID
|
|
1502
|
+
FROM PDCRINFO.DBQLSqlTbl_Hst
|
|
1503
|
+
WHERE 1=0
|
|
1504
|
+
"""
|
|
1505
|
+
with engine.connect() as conn:
|
|
1506
|
+
conn.execute(text(check_query))
|
|
1507
|
+
logger.info(
|
|
1508
|
+
"Historical lineage table PDCRINFO.DBQLSqlTbl_Hst is available"
|
|
1509
|
+
)
|
|
1510
|
+
return True
|
|
1511
|
+
except Exception as e:
|
|
1512
|
+
logger.info(
|
|
1513
|
+
f"Historical lineage table PDCRINFO.DBQLSqlTbl_Hst is not available: {e}"
|
|
840
1514
|
)
|
|
1515
|
+
return False
|
|
1516
|
+
finally:
|
|
1517
|
+
engine.dispose()
|
|
841
1518
|
|
|
842
|
-
def
|
|
1519
|
+
def _make_lineage_queries(self) -> List[str]:
|
|
843
1520
|
databases_filter = (
|
|
844
1521
|
""
|
|
845
1522
|
if not self.config.databases
|
|
846
|
-
else "and
|
|
1523
|
+
else "and l.DefaultDatabase in ({databases})".format(
|
|
847
1524
|
databases=",".join([f"'{db}'" for db in self.config.databases])
|
|
848
1525
|
)
|
|
849
1526
|
)
|
|
850
1527
|
|
|
851
|
-
|
|
852
|
-
start_time=self.config.start_time,
|
|
853
|
-
end_time=self.config.end_time,
|
|
854
|
-
databases_filter=databases_filter,
|
|
855
|
-
)
|
|
856
|
-
return query
|
|
1528
|
+
queries = []
|
|
857
1529
|
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
f"Error parsing table lineage ({view_urn}):\n{result.debug_info.table_error}"
|
|
1530
|
+
# Check if historical lineage is configured and available
|
|
1531
|
+
if (
|
|
1532
|
+
self.config.include_historical_lineage
|
|
1533
|
+
and self._check_historical_table_exists()
|
|
1534
|
+
):
|
|
1535
|
+
logger.info(
|
|
1536
|
+
"Using UNION query to combine historical and current lineage data to avoid duplicates"
|
|
1537
|
+
)
|
|
1538
|
+
# For historical query, we need the database filter for historical part
|
|
1539
|
+
databases_filter_history = (
|
|
1540
|
+
databases_filter.replace("l.DefaultDatabase", "h.DefaultDatabase")
|
|
1541
|
+
if databases_filter
|
|
1542
|
+
else ""
|
|
1543
|
+
)
|
|
1544
|
+
|
|
1545
|
+
union_query = self.QUERY_TEXT_HISTORICAL_UNION.format(
|
|
1546
|
+
start_time=self.config.start_time,
|
|
1547
|
+
end_time=self.config.end_time,
|
|
1548
|
+
databases_filter=databases_filter,
|
|
1549
|
+
databases_filter_history=databases_filter_history,
|
|
879
1550
|
)
|
|
880
|
-
|
|
1551
|
+
queries.append(union_query)
|
|
881
1552
|
else:
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
1553
|
+
if self.config.include_historical_lineage:
|
|
1554
|
+
logger.warning(
|
|
1555
|
+
"Historical lineage was requested but PDCRINFO.DBQLSqlTbl_Hst table is not available. Falling back to current data only."
|
|
1556
|
+
)
|
|
1557
|
+
|
|
1558
|
+
# Use current-only query when historical data is not available
|
|
1559
|
+
current_query = self.QUERY_TEXT_CURRENT_QUERIES.format(
|
|
1560
|
+
start_time=self.config.start_time,
|
|
1561
|
+
end_time=self.config.end_time,
|
|
1562
|
+
databases_filter=databases_filter,
|
|
889
1563
|
)
|
|
1564
|
+
queries.append(current_query)
|
|
1565
|
+
|
|
1566
|
+
return queries
|
|
890
1567
|
|
|
891
1568
|
def get_metadata_engine(self) -> Engine:
|
|
892
1569
|
url = self.config.get_sql_alchemy_url()
|
|
893
1570
|
logger.debug(f"sql_alchemy_url={url}")
|
|
894
1571
|
return create_engine(url, **self.config.options)
|
|
895
1572
|
|
|
896
|
-
def
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
1573
|
+
def _execute_with_cursor_fallback(
|
|
1574
|
+
self, connection: Connection, query: str, params: Optional[Dict] = None
|
|
1575
|
+
) -> Any:
|
|
1576
|
+
"""
|
|
1577
|
+
Execute query with server-side cursor if enabled and supported, otherwise fall back to regular execution.
|
|
1578
|
+
|
|
1579
|
+
Args:
|
|
1580
|
+
connection: Database connection
|
|
1581
|
+
query: SQL query to execute
|
|
1582
|
+
params: Query parameters
|
|
1583
|
+
|
|
1584
|
+
Returns:
|
|
1585
|
+
Query result object
|
|
1586
|
+
"""
|
|
1587
|
+
if self.config.use_server_side_cursors:
|
|
1588
|
+
try:
|
|
1589
|
+
# Try server-side cursor first
|
|
1590
|
+
if params:
|
|
1591
|
+
result = connection.execution_options(stream_results=True).execute(
|
|
1592
|
+
text(query), params
|
|
1593
|
+
)
|
|
1594
|
+
else:
|
|
1595
|
+
result = connection.execution_options(stream_results=True).execute(
|
|
1596
|
+
text(query)
|
|
1597
|
+
)
|
|
1598
|
+
|
|
1599
|
+
logger.debug(
|
|
1600
|
+
"Successfully using server-side cursor for query execution"
|
|
1601
|
+
)
|
|
1602
|
+
return result
|
|
1603
|
+
|
|
1604
|
+
except Exception as e:
|
|
1605
|
+
logger.warning(
|
|
1606
|
+
f"Server-side cursor failed, falling back to client-side execution: {e}"
|
|
1607
|
+
)
|
|
1608
|
+
# Fall through to regular execution
|
|
1609
|
+
|
|
1610
|
+
# Regular execution (client-side)
|
|
1611
|
+
if params:
|
|
1612
|
+
return connection.execute(text(query), params)
|
|
1613
|
+
else:
|
|
1614
|
+
return connection.execute(text(query))
|
|
1615
|
+
|
|
1616
|
+
def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
1617
|
+
"""Override to prevent parent class from generating aggregator work units during schema extraction.
|
|
1618
|
+
|
|
1619
|
+
We handle aggregator generation manually after populating it with audit log data.
|
|
1620
|
+
"""
|
|
1621
|
+
# Do nothing - we'll call the parent implementation manually after populating the aggregator
|
|
1622
|
+
return iter([])
|
|
1623
|
+
|
|
1624
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1625
|
+
logger.info("Starting Teradata metadata extraction")
|
|
1626
|
+
|
|
1627
|
+
# Step 1: Schema extraction first (parent class will skip aggregator generation due to our override)
|
|
1628
|
+
with self.report.new_stage("Schema metadata extraction"):
|
|
1629
|
+
yield from super().get_workunits_internal()
|
|
1630
|
+
logger.info("Completed schema metadata extraction")
|
|
1631
|
+
|
|
1632
|
+
# Step 2: Lineage extraction after schema extraction
|
|
1633
|
+
# This allows lineage processing to have access to all discovered schema information
|
|
1634
|
+
with self.report.new_stage("Audit log extraction and lineage processing"):
|
|
1635
|
+
self._populate_aggregator_from_audit_logs()
|
|
1636
|
+
# Call parent implementation directly to generate aggregator work units
|
|
1637
|
+
yield from super()._generate_aggregator_workunits()
|
|
1638
|
+
logger.info("Completed lineage processing")
|
|
1639
|
+
|
|
1640
|
+
def _populate_aggregator_from_audit_logs(self) -> None:
|
|
1641
|
+
"""SqlParsingAggregator-based lineage extraction with enhanced capabilities."""
|
|
1642
|
+
with self.report.new_stage("Lineage extraction from Teradata audit logs"):
|
|
1643
|
+
# Record the lineage query time range in the report
|
|
1644
|
+
self.report.lineage_start_time = self.config.start_time
|
|
1645
|
+
self.report.lineage_end_time = self.config.end_time
|
|
1646
|
+
|
|
1647
|
+
logger.info(
|
|
1648
|
+
f"Starting lineage extraction from Teradata audit logs (time range: {self.config.start_time} to {self.config.end_time})"
|
|
1649
|
+
)
|
|
1650
|
+
|
|
1651
|
+
if (
|
|
1652
|
+
self.config.include_table_lineage
|
|
1653
|
+
or self.config.include_usage_statistics
|
|
1654
|
+
):
|
|
1655
|
+
# Step 1: Stream query entries from database with memory-efficient processing
|
|
1656
|
+
with self.report.new_stage("Fetching lineage entries from Audit Logs"):
|
|
1657
|
+
queries_processed = 0
|
|
1658
|
+
|
|
1659
|
+
# Use streaming query reconstruction for memory efficiency
|
|
1660
|
+
for observed_query in self._reconstruct_queries_streaming(
|
|
1661
|
+
self._fetch_lineage_entries_chunked()
|
|
1662
|
+
):
|
|
1663
|
+
self.aggregator.add(observed_query)
|
|
1664
|
+
|
|
1665
|
+
queries_processed += 1
|
|
1666
|
+
if queries_processed % 10000 == 0:
|
|
1667
|
+
logger.info(
|
|
1668
|
+
f"Processed {queries_processed} queries to aggregator"
|
|
1669
|
+
)
|
|
1670
|
+
|
|
1671
|
+
if queries_processed == 0:
|
|
1672
|
+
logger.info("No lineage entries found")
|
|
1673
|
+
return
|
|
1674
|
+
|
|
1675
|
+
logger.info(
|
|
1676
|
+
f"Completed adding {queries_processed} queries to SqlParsingAggregator"
|
|
1677
|
+
)
|
|
1678
|
+
|
|
1679
|
+
logger.info("Completed lineage extraction from Teradata audit logs")
|
|
1680
|
+
|
|
1681
|
+
def close(self) -> None:
|
|
1682
|
+
"""Clean up resources when source is closed."""
|
|
1683
|
+
logger.info("Closing SqlParsingAggregator")
|
|
1684
|
+
self.aggregator.close()
|
|
1685
|
+
|
|
1686
|
+
# Clean up pooled engine
|
|
1687
|
+
with self._pooled_engine_lock:
|
|
1688
|
+
if self._pooled_engine is not None:
|
|
1689
|
+
logger.info("Disposing pooled engine")
|
|
1690
|
+
self._pooled_engine.dispose()
|
|
1691
|
+
self._pooled_engine = None
|
|
1692
|
+
|
|
1693
|
+
# Report failed views summary
|
|
1694
|
+
super().close()
|