acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,7 @@ from sqlalchemy.types import TypeEngine
|
|
|
18
18
|
from trino.sqlalchemy import datatype
|
|
19
19
|
from trino.sqlalchemy.dialect import TrinoDialect
|
|
20
20
|
|
|
21
|
+
from datahub.configuration.common import HiddenFromDocs
|
|
21
22
|
from datahub.configuration.source_common import (
|
|
22
23
|
EnvConfigMixin,
|
|
23
24
|
PlatformInstanceConfigMixin,
|
|
@@ -36,6 +37,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
36
37
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
37
38
|
from datahub.ingestion.extractor import schema_util
|
|
38
39
|
from datahub.ingestion.source.common.data_reader import DataReader
|
|
40
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
39
41
|
from datahub.ingestion.source.sql.sql_common import (
|
|
40
42
|
SQLAlchemySource,
|
|
41
43
|
SqlWorkUnit,
|
|
@@ -221,7 +223,7 @@ class ConnectorDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
|
221
223
|
|
|
222
224
|
class TrinoConfig(BasicSQLAlchemyConfig):
|
|
223
225
|
# defaults
|
|
224
|
-
scheme: str = Field(default="trino"
|
|
226
|
+
scheme: HiddenFromDocs[str] = Field(default="trino")
|
|
225
227
|
database: str = Field(description="database (catalog)")
|
|
226
228
|
|
|
227
229
|
catalog_to_connector_details: Dict[str, ConnectorDetail] = Field(
|
|
@@ -249,6 +251,14 @@ class TrinoConfig(BasicSQLAlchemyConfig):
|
|
|
249
251
|
@support_status(SupportStatus.CERTIFIED)
|
|
250
252
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
251
253
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
254
|
+
@capability(
|
|
255
|
+
SourceCapability.LINEAGE_COARSE,
|
|
256
|
+
"Extract table-level lineage",
|
|
257
|
+
subtype_modifier=[
|
|
258
|
+
SourceCapabilityModifier.TABLE,
|
|
259
|
+
SourceCapabilityModifier.VIEW,
|
|
260
|
+
],
|
|
261
|
+
)
|
|
252
262
|
class TrinoSource(SQLAlchemySource):
|
|
253
263
|
"""
|
|
254
264
|
|
|
@@ -7,7 +7,7 @@ from sqlalchemy import create_engine, inspect
|
|
|
7
7
|
from sqlalchemy.engine import URL
|
|
8
8
|
from sqlalchemy.engine.reflection import Inspector
|
|
9
9
|
|
|
10
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
11
11
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
12
12
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
13
13
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
@@ -27,11 +27,10 @@ class TwoTierSQLAlchemyConfig(BasicSQLAlchemyConfig):
|
|
|
27
27
|
default=AllowDenyPattern.allow_all(),
|
|
28
28
|
description="Regex patterns for databases to filter in ingestion.",
|
|
29
29
|
)
|
|
30
|
-
schema_pattern: AllowDenyPattern = Field(
|
|
30
|
+
schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
|
|
31
31
|
# The superclass contains a `schema_pattern` field, so we need this here
|
|
32
32
|
# to override the documentation.
|
|
33
33
|
default=AllowDenyPattern.allow_all(),
|
|
34
|
-
hidden_from_docs=True,
|
|
35
34
|
description="Deprecated in favour of database_pattern.",
|
|
36
35
|
)
|
|
37
36
|
|
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
|
-
|
|
7
|
+
import pytest
|
|
8
|
+
from pydantic import validator
|
|
8
9
|
from vertica_sqlalchemy_dialect.base import VerticaInspector
|
|
9
10
|
|
|
10
11
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -25,6 +26,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
25
26
|
)
|
|
26
27
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
27
28
|
from datahub.ingestion.source.common.data_reader import DataReader
|
|
29
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
30
|
+
DatasetSubTypes,
|
|
31
|
+
SourceCapabilityModifier,
|
|
32
|
+
)
|
|
28
33
|
from datahub.ingestion.source.sql.sql_common import (
|
|
29
34
|
SQLAlchemySource,
|
|
30
35
|
SqlWorkUnit,
|
|
@@ -41,7 +46,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
|
|
|
41
46
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
42
47
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
43
48
|
from datahub.metadata.schema_classes import (
|
|
44
|
-
ChangeTypeClass,
|
|
45
49
|
DatasetLineageTypeClass,
|
|
46
50
|
DatasetPropertiesClass,
|
|
47
51
|
SubTypesClass,
|
|
@@ -52,6 +56,8 @@ from datahub.utilities import config_clean
|
|
|
52
56
|
|
|
53
57
|
if TYPE_CHECKING:
|
|
54
58
|
from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest
|
|
59
|
+
|
|
60
|
+
pytestmark = pytest.mark.integration_batch_4
|
|
55
61
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
56
62
|
|
|
57
63
|
|
|
@@ -113,10 +119,14 @@ class VerticaConfig(BasicSQLAlchemyConfig):
|
|
|
113
119
|
@capability(
|
|
114
120
|
SourceCapability.LINEAGE_COARSE,
|
|
115
121
|
"Enabled by default, can be disabled via configuration `include_view_lineage` and `include_projection_lineage`",
|
|
122
|
+
subtype_modifier=[
|
|
123
|
+
SourceCapabilityModifier.VIEW,
|
|
124
|
+
SourceCapabilityModifier.PROJECTIONS,
|
|
125
|
+
],
|
|
116
126
|
)
|
|
117
127
|
@capability(
|
|
118
128
|
SourceCapability.DELETION_DETECTION,
|
|
119
|
-
"
|
|
129
|
+
"Enabled by default via stateful ingestion",
|
|
120
130
|
supported=True,
|
|
121
131
|
)
|
|
122
132
|
class VerticaSource(SQLAlchemySource):
|
|
@@ -493,11 +503,8 @@ class VerticaSource(SQLAlchemySource):
|
|
|
493
503
|
if dpi_aspect:
|
|
494
504
|
yield dpi_aspect
|
|
495
505
|
yield MetadataChangeProposalWrapper(
|
|
496
|
-
entityType="dataset",
|
|
497
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
498
506
|
entityUrn=dataset_urn,
|
|
499
|
-
|
|
500
|
-
aspect=SubTypesClass(typeNames=["Projections"]),
|
|
507
|
+
aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
|
|
501
508
|
).as_workunit()
|
|
502
509
|
|
|
503
510
|
if self.config.domain:
|
|
@@ -2,21 +2,22 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
-
from datetime import datetime
|
|
5
|
+
from datetime import datetime
|
|
6
6
|
from functools import partial
|
|
7
|
-
from typing import Iterable, List, Optional,
|
|
7
|
+
from typing import ClassVar, Iterable, List, Optional, Union
|
|
8
8
|
|
|
9
|
-
from pydantic import Field
|
|
9
|
+
from pydantic import BaseModel, Field, validator
|
|
10
10
|
|
|
11
|
+
from datahub.configuration.common import HiddenFromDocs
|
|
12
|
+
from datahub.configuration.datetimes import parse_user_datetime
|
|
11
13
|
from datahub.configuration.source_common import (
|
|
12
14
|
EnvConfigMixin,
|
|
13
15
|
PlatformInstanceConfigMixin,
|
|
14
16
|
)
|
|
15
17
|
from datahub.emitter.mce_builder import (
|
|
16
18
|
make_dataset_urn_with_platform_instance,
|
|
17
|
-
make_user_urn,
|
|
18
19
|
)
|
|
19
|
-
from datahub.emitter.
|
|
20
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
21
|
from datahub.ingestion.api.common import PipelineContext
|
|
21
22
|
from datahub.ingestion.api.decorators import (
|
|
22
23
|
SupportStatus,
|
|
@@ -25,6 +26,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
25
26
|
platform_name,
|
|
26
27
|
support_status,
|
|
27
28
|
)
|
|
29
|
+
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
30
|
+
IncrementalLineageConfigMixin,
|
|
31
|
+
auto_incremental_lineage,
|
|
32
|
+
)
|
|
28
33
|
from datahub.ingestion.api.source import (
|
|
29
34
|
MetadataWorkUnitProcessor,
|
|
30
35
|
Source,
|
|
@@ -35,13 +40,21 @@ from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
|
35
40
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
36
41
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
37
42
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
43
|
+
from datahub.metadata.urns import CorpUserUrn, DatasetUrn
|
|
38
44
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
39
|
-
from datahub.sql_parsing.
|
|
45
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
46
|
+
KnownQueryLineageInfo,
|
|
47
|
+
ObservedQuery,
|
|
48
|
+
SqlAggregatorReport,
|
|
49
|
+
SqlParsingAggregator,
|
|
50
|
+
)
|
|
40
51
|
|
|
41
52
|
logger = logging.getLogger(__name__)
|
|
42
53
|
|
|
43
54
|
|
|
44
|
-
class SqlQueriesSourceConfig(
|
|
55
|
+
class SqlQueriesSourceConfig(
|
|
56
|
+
PlatformInstanceConfigMixin, EnvConfigMixin, IncrementalLineageConfigMixin
|
|
57
|
+
):
|
|
45
58
|
query_file: str = Field(description="Path to file to ingest")
|
|
46
59
|
|
|
47
60
|
platform: str = Field(
|
|
@@ -53,45 +66,34 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
|
53
66
|
default=BaseUsageConfig(),
|
|
54
67
|
)
|
|
55
68
|
|
|
56
|
-
use_schema_resolver: bool = Field(
|
|
69
|
+
use_schema_resolver: HiddenFromDocs[bool] = Field(
|
|
70
|
+
True,
|
|
57
71
|
description="Read SchemaMetadata aspects from DataHub to aid in SQL parsing. Turn off only for testing.",
|
|
58
|
-
default=True,
|
|
59
|
-
hidden_from_docs=True,
|
|
60
72
|
)
|
|
61
73
|
default_db: Optional[str] = Field(
|
|
74
|
+
None,
|
|
62
75
|
description="The default database to use for unqualified table names",
|
|
63
|
-
default=None,
|
|
64
76
|
)
|
|
65
77
|
default_schema: Optional[str] = Field(
|
|
78
|
+
None,
|
|
66
79
|
description="The default schema to use for unqualified table names",
|
|
67
|
-
default=None,
|
|
68
80
|
)
|
|
69
|
-
|
|
81
|
+
override_dialect: Optional[str] = Field(
|
|
82
|
+
None,
|
|
70
83
|
description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
|
|
71
|
-
default=None,
|
|
72
84
|
)
|
|
73
85
|
|
|
74
86
|
|
|
87
|
+
@dataclass
|
|
75
88
|
class SqlQueriesSourceReport(SourceReport):
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def compute_stats(self) -> None:
|
|
81
|
-
super().compute_stats()
|
|
82
|
-
self.table_failure_rate = (
|
|
83
|
-
f"{self.num_table_parse_failures / self.num_queries_parsed:.4f}"
|
|
84
|
-
if self.num_queries_parsed
|
|
85
|
-
else "0"
|
|
86
|
-
)
|
|
87
|
-
self.column_failure_rate = (
|
|
88
|
-
f"{self.num_column_parse_failures / self.num_queries_parsed:.4f}"
|
|
89
|
-
if self.num_queries_parsed
|
|
90
|
-
else "0"
|
|
91
|
-
)
|
|
89
|
+
num_entries_processed: int = 0
|
|
90
|
+
num_entries_failed: int = 0
|
|
91
|
+
num_queries_aggregator_failures: int = 0
|
|
92
92
|
|
|
93
|
+
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
93
94
|
|
|
94
|
-
|
|
95
|
+
|
|
96
|
+
@platform_name("SQL Queries", id="sql-queries")
|
|
95
97
|
@config_class(SqlQueriesSourceConfig)
|
|
96
98
|
@support_status(SupportStatus.INCUBATING)
|
|
97
99
|
@capability(SourceCapability.LINEAGE_COARSE, "Parsed from SQL queries")
|
|
@@ -107,15 +109,25 @@ class SqlQueriesSource(Source):
|
|
|
107
109
|
- user (optional): string - The user who ran the query.
|
|
108
110
|
This user value will be directly converted into a DataHub user urn.
|
|
109
111
|
- operation_type (optional): string - Platform-specific operation type, used if the operation type can't be parsed.
|
|
112
|
+
- session_id (optional): string - Session identifier for temporary table resolution across queries.
|
|
110
113
|
- downstream_tables (optional): string[] - Fallback list of tables that the query writes to,
|
|
111
114
|
used if the query can't be parsed.
|
|
112
115
|
- upstream_tables (optional): string[] - Fallback list of tables the query reads from,
|
|
113
116
|
used if the query can't be parsed.
|
|
117
|
+
|
|
118
|
+
### Incremental Lineage
|
|
119
|
+
When `incremental_lineage` is enabled, this source will emit lineage as patches rather than full overwrites.
|
|
120
|
+
This allows you to add lineage edges without removing existing ones, which is useful for:
|
|
121
|
+
- Gradually building up lineage from multiple sources
|
|
122
|
+
- Preserving manually curated lineage
|
|
123
|
+
- Avoiding conflicts when multiple ingestion processes target the same datasets
|
|
124
|
+
|
|
125
|
+
Note: Incremental lineage only applies to UpstreamLineage aspects. Other aspects like queries and usage
|
|
126
|
+
statistics will still be emitted normally.
|
|
114
127
|
"""
|
|
115
128
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
builder: SqlParsingBuilder
|
|
129
|
+
schema_resolver: Optional[SchemaResolver]
|
|
130
|
+
aggregator: SqlParsingAggregator
|
|
119
131
|
|
|
120
132
|
def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig):
|
|
121
133
|
if not ctx.graph:
|
|
@@ -128,22 +140,36 @@ class SqlQueriesSource(Source):
|
|
|
128
140
|
self.config = config
|
|
129
141
|
self.report = SqlQueriesSourceReport()
|
|
130
142
|
|
|
131
|
-
self.builder = SqlParsingBuilder(usage_config=self.config.usage)
|
|
132
|
-
|
|
133
143
|
if self.config.use_schema_resolver:
|
|
144
|
+
# TODO: `initialize_schema_resolver_from_datahub` does a bulk initialization by fetching all schemas
|
|
145
|
+
# for the given platform, platform instance, and env. Instead this should be configurable:
|
|
146
|
+
# bulk initialization vs lazy on-demand schema fetching.
|
|
134
147
|
self.schema_resolver = self.graph.initialize_schema_resolver_from_datahub(
|
|
135
148
|
platform=self.config.platform,
|
|
136
149
|
platform_instance=self.config.platform_instance,
|
|
137
150
|
env=self.config.env,
|
|
138
151
|
)
|
|
139
|
-
self.urns = self.schema_resolver.get_urns()
|
|
140
152
|
else:
|
|
141
|
-
self.schema_resolver =
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
self.
|
|
153
|
+
self.schema_resolver = None
|
|
154
|
+
|
|
155
|
+
self.aggregator = SqlParsingAggregator(
|
|
156
|
+
platform=self.config.platform,
|
|
157
|
+
platform_instance=self.config.platform_instance,
|
|
158
|
+
env=self.config.env,
|
|
159
|
+
schema_resolver=self.schema_resolver,
|
|
160
|
+
eager_graph_load=False,
|
|
161
|
+
generate_lineage=True, # TODO: make this configurable
|
|
162
|
+
generate_queries=True, # TODO: make this configurable
|
|
163
|
+
generate_query_subject_fields=True, # TODO: make this configurable
|
|
164
|
+
generate_query_usage_statistics=True, # This enables publishing SELECT query entities, otherwise only mutation queries are published
|
|
165
|
+
generate_usage_statistics=True,
|
|
166
|
+
generate_operations=True, # TODO: make this configurable
|
|
167
|
+
usage_config=self.config.usage,
|
|
168
|
+
is_temp_table=None,
|
|
169
|
+
is_allowed_table=None,
|
|
170
|
+
format_queries=False,
|
|
171
|
+
)
|
|
172
|
+
self.report.sql_aggregator = self.aggregator.report
|
|
147
173
|
|
|
148
174
|
@classmethod
|
|
149
175
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "SqlQueriesSource":
|
|
@@ -154,100 +180,172 @@ class SqlQueriesSource(Source):
|
|
|
154
180
|
return self.report
|
|
155
181
|
|
|
156
182
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
157
|
-
return [
|
|
183
|
+
return [
|
|
184
|
+
partial(auto_workunit_reporter, self.get_report()),
|
|
185
|
+
partial(
|
|
186
|
+
auto_incremental_lineage,
|
|
187
|
+
self.config.incremental_lineage,
|
|
188
|
+
),
|
|
189
|
+
]
|
|
158
190
|
|
|
159
|
-
def get_workunits_internal(
|
|
191
|
+
def get_workunits_internal(
|
|
192
|
+
self,
|
|
193
|
+
) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
|
|
160
194
|
logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}")
|
|
195
|
+
|
|
196
|
+
with self.report.new_stage("Collecting queries from file"):
|
|
197
|
+
queries = list(self._parse_query_file())
|
|
198
|
+
logger.info(f"Collected {len(queries)} queries for processing")
|
|
199
|
+
|
|
200
|
+
with self.report.new_stage("Processing queries through SQL parsing aggregator"):
|
|
201
|
+
for query_entry in queries:
|
|
202
|
+
self._add_query_to_aggregator(query_entry)
|
|
203
|
+
|
|
204
|
+
with self.report.new_stage("Generating metadata work units"):
|
|
205
|
+
logger.info("Generating workunits from SQL parsing aggregator")
|
|
206
|
+
yield from self.aggregator.gen_metadata()
|
|
207
|
+
|
|
208
|
+
def _parse_query_file(self) -> Iterable["QueryEntry"]:
|
|
209
|
+
"""Parse the query file and yield QueryEntry objects."""
|
|
161
210
|
with open(self.config.query_file) as f:
|
|
162
211
|
for line in f:
|
|
163
212
|
try:
|
|
164
213
|
query_dict = json.loads(line, strict=False)
|
|
165
214
|
entry = QueryEntry.create(query_dict, config=self.config)
|
|
166
|
-
|
|
215
|
+
self.report.num_entries_processed += 1
|
|
216
|
+
if self.report.num_entries_processed % 1000 == 0:
|
|
217
|
+
logger.info(
|
|
218
|
+
f"Processed {self.report.num_entries_processed} query entries"
|
|
219
|
+
)
|
|
220
|
+
yield entry
|
|
167
221
|
except Exception as e:
|
|
168
|
-
|
|
169
|
-
self.report.
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
self.report.num_queries_parsed += 1
|
|
176
|
-
if self.report.num_queries_parsed % 1000 == 0:
|
|
177
|
-
logger.info(f"Parsed {self.report.num_queries_parsed} queries")
|
|
222
|
+
self.report.num_entries_failed += 1
|
|
223
|
+
self.report.warning(
|
|
224
|
+
title="Error processing query",
|
|
225
|
+
message="Query skipped due to parsing error",
|
|
226
|
+
context=line.strip(),
|
|
227
|
+
exc=e,
|
|
228
|
+
)
|
|
178
229
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
230
|
+
def _add_query_to_aggregator(self, query_entry: "QueryEntry") -> None:
|
|
231
|
+
"""Add a query to the SQL parsing aggregator."""
|
|
232
|
+
try:
|
|
233
|
+
# If we have both upstream and downstream tables, use explicit lineage
|
|
234
|
+
if query_entry.upstream_tables and query_entry.downstream_tables:
|
|
235
|
+
logger.debug("Using explicit lineage from query file")
|
|
236
|
+
for downstream_table in query_entry.downstream_tables:
|
|
237
|
+
known_lineage = KnownQueryLineageInfo(
|
|
238
|
+
query_text=query_entry.query,
|
|
239
|
+
downstream=str(downstream_table),
|
|
240
|
+
upstreams=[str(urn) for urn in query_entry.upstream_tables],
|
|
241
|
+
timestamp=query_entry.timestamp,
|
|
242
|
+
session_id=query_entry.session_id,
|
|
243
|
+
)
|
|
244
|
+
self.aggregator.add_known_query_lineage(known_lineage)
|
|
245
|
+
else:
|
|
246
|
+
# Warn if only partial lineage information is provided
|
|
247
|
+
# XOR: true if exactly one of upstream_tables or downstream_tables is provided
|
|
248
|
+
if bool(query_entry.upstream_tables) ^ bool(
|
|
249
|
+
query_entry.downstream_tables
|
|
250
|
+
):
|
|
251
|
+
query_preview = (
|
|
252
|
+
query_entry.query[:150] + "..."
|
|
253
|
+
if len(query_entry.query) > 150
|
|
254
|
+
else query_entry.query
|
|
255
|
+
)
|
|
256
|
+
missing_upstream = (
|
|
257
|
+
"Missing upstream. " if not query_entry.upstream_tables else ""
|
|
258
|
+
)
|
|
259
|
+
missing_downstream = (
|
|
260
|
+
"Missing downstream. "
|
|
261
|
+
if not query_entry.downstream_tables
|
|
262
|
+
else ""
|
|
263
|
+
)
|
|
264
|
+
logger.info(
|
|
265
|
+
f"Only partial lineage information provided, falling back to SQL parsing for complete lineage detection. {missing_upstream}{missing_downstream}Query: {query_preview}"
|
|
266
|
+
)
|
|
267
|
+
# No explicit lineage, rely on parsing
|
|
268
|
+
observed_query = ObservedQuery(
|
|
269
|
+
query=query_entry.query,
|
|
270
|
+
timestamp=query_entry.timestamp,
|
|
271
|
+
user=query_entry.user,
|
|
272
|
+
session_id=query_entry.session_id,
|
|
273
|
+
default_db=self.config.default_db,
|
|
274
|
+
default_schema=self.config.default_schema,
|
|
275
|
+
override_dialect=self.config.override_dialect,
|
|
195
276
|
)
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
277
|
+
self.aggregator.add_observed_query(observed_query)
|
|
278
|
+
|
|
279
|
+
except Exception as e:
|
|
280
|
+
self.report.num_queries_aggregator_failures += 1
|
|
281
|
+
self.report.warning(
|
|
282
|
+
title="Error adding query to aggregator",
|
|
283
|
+
message="Query skipped due to failure when adding query to SQL parsing aggregator",
|
|
284
|
+
context=query_entry.query,
|
|
285
|
+
exc=e,
|
|
200
286
|
)
|
|
201
|
-
self.report.num_column_parse_failures += 1
|
|
202
|
-
|
|
203
|
-
yield from self.builder.process_sql_parsing_result(
|
|
204
|
-
result,
|
|
205
|
-
query=entry.query,
|
|
206
|
-
query_timestamp=entry.timestamp,
|
|
207
|
-
user=entry.user,
|
|
208
|
-
custom_operation_type=entry.operation_type,
|
|
209
|
-
include_urns=self.urns,
|
|
210
|
-
)
|
|
211
287
|
|
|
212
288
|
|
|
213
|
-
|
|
214
|
-
class QueryEntry:
|
|
289
|
+
class QueryEntry(BaseModel):
|
|
215
290
|
query: str
|
|
216
|
-
timestamp: Optional[datetime]
|
|
217
|
-
user: Optional[
|
|
218
|
-
operation_type: Optional[str]
|
|
219
|
-
downstream_tables: List[
|
|
220
|
-
upstream_tables: List[
|
|
291
|
+
timestamp: Optional[datetime] = None
|
|
292
|
+
user: Optional[CorpUserUrn] = None
|
|
293
|
+
operation_type: Optional[str] = None
|
|
294
|
+
downstream_tables: List[DatasetUrn] = Field(default_factory=list)
|
|
295
|
+
upstream_tables: List[DatasetUrn] = Field(default_factory=list)
|
|
296
|
+
session_id: Optional[str] = None
|
|
297
|
+
|
|
298
|
+
# Validation context for URN creation
|
|
299
|
+
_validation_context: ClassVar[Optional[SqlQueriesSourceConfig]] = None
|
|
300
|
+
|
|
301
|
+
class Config:
|
|
302
|
+
arbitrary_types_allowed = True
|
|
303
|
+
|
|
304
|
+
@validator("timestamp", pre=True)
|
|
305
|
+
def parse_timestamp(cls, v):
|
|
306
|
+
return None if v is None else parse_user_datetime(str(v))
|
|
307
|
+
|
|
308
|
+
@validator("user", pre=True)
|
|
309
|
+
def parse_user(cls, v):
|
|
310
|
+
if v is None:
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
return v if isinstance(v, CorpUserUrn) else CorpUserUrn(v)
|
|
314
|
+
|
|
315
|
+
@validator("downstream_tables", "upstream_tables", pre=True)
|
|
316
|
+
def parse_tables(cls, v):
|
|
317
|
+
if not v:
|
|
318
|
+
return []
|
|
319
|
+
|
|
320
|
+
result = []
|
|
321
|
+
for item in v:
|
|
322
|
+
if isinstance(item, DatasetUrn):
|
|
323
|
+
result.append(item)
|
|
324
|
+
elif isinstance(item, str):
|
|
325
|
+
# Skip empty/whitespace-only strings
|
|
326
|
+
if item and item.strip():
|
|
327
|
+
# Convert to URN using validation context
|
|
328
|
+
assert cls._validation_context, (
|
|
329
|
+
"Validation context must be set for URN creation"
|
|
330
|
+
)
|
|
331
|
+
urn_string = make_dataset_urn_with_platform_instance(
|
|
332
|
+
name=item,
|
|
333
|
+
platform=cls._validation_context.platform,
|
|
334
|
+
platform_instance=cls._validation_context.platform_instance,
|
|
335
|
+
env=cls._validation_context.env,
|
|
336
|
+
)
|
|
337
|
+
result.append(DatasetUrn.from_string(urn_string))
|
|
338
|
+
|
|
339
|
+
return result
|
|
221
340
|
|
|
222
341
|
@classmethod
|
|
223
342
|
def create(
|
|
224
343
|
cls, entry_dict: dict, *, config: SqlQueriesSourceConfig
|
|
225
344
|
) -> "QueryEntry":
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
user=make_user_urn(entry_dict["user"]) if "user" in entry_dict else None,
|
|
234
|
-
operation_type=entry_dict.get("operation_type"),
|
|
235
|
-
downstream_tables=[
|
|
236
|
-
make_dataset_urn_with_platform_instance(
|
|
237
|
-
name=table,
|
|
238
|
-
platform=config.platform,
|
|
239
|
-
platform_instance=config.platform_instance,
|
|
240
|
-
env=config.env,
|
|
241
|
-
)
|
|
242
|
-
for table in entry_dict.get("downstream_tables", [])
|
|
243
|
-
],
|
|
244
|
-
upstream_tables=[
|
|
245
|
-
make_dataset_urn_with_platform_instance(
|
|
246
|
-
name=table,
|
|
247
|
-
platform=config.platform,
|
|
248
|
-
platform_instance=config.platform_instance,
|
|
249
|
-
env=config.env,
|
|
250
|
-
)
|
|
251
|
-
for table in entry_dict.get("upstream_tables", [])
|
|
252
|
-
],
|
|
253
|
-
)
|
|
345
|
+
"""Create QueryEntry from dict with config context."""
|
|
346
|
+
# Set validation context for URN creation
|
|
347
|
+
cls._validation_context = config
|
|
348
|
+
try:
|
|
349
|
+
return cls.parse_obj(entry_dict)
|
|
350
|
+
finally:
|
|
351
|
+
cls._validation_context = None
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import bz2
|
|
3
|
-
import contextlib
|
|
4
3
|
import functools
|
|
5
4
|
import json
|
|
6
5
|
import logging
|
|
7
|
-
import pickle
|
|
8
6
|
from dataclasses import dataclass
|
|
9
7
|
from datetime import datetime, timezone
|
|
10
8
|
from typing import Callable, Generic, Optional, Type, TypeVar
|
|
@@ -70,7 +68,11 @@ class CheckpointStateBase(ConfigModel):
|
|
|
70
68
|
|
|
71
69
|
@staticmethod
|
|
72
70
|
def _to_bytes_utf8(model: ConfigModel) -> bytes:
|
|
73
|
-
|
|
71
|
+
pydantic_json = model.model_dump_json(exclude={"version", "serde"})
|
|
72
|
+
# We decode and re-encode so that Python's default whitespace is included.
|
|
73
|
+
# This is purely to keep tests consistent as we migrate to pydantic v2,
|
|
74
|
+
# and can be removed once we're fully migrated.
|
|
75
|
+
return json.dumps(json.loads(pydantic_json)).encode("utf-8")
|
|
74
76
|
|
|
75
77
|
@staticmethod
|
|
76
78
|
def _to_bytes_base85_json(
|
|
@@ -117,10 +119,9 @@ class Checkpoint(Generic[StateType]):
|
|
|
117
119
|
checkpoint_aspect, state_class
|
|
118
120
|
)
|
|
119
121
|
elif checkpoint_aspect.state.serde == "base85":
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
state_class,
|
|
122
|
+
raise ValueError(
|
|
123
|
+
"The base85 encoding for stateful ingestion has been removed for security reasons. "
|
|
124
|
+
"You may need to temporarily set `ignore_previous_checkpoint` to true to ignore the outdated checkpoint object."
|
|
124
125
|
)
|
|
125
126
|
elif checkpoint_aspect.state.serde == "base85-bz2-json":
|
|
126
127
|
state_obj = Checkpoint._from_base85_json_bytes(
|
|
@@ -164,28 +165,6 @@ class Checkpoint(Generic[StateType]):
|
|
|
164
165
|
state_as_dict["serde"] = checkpoint_aspect.state.serde
|
|
165
166
|
return state_class.parse_obj(state_as_dict)
|
|
166
167
|
|
|
167
|
-
@staticmethod
|
|
168
|
-
def _from_base85_bytes(
|
|
169
|
-
checkpoint_aspect: DatahubIngestionCheckpointClass,
|
|
170
|
-
decompressor: Callable[[bytes], bytes],
|
|
171
|
-
state_class: Type[StateType],
|
|
172
|
-
) -> StateType:
|
|
173
|
-
state: StateType = pickle.loads(
|
|
174
|
-
decompressor(base64.b85decode(checkpoint_aspect.state.payload)) # type: ignore
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
with contextlib.suppress(Exception):
|
|
178
|
-
# When loading from pickle, the pydantic validators don't run.
|
|
179
|
-
# By re-serializing and re-parsing, we ensure that the state is valid.
|
|
180
|
-
# However, we also suppress any exceptions to make sure this doesn't blow up.
|
|
181
|
-
state = state_class.parse_obj(state.dict())
|
|
182
|
-
|
|
183
|
-
# Because the base85 method is deprecated in favor of base85-bz2-json,
|
|
184
|
-
# we will automatically switch the serde.
|
|
185
|
-
state.serde = "base85-bz2-json"
|
|
186
|
-
|
|
187
|
-
return state
|
|
188
|
-
|
|
189
168
|
@staticmethod
|
|
190
169
|
def _from_base85_json_bytes(
|
|
191
170
|
checkpoint_aspect: DatahubIngestionCheckpointClass,
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, Dict, Iterable, List, Tuple, Type
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Tuple, Type
|
|
2
2
|
|
|
3
3
|
import pydantic
|
|
4
4
|
|
|
@@ -8,13 +8,16 @@ from datahub.utilities.checkpoint_state_util import CheckpointStateUtil
|
|
|
8
8
|
from datahub.utilities.dedup_list import deduplicate_list
|
|
9
9
|
from datahub.utilities.urns.urn import guess_entity_type
|
|
10
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from pydantic.deprecated.class_validators import V1RootValidator
|
|
13
|
+
|
|
11
14
|
STATEFUL_INGESTION_IGNORED_ENTITY_TYPES = {
|
|
12
15
|
"dataProcessInstance",
|
|
13
16
|
"query",
|
|
14
17
|
}
|
|
15
18
|
|
|
16
19
|
|
|
17
|
-
def pydantic_state_migrator(mapping: Dict[str, str]) ->
|
|
20
|
+
def pydantic_state_migrator(mapping: Dict[str, str]) -> "V1RootValidator":
|
|
18
21
|
# mapping would be something like:
|
|
19
22
|
# {
|
|
20
23
|
# 'encoded_view_urns': 'dataset',
|