acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -16,6 +16,7 @@ from sqlalchemy.engine.reflection import Inspector
|
|
|
16
16
|
from sqlalchemy.types import TypeEngine
|
|
17
17
|
from sqlalchemy_bigquery import STRUCT
|
|
18
18
|
|
|
19
|
+
from datahub.configuration.common import HiddenFromDocs
|
|
19
20
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
20
21
|
from datahub.emitter.mcp_builder import ContainerKey, DatabaseKey
|
|
21
22
|
from datahub.ingestion.api.decorators import (
|
|
@@ -29,8 +30,14 @@ from datahub.ingestion.api.decorators import (
|
|
|
29
30
|
from datahub.ingestion.api.source import StructuredLogLevel
|
|
30
31
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
31
32
|
from datahub.ingestion.source.aws.s3_util import make_s3_urn
|
|
32
|
-
from datahub.ingestion.source.common.subtypes import
|
|
33
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
34
|
+
DatasetContainerSubTypes,
|
|
35
|
+
SourceCapabilityModifier,
|
|
36
|
+
)
|
|
33
37
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
|
38
|
+
from datahub.ingestion.source.sql.athena_properties_extractor import (
|
|
39
|
+
AthenaPropertiesExtractor,
|
|
40
|
+
)
|
|
34
41
|
from datahub.ingestion.source.sql.sql_common import (
|
|
35
42
|
SQLAlchemySource,
|
|
36
43
|
register_custom_type,
|
|
@@ -44,12 +51,17 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
44
51
|
)
|
|
45
52
|
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
46
53
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
|
47
|
-
from datahub.metadata.schema_classes import
|
|
54
|
+
from datahub.metadata.schema_classes import (
|
|
55
|
+
ArrayTypeClass,
|
|
56
|
+
MapTypeClass,
|
|
57
|
+
RecordTypeClass,
|
|
58
|
+
)
|
|
48
59
|
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
|
|
49
60
|
from datahub.utilities.sqlalchemy_type_converter import (
|
|
50
61
|
MapType,
|
|
51
62
|
get_schema_fields_for_sqlalchemy_column,
|
|
52
63
|
)
|
|
64
|
+
from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
|
|
53
65
|
|
|
54
66
|
try:
|
|
55
67
|
from typing_extensions import override
|
|
@@ -62,6 +74,11 @@ except ImportError:
|
|
|
62
74
|
|
|
63
75
|
logger = logging.getLogger(__name__)
|
|
64
76
|
|
|
77
|
+
# Precompiled regex for SQL identifier validation
|
|
78
|
+
# Athena identifiers can only contain lowercase letters, numbers, underscore, and period (for complex types)
|
|
79
|
+
# Note: Athena automatically converts uppercase to lowercase, but we're being strict for security
|
|
80
|
+
_IDENTIFIER_PATTERN = re.compile(r"^[a-zA-Z0-9_.]+$")
|
|
81
|
+
|
|
65
82
|
assert STRUCT, "required type modules are not available"
|
|
66
83
|
register_custom_type(STRUCT, RecordTypeClass)
|
|
67
84
|
register_custom_type(MapType, MapTypeClass)
|
|
@@ -235,7 +252,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
|
|
|
235
252
|
|
|
236
253
|
|
|
237
254
|
class AthenaConfig(SQLCommonConfig):
|
|
238
|
-
scheme: str = "awsathena+rest"
|
|
255
|
+
scheme: HiddenFromDocs[str] = "awsathena+rest"
|
|
239
256
|
username: Optional[str] = pydantic.Field(
|
|
240
257
|
default=None,
|
|
241
258
|
description="Username credential. If not specified, detected with boto3 rules. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html",
|
|
@@ -281,12 +298,22 @@ class AthenaConfig(SQLCommonConfig):
|
|
|
281
298
|
description="Extract partitions for tables. Partition extraction needs to run a query (`select * from table$partitions`) on the table. Disable this if you don't want to grant select permission.",
|
|
282
299
|
)
|
|
283
300
|
|
|
301
|
+
extract_partitions_using_create_statements: bool = pydantic.Field(
|
|
302
|
+
default=False,
|
|
303
|
+
description="Extract partitions using the `SHOW CREATE TABLE` statement instead of querying the table's partitions directly. This needs to be enabled to extract Iceberg partitions. If extraction fails it falls back to the default partition extraction. This is experimental.",
|
|
304
|
+
)
|
|
305
|
+
|
|
284
306
|
_s3_staging_dir_population = pydantic_renamed_field(
|
|
285
307
|
old_name="s3_staging_dir",
|
|
286
308
|
new_name="query_result_location",
|
|
287
309
|
print_warning=True,
|
|
288
310
|
)
|
|
289
311
|
|
|
312
|
+
emit_schema_fieldpaths_as_v1: bool = pydantic.Field(
|
|
313
|
+
default=False,
|
|
314
|
+
description="Convert simple field paths to DataHub field path v1 format. Simple column paths are those that do not contain any nested fields.",
|
|
315
|
+
)
|
|
316
|
+
|
|
290
317
|
profiling: AthenaProfilingConfig = AthenaProfilingConfig()
|
|
291
318
|
|
|
292
319
|
def get_sql_alchemy_url(self):
|
|
@@ -321,8 +348,24 @@ class Partitionitem:
|
|
|
321
348
|
@capability(
|
|
322
349
|
SourceCapability.DATA_PROFILING,
|
|
323
350
|
"Optionally enabled via configuration. Profiling uses sql queries on whole table which can be expensive operation.",
|
|
351
|
+
subtype_modifier=[SourceCapabilityModifier.TABLE],
|
|
352
|
+
)
|
|
353
|
+
@capability(
|
|
354
|
+
SourceCapability.LINEAGE_COARSE,
|
|
355
|
+
"Supported for S3 tables",
|
|
356
|
+
subtype_modifier=[
|
|
357
|
+
SourceCapabilityModifier.VIEW,
|
|
358
|
+
SourceCapabilityModifier.TABLE,
|
|
359
|
+
],
|
|
360
|
+
)
|
|
361
|
+
@capability(
|
|
362
|
+
SourceCapability.LINEAGE_FINE,
|
|
363
|
+
"Supported for S3 tables",
|
|
364
|
+
subtype_modifier=[
|
|
365
|
+
SourceCapabilityModifier.VIEW,
|
|
366
|
+
SourceCapabilityModifier.TABLE,
|
|
367
|
+
],
|
|
324
368
|
)
|
|
325
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Supported for S3 tables")
|
|
326
369
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
327
370
|
class AthenaSource(SQLAlchemySource):
|
|
328
371
|
"""
|
|
@@ -473,11 +516,70 @@ class AthenaSource(SQLAlchemySource):
|
|
|
473
516
|
return [schema for schema in schemas if schema == athena_config.database]
|
|
474
517
|
return schemas
|
|
475
518
|
|
|
519
|
+
@classmethod
|
|
520
|
+
def _sanitize_identifier(cls, identifier: str) -> str:
|
|
521
|
+
"""Sanitize SQL identifiers to prevent injection attacks.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
identifier: The SQL identifier to sanitize
|
|
525
|
+
|
|
526
|
+
Returns:
|
|
527
|
+
Sanitized identifier safe for SQL queries
|
|
528
|
+
|
|
529
|
+
Raises:
|
|
530
|
+
ValueError: If identifier contains unsafe characters
|
|
531
|
+
"""
|
|
532
|
+
if not identifier:
|
|
533
|
+
raise ValueError("Identifier cannot be empty")
|
|
534
|
+
|
|
535
|
+
# Allow only alphanumeric characters, underscores, and periods for identifiers
|
|
536
|
+
# This matches Athena's identifier naming rules
|
|
537
|
+
if not _IDENTIFIER_PATTERN.match(identifier):
|
|
538
|
+
raise ValueError(
|
|
539
|
+
f"Identifier '{identifier}' contains unsafe characters. Only alphanumeric characters, underscores, and periods are allowed."
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
return identifier
|
|
543
|
+
|
|
476
544
|
@classmethod
|
|
477
545
|
def _casted_partition_key(cls, key: str) -> str:
|
|
478
546
|
# We need to cast the partition keys to a VARCHAR, since otherwise
|
|
479
547
|
# Athena may throw an error during concatenation / comparison.
|
|
480
|
-
|
|
548
|
+
sanitized_key = cls._sanitize_identifier(key)
|
|
549
|
+
return f"CAST({sanitized_key} as VARCHAR)"
|
|
550
|
+
|
|
551
|
+
@classmethod
|
|
552
|
+
def _build_max_partition_query(
|
|
553
|
+
cls, schema: str, table: str, partitions: List[str]
|
|
554
|
+
) -> str:
|
|
555
|
+
"""Build SQL query to find the row with maximum partition values.
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
schema: Database schema name
|
|
559
|
+
table: Table name
|
|
560
|
+
partitions: List of partition column names
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
SQL query string to find the maximum partition
|
|
564
|
+
|
|
565
|
+
Raises:
|
|
566
|
+
ValueError: If any identifier contains unsafe characters
|
|
567
|
+
"""
|
|
568
|
+
# Sanitize all identifiers to prevent SQL injection
|
|
569
|
+
sanitized_schema = cls._sanitize_identifier(schema)
|
|
570
|
+
sanitized_table = cls._sanitize_identifier(table)
|
|
571
|
+
sanitized_partitions = [
|
|
572
|
+
cls._sanitize_identifier(partition) for partition in partitions
|
|
573
|
+
]
|
|
574
|
+
|
|
575
|
+
casted_keys = [cls._casted_partition_key(key) for key in partitions]
|
|
576
|
+
if len(casted_keys) == 1:
|
|
577
|
+
part_concat = casted_keys[0]
|
|
578
|
+
else:
|
|
579
|
+
separator = "CAST('-' AS VARCHAR)"
|
|
580
|
+
part_concat = f"CONCAT({f', {separator}, '.join(casted_keys)})"
|
|
581
|
+
|
|
582
|
+
return f'select {",".join(sanitized_partitions)} from "{sanitized_schema}"."{sanitized_table}$partitions" where {part_concat} = (select max({part_concat}) from "{sanitized_schema}"."{sanitized_table}$partitions")'
|
|
481
583
|
|
|
482
584
|
@override
|
|
483
585
|
def get_partitions(
|
|
@@ -489,27 +591,37 @@ class AthenaSource(SQLAlchemySource):
|
|
|
489
591
|
if not self.cursor:
|
|
490
592
|
return None
|
|
491
593
|
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
594
|
+
if self.config.extract_partitions_using_create_statements:
|
|
595
|
+
try:
|
|
596
|
+
partitions = self._get_partitions_create_table(schema, table)
|
|
597
|
+
except Exception as e:
|
|
598
|
+
logger.warning(
|
|
599
|
+
f"Failed to get partitions from create table statement for {schema}.{table} because of {e}. Falling back to SQLAlchemy.",
|
|
600
|
+
exc_info=True,
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
# If we can't get create table statement, we fall back to SQLAlchemy
|
|
604
|
+
partitions = self._get_partitions_sqlalchemy(schema, table)
|
|
605
|
+
else:
|
|
606
|
+
partitions = self._get_partitions_sqlalchemy(schema, table)
|
|
495
607
|
|
|
496
|
-
partitions = []
|
|
497
|
-
for key in metadata.partition_keys:
|
|
498
|
-
if key.name:
|
|
499
|
-
partitions.append(key.name)
|
|
500
608
|
if not partitions:
|
|
501
609
|
return []
|
|
502
610
|
|
|
611
|
+
if (
|
|
612
|
+
not self.config.profiling.enabled
|
|
613
|
+
or not self.config.profiling.partition_profiling_enabled
|
|
614
|
+
):
|
|
615
|
+
return partitions
|
|
616
|
+
|
|
503
617
|
with self.report.report_exc(
|
|
504
618
|
message="Failed to extract partition details",
|
|
505
619
|
context=f"{schema}.{table}",
|
|
506
620
|
level=StructuredLogLevel.WARN,
|
|
507
621
|
):
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
self._casted_partition_key(key) for key in partitions
|
|
622
|
+
max_partition_query = self._build_max_partition_query(
|
|
623
|
+
schema, table, partitions
|
|
511
624
|
)
|
|
512
|
-
max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
|
|
513
625
|
ret = self.cursor.execute(max_partition_query)
|
|
514
626
|
max_partition: Dict[str, str] = {}
|
|
515
627
|
if ret:
|
|
@@ -525,6 +637,56 @@ class AthenaSource(SQLAlchemySource):
|
|
|
525
637
|
|
|
526
638
|
return partitions
|
|
527
639
|
|
|
640
|
+
def _get_partitions_create_table(self, schema: str, table: str) -> List[str]:
|
|
641
|
+
assert self.cursor
|
|
642
|
+
try:
|
|
643
|
+
res = self.cursor.execute(f"SHOW CREATE TABLE `{schema}`.`{table}`")
|
|
644
|
+
except Exception as e:
|
|
645
|
+
# Athena does not support SHOW CREATE TABLE for views
|
|
646
|
+
# and will throw an error. We need to handle this case
|
|
647
|
+
# and caller needs to fallback to sqlalchemy's get partitions call.
|
|
648
|
+
logger.debug(
|
|
649
|
+
f"Failed to get table properties for {schema}.{table}: {e}",
|
|
650
|
+
exc_info=True,
|
|
651
|
+
)
|
|
652
|
+
raise e
|
|
653
|
+
rows = res.fetchall()
|
|
654
|
+
|
|
655
|
+
# Concatenate all rows into a single string with newlines
|
|
656
|
+
create_table_statement = "\n".join(row[0] for row in rows)
|
|
657
|
+
|
|
658
|
+
try:
|
|
659
|
+
athena_table_info = AthenaPropertiesExtractor.get_table_properties(
|
|
660
|
+
create_table_statement
|
|
661
|
+
)
|
|
662
|
+
except Exception as e:
|
|
663
|
+
logger.debug(
|
|
664
|
+
f"Failed to parse table properties for {schema}.{table}: {e} and statement: {create_table_statement}",
|
|
665
|
+
exc_info=True,
|
|
666
|
+
)
|
|
667
|
+
raise e
|
|
668
|
+
|
|
669
|
+
partitions = []
|
|
670
|
+
if (
|
|
671
|
+
athena_table_info.partition_info
|
|
672
|
+
and athena_table_info.partition_info.simple_columns
|
|
673
|
+
):
|
|
674
|
+
partitions = [
|
|
675
|
+
ci.name for ci in athena_table_info.partition_info.simple_columns
|
|
676
|
+
]
|
|
677
|
+
return partitions
|
|
678
|
+
|
|
679
|
+
def _get_partitions_sqlalchemy(self, schema: str, table: str) -> List[str]:
|
|
680
|
+
assert self.cursor
|
|
681
|
+
metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
|
|
682
|
+
table_name=table, schema_name=schema
|
|
683
|
+
)
|
|
684
|
+
partitions = []
|
|
685
|
+
for key in metadata.partition_keys:
|
|
686
|
+
if key.name:
|
|
687
|
+
partitions.append(key.name)
|
|
688
|
+
return partitions
|
|
689
|
+
|
|
528
690
|
# Overwrite to modify the creation of schema fields
|
|
529
691
|
def get_schema_fields_for_column(
|
|
530
692
|
self,
|
|
@@ -551,6 +713,18 @@ class AthenaSource(SQLAlchemySource):
|
|
|
551
713
|
),
|
|
552
714
|
)
|
|
553
715
|
|
|
716
|
+
# Keeping it as individual check to make it more explicit and easier to understand
|
|
717
|
+
if not self.config.emit_schema_fieldpaths_as_v1:
|
|
718
|
+
return fields
|
|
719
|
+
|
|
720
|
+
if isinstance(
|
|
721
|
+
fields[0].type.type, (RecordTypeClass, MapTypeClass, ArrayTypeClass)
|
|
722
|
+
):
|
|
723
|
+
return fields
|
|
724
|
+
else:
|
|
725
|
+
fields[0].fieldPath = get_simple_field_path_from_v2_field_path(
|
|
726
|
+
fields[0].fieldPath
|
|
727
|
+
)
|
|
554
728
|
return fields
|
|
555
729
|
|
|
556
730
|
def generate_partition_profiler_query(
|
|
@@ -564,16 +738,34 @@ class AthenaSource(SQLAlchemySource):
|
|
|
564
738
|
).get(table, None)
|
|
565
739
|
|
|
566
740
|
if partition and partition.max_partition:
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
741
|
+
try:
|
|
742
|
+
# Sanitize identifiers to prevent SQL injection
|
|
743
|
+
sanitized_schema = self._sanitize_identifier(schema)
|
|
744
|
+
sanitized_table = self._sanitize_identifier(table)
|
|
745
|
+
|
|
746
|
+
max_partition_filters = []
|
|
747
|
+
for key, value in partition.max_partition.items():
|
|
748
|
+
# Sanitize partition key and properly escape the value
|
|
749
|
+
sanitized_key = self._sanitize_identifier(key)
|
|
750
|
+
# Escape single quotes in the value to prevent injection
|
|
751
|
+
escaped_value = value.replace("'", "''") if value else ""
|
|
752
|
+
max_partition_filters.append(
|
|
753
|
+
f"{self._casted_partition_key(sanitized_key)} = '{escaped_value}'"
|
|
754
|
+
)
|
|
755
|
+
max_partition = str(partition.max_partition)
|
|
756
|
+
return (
|
|
757
|
+
max_partition,
|
|
758
|
+
f'SELECT * FROM "{sanitized_schema}"."{sanitized_table}" WHERE {" AND ".join(max_partition_filters)}',
|
|
571
759
|
)
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
760
|
+
except ValueError as e:
|
|
761
|
+
# If sanitization fails due to malicious identifiers,
|
|
762
|
+
# return None to disable partition profiling for this table
|
|
763
|
+
# rather than crashing the entire ingestion
|
|
764
|
+
logger.warning(
|
|
765
|
+
f"Failed to generate partition profiler query for {schema}.{table} due to unsafe identifiers: {e}. "
|
|
766
|
+
f"Partition profiling disabled for this table."
|
|
767
|
+
)
|
|
768
|
+
return None, None
|
|
577
769
|
return None, None
|
|
578
770
|
|
|
579
771
|
def close(self):
|