acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,732 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
|
|
4
|
+
# Add imports for source customization
|
|
5
|
+
from typing import Any, Callable, Dict, Optional, Type, TypeVar
|
|
6
|
+
from urllib.parse import unquote
|
|
7
|
+
|
|
8
|
+
# Don't import TableData at the module level to avoid circular imports
|
|
9
|
+
# from datahub.ingestion.source.s3.source import TableData
|
|
10
|
+
|
|
11
|
+
T = TypeVar("T")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ObjectStoreInterface(ABC):
|
|
15
|
+
"""
|
|
16
|
+
Abstract interface for object store operations.
|
|
17
|
+
|
|
18
|
+
This interface defines the operations that any object store connector
|
|
19
|
+
(S3, GCS, ABS, etc.) should implement to provide a consistent API.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def is_uri(cls, uri: str) -> bool:
|
|
25
|
+
"""
|
|
26
|
+
Check if the given URI is for this object store.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
uri: The URI to check
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
True if the URI is for this object store, False otherwise
|
|
33
|
+
"""
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def get_prefix(cls, uri: str) -> Optional[str]:
|
|
39
|
+
"""
|
|
40
|
+
Get the prefix for this object store URI (e.g., 's3://', 'gs://').
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
uri: The URI to get the prefix from
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
The prefix if the URI starts with it, None otherwise
|
|
47
|
+
"""
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def strip_prefix(cls, uri: str) -> str:
|
|
53
|
+
"""
|
|
54
|
+
Remove the object store prefix from the URI.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
uri: The URI to strip the prefix from
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
The URI without the prefix
|
|
61
|
+
|
|
62
|
+
Raises:
|
|
63
|
+
ValueError: If the URI does not start with the expected prefix
|
|
64
|
+
"""
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
@abstractmethod
|
|
69
|
+
def get_bucket_name(cls, uri: str) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Get the bucket name from the URI.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
uri: The URI to get the bucket name from
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
The bucket name
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
ValueError: If the URI is not valid for this object store
|
|
81
|
+
"""
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
@abstractmethod
|
|
86
|
+
def get_object_key(cls, uri: str) -> str:
|
|
87
|
+
"""
|
|
88
|
+
Get the object key/path (excluding the bucket) from the URI.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
uri: The URI to get the object key from
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
The object key
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
ValueError: If the URI is not valid for this object store
|
|
98
|
+
"""
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
@classmethod
|
|
102
|
+
def get_object_store_bucket_name(cls, uri: str) -> str:
|
|
103
|
+
"""
|
|
104
|
+
Get the bucket name from the URI, handling foreign URIs if supported.
|
|
105
|
+
|
|
106
|
+
The default implementation just calls get_bucket_name, but subclasses
|
|
107
|
+
can override this to handle URIs from other object stores.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
uri: The URI to get the bucket name from
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
The bucket name
|
|
114
|
+
|
|
115
|
+
Raises:
|
|
116
|
+
ValueError: If the URI is not supported
|
|
117
|
+
"""
|
|
118
|
+
return cls.get_bucket_name(uri)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class S3ObjectStore(ObjectStoreInterface):
|
|
122
|
+
"""Implementation of ObjectStoreInterface for Amazon S3."""
|
|
123
|
+
|
|
124
|
+
PREFIXES = ["s3://", "s3n://", "s3a://"]
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def is_uri(cls, uri: str) -> bool:
|
|
128
|
+
return any(uri.startswith(prefix) for prefix in cls.PREFIXES)
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
def get_prefix(cls, uri: str) -> Optional[str]:
|
|
132
|
+
for prefix in cls.PREFIXES:
|
|
133
|
+
if uri.startswith(prefix):
|
|
134
|
+
return prefix
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def strip_prefix(cls, uri: str) -> str:
|
|
139
|
+
prefix = cls.get_prefix(uri)
|
|
140
|
+
if not prefix:
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f"Not an S3 URI. Must start with one of the following prefixes: {str(cls.PREFIXES)}"
|
|
143
|
+
)
|
|
144
|
+
return uri[len(prefix) :]
|
|
145
|
+
|
|
146
|
+
@classmethod
|
|
147
|
+
def get_bucket_name(cls, uri: str) -> str:
|
|
148
|
+
if not cls.is_uri(uri):
|
|
149
|
+
raise ValueError(
|
|
150
|
+
f"Not an S3 URI. Must start with one of the following prefixes: {str(cls.PREFIXES)}"
|
|
151
|
+
)
|
|
152
|
+
return cls.strip_prefix(uri).split("/")[0]
|
|
153
|
+
|
|
154
|
+
@classmethod
|
|
155
|
+
def get_object_key(cls, uri: str) -> str:
|
|
156
|
+
if not cls.is_uri(uri):
|
|
157
|
+
raise ValueError(
|
|
158
|
+
f"Not an S3 URI. Must start with one of the following prefixes: {str(cls.PREFIXES)}"
|
|
159
|
+
)
|
|
160
|
+
parts = cls.strip_prefix(uri).split("/", 1)
|
|
161
|
+
if len(parts) < 2:
|
|
162
|
+
return ""
|
|
163
|
+
return parts[1]
|
|
164
|
+
|
|
165
|
+
@classmethod
|
|
166
|
+
def get_object_store_bucket_name(cls, uri: str) -> str:
|
|
167
|
+
"""
|
|
168
|
+
Get the bucket name from an S3 URI.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
uri: The URI to get the bucket name from
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
The bucket name
|
|
175
|
+
|
|
176
|
+
Raises:
|
|
177
|
+
ValueError: If the URI is not an S3 URI
|
|
178
|
+
"""
|
|
179
|
+
return cls.get_bucket_name(uri)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class GCSObjectStore(ObjectStoreInterface):
|
|
183
|
+
"""Implementation of ObjectStoreInterface for Google Cloud Storage."""
|
|
184
|
+
|
|
185
|
+
PREFIX = "gs://"
|
|
186
|
+
|
|
187
|
+
@classmethod
|
|
188
|
+
def is_uri(cls, uri: str) -> bool:
|
|
189
|
+
return uri.startswith(cls.PREFIX)
|
|
190
|
+
|
|
191
|
+
@classmethod
|
|
192
|
+
def get_prefix(cls, uri: str) -> Optional[str]:
|
|
193
|
+
if uri.startswith(cls.PREFIX):
|
|
194
|
+
return cls.PREFIX
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
@classmethod
|
|
198
|
+
def strip_prefix(cls, uri: str) -> str:
|
|
199
|
+
prefix = cls.get_prefix(uri)
|
|
200
|
+
if not prefix:
|
|
201
|
+
raise ValueError(f"Not a GCS URI. Must start with prefix: {cls.PREFIX}")
|
|
202
|
+
return uri[len(prefix) :]
|
|
203
|
+
|
|
204
|
+
@classmethod
|
|
205
|
+
def get_bucket_name(cls, uri: str) -> str:
|
|
206
|
+
if not cls.is_uri(uri):
|
|
207
|
+
raise ValueError(f"Not a GCS URI. Must start with prefix: {cls.PREFIX}")
|
|
208
|
+
return cls.strip_prefix(uri).split("/")[0]
|
|
209
|
+
|
|
210
|
+
@classmethod
|
|
211
|
+
def get_object_key(cls, uri: str) -> str:
|
|
212
|
+
if not cls.is_uri(uri):
|
|
213
|
+
raise ValueError(f"Not a GCS URI. Must start with prefix: {cls.PREFIX}")
|
|
214
|
+
parts = cls.strip_prefix(uri).split("/", 1)
|
|
215
|
+
if len(parts) < 2:
|
|
216
|
+
return ""
|
|
217
|
+
return parts[1]
|
|
218
|
+
|
|
219
|
+
@classmethod
|
|
220
|
+
def get_object_store_bucket_name(cls, uri: str) -> str:
|
|
221
|
+
"""
|
|
222
|
+
Get the bucket name from a GCS URI.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
uri: The URI to get the bucket name from
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
The bucket name
|
|
229
|
+
|
|
230
|
+
Raises:
|
|
231
|
+
ValueError: If the URI is not a GCS URI
|
|
232
|
+
"""
|
|
233
|
+
return cls.get_bucket_name(uri)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class ABSObjectStore(ObjectStoreInterface):
|
|
237
|
+
"""Implementation of ObjectStoreInterface for Azure Blob Storage."""
|
|
238
|
+
|
|
239
|
+
PREFIX = "abfss://"
|
|
240
|
+
HTTPS_REGEX = re.compile(r"(https?://[a-z0-9]{3,24}\.blob\.core\.windows\.net/)")
|
|
241
|
+
|
|
242
|
+
@classmethod
|
|
243
|
+
def is_uri(cls, uri: str) -> bool:
|
|
244
|
+
return uri.startswith(cls.PREFIX) or bool(cls.HTTPS_REGEX.match(uri))
|
|
245
|
+
|
|
246
|
+
@classmethod
|
|
247
|
+
def get_prefix(cls, uri: str) -> Optional[str]:
|
|
248
|
+
if uri.startswith(cls.PREFIX):
|
|
249
|
+
return cls.PREFIX
|
|
250
|
+
|
|
251
|
+
# Check for HTTPS format
|
|
252
|
+
match = cls.HTTPS_REGEX.match(uri)
|
|
253
|
+
if match:
|
|
254
|
+
return match.group(1)
|
|
255
|
+
|
|
256
|
+
return None
|
|
257
|
+
|
|
258
|
+
@classmethod
|
|
259
|
+
def strip_prefix(cls, uri: str) -> str:
|
|
260
|
+
if uri.startswith(cls.PREFIX):
|
|
261
|
+
return uri[len(cls.PREFIX) :]
|
|
262
|
+
|
|
263
|
+
# Handle HTTPS format
|
|
264
|
+
match = cls.HTTPS_REGEX.match(uri)
|
|
265
|
+
if match:
|
|
266
|
+
return uri[len(match.group(1)) :]
|
|
267
|
+
|
|
268
|
+
raise ValueError(
|
|
269
|
+
f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
@classmethod
|
|
273
|
+
def get_bucket_name(cls, uri: str) -> str:
|
|
274
|
+
if not cls.is_uri(uri):
|
|
275
|
+
raise ValueError(
|
|
276
|
+
f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
if uri.startswith(cls.PREFIX):
|
|
280
|
+
# abfss://container@account.dfs.core.windows.net/path
|
|
281
|
+
return cls.strip_prefix(uri).split("@")[0]
|
|
282
|
+
else:
|
|
283
|
+
# https://account.blob.core.windows.net/container/path
|
|
284
|
+
return cls.strip_prefix(uri).split("/")[0]
|
|
285
|
+
|
|
286
|
+
@classmethod
|
|
287
|
+
def get_object_key(cls, uri: str) -> str:
|
|
288
|
+
if not cls.is_uri(uri):
|
|
289
|
+
raise ValueError(
|
|
290
|
+
f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
if uri.startswith(cls.PREFIX):
|
|
294
|
+
# abfss://container@account.dfs.core.windows.net/path
|
|
295
|
+
parts = cls.strip_prefix(uri).split("@", 1)
|
|
296
|
+
if len(parts) < 2:
|
|
297
|
+
return ""
|
|
298
|
+
account_path = parts[1]
|
|
299
|
+
path_parts = account_path.split("/", 1)
|
|
300
|
+
if len(path_parts) < 2:
|
|
301
|
+
return ""
|
|
302
|
+
return path_parts[1]
|
|
303
|
+
else:
|
|
304
|
+
# https://account.blob.core.windows.net/container/path
|
|
305
|
+
stripped = cls.strip_prefix(uri)
|
|
306
|
+
parts = stripped.split("/", 1)
|
|
307
|
+
if len(parts) < 2:
|
|
308
|
+
return ""
|
|
309
|
+
return parts[1]
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# Registry of all object store implementations
|
|
313
|
+
OBJECT_STORE_REGISTRY: Dict[str, Type[ObjectStoreInterface]] = {
|
|
314
|
+
"s3": S3ObjectStore,
|
|
315
|
+
"gcs": GCSObjectStore,
|
|
316
|
+
"abs": ABSObjectStore,
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def get_object_store_for_uri(uri: str) -> Optional[Type[ObjectStoreInterface]]:
|
|
321
|
+
"""
|
|
322
|
+
Get the appropriate object store implementation for the given URI.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
uri: The URI to get the object store for
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
The object store implementation, or None if no matching implementation is found
|
|
329
|
+
"""
|
|
330
|
+
for object_store in OBJECT_STORE_REGISTRY.values():
|
|
331
|
+
if object_store.is_uri(uri):
|
|
332
|
+
return object_store
|
|
333
|
+
return None
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def get_object_store_bucket_name(uri: str) -> str:
|
|
337
|
+
"""
|
|
338
|
+
Get the bucket name from any supported object store URI.
|
|
339
|
+
|
|
340
|
+
This function acts as a central dispatcher that:
|
|
341
|
+
1. Identifies the appropriate object store implementation for the URI
|
|
342
|
+
2. Uses that implementation to extract the bucket name
|
|
343
|
+
3. Falls back to specific URI format parsing if needed
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
uri: The URI to get the bucket name from
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
The bucket name
|
|
350
|
+
|
|
351
|
+
Raises:
|
|
352
|
+
ValueError: If the URI is not supported by any registered object store
|
|
353
|
+
"""
|
|
354
|
+
# First try to find the native implementation for this URI
|
|
355
|
+
object_store = get_object_store_for_uri(uri)
|
|
356
|
+
if object_store:
|
|
357
|
+
return object_store.get_bucket_name(uri)
|
|
358
|
+
|
|
359
|
+
# If no native implementation, handle specific URI formats directly
|
|
360
|
+
if uri.startswith("gs://"):
|
|
361
|
+
return uri[5:].split("/")[0]
|
|
362
|
+
elif any(uri.startswith(prefix) for prefix in S3ObjectStore.PREFIXES):
|
|
363
|
+
prefix_length = next(
|
|
364
|
+
len(prefix) for prefix in S3ObjectStore.PREFIXES if uri.startswith(prefix)
|
|
365
|
+
)
|
|
366
|
+
return uri[prefix_length:].split("/")[0]
|
|
367
|
+
elif uri.startswith(ABSObjectStore.PREFIX):
|
|
368
|
+
return uri[len(ABSObjectStore.PREFIX) :].split("@")[0]
|
|
369
|
+
elif ABSObjectStore.HTTPS_REGEX.match(uri):
|
|
370
|
+
# Handle HTTPS Azure Blob Storage URLs
|
|
371
|
+
match = ABSObjectStore.HTTPS_REGEX.match(uri)
|
|
372
|
+
if match:
|
|
373
|
+
stripped = uri[len(match.group(1)) :]
|
|
374
|
+
return stripped.split("/")[0]
|
|
375
|
+
|
|
376
|
+
raise ValueError(f"Unsupported URI format: {uri}")
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def get_object_key(uri: str) -> str:
|
|
380
|
+
"""
|
|
381
|
+
Get the object key from any supported object store URI.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
uri: The URI to get the object key from
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
The object key
|
|
388
|
+
|
|
389
|
+
Raises:
|
|
390
|
+
ValueError: If the URI is not supported by any registered object store
|
|
391
|
+
"""
|
|
392
|
+
object_store = get_object_store_for_uri(uri)
|
|
393
|
+
if object_store:
|
|
394
|
+
return object_store.get_object_key(uri)
|
|
395
|
+
|
|
396
|
+
raise ValueError(f"Unsupported URI format: {uri}")
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
class ObjectStoreSourceAdapter:
|
|
400
|
+
"""
|
|
401
|
+
Adapter for customizing object store source implementations.
|
|
402
|
+
|
|
403
|
+
This class provides a way to customize source implementations for different
|
|
404
|
+
object stores (S3, GCS, etc.) without having to directly modify those classes.
|
|
405
|
+
Instead, adapters register customizations that are applied to the source instance.
|
|
406
|
+
"""
|
|
407
|
+
|
|
408
|
+
@staticmethod
|
|
409
|
+
def create_s3_path(bucket_name: str, key: str) -> str:
|
|
410
|
+
"""
|
|
411
|
+
Create a default S3 path.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
bucket_name: The bucket name
|
|
415
|
+
key: The object key
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
A properly formatted S3 URI
|
|
419
|
+
"""
|
|
420
|
+
return unquote(f"s3://{bucket_name}/{key}")
|
|
421
|
+
|
|
422
|
+
@staticmethod
|
|
423
|
+
def create_gcs_path(bucket_name: str, key: str) -> str:
|
|
424
|
+
"""
|
|
425
|
+
Create a default GCS path.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
bucket_name: The bucket name
|
|
429
|
+
key: The object key
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
A properly formatted GCS URI
|
|
433
|
+
"""
|
|
434
|
+
return unquote(f"gs://{bucket_name}/{key}")
|
|
435
|
+
|
|
436
|
+
@staticmethod
|
|
437
|
+
def create_abs_path(container_name: str, key: str, account_name: str) -> str:
|
|
438
|
+
"""
|
|
439
|
+
Create a default Azure Blob Storage path.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
container_name: The container name
|
|
443
|
+
key: The object key
|
|
444
|
+
account_name: The storage account name
|
|
445
|
+
|
|
446
|
+
Returns:
|
|
447
|
+
A properly formatted ABS URI
|
|
448
|
+
"""
|
|
449
|
+
return unquote(
|
|
450
|
+
f"abfss://{container_name}@{account_name}.dfs.core.windows.net/{key}"
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
@staticmethod
|
|
454
|
+
def get_s3_external_url(
|
|
455
|
+
table_data: Any, region: Optional[str] = None
|
|
456
|
+
) -> Optional[str]:
|
|
457
|
+
"""
|
|
458
|
+
Get the AWS S3 console URL for the given table.
|
|
459
|
+
|
|
460
|
+
Args:
|
|
461
|
+
table_data: Table data containing path information
|
|
462
|
+
region: AWS region for the S3 console URL, defaults to us-east-1 if not specified
|
|
463
|
+
|
|
464
|
+
Returns:
|
|
465
|
+
The AWS console URL, or None if not applicable
|
|
466
|
+
"""
|
|
467
|
+
if not S3ObjectStore.is_uri(table_data.table_path):
|
|
468
|
+
return None
|
|
469
|
+
|
|
470
|
+
# Get the bucket name and key from the S3 URI
|
|
471
|
+
bucket_name = get_object_store_bucket_name(table_data.table_path)
|
|
472
|
+
key = get_object_key(table_data.table_path)
|
|
473
|
+
|
|
474
|
+
# Use the provided region or default to us-east-1
|
|
475
|
+
aws_region = region or "us-east-1"
|
|
476
|
+
|
|
477
|
+
return f"https://{aws_region}.console.aws.amazon.com/s3/buckets/{bucket_name}?prefix={key}"
|
|
478
|
+
|
|
479
|
+
@staticmethod
|
|
480
|
+
def get_gcs_external_url(table_data: Any) -> Optional[str]:
|
|
481
|
+
"""
|
|
482
|
+
Get the GCS console URL for the given table.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
table_data: Table data containing path information
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
The GCS console URL, or None if not applicable
|
|
489
|
+
"""
|
|
490
|
+
if not GCSObjectStore.is_uri(table_data.table_path):
|
|
491
|
+
return None
|
|
492
|
+
|
|
493
|
+
# Get the bucket name and key from the GCS URI
|
|
494
|
+
bucket_name = get_object_store_bucket_name(table_data.table_path)
|
|
495
|
+
key = get_object_key(table_data.table_path)
|
|
496
|
+
|
|
497
|
+
# Return the basic GCS console URL
|
|
498
|
+
return f"https://console.cloud.google.com/storage/browser/{bucket_name}/{key}"
|
|
499
|
+
|
|
500
|
+
@staticmethod
|
|
501
|
+
def get_abs_external_url(table_data: Any) -> Optional[str]:
|
|
502
|
+
"""
|
|
503
|
+
Get the Azure Storage browser URL for the given table.
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
table_data: Table data containing path information
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
The Azure Storage URL, or None if not applicable
|
|
510
|
+
"""
|
|
511
|
+
if not ABSObjectStore.is_uri(table_data.table_path):
|
|
512
|
+
return None
|
|
513
|
+
|
|
514
|
+
try:
|
|
515
|
+
if table_data.table_path.startswith("abfss://"):
|
|
516
|
+
# URI format: abfss://container@account.dfs.core.windows.net/path
|
|
517
|
+
path_without_prefix = ABSObjectStore.strip_prefix(table_data.table_path)
|
|
518
|
+
parts = path_without_prefix.split("@", 1)
|
|
519
|
+
if len(parts) < 2:
|
|
520
|
+
return None
|
|
521
|
+
|
|
522
|
+
container_name = parts[0]
|
|
523
|
+
account_parts = parts[1].split("/", 1)
|
|
524
|
+
account_domain = account_parts[0]
|
|
525
|
+
account_name = account_domain.split(".")[0]
|
|
526
|
+
else:
|
|
527
|
+
# Handle HTTPS format: https://account.blob.core.windows.net/container/path
|
|
528
|
+
container_name = ABSObjectStore.get_bucket_name(table_data.table_path)
|
|
529
|
+
if "blob.core.windows.net" in table_data.table_path:
|
|
530
|
+
account_name = table_data.table_path.split("//")[1].split(".")[0]
|
|
531
|
+
else:
|
|
532
|
+
return None
|
|
533
|
+
|
|
534
|
+
# Construct Azure portal URL
|
|
535
|
+
return f"https://portal.azure.com/#blade/Microsoft_Azure_Storage/ContainerMenuBlade/overview/storageAccountId/{account_name}/containerName/{container_name}"
|
|
536
|
+
except Exception:
|
|
537
|
+
# If any parsing error occurs, return None
|
|
538
|
+
return None
|
|
539
|
+
|
|
540
|
+
def __init__(
|
|
541
|
+
self,
|
|
542
|
+
platform: str,
|
|
543
|
+
platform_name: str,
|
|
544
|
+
aws_region: Optional[str] = None,
|
|
545
|
+
azure_storage_account: Optional[str] = None,
|
|
546
|
+
):
|
|
547
|
+
"""
|
|
548
|
+
Initialize the adapter with platform-specific configurations.
|
|
549
|
+
|
|
550
|
+
Args:
|
|
551
|
+
platform: The platform identifier (e.g., "s3", "gcs", "abs")
|
|
552
|
+
platform_name: The human-readable platform name
|
|
553
|
+
aws_region: AWS region for S3 URLs, defaults to us-east-1 if not specified
|
|
554
|
+
azure_storage_account: Azure Storage account name
|
|
555
|
+
"""
|
|
556
|
+
self.platform = platform
|
|
557
|
+
self.platform_name = platform_name
|
|
558
|
+
self.aws_region = aws_region
|
|
559
|
+
self.azure_storage_account = azure_storage_account
|
|
560
|
+
self.customizations: Dict[str, Callable[..., Any]] = {}
|
|
561
|
+
|
|
562
|
+
# Register default customizations based on platform
|
|
563
|
+
if platform == "gcs":
|
|
564
|
+
self.register_customization("is_s3_platform", lambda: True)
|
|
565
|
+
self.register_customization("create_s3_path", self.create_gcs_path)
|
|
566
|
+
self.register_customization(
|
|
567
|
+
"get_external_url",
|
|
568
|
+
lambda table_data: self.get_gcs_external_url(table_data),
|
|
569
|
+
)
|
|
570
|
+
# Fix URI mismatch issue in pattern matching
|
|
571
|
+
self.register_customization(
|
|
572
|
+
"_normalize_uri_for_pattern_matching",
|
|
573
|
+
self._normalize_gcs_uri_for_pattern_matching,
|
|
574
|
+
)
|
|
575
|
+
# Fix URI handling in schema extraction - override strip_s3_prefix for GCS
|
|
576
|
+
self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
|
|
577
|
+
elif platform == "s3":
|
|
578
|
+
self.register_customization("is_s3_platform", lambda: True)
|
|
579
|
+
self.register_customization("create_s3_path", self.create_s3_path)
|
|
580
|
+
self.register_customization(
|
|
581
|
+
"get_external_url",
|
|
582
|
+
lambda table_data: self.get_s3_external_url(
|
|
583
|
+
table_data, self.aws_region
|
|
584
|
+
),
|
|
585
|
+
)
|
|
586
|
+
elif platform == "abs":
|
|
587
|
+
self.register_customization("is_s3_platform", lambda: True)
|
|
588
|
+
# If we have an Azure storage account, create a specialized path creation function
|
|
589
|
+
if self.azure_storage_account:
|
|
590
|
+
storage_account = (
|
|
591
|
+
self.azure_storage_account
|
|
592
|
+
) # Create a local non-optional variable
|
|
593
|
+
self.register_customization(
|
|
594
|
+
"create_s3_path",
|
|
595
|
+
lambda bucket, key: self.create_abs_path(
|
|
596
|
+
bucket, key, storage_account
|
|
597
|
+
),
|
|
598
|
+
)
|
|
599
|
+
else:
|
|
600
|
+
# Fall back to a simpler implementation if no account provided
|
|
601
|
+
self.register_customization(
|
|
602
|
+
"create_s3_path", lambda bucket, key: f"abfss://{bucket}@{key}"
|
|
603
|
+
)
|
|
604
|
+
self.register_customization("get_external_url", self.get_abs_external_url)
|
|
605
|
+
|
|
606
|
+
def register_customization(
|
|
607
|
+
self, method_name: str, implementation: Callable[..., Any]
|
|
608
|
+
) -> None:
|
|
609
|
+
"""
|
|
610
|
+
Register a customization for a specific method.
|
|
611
|
+
|
|
612
|
+
Args:
|
|
613
|
+
method_name: The name of the method to customize
|
|
614
|
+
implementation: The implementation to use
|
|
615
|
+
"""
|
|
616
|
+
self.customizations[method_name] = implementation
|
|
617
|
+
|
|
618
|
+
def apply_customizations(self, source: Any) -> Any:
|
|
619
|
+
"""
|
|
620
|
+
Apply all registered customizations to the source instance.
|
|
621
|
+
|
|
622
|
+
Args:
|
|
623
|
+
source: The source instance to customize
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
The customized source instance
|
|
627
|
+
"""
|
|
628
|
+
# Set the platform
|
|
629
|
+
if hasattr(source, "source_config") and hasattr(
|
|
630
|
+
source.source_config, "platform"
|
|
631
|
+
):
|
|
632
|
+
source.source_config.platform = self.platform
|
|
633
|
+
|
|
634
|
+
# Apply method customizations
|
|
635
|
+
for method_name, implementation in self.customizations.items():
|
|
636
|
+
# For instance methods that use self, we need to bind them to the source
|
|
637
|
+
if (
|
|
638
|
+
hasattr(implementation, "__self__")
|
|
639
|
+
and implementation.__self__ is not None
|
|
640
|
+
):
|
|
641
|
+
# This is already a bound method, use __get__ to rebind it to the source
|
|
642
|
+
setattr(source, method_name, implementation.__get__(source))
|
|
643
|
+
else:
|
|
644
|
+
# This is a regular function or static/class method
|
|
645
|
+
setattr(source, method_name, implementation)
|
|
646
|
+
|
|
647
|
+
return source
|
|
648
|
+
|
|
649
|
+
# Add a direct method for tests that may call this directly
|
|
650
|
+
def get_external_url(self, table_data: Any) -> Optional[str]:
|
|
651
|
+
"""
|
|
652
|
+
Get the external URL for a table based on the platform type.
|
|
653
|
+
|
|
654
|
+
This method routes to the appropriate implementation based on the platform.
|
|
655
|
+
|
|
656
|
+
Args:
|
|
657
|
+
table_data: Table data containing path information
|
|
658
|
+
|
|
659
|
+
Returns:
|
|
660
|
+
An external URL or None if not applicable
|
|
661
|
+
"""
|
|
662
|
+
if self.platform == "s3":
|
|
663
|
+
return self.get_s3_external_url(table_data, self.aws_region)
|
|
664
|
+
elif self.platform == "gcs":
|
|
665
|
+
return self.get_gcs_external_url(table_data)
|
|
666
|
+
elif self.platform == "abs":
|
|
667
|
+
return self.get_abs_external_url(table_data)
|
|
668
|
+
return None
|
|
669
|
+
|
|
670
|
+
def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
|
|
671
|
+
"""
|
|
672
|
+
Normalize GCS URI for pattern matching.
|
|
673
|
+
|
|
674
|
+
This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
|
|
675
|
+
fixing the URI mismatch issue in GCS ingestion.
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
uri: The URI to normalize
|
|
679
|
+
|
|
680
|
+
Returns:
|
|
681
|
+
The normalized URI for pattern matching
|
|
682
|
+
"""
|
|
683
|
+
if uri.startswith("gs://"):
|
|
684
|
+
return uri.replace("gs://", "s3://", 1)
|
|
685
|
+
return uri
|
|
686
|
+
|
|
687
|
+
def _strip_gcs_prefix(self, uri: str) -> str:
|
|
688
|
+
"""
|
|
689
|
+
Strip GCS prefix from URI.
|
|
690
|
+
|
|
691
|
+
This method removes the gs:// prefix from GCS URIs for path processing.
|
|
692
|
+
|
|
693
|
+
Args:
|
|
694
|
+
uri: The URI to strip the prefix from
|
|
695
|
+
|
|
696
|
+
Returns:
|
|
697
|
+
The URI without the gs:// prefix
|
|
698
|
+
"""
|
|
699
|
+
if uri.startswith("gs://"):
|
|
700
|
+
return uri[5:] # Remove "gs://" prefix
|
|
701
|
+
return uri
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
# Factory function to create an adapter for a specific platform
|
|
705
|
+
def create_object_store_adapter(
|
|
706
|
+
platform: str,
|
|
707
|
+
aws_region: Optional[str] = None,
|
|
708
|
+
azure_storage_account: Optional[str] = None,
|
|
709
|
+
) -> ObjectStoreSourceAdapter:
|
|
710
|
+
"""
|
|
711
|
+
Create an adapter for a specific object store platform.
|
|
712
|
+
|
|
713
|
+
Args:
|
|
714
|
+
platform: The platform identifier (e.g., "s3", "gcs", "abs")
|
|
715
|
+
aws_region: AWS region for S3 URLs, defaults to us-east-1 if not specified
|
|
716
|
+
azure_storage_account: Azure Storage account name
|
|
717
|
+
|
|
718
|
+
Returns:
|
|
719
|
+
An adapter configured for the specified platform
|
|
720
|
+
"""
|
|
721
|
+
platform_names = {
|
|
722
|
+
"s3": "Amazon S3",
|
|
723
|
+
"gcs": "Google Cloud Storage",
|
|
724
|
+
"abs": "Azure Blob Storage",
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
return ObjectStoreSourceAdapter(
|
|
728
|
+
platform=platform,
|
|
729
|
+
platform_name=platform_names.get(platform, f"Unknown ({platform})"),
|
|
730
|
+
aws_region=aws_region,
|
|
731
|
+
azure_storage_account=azure_storage_account,
|
|
732
|
+
)
|