acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -3,15 +3,14 @@ import functools
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import pathlib
|
|
6
|
+
import posixpath
|
|
6
7
|
import re
|
|
7
8
|
import time
|
|
8
9
|
from datetime import datetime
|
|
9
10
|
from pathlib import PurePath
|
|
10
|
-
from typing import
|
|
11
|
-
from urllib.parse import urlparse
|
|
11
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
12
12
|
|
|
13
13
|
import smart_open.compression as so_compression
|
|
14
|
-
from more_itertools import peekable
|
|
15
14
|
from pyspark.conf import SparkConf
|
|
16
15
|
from pyspark.sql import SparkSession
|
|
17
16
|
from pyspark.sql.dataframe import DataFrame
|
|
@@ -35,14 +34,22 @@ from datahub.ingestion.api.decorators import (
|
|
|
35
34
|
)
|
|
36
35
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
37
36
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
|
-
from datahub.ingestion.source.aws.s3_boto_utils import
|
|
37
|
+
from datahub.ingestion.source.aws.s3_boto_utils import (
|
|
38
|
+
get_s3_tags,
|
|
39
|
+
list_folders_path,
|
|
40
|
+
list_objects_recursive_path,
|
|
41
|
+
)
|
|
39
42
|
from datahub.ingestion.source.aws.s3_util import (
|
|
40
43
|
get_bucket_name,
|
|
41
44
|
get_bucket_relative_path,
|
|
42
45
|
get_key_prefix,
|
|
43
46
|
strip_s3_prefix,
|
|
44
47
|
)
|
|
45
|
-
from datahub.ingestion.source.
|
|
48
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
49
|
+
from datahub.ingestion.source.data_lake_common.data_lake_utils import (
|
|
50
|
+
ContainerWUCreator,
|
|
51
|
+
add_partition_columns_to_schema,
|
|
52
|
+
)
|
|
46
53
|
from datahub.ingestion.source.data_lake_common.object_store import (
|
|
47
54
|
create_object_store_adapter,
|
|
48
55
|
)
|
|
@@ -59,9 +66,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
59
66
|
)
|
|
60
67
|
from datahub.metadata.com.linkedin.pegasus2avro.common import TimeStamp
|
|
61
68
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
62
|
-
SchemaField,
|
|
63
69
|
SchemaMetadata,
|
|
64
|
-
StringTypeClass,
|
|
65
70
|
)
|
|
66
71
|
from datahub.metadata.schema_classes import (
|
|
67
72
|
DataPlatformInstanceClass,
|
|
@@ -71,22 +76,15 @@ from datahub.metadata.schema_classes import (
|
|
|
71
76
|
OtherSchemaClass,
|
|
72
77
|
PartitionsSummaryClass,
|
|
73
78
|
PartitionSummaryClass,
|
|
74
|
-
SchemaFieldDataTypeClass,
|
|
75
79
|
_Aspect,
|
|
76
80
|
)
|
|
77
81
|
from datahub.telemetry import stats, telemetry
|
|
78
|
-
from datahub.utilities.groupby import groupby_unsorted
|
|
79
82
|
from datahub.utilities.perf_timer import PerfTimer
|
|
80
83
|
|
|
81
|
-
if TYPE_CHECKING:
|
|
82
|
-
from mypy_boto3_s3.service_resource import Bucket
|
|
83
|
-
|
|
84
84
|
# hide annoying debug errors from py4j
|
|
85
85
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
86
86
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
87
87
|
|
|
88
|
-
PAGE_SIZE = 1000
|
|
89
|
-
|
|
90
88
|
# Hack to support the .gzip extension with smart_open.
|
|
91
89
|
so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"])
|
|
92
90
|
|
|
@@ -112,14 +110,7 @@ profiling_flags_to_report = [
|
|
|
112
110
|
"include_field_sample_values",
|
|
113
111
|
]
|
|
114
112
|
|
|
115
|
-
|
|
116
|
-
# LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG = AddDatasetBrowsePathConfig(
|
|
117
|
-
# path_templates=["/ENV/PLATFORMDATASET_PARTS"], replace_existing=True
|
|
118
|
-
# )
|
|
119
|
-
#
|
|
120
|
-
# LOCAL_BROWSE_PATH_TRANSFORMER = AddDatasetBrowsePathTransformer(
|
|
121
|
-
# ctx=None, config=LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG
|
|
122
|
-
# )
|
|
113
|
+
URI_SCHEME_REGEX = re.compile(r"^[a-z0-9]+://")
|
|
123
114
|
|
|
124
115
|
|
|
125
116
|
def partitioned_folder_comparator(folder1: str, folder2: str) -> int:
|
|
@@ -162,6 +153,15 @@ class Folder:
|
|
|
162
153
|
)
|
|
163
154
|
|
|
164
155
|
|
|
156
|
+
@dataclasses.dataclass
|
|
157
|
+
class FolderInfo:
|
|
158
|
+
objects: List[Any]
|
|
159
|
+
total_size: int
|
|
160
|
+
min_time: datetime
|
|
161
|
+
max_time: datetime
|
|
162
|
+
latest_obj: Any
|
|
163
|
+
|
|
164
|
+
|
|
165
165
|
@dataclasses.dataclass
|
|
166
166
|
class BrowsePath:
|
|
167
167
|
file: str
|
|
@@ -188,8 +188,15 @@ class TableData:
|
|
|
188
188
|
|
|
189
189
|
@platform_name("S3 / Local Files", id="s3")
|
|
190
190
|
@config_class(DataLakeSourceConfig)
|
|
191
|
-
@support_status(SupportStatus.
|
|
192
|
-
@capability(
|
|
191
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
192
|
+
@capability(
|
|
193
|
+
SourceCapability.CONTAINERS,
|
|
194
|
+
"Enabled by default",
|
|
195
|
+
subtype_modifier=[
|
|
196
|
+
SourceCapabilityModifier.FOLDER,
|
|
197
|
+
SourceCapabilityModifier.S3_BUCKET,
|
|
198
|
+
],
|
|
199
|
+
)
|
|
193
200
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
194
201
|
@capability(
|
|
195
202
|
SourceCapability.SCHEMA_METADATA, "Can infer schema from supported file types"
|
|
@@ -369,7 +376,10 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
369
376
|
|
|
370
377
|
def read_file_spark(self, file: str, ext: str) -> Optional[DataFrame]:
|
|
371
378
|
logger.debug(f"Opening file {file} for profiling in spark")
|
|
372
|
-
|
|
379
|
+
if "s3://" in file:
|
|
380
|
+
# replace s3:// with s3a://, and make sure standalone bucket names always end with a slash.
|
|
381
|
+
# Spark will fail if given a path like `s3a://mybucket`, and requires it to be `s3a://mybucket/`.
|
|
382
|
+
file = f"s3a://{get_bucket_name(file)}/{get_bucket_relative_path(file)}"
|
|
373
383
|
|
|
374
384
|
telemetry.telemetry_instance.ping("data_lake_file", {"extension": ext})
|
|
375
385
|
|
|
@@ -426,9 +436,8 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
426
436
|
self.source_config.verify_ssl
|
|
427
437
|
)
|
|
428
438
|
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
)
|
|
439
|
+
path = re.sub(URI_SCHEME_REGEX, "s3://", table_data.full_path)
|
|
440
|
+
file = smart_open(path, "rb", transport_params={"client": s3_client})
|
|
432
441
|
else:
|
|
433
442
|
# We still use smart_open here to take advantage of the compression
|
|
434
443
|
# capabilities of smart_open.
|
|
@@ -467,7 +476,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
467
476
|
fields = sorted(fields, key=lambda f: f.fieldPath)
|
|
468
477
|
|
|
469
478
|
if self.source_config.add_partition_columns_to_schema and table_data.partitions:
|
|
470
|
-
|
|
479
|
+
add_partition_columns_to_schema(
|
|
471
480
|
fields=fields, path_spec=path_spec, full_path=table_data.full_path
|
|
472
481
|
)
|
|
473
482
|
|
|
@@ -503,34 +512,6 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
503
512
|
else:
|
|
504
513
|
return None
|
|
505
514
|
|
|
506
|
-
def add_partition_columns_to_schema(
|
|
507
|
-
self, path_spec: PathSpec, full_path: str, fields: List[SchemaField]
|
|
508
|
-
) -> None:
|
|
509
|
-
is_fieldpath_v2 = False
|
|
510
|
-
for field in fields:
|
|
511
|
-
if field.fieldPath.startswith("[version=2.0]"):
|
|
512
|
-
is_fieldpath_v2 = True
|
|
513
|
-
break
|
|
514
|
-
partition_keys = path_spec.get_partition_from_path(full_path)
|
|
515
|
-
if not partition_keys:
|
|
516
|
-
return None
|
|
517
|
-
|
|
518
|
-
for partition_key in partition_keys:
|
|
519
|
-
fields.append(
|
|
520
|
-
SchemaField(
|
|
521
|
-
fieldPath=(
|
|
522
|
-
f"{partition_key[0]}"
|
|
523
|
-
if not is_fieldpath_v2
|
|
524
|
-
else f"[version=2.0].[type=string].{partition_key[0]}"
|
|
525
|
-
),
|
|
526
|
-
nativeDataType="string",
|
|
527
|
-
type=SchemaFieldDataTypeClass(StringTypeClass()),
|
|
528
|
-
isPartitioningKey=True,
|
|
529
|
-
nullable=True,
|
|
530
|
-
recursive=False,
|
|
531
|
-
)
|
|
532
|
-
)
|
|
533
|
-
|
|
534
515
|
def get_table_profile(
|
|
535
516
|
self, table_data: TableData, dataset_urn: str
|
|
536
517
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -674,11 +655,9 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
674
655
|
aspects: List[Optional[_Aspect]] = []
|
|
675
656
|
|
|
676
657
|
logger.info(f"Extracting table schema from file: {table_data.full_path}")
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
else table_data.table_path.strip("/")
|
|
681
|
-
)
|
|
658
|
+
|
|
659
|
+
# remove protocol and any leading or trailing slashes
|
|
660
|
+
browse_path = re.sub(URI_SCHEME_REGEX, "", table_data.table_path).strip("/")
|
|
682
661
|
|
|
683
662
|
data_platform_urn = make_data_platform_urn(self.source_config.platform)
|
|
684
663
|
logger.info(f"Creating dataset urn with name: {browse_path}")
|
|
@@ -812,10 +791,20 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
812
791
|
else:
|
|
813
792
|
return relative_path
|
|
814
793
|
|
|
815
|
-
def
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
794
|
+
def extract_table_name_and_path(
|
|
795
|
+
self, path_spec: PathSpec, path: str
|
|
796
|
+
) -> Tuple[str, str]:
|
|
797
|
+
# Extract the table name and base path from a path that's been normalized back to the
|
|
798
|
+
# "s3://" scheme that matches the path_spec
|
|
799
|
+
table_name, table_path = path_spec.extract_table_name_and_path(
|
|
800
|
+
self._normalize_uri_for_pattern_matching(path)
|
|
801
|
+
)
|
|
802
|
+
# Then convert the table base path back to the original scheme
|
|
803
|
+
scheme = re.match(URI_SCHEME_REGEX, path)
|
|
804
|
+
if scheme:
|
|
805
|
+
table_path = re.sub(URI_SCHEME_REGEX, scheme[0], table_path)
|
|
806
|
+
|
|
807
|
+
return table_name, table_path
|
|
819
808
|
|
|
820
809
|
def extract_table_data(
|
|
821
810
|
self,
|
|
@@ -825,7 +814,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
825
814
|
path = browse_path.file
|
|
826
815
|
partitions = browse_path.partitions
|
|
827
816
|
logger.debug(f"Getting table data for path: {path}")
|
|
828
|
-
table_name, table_path =
|
|
817
|
+
table_name, table_path = self.extract_table_name_and_path(path_spec, path)
|
|
829
818
|
return TableData(
|
|
830
819
|
display_name=table_name,
|
|
831
820
|
is_s3=self.is_s3_platform(),
|
|
@@ -849,72 +838,91 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
849
838
|
content_type=browse_path.content_type,
|
|
850
839
|
)
|
|
851
840
|
|
|
852
|
-
def resolve_templated_folders(self,
|
|
841
|
+
def resolve_templated_folders(self, prefix: str) -> Iterable[str]:
|
|
853
842
|
folder_split: List[str] = prefix.split("*", 1)
|
|
854
843
|
# If the len of split is 1 it means we don't have * in the prefix
|
|
855
844
|
if len(folder_split) == 1:
|
|
856
845
|
yield prefix
|
|
857
846
|
return
|
|
858
847
|
|
|
859
|
-
|
|
860
|
-
|
|
848
|
+
basename_startswith = folder_split[0].split("/")[-1]
|
|
849
|
+
dirname = folder_split[0].removesuffix(basename_startswith)
|
|
850
|
+
|
|
851
|
+
folders = list_folders_path(
|
|
852
|
+
dirname,
|
|
853
|
+
startswith=basename_startswith,
|
|
854
|
+
aws_config=self.source_config.aws_config,
|
|
861
855
|
)
|
|
862
856
|
for folder in folders:
|
|
857
|
+
# Ensure proper path joining - folders from list_folders path never include a
|
|
858
|
+
# trailing slash, but we need to handle the case where folder_split[1] might
|
|
859
|
+
# start with a slash
|
|
860
|
+
remaining_pattern = folder_split[1]
|
|
861
|
+
if remaining_pattern.startswith("/"):
|
|
862
|
+
remaining_pattern = remaining_pattern[1:]
|
|
863
|
+
|
|
863
864
|
yield from self.resolve_templated_folders(
|
|
864
|
-
|
|
865
|
+
f"{folder.path}/{remaining_pattern}"
|
|
865
866
|
)
|
|
866
867
|
|
|
867
868
|
def get_dir_to_process(
|
|
868
869
|
self,
|
|
869
|
-
|
|
870
|
-
folder: str,
|
|
870
|
+
uri: str,
|
|
871
871
|
path_spec: PathSpec,
|
|
872
|
-
protocol: str,
|
|
873
872
|
min: bool = False,
|
|
874
873
|
) -> List[str]:
|
|
875
|
-
#
|
|
876
|
-
#
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
874
|
+
# Add any remaining parts of the path_spec before globs, excluding the
|
|
875
|
+
# final filename component, to the URI and prefix so that we don't
|
|
876
|
+
# unnecessarily list too many objects.
|
|
877
|
+
if not uri.endswith("/"):
|
|
878
|
+
uri += "/"
|
|
879
|
+
remaining = posixpath.dirname(path_spec.get_remaining_glob_include(uri)).split(
|
|
880
|
+
"*"
|
|
881
|
+
)[0]
|
|
882
|
+
uri += posixpath.dirname(remaining)
|
|
883
|
+
prefix = posixpath.basename(remaining)
|
|
884
|
+
|
|
885
|
+
# Check if we're at the end of the include path. If so, no need to list sub-folders.
|
|
886
|
+
if path_spec.has_correct_number_of_directory_components(uri):
|
|
887
|
+
return [uri]
|
|
888
|
+
|
|
889
|
+
logger.debug(f"get_dir_to_process listing folders {uri=} {prefix=}")
|
|
890
|
+
iterator = list_folders_path(
|
|
891
|
+
s3_uri=uri,
|
|
892
|
+
startswith=prefix,
|
|
881
893
|
aws_config=self.source_config.aws_config,
|
|
882
894
|
)
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
folders
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
else:
|
|
906
|
-
return [f"{protocol}{bucket_name}/{folder}"]
|
|
907
|
-
return [f"{protocol}{bucket_name}/{folder}"]
|
|
895
|
+
sorted_dirs = sorted(
|
|
896
|
+
iterator,
|
|
897
|
+
key=lambda dir: functools.cmp_to_key(partitioned_folder_comparator)(
|
|
898
|
+
dir.name
|
|
899
|
+
),
|
|
900
|
+
reverse=not min,
|
|
901
|
+
)
|
|
902
|
+
folders = []
|
|
903
|
+
for dir in sorted_dirs:
|
|
904
|
+
if path_spec.dir_allowed(dir.path):
|
|
905
|
+
folders_list = self.get_dir_to_process(
|
|
906
|
+
uri=dir.path,
|
|
907
|
+
path_spec=path_spec,
|
|
908
|
+
min=min,
|
|
909
|
+
)
|
|
910
|
+
folders.extend(folders_list)
|
|
911
|
+
if path_spec.traversal_method != FolderTraversalMethod.ALL:
|
|
912
|
+
return folders
|
|
913
|
+
if folders:
|
|
914
|
+
return folders
|
|
915
|
+
else:
|
|
916
|
+
return [uri]
|
|
908
917
|
|
|
909
918
|
def get_folder_info(
|
|
910
919
|
self,
|
|
911
920
|
path_spec: PathSpec,
|
|
912
|
-
|
|
913
|
-
prefix: str,
|
|
921
|
+
uri: str,
|
|
914
922
|
) -> Iterable[Folder]:
|
|
915
923
|
"""
|
|
916
|
-
Retrieves all the folders in a path by listing all the files
|
|
917
|
-
|
|
924
|
+
Retrieves all the folders in a path by recursively listing all the files under the
|
|
925
|
+
given URI.
|
|
918
926
|
|
|
919
927
|
A folder has creation and modification times, size, and a sample file path.
|
|
920
928
|
- Creation time is the earliest creation time of all files in the folder.
|
|
@@ -924,72 +932,174 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
924
932
|
|
|
925
933
|
Parameters:
|
|
926
934
|
path_spec (PathSpec): The path specification used to determine partitioning.
|
|
927
|
-
|
|
928
|
-
prefix (str): The prefix path in the S3 bucket to list objects from.
|
|
935
|
+
uri (str): The path in the S3 bucket to list objects from.
|
|
929
936
|
|
|
930
937
|
Returns:
|
|
931
938
|
List[Folder]: A list of Folder objects representing the partitions found.
|
|
932
939
|
"""
|
|
933
940
|
|
|
934
941
|
def _is_allowed_path(path_spec_: PathSpec, s3_uri: str) -> bool:
|
|
935
|
-
|
|
942
|
+
# Normalize URI for pattern matching
|
|
943
|
+
normalized_uri = self._normalize_uri_for_pattern_matching(s3_uri)
|
|
944
|
+
|
|
945
|
+
allowed = path_spec_.allowed(normalized_uri)
|
|
936
946
|
if not allowed:
|
|
937
947
|
logger.debug(f"File {s3_uri} not allowed and skipping")
|
|
938
948
|
self.report.report_file_dropped(s3_uri)
|
|
939
949
|
return allowed
|
|
940
950
|
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
951
|
+
# Add any remaining parts of the path_spec before globs to the URI and prefix,
|
|
952
|
+
# so that we don't unnecessarily list too many objects.
|
|
953
|
+
if not uri.endswith("/"):
|
|
954
|
+
uri += "/"
|
|
955
|
+
remaining = path_spec.get_remaining_glob_include(uri).split("*")[0]
|
|
956
|
+
uri += posixpath.dirname(remaining)
|
|
957
|
+
prefix = posixpath.basename(remaining)
|
|
958
|
+
|
|
959
|
+
# Process objects in a memory-efficient streaming fashion
|
|
960
|
+
# Instead of loading all objects into memory, we'll accumulate folder data incrementally
|
|
961
|
+
folder_data: Dict[str, FolderInfo] = {} # dirname -> FolderInfo
|
|
962
|
+
|
|
963
|
+
logger.info(f"Listing objects under {repr(uri)} with {prefix=}")
|
|
964
|
+
|
|
965
|
+
for obj in list_objects_recursive_path(
|
|
966
|
+
uri, startswith=prefix, aws_config=self.source_config.aws_config
|
|
967
|
+
):
|
|
968
|
+
s3_path = self.create_s3_path(obj.bucket_name, obj.key)
|
|
969
|
+
|
|
970
|
+
if not _is_allowed_path(path_spec, s3_path):
|
|
971
|
+
continue
|
|
972
|
+
|
|
973
|
+
# Extract the directory name (folder) from the object key
|
|
974
|
+
dirname = obj.key.rsplit("/", 1)[0]
|
|
975
|
+
|
|
976
|
+
# Initialize folder data if we haven't seen this directory before
|
|
977
|
+
if dirname not in folder_data:
|
|
978
|
+
folder_data[dirname] = FolderInfo(
|
|
979
|
+
objects=[],
|
|
980
|
+
total_size=0,
|
|
981
|
+
min_time=obj.last_modified,
|
|
982
|
+
max_time=obj.last_modified,
|
|
983
|
+
latest_obj=obj,
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
# Update folder statistics incrementally
|
|
987
|
+
folder_info = folder_data[dirname]
|
|
988
|
+
folder_info.objects.append(obj)
|
|
989
|
+
folder_info.total_size += obj.size
|
|
990
|
+
|
|
991
|
+
# Track min/max times and latest object
|
|
992
|
+
if obj.last_modified < folder_info.min_time:
|
|
993
|
+
folder_info.min_time = obj.last_modified
|
|
994
|
+
if obj.last_modified > folder_info.max_time:
|
|
995
|
+
folder_info.max_time = obj.last_modified
|
|
996
|
+
folder_info.latest_obj = obj
|
|
997
|
+
|
|
998
|
+
# Yield folders after processing all objects
|
|
999
|
+
for _dirname, folder_info in folder_data.items():
|
|
1000
|
+
latest_obj = folder_info.latest_obj
|
|
1001
|
+
max_file_s3_path = self.create_s3_path(
|
|
1002
|
+
latest_obj.bucket_name, latest_obj.key
|
|
946
1003
|
)
|
|
947
|
-
)
|
|
948
|
-
grouped_s3_objects_by_dirname = groupby_unsorted(
|
|
949
|
-
s3_objects,
|
|
950
|
-
key=lambda obj: obj.key.rsplit("/", 1)[0],
|
|
951
|
-
)
|
|
952
|
-
for _, group in grouped_s3_objects_by_dirname:
|
|
953
|
-
max_file = max(group, key=lambda x: x.last_modified)
|
|
954
|
-
max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
|
|
955
1004
|
|
|
956
1005
|
# If partition_id is None, it means the folder is not a partition
|
|
957
|
-
partition_id = path_spec.get_partition_from_path(
|
|
1006
|
+
partition_id = path_spec.get_partition_from_path(
|
|
1007
|
+
self._normalize_uri_for_pattern_matching(max_file_s3_path)
|
|
1008
|
+
)
|
|
958
1009
|
|
|
959
1010
|
yield Folder(
|
|
960
1011
|
partition_id=partition_id,
|
|
961
1012
|
is_partition=bool(partition_id),
|
|
962
|
-
creation_time=
|
|
963
|
-
modification_time=
|
|
1013
|
+
creation_time=folder_info.min_time,
|
|
1014
|
+
modification_time=folder_info.max_time,
|
|
964
1015
|
sample_file=max_file_s3_path,
|
|
965
|
-
size=
|
|
1016
|
+
size=folder_info.total_size,
|
|
966
1017
|
)
|
|
967
1018
|
|
|
1019
|
+
def create_s3_path(self, bucket_name: str, key: str) -> str:
|
|
1020
|
+
return f"s3://{bucket_name}/{key}"
|
|
1021
|
+
|
|
968
1022
|
def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
|
|
1023
|
+
"""
|
|
1024
|
+
Main entry point for browsing S3 objects and creating table-level datasets.
|
|
1025
|
+
|
|
1026
|
+
This method determines whether to use templated processing (for paths with {table})
|
|
1027
|
+
or simple file-by-file processing (for paths without templates).
|
|
1028
|
+
|
|
1029
|
+
Args:
|
|
1030
|
+
path_spec: Configuration specifying the S3 path pattern to scan
|
|
1031
|
+
sample_size: Number of files to sample (used in simple processing)
|
|
1032
|
+
|
|
1033
|
+
Returns:
|
|
1034
|
+
Iterator of BrowsePath objects representing datasets to be created
|
|
1035
|
+
|
|
1036
|
+
Examples:
|
|
1037
|
+
- Templated: s3://bucket/data/*/{table}/** -> Groups files by table
|
|
1038
|
+
- Simple: s3://bucket/data/*.csv -> Processes individual files
|
|
1039
|
+
"""
|
|
1040
|
+
if self.source_config.aws_config is None:
|
|
1041
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
1042
|
+
|
|
1043
|
+
logger.info(f"Processing path spec: {path_spec.include}")
|
|
1044
|
+
|
|
1045
|
+
# Check if we have {table} template in the path
|
|
1046
|
+
has_table_template = "{table}" in path_spec.include
|
|
1047
|
+
|
|
1048
|
+
logger.info(f"Has table template: {has_table_template}")
|
|
1049
|
+
|
|
1050
|
+
if has_table_template:
|
|
1051
|
+
logger.info("Using templated path processing")
|
|
1052
|
+
# Always use templated processing when {table} is present
|
|
1053
|
+
# This groups files under table-level datasets
|
|
1054
|
+
yield from self._process_templated_path(path_spec)
|
|
1055
|
+
else:
|
|
1056
|
+
logger.info("Using simple path processing")
|
|
1057
|
+
# Only use simple processing for non-templated paths
|
|
1058
|
+
# This creates individual file-level datasets
|
|
1059
|
+
yield from self._process_simple_path(path_spec)
|
|
1060
|
+
|
|
1061
|
+
def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
|
|
1062
|
+
"""
|
|
1063
|
+
Process S3 paths containing {table} templates to create table-level datasets.
|
|
1064
|
+
|
|
1065
|
+
This method handles complex path patterns with wildcards and templates by:
|
|
1066
|
+
1. Replacing template placeholders with stars (except {table})
|
|
1067
|
+
2. Resolving wildcards in the path up to the {table} marker
|
|
1068
|
+
3. Finding all potential table folders under each resolved path
|
|
1069
|
+
4. Applying configurable partition traversal strategy (ALL, MAX, MIN_MAX)
|
|
1070
|
+
5. Aggregating files from selected partitions under each table
|
|
1071
|
+
6. Creating one dataset per table (not per file)
|
|
1072
|
+
|
|
1073
|
+
Args:
|
|
1074
|
+
path_spec: Path specification with {table} template
|
|
1075
|
+
|
|
1076
|
+
Yields:
|
|
1077
|
+
BrowsePath: One per table (not per file), containing aggregated metadata
|
|
1078
|
+
"""
|
|
1079
|
+
|
|
969
1080
|
if self.source_config.aws_config is None:
|
|
970
1081
|
raise ValueError("aws_config not set. Cannot browse s3")
|
|
971
1082
|
s3 = self.source_config.aws_config.get_s3_resource(
|
|
972
1083
|
self.source_config.verify_ssl
|
|
973
1084
|
)
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
1085
|
+
|
|
1086
|
+
# Find the part before {table}
|
|
1087
|
+
table_marker = "{table}"
|
|
1088
|
+
if table_marker not in path_spec.include:
|
|
1089
|
+
logger.info("No {table} marker found in path")
|
|
1090
|
+
return
|
|
1091
|
+
|
|
1092
|
+
# STEP 1: Replace template placeholders with stars (except {table}) to enable folder resolution
|
|
1093
|
+
# This is the crucial missing logic from the original implementation
|
|
979
1094
|
matches = re.finditer(r"{\s*\w+\s*}", path_spec.include, re.MULTILINE)
|
|
980
1095
|
matches_list = list(matches)
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
#
|
|
984
|
-
# For example:
|
|
985
|
-
# "s3://my-test-bucket/*/{dept}/*/{table}/*/*.*" -> "s3://my-test-bucket/*/*/*/{table}/*/*.*"
|
|
986
|
-
# We only keep the last template as a marker to know the point util we need to resolve path.
|
|
987
|
-
# After the marker we can safely get sample files for sampling because it is not used in the
|
|
988
|
-
# table name, so we don't need all the files.
|
|
989
|
-
# This speed up processing but we won't be able to get a precise modification date/size/number of files.
|
|
1096
|
+
|
|
1097
|
+
if matches_list:
|
|
1098
|
+
# Replace all templates with stars except keep {table} as the marker
|
|
990
1099
|
max_start: int = -1
|
|
991
1100
|
include: str = path_spec.include
|
|
992
1101
|
max_match: str = ""
|
|
1102
|
+
|
|
993
1103
|
for match in matches_list:
|
|
994
1104
|
pos = include.find(match.group())
|
|
995
1105
|
if pos > max_start:
|
|
@@ -1001,120 +1111,198 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1001
1111
|
if max_match == "{table}":
|
|
1002
1112
|
break
|
|
1003
1113
|
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1114
|
+
logger.info(f"Template replacement: {path_spec.include} -> {include}")
|
|
1115
|
+
else:
|
|
1116
|
+
include = path_spec.include
|
|
1117
|
+
|
|
1118
|
+
# Split the path at {table} to get the prefix that needs wildcard resolution
|
|
1119
|
+
prefix_before_table = include.split(table_marker)[0]
|
|
1120
|
+
logger.info(f"Prefix before table: {prefix_before_table}")
|
|
1121
|
+
|
|
1122
|
+
try:
|
|
1123
|
+
# STEP 2: Resolve ALL wildcards in the path up to {table}
|
|
1124
|
+
# This converts patterns like "s3://data/*/logs/" to actual paths like ["s3://data/2023/logs/", "s3://data/2024/logs/"]
|
|
1125
|
+
resolved_prefixes = list(
|
|
1126
|
+
self.resolve_templated_folders(prefix_before_table)
|
|
1127
|
+
)
|
|
1128
|
+
logger.info(f"Resolved prefixes: {resolved_prefixes}")
|
|
1129
|
+
|
|
1130
|
+
# STEP 3: Process each resolved prefix to find table folders
|
|
1131
|
+
for resolved_prefix in resolved_prefixes:
|
|
1132
|
+
logger.info(f"Processing resolved prefix: {resolved_prefix}")
|
|
1133
|
+
|
|
1134
|
+
# Get all folders that could be tables under this resolved prefix
|
|
1135
|
+
# These are the actual table names (e.g., "users", "events", "logs")
|
|
1136
|
+
table_folders = list(
|
|
1137
|
+
list_folders_path(
|
|
1138
|
+
resolved_prefix, aws_config=self.source_config.aws_config
|
|
1139
|
+
)
|
|
1140
|
+
)
|
|
1141
|
+
logger.debug(
|
|
1142
|
+
f"Found table folders under {resolved_prefix}: {[folder.name for folder in table_folders]}"
|
|
1143
|
+
)
|
|
1144
|
+
|
|
1145
|
+
# STEP 4: Process each table folder to create a table-level dataset
|
|
1146
|
+
for folder in table_folders:
|
|
1147
|
+
logger.info(f"Processing table path: {folder.path}")
|
|
1148
|
+
|
|
1149
|
+
# Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
|
|
1150
|
+
# This uses the compiled regex pattern to extract the table name from the full path
|
|
1151
|
+
table_name, _ = self.extract_table_name_and_path(
|
|
1152
|
+
path_spec, folder.path
|
|
1153
|
+
)
|
|
1154
|
+
|
|
1155
|
+
# Apply table name filtering if configured
|
|
1156
|
+
if not path_spec.tables_filter_pattern.allowed(table_name):
|
|
1157
|
+
logger.debug(f"Table '{table_name}' not allowed and skipping")
|
|
1158
|
+
continue
|
|
1159
|
+
|
|
1160
|
+
# STEP 5: Handle partition traversal based on configuration
|
|
1161
|
+
dirs_to_process = []
|
|
1162
|
+
|
|
1163
|
+
if path_spec.traversal_method == FolderTraversalMethod.ALL:
|
|
1164
|
+
# Process ALL partitions (original behavior)
|
|
1165
|
+
dirs_to_process = [folder.path]
|
|
1166
|
+
logger.debug(
|
|
1167
|
+
f"Processing ALL partition folders under: {folder.path}"
|
|
1015
1168
|
)
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1169
|
+
|
|
1170
|
+
else:
|
|
1171
|
+
# Use the original get_dir_to_process logic for MIN/MAX
|
|
1172
|
+
if (
|
|
1173
|
+
path_spec.traversal_method == FolderTraversalMethod.MIN_MAX
|
|
1174
|
+
or path_spec.traversal_method == FolderTraversalMethod.MAX
|
|
1175
|
+
):
|
|
1176
|
+
# Get MAX partition using original logic
|
|
1177
|
+
dirs_to_process_max = self.get_dir_to_process(
|
|
1178
|
+
uri=folder.path,
|
|
1179
|
+
path_spec=path_spec,
|
|
1180
|
+
min=False,
|
|
1019
1181
|
)
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
logger.info(f"Processing folder: {f}")
|
|
1025
|
-
if path_spec.traversal_method == FolderTraversalMethod.ALL:
|
|
1026
|
-
dirs_to_process.append(f)
|
|
1027
|
-
else:
|
|
1028
|
-
if (
|
|
1029
|
-
path_spec.traversal_method
|
|
1030
|
-
== FolderTraversalMethod.MIN_MAX
|
|
1031
|
-
or path_spec.traversal_method
|
|
1032
|
-
== FolderTraversalMethod.MAX
|
|
1033
|
-
):
|
|
1034
|
-
protocol = ContainerWUCreator.get_protocol(
|
|
1035
|
-
path_spec.include
|
|
1036
|
-
)
|
|
1037
|
-
dirs_to_process_max = self.get_dir_to_process(
|
|
1038
|
-
bucket_name=bucket_name,
|
|
1039
|
-
folder=f + "/",
|
|
1040
|
-
path_spec=path_spec,
|
|
1041
|
-
protocol=protocol,
|
|
1182
|
+
if dirs_to_process_max:
|
|
1183
|
+
dirs_to_process.extend(dirs_to_process_max)
|
|
1184
|
+
logger.debug(
|
|
1185
|
+
f"Added MAX partition: {dirs_to_process_max}"
|
|
1042
1186
|
)
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
bucket_name=bucket_name,
|
|
1051
|
-
folder=f + "/",
|
|
1052
|
-
path_spec=path_spec,
|
|
1053
|
-
protocol=protocol,
|
|
1054
|
-
min=True,
|
|
1055
|
-
)
|
|
1056
|
-
dirs_to_process.append(dirs_to_process_min[0])
|
|
1057
|
-
folders: List[Folder] = []
|
|
1058
|
-
for dir in dirs_to_process:
|
|
1059
|
-
logger.info(f"Getting files from folder: {dir}")
|
|
1060
|
-
prefix_to_process = urlparse(dir).path.lstrip("/")
|
|
1061
|
-
|
|
1062
|
-
folders.extend(
|
|
1063
|
-
self.get_folder_info(
|
|
1064
|
-
path_spec, bucket, prefix_to_process
|
|
1065
|
-
)
|
|
1066
|
-
)
|
|
1067
|
-
max_folder = None
|
|
1068
|
-
if folders:
|
|
1069
|
-
max_folder = max(folders, key=lambda x: x.modification_time)
|
|
1070
|
-
if not max_folder:
|
|
1071
|
-
logger.warning(
|
|
1072
|
-
f"Unable to find any files in the folder {dir}. Skipping..."
|
|
1187
|
+
|
|
1188
|
+
if path_spec.traversal_method == FolderTraversalMethod.MIN_MAX:
|
|
1189
|
+
# Get MIN partition using original logic
|
|
1190
|
+
dirs_to_process_min = self.get_dir_to_process(
|
|
1191
|
+
uri=folder.path,
|
|
1192
|
+
path_spec=path_spec,
|
|
1193
|
+
min=True,
|
|
1073
1194
|
)
|
|
1074
|
-
|
|
1195
|
+
if dirs_to_process_min:
|
|
1196
|
+
dirs_to_process.extend(dirs_to_process_min)
|
|
1197
|
+
logger.debug(
|
|
1198
|
+
f"Added MIN partition: {dirs_to_process_min}"
|
|
1199
|
+
)
|
|
1075
1200
|
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
# TODO: Support content type inference for partitions
|
|
1201
|
+
# Process the selected partitions
|
|
1202
|
+
all_folders = []
|
|
1203
|
+
for partition_path in dirs_to_process:
|
|
1204
|
+
logger.info(f"Scanning files in partition: {partition_path}")
|
|
1205
|
+
partition_files = list(
|
|
1206
|
+
self.get_folder_info(path_spec, partition_path)
|
|
1083
1207
|
)
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1208
|
+
all_folders.extend(partition_files)
|
|
1209
|
+
|
|
1210
|
+
if all_folders:
|
|
1211
|
+
# Use the most recent file across all processed partitions
|
|
1212
|
+
latest_file = max(
|
|
1213
|
+
all_folders, key=lambda x: x.modification_time
|
|
1214
|
+
)
|
|
1215
|
+
|
|
1216
|
+
# Get partition information
|
|
1217
|
+
partitions = [f for f in all_folders if f.is_partition]
|
|
1218
|
+
|
|
1219
|
+
# Calculate total size of processed partitions
|
|
1220
|
+
total_size = sum(f.size for f in all_folders)
|
|
1221
|
+
|
|
1222
|
+
# Create ONE BrowsePath per table
|
|
1223
|
+
# The key insight: we need to provide the sample file for schema inference
|
|
1224
|
+
# but the table path should be extracted correctly by extract_table_name_and_path
|
|
1225
|
+
yield BrowsePath(
|
|
1226
|
+
file=latest_file.sample_file, # Sample file for schema inference
|
|
1227
|
+
timestamp=latest_file.modification_time, # Latest timestamp
|
|
1228
|
+
size=total_size, # Size of processed partitions
|
|
1229
|
+
partitions=partitions, # Partition metadata
|
|
1092
1230
|
)
|
|
1093
1231
|
else:
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
content_type = None
|
|
1105
|
-
if self.source_config.use_s3_content_type:
|
|
1106
|
-
content_type = s3.Object(obj.bucket_name, obj.key).content_type
|
|
1107
|
-
|
|
1108
|
-
yield BrowsePath(
|
|
1109
|
-
file=s3_path,
|
|
1110
|
-
timestamp=obj.last_modified,
|
|
1111
|
-
size=obj.size,
|
|
1112
|
-
partitions=[],
|
|
1113
|
-
content_type=content_type,
|
|
1232
|
+
logger.warning(
|
|
1233
|
+
f"No files found in processed partitions for table {table_name}"
|
|
1234
|
+
)
|
|
1235
|
+
|
|
1236
|
+
except Exception as e:
|
|
1237
|
+
if isinstance(e, s3.meta.client.exceptions.NoSuchBucket):
|
|
1238
|
+
self.get_report().report_warning(
|
|
1239
|
+
"Missing bucket",
|
|
1240
|
+
f"No bucket found {e.response['Error'].get('BucketName')}",
|
|
1114
1241
|
)
|
|
1242
|
+
return
|
|
1243
|
+
logger.error(f"Error in _process_templated_path: {e}")
|
|
1244
|
+
raise e
|
|
1115
1245
|
|
|
1116
|
-
def
|
|
1117
|
-
|
|
1246
|
+
def _process_simple_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
|
|
1247
|
+
"""
|
|
1248
|
+
Process simple S3 paths without {table} templates to create file-level datasets.
|
|
1249
|
+
|
|
1250
|
+
This method handles straightforward file patterns by:
|
|
1251
|
+
1. Listing all files matching the pattern
|
|
1252
|
+
2. Creating one dataset per file
|
|
1253
|
+
3. No aggregation or grouping is performed
|
|
1254
|
+
|
|
1255
|
+
Use Cases:
|
|
1256
|
+
- Individual file processing: s3://bucket/data/*.csv
|
|
1257
|
+
- Direct file paths: s3://bucket/data/myfile.json
|
|
1258
|
+
- Patterns without table grouping: s3://bucket/logs/*.log
|
|
1259
|
+
|
|
1260
|
+
Args:
|
|
1261
|
+
path_spec: Path specification without {table} template
|
|
1262
|
+
|
|
1263
|
+
Yields:
|
|
1264
|
+
BrowsePath: One per file, containing individual file metadata
|
|
1265
|
+
|
|
1266
|
+
Example Output:
|
|
1267
|
+
- BrowsePath(file="data/file1.csv", size=1000, partitions=[])
|
|
1268
|
+
- BrowsePath(file="data/file2.csv", size=2000, partitions=[])
|
|
1269
|
+
"""
|
|
1270
|
+
|
|
1271
|
+
if self.source_config.aws_config is None:
|
|
1272
|
+
raise ValueError("aws_config not set")
|
|
1273
|
+
s3 = self.source_config.aws_config.get_s3_resource(
|
|
1274
|
+
self.source_config.verify_ssl
|
|
1275
|
+
)
|
|
1276
|
+
|
|
1277
|
+
path_spec.sample_files = False # Disable sampling for simple paths
|
|
1278
|
+
|
|
1279
|
+
# Extract the prefix from the path spec (stops at first wildcard)
|
|
1280
|
+
prefix = self.get_prefix(path_spec.include)
|
|
1281
|
+
|
|
1282
|
+
basename_startswith = prefix.split("/")[-1]
|
|
1283
|
+
dirname = prefix.removesuffix(basename_startswith)
|
|
1284
|
+
|
|
1285
|
+
# Iterate through all objects in the bucket matching the prefix
|
|
1286
|
+
for obj in list_objects_recursive_path(
|
|
1287
|
+
dirname,
|
|
1288
|
+
startswith=basename_startswith,
|
|
1289
|
+
aws_config=self.source_config.aws_config,
|
|
1290
|
+
):
|
|
1291
|
+
s3_path = self.create_s3_path(obj.bucket_name, obj.key)
|
|
1292
|
+
|
|
1293
|
+
# Get content type if configured
|
|
1294
|
+
content_type = None
|
|
1295
|
+
if self.source_config.use_s3_content_type:
|
|
1296
|
+
content_type = s3.Object(obj.bucket_name, obj.key).content_type
|
|
1297
|
+
|
|
1298
|
+
# Create one BrowsePath per file
|
|
1299
|
+
yield BrowsePath(
|
|
1300
|
+
file=s3_path,
|
|
1301
|
+
timestamp=obj.last_modified,
|
|
1302
|
+
size=obj.size,
|
|
1303
|
+
partitions=[], # No partitions in simple mode
|
|
1304
|
+
content_type=content_type,
|
|
1305
|
+
)
|
|
1118
1306
|
|
|
1119
1307
|
def local_browser(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
|
|
1120
1308
|
prefix = self.get_prefix(path_spec.include)
|
|
@@ -1158,8 +1346,13 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1158
1346
|
)
|
|
1159
1347
|
table_dict: Dict[str, TableData] = {}
|
|
1160
1348
|
for browse_path in file_browser:
|
|
1349
|
+
# Normalize URI for pattern matching
|
|
1350
|
+
normalized_file_path = self._normalize_uri_for_pattern_matching(
|
|
1351
|
+
browse_path.file
|
|
1352
|
+
)
|
|
1353
|
+
|
|
1161
1354
|
if not path_spec.allowed(
|
|
1162
|
-
|
|
1355
|
+
normalized_file_path,
|
|
1163
1356
|
ignore_ext=self.is_s3_platform()
|
|
1164
1357
|
and self.source_config.use_s3_content_type,
|
|
1165
1358
|
):
|
|
@@ -1235,5 +1428,13 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1235
1428
|
def is_s3_platform(self):
|
|
1236
1429
|
return self.source_config.platform == "s3"
|
|
1237
1430
|
|
|
1431
|
+
def strip_s3_prefix(self, s3_uri: str) -> str:
|
|
1432
|
+
"""Strip S3 prefix from URI. Can be overridden by adapters for other platforms."""
|
|
1433
|
+
return strip_s3_prefix(s3_uri)
|
|
1434
|
+
|
|
1435
|
+
def _normalize_uri_for_pattern_matching(self, uri: str) -> str:
|
|
1436
|
+
"""Normalize URI for pattern matching. Can be overridden by adapters for other platforms."""
|
|
1437
|
+
return uri
|
|
1438
|
+
|
|
1238
1439
|
def get_report(self):
|
|
1239
1440
|
return self.report
|