acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -3,15 +3,14 @@ import functools
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import pathlib
|
|
6
|
+
import posixpath
|
|
6
7
|
import re
|
|
7
8
|
import time
|
|
8
9
|
from datetime import datetime
|
|
9
10
|
from pathlib import PurePath
|
|
10
|
-
from typing import
|
|
11
|
-
from urllib.parse import urlparse
|
|
11
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
12
12
|
|
|
13
13
|
import smart_open.compression as so_compression
|
|
14
|
-
from more_itertools import peekable
|
|
15
14
|
from pyspark.conf import SparkConf
|
|
16
15
|
from pyspark.sql import SparkSession
|
|
17
16
|
from pyspark.sql.dataframe import DataFrame
|
|
@@ -35,14 +34,25 @@ from datahub.ingestion.api.decorators import (
|
|
|
35
34
|
)
|
|
36
35
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
37
36
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
|
-
from datahub.ingestion.source.aws.s3_boto_utils import
|
|
37
|
+
from datahub.ingestion.source.aws.s3_boto_utils import (
|
|
38
|
+
get_s3_tags,
|
|
39
|
+
list_folders_path,
|
|
40
|
+
list_objects_recursive_path,
|
|
41
|
+
)
|
|
39
42
|
from datahub.ingestion.source.aws.s3_util import (
|
|
40
43
|
get_bucket_name,
|
|
41
44
|
get_bucket_relative_path,
|
|
42
45
|
get_key_prefix,
|
|
43
46
|
strip_s3_prefix,
|
|
44
47
|
)
|
|
45
|
-
from datahub.ingestion.source.
|
|
48
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
49
|
+
from datahub.ingestion.source.data_lake_common.data_lake_utils import (
|
|
50
|
+
ContainerWUCreator,
|
|
51
|
+
add_partition_columns_to_schema,
|
|
52
|
+
)
|
|
53
|
+
from datahub.ingestion.source.data_lake_common.object_store import (
|
|
54
|
+
create_object_store_adapter,
|
|
55
|
+
)
|
|
46
56
|
from datahub.ingestion.source.data_lake_common.path_spec import FolderTraversalMethod
|
|
47
57
|
from datahub.ingestion.source.s3.config import DataLakeSourceConfig, PathSpec
|
|
48
58
|
from datahub.ingestion.source.s3.report import DataLakeSourceReport
|
|
@@ -56,9 +66,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
56
66
|
)
|
|
57
67
|
from datahub.metadata.com.linkedin.pegasus2avro.common import TimeStamp
|
|
58
68
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
59
|
-
SchemaField,
|
|
60
69
|
SchemaMetadata,
|
|
61
|
-
StringTypeClass,
|
|
62
70
|
)
|
|
63
71
|
from datahub.metadata.schema_classes import (
|
|
64
72
|
DataPlatformInstanceClass,
|
|
@@ -68,22 +76,15 @@ from datahub.metadata.schema_classes import (
|
|
|
68
76
|
OtherSchemaClass,
|
|
69
77
|
PartitionsSummaryClass,
|
|
70
78
|
PartitionSummaryClass,
|
|
71
|
-
SchemaFieldDataTypeClass,
|
|
72
79
|
_Aspect,
|
|
73
80
|
)
|
|
74
81
|
from datahub.telemetry import stats, telemetry
|
|
75
|
-
from datahub.utilities.groupby import groupby_unsorted
|
|
76
82
|
from datahub.utilities.perf_timer import PerfTimer
|
|
77
83
|
|
|
78
|
-
if TYPE_CHECKING:
|
|
79
|
-
from mypy_boto3_s3.service_resource import Bucket
|
|
80
|
-
|
|
81
84
|
# hide annoying debug errors from py4j
|
|
82
85
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
83
86
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
84
87
|
|
|
85
|
-
PAGE_SIZE = 1000
|
|
86
|
-
|
|
87
88
|
# Hack to support the .gzip extension with smart_open.
|
|
88
89
|
so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"])
|
|
89
90
|
|
|
@@ -109,14 +110,7 @@ profiling_flags_to_report = [
|
|
|
109
110
|
"include_field_sample_values",
|
|
110
111
|
]
|
|
111
112
|
|
|
112
|
-
|
|
113
|
-
# LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG = AddDatasetBrowsePathConfig(
|
|
114
|
-
# path_templates=["/ENV/PLATFORMDATASET_PARTS"], replace_existing=True
|
|
115
|
-
# )
|
|
116
|
-
#
|
|
117
|
-
# LOCAL_BROWSE_PATH_TRANSFORMER = AddDatasetBrowsePathTransformer(
|
|
118
|
-
# ctx=None, config=LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG
|
|
119
|
-
# )
|
|
113
|
+
URI_SCHEME_REGEX = re.compile(r"^[a-z0-9]+://")
|
|
120
114
|
|
|
121
115
|
|
|
122
116
|
def partitioned_folder_comparator(folder1: str, folder2: str) -> int:
|
|
@@ -159,6 +153,15 @@ class Folder:
|
|
|
159
153
|
)
|
|
160
154
|
|
|
161
155
|
|
|
156
|
+
@dataclasses.dataclass
|
|
157
|
+
class FolderInfo:
|
|
158
|
+
objects: List[Any]
|
|
159
|
+
total_size: int
|
|
160
|
+
min_time: datetime
|
|
161
|
+
max_time: datetime
|
|
162
|
+
latest_obj: Any
|
|
163
|
+
|
|
164
|
+
|
|
162
165
|
@dataclasses.dataclass
|
|
163
166
|
class BrowsePath:
|
|
164
167
|
file: str
|
|
@@ -185,8 +188,15 @@ class TableData:
|
|
|
185
188
|
|
|
186
189
|
@platform_name("S3 / Local Files", id="s3")
|
|
187
190
|
@config_class(DataLakeSourceConfig)
|
|
188
|
-
@support_status(SupportStatus.
|
|
189
|
-
@capability(
|
|
191
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
192
|
+
@capability(
|
|
193
|
+
SourceCapability.CONTAINERS,
|
|
194
|
+
"Enabled by default",
|
|
195
|
+
subtype_modifier=[
|
|
196
|
+
SourceCapabilityModifier.FOLDER,
|
|
197
|
+
SourceCapabilityModifier.S3_BUCKET,
|
|
198
|
+
],
|
|
199
|
+
)
|
|
190
200
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
191
201
|
@capability(
|
|
192
202
|
SourceCapability.SCHEMA_METADATA, "Can infer schema from supported file types"
|
|
@@ -197,12 +207,59 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
197
207
|
report: DataLakeSourceReport
|
|
198
208
|
profiling_times_taken: List[float]
|
|
199
209
|
container_WU_creator: ContainerWUCreator
|
|
210
|
+
object_store_adapter: Any
|
|
200
211
|
|
|
201
212
|
def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext):
|
|
202
213
|
super().__init__(config, ctx)
|
|
203
214
|
self.source_config = config
|
|
204
215
|
self.report = DataLakeSourceReport()
|
|
205
216
|
self.profiling_times_taken = []
|
|
217
|
+
self.container_WU_creator = ContainerWUCreator(
|
|
218
|
+
self.source_config.platform,
|
|
219
|
+
self.source_config.platform_instance,
|
|
220
|
+
self.source_config.env,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Create an object store adapter for handling external URLs and paths
|
|
224
|
+
if self.is_s3_platform():
|
|
225
|
+
# Get the AWS region from config, if available
|
|
226
|
+
aws_region = None
|
|
227
|
+
if self.source_config.aws_config:
|
|
228
|
+
aws_region = self.source_config.aws_config.aws_region
|
|
229
|
+
|
|
230
|
+
# For backward compatibility with tests: if we're using a test endpoint, use us-east-1
|
|
231
|
+
if self.source_config.aws_config.aws_endpoint_url and (
|
|
232
|
+
"localstack"
|
|
233
|
+
in self.source_config.aws_config.aws_endpoint_url.lower()
|
|
234
|
+
or "storage.googleapis.com"
|
|
235
|
+
in self.source_config.aws_config.aws_endpoint_url.lower()
|
|
236
|
+
):
|
|
237
|
+
aws_region = "us-east-1"
|
|
238
|
+
|
|
239
|
+
# Create an S3 adapter with the configured region
|
|
240
|
+
self.object_store_adapter = create_object_store_adapter(
|
|
241
|
+
"s3", aws_region=aws_region
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Special handling for GCS via S3 (via boto compatibility layer)
|
|
245
|
+
if (
|
|
246
|
+
self.source_config.aws_config
|
|
247
|
+
and self.source_config.aws_config.aws_endpoint_url
|
|
248
|
+
and "storage.googleapis.com"
|
|
249
|
+
in self.source_config.aws_config.aws_endpoint_url.lower()
|
|
250
|
+
):
|
|
251
|
+
# We need to preserve the S3-style paths but use GCS external URL generation
|
|
252
|
+
self.object_store_adapter = create_object_store_adapter("gcs")
|
|
253
|
+
# Override create_s3_path to maintain S3 compatibility
|
|
254
|
+
self.object_store_adapter.register_customization(
|
|
255
|
+
"create_s3_path", lambda bucket, key: f"s3://{bucket}/{key}"
|
|
256
|
+
)
|
|
257
|
+
else:
|
|
258
|
+
# For local files, create a default adapter
|
|
259
|
+
self.object_store_adapter = create_object_store_adapter(
|
|
260
|
+
self.source_config.platform or "file"
|
|
261
|
+
)
|
|
262
|
+
|
|
206
263
|
config_report = {
|
|
207
264
|
config_option: config.dict().get(config_option)
|
|
208
265
|
for config_option in config_options_to_report
|
|
@@ -319,7 +376,10 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
319
376
|
|
|
320
377
|
def read_file_spark(self, file: str, ext: str) -> Optional[DataFrame]:
|
|
321
378
|
logger.debug(f"Opening file {file} for profiling in spark")
|
|
322
|
-
|
|
379
|
+
if "s3://" in file:
|
|
380
|
+
# replace s3:// with s3a://, and make sure standalone bucket names always end with a slash.
|
|
381
|
+
# Spark will fail if given a path like `s3a://mybucket`, and requires it to be `s3a://mybucket/`.
|
|
382
|
+
file = f"s3a://{get_bucket_name(file)}/{get_bucket_relative_path(file)}"
|
|
323
383
|
|
|
324
384
|
telemetry.telemetry_instance.ping("data_lake_file", {"extension": ext})
|
|
325
385
|
|
|
@@ -376,9 +436,8 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
376
436
|
self.source_config.verify_ssl
|
|
377
437
|
)
|
|
378
438
|
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
)
|
|
439
|
+
path = re.sub(URI_SCHEME_REGEX, "s3://", table_data.full_path)
|
|
440
|
+
file = smart_open(path, "rb", transport_params={"client": s3_client})
|
|
382
441
|
else:
|
|
383
442
|
# We still use smart_open here to take advantage of the compression
|
|
384
443
|
# capabilities of smart_open.
|
|
@@ -417,7 +476,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
417
476
|
fields = sorted(fields, key=lambda f: f.fieldPath)
|
|
418
477
|
|
|
419
478
|
if self.source_config.add_partition_columns_to_schema and table_data.partitions:
|
|
420
|
-
|
|
479
|
+
add_partition_columns_to_schema(
|
|
421
480
|
fields=fields, path_spec=path_spec, full_path=table_data.full_path
|
|
422
481
|
)
|
|
423
482
|
|
|
@@ -453,34 +512,6 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
453
512
|
else:
|
|
454
513
|
return None
|
|
455
514
|
|
|
456
|
-
def add_partition_columns_to_schema(
|
|
457
|
-
self, path_spec: PathSpec, full_path: str, fields: List[SchemaField]
|
|
458
|
-
) -> None:
|
|
459
|
-
is_fieldpath_v2 = False
|
|
460
|
-
for field in fields:
|
|
461
|
-
if field.fieldPath.startswith("[version=2.0]"):
|
|
462
|
-
is_fieldpath_v2 = True
|
|
463
|
-
break
|
|
464
|
-
partition_keys = path_spec.get_partition_from_path(full_path)
|
|
465
|
-
if not partition_keys:
|
|
466
|
-
return None
|
|
467
|
-
|
|
468
|
-
for partition_key in partition_keys:
|
|
469
|
-
fields.append(
|
|
470
|
-
SchemaField(
|
|
471
|
-
fieldPath=(
|
|
472
|
-
f"{partition_key[0]}"
|
|
473
|
-
if not is_fieldpath_v2
|
|
474
|
-
else f"[version=2.0].[type=string].{partition_key[0]}"
|
|
475
|
-
),
|
|
476
|
-
nativeDataType="string",
|
|
477
|
-
type=SchemaFieldDataTypeClass(StringTypeClass()),
|
|
478
|
-
isPartitioningKey=True,
|
|
479
|
-
nullable=True,
|
|
480
|
-
recursive=False,
|
|
481
|
-
)
|
|
482
|
-
)
|
|
483
|
-
|
|
484
515
|
def get_table_profile(
|
|
485
516
|
self, table_data: TableData, dataset_urn: str
|
|
486
517
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -605,17 +636,28 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
605
636
|
maxPartition=max_partition_summary, minPartition=min_partition_summary
|
|
606
637
|
)
|
|
607
638
|
|
|
639
|
+
def get_external_url(self, table_data: TableData) -> Optional[str]:
|
|
640
|
+
"""
|
|
641
|
+
Get the external URL for a table using the configured object store adapter.
|
|
642
|
+
|
|
643
|
+
Args:
|
|
644
|
+
table_data: Table data containing path information
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
An external URL or None if not applicable
|
|
648
|
+
"""
|
|
649
|
+
# The adapter handles all the URL generation with proper region handling
|
|
650
|
+
return self.object_store_adapter.get_external_url(table_data)
|
|
651
|
+
|
|
608
652
|
def ingest_table(
|
|
609
653
|
self, table_data: TableData, path_spec: PathSpec
|
|
610
654
|
) -> Iterable[MetadataWorkUnit]:
|
|
611
655
|
aspects: List[Optional[_Aspect]] = []
|
|
612
656
|
|
|
613
657
|
logger.info(f"Extracting table schema from file: {table_data.full_path}")
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
else table_data.table_path.strip("/")
|
|
618
|
-
)
|
|
658
|
+
|
|
659
|
+
# remove protocol and any leading or trailing slashes
|
|
660
|
+
browse_path = re.sub(URI_SCHEME_REGEX, "", table_data.table_path).strip("/")
|
|
619
661
|
|
|
620
662
|
data_platform_urn = make_data_platform_urn(self.source_config.platform)
|
|
621
663
|
logger.info(f"Creating dataset urn with name: {browse_path}")
|
|
@@ -674,6 +716,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
674
716
|
if max_partition
|
|
675
717
|
else None
|
|
676
718
|
),
|
|
719
|
+
externalUrl=self.get_external_url(table_data),
|
|
677
720
|
)
|
|
678
721
|
aspects.append(dataset_properties)
|
|
679
722
|
if table_data.size_in_bytes > 0:
|
|
@@ -748,10 +791,20 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
748
791
|
else:
|
|
749
792
|
return relative_path
|
|
750
793
|
|
|
751
|
-
def
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
794
|
+
def extract_table_name_and_path(
|
|
795
|
+
self, path_spec: PathSpec, path: str
|
|
796
|
+
) -> Tuple[str, str]:
|
|
797
|
+
# Extract the table name and base path from a path that's been normalized back to the
|
|
798
|
+
# "s3://" scheme that matches the path_spec
|
|
799
|
+
table_name, table_path = path_spec.extract_table_name_and_path(
|
|
800
|
+
self._normalize_uri_for_pattern_matching(path)
|
|
801
|
+
)
|
|
802
|
+
# Then convert the table base path back to the original scheme
|
|
803
|
+
scheme = re.match(URI_SCHEME_REGEX, path)
|
|
804
|
+
if scheme:
|
|
805
|
+
table_path = re.sub(URI_SCHEME_REGEX, scheme[0], table_path)
|
|
806
|
+
|
|
807
|
+
return table_name, table_path
|
|
755
808
|
|
|
756
809
|
def extract_table_data(
|
|
757
810
|
self,
|
|
@@ -761,7 +814,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
761
814
|
path = browse_path.file
|
|
762
815
|
partitions = browse_path.partitions
|
|
763
816
|
logger.debug(f"Getting table data for path: {path}")
|
|
764
|
-
table_name, table_path =
|
|
817
|
+
table_name, table_path = self.extract_table_name_and_path(path_spec, path)
|
|
765
818
|
return TableData(
|
|
766
819
|
display_name=table_name,
|
|
767
820
|
is_s3=self.is_s3_platform(),
|
|
@@ -785,72 +838,91 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
785
838
|
content_type=browse_path.content_type,
|
|
786
839
|
)
|
|
787
840
|
|
|
788
|
-
def resolve_templated_folders(self,
|
|
841
|
+
def resolve_templated_folders(self, prefix: str) -> Iterable[str]:
|
|
789
842
|
folder_split: List[str] = prefix.split("*", 1)
|
|
790
843
|
# If the len of split is 1 it means we don't have * in the prefix
|
|
791
844
|
if len(folder_split) == 1:
|
|
792
845
|
yield prefix
|
|
793
846
|
return
|
|
794
847
|
|
|
795
|
-
|
|
796
|
-
|
|
848
|
+
basename_startswith = folder_split[0].split("/")[-1]
|
|
849
|
+
dirname = folder_split[0].removesuffix(basename_startswith)
|
|
850
|
+
|
|
851
|
+
folders = list_folders_path(
|
|
852
|
+
dirname,
|
|
853
|
+
startswith=basename_startswith,
|
|
854
|
+
aws_config=self.source_config.aws_config,
|
|
797
855
|
)
|
|
798
856
|
for folder in folders:
|
|
857
|
+
# Ensure proper path joining - folders from list_folders path never include a
|
|
858
|
+
# trailing slash, but we need to handle the case where folder_split[1] might
|
|
859
|
+
# start with a slash
|
|
860
|
+
remaining_pattern = folder_split[1]
|
|
861
|
+
if remaining_pattern.startswith("/"):
|
|
862
|
+
remaining_pattern = remaining_pattern[1:]
|
|
863
|
+
|
|
799
864
|
yield from self.resolve_templated_folders(
|
|
800
|
-
|
|
865
|
+
f"{folder.path}/{remaining_pattern}"
|
|
801
866
|
)
|
|
802
867
|
|
|
803
868
|
def get_dir_to_process(
|
|
804
869
|
self,
|
|
805
|
-
|
|
806
|
-
folder: str,
|
|
870
|
+
uri: str,
|
|
807
871
|
path_spec: PathSpec,
|
|
808
|
-
protocol: str,
|
|
809
872
|
min: bool = False,
|
|
810
873
|
) -> List[str]:
|
|
811
|
-
#
|
|
812
|
-
#
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
874
|
+
# Add any remaining parts of the path_spec before globs, excluding the
|
|
875
|
+
# final filename component, to the URI and prefix so that we don't
|
|
876
|
+
# unnecessarily list too many objects.
|
|
877
|
+
if not uri.endswith("/"):
|
|
878
|
+
uri += "/"
|
|
879
|
+
remaining = posixpath.dirname(path_spec.get_remaining_glob_include(uri)).split(
|
|
880
|
+
"*"
|
|
881
|
+
)[0]
|
|
882
|
+
uri += posixpath.dirname(remaining)
|
|
883
|
+
prefix = posixpath.basename(remaining)
|
|
884
|
+
|
|
885
|
+
# Check if we're at the end of the include path. If so, no need to list sub-folders.
|
|
886
|
+
if path_spec.has_correct_number_of_directory_components(uri):
|
|
887
|
+
return [uri]
|
|
888
|
+
|
|
889
|
+
logger.debug(f"get_dir_to_process listing folders {uri=} {prefix=}")
|
|
890
|
+
iterator = list_folders_path(
|
|
891
|
+
s3_uri=uri,
|
|
892
|
+
startswith=prefix,
|
|
817
893
|
aws_config=self.source_config.aws_config,
|
|
818
894
|
)
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
folders
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
else:
|
|
842
|
-
return [f"{protocol}{bucket_name}/{folder}"]
|
|
843
|
-
return [f"{protocol}{bucket_name}/{folder}"]
|
|
895
|
+
sorted_dirs = sorted(
|
|
896
|
+
iterator,
|
|
897
|
+
key=lambda dir: functools.cmp_to_key(partitioned_folder_comparator)(
|
|
898
|
+
dir.name
|
|
899
|
+
),
|
|
900
|
+
reverse=not min,
|
|
901
|
+
)
|
|
902
|
+
folders = []
|
|
903
|
+
for dir in sorted_dirs:
|
|
904
|
+
if path_spec.dir_allowed(dir.path):
|
|
905
|
+
folders_list = self.get_dir_to_process(
|
|
906
|
+
uri=dir.path,
|
|
907
|
+
path_spec=path_spec,
|
|
908
|
+
min=min,
|
|
909
|
+
)
|
|
910
|
+
folders.extend(folders_list)
|
|
911
|
+
if path_spec.traversal_method != FolderTraversalMethod.ALL:
|
|
912
|
+
return folders
|
|
913
|
+
if folders:
|
|
914
|
+
return folders
|
|
915
|
+
else:
|
|
916
|
+
return [uri]
|
|
844
917
|
|
|
845
918
|
def get_folder_info(
|
|
846
919
|
self,
|
|
847
920
|
path_spec: PathSpec,
|
|
848
|
-
|
|
849
|
-
prefix: str,
|
|
921
|
+
uri: str,
|
|
850
922
|
) -> Iterable[Folder]:
|
|
851
923
|
"""
|
|
852
|
-
Retrieves all the folders in a path by listing all the files
|
|
853
|
-
|
|
924
|
+
Retrieves all the folders in a path by recursively listing all the files under the
|
|
925
|
+
given URI.
|
|
854
926
|
|
|
855
927
|
A folder has creation and modification times, size, and a sample file path.
|
|
856
928
|
- Creation time is the earliest creation time of all files in the folder.
|
|
@@ -860,72 +932,174 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
860
932
|
|
|
861
933
|
Parameters:
|
|
862
934
|
path_spec (PathSpec): The path specification used to determine partitioning.
|
|
863
|
-
|
|
864
|
-
prefix (str): The prefix path in the S3 bucket to list objects from.
|
|
935
|
+
uri (str): The path in the S3 bucket to list objects from.
|
|
865
936
|
|
|
866
937
|
Returns:
|
|
867
938
|
List[Folder]: A list of Folder objects representing the partitions found.
|
|
868
939
|
"""
|
|
869
940
|
|
|
870
941
|
def _is_allowed_path(path_spec_: PathSpec, s3_uri: str) -> bool:
|
|
871
|
-
|
|
942
|
+
# Normalize URI for pattern matching
|
|
943
|
+
normalized_uri = self._normalize_uri_for_pattern_matching(s3_uri)
|
|
944
|
+
|
|
945
|
+
allowed = path_spec_.allowed(normalized_uri)
|
|
872
946
|
if not allowed:
|
|
873
947
|
logger.debug(f"File {s3_uri} not allowed and skipping")
|
|
874
948
|
self.report.report_file_dropped(s3_uri)
|
|
875
949
|
return allowed
|
|
876
950
|
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
951
|
+
# Add any remaining parts of the path_spec before globs to the URI and prefix,
|
|
952
|
+
# so that we don't unnecessarily list too many objects.
|
|
953
|
+
if not uri.endswith("/"):
|
|
954
|
+
uri += "/"
|
|
955
|
+
remaining = path_spec.get_remaining_glob_include(uri).split("*")[0]
|
|
956
|
+
uri += posixpath.dirname(remaining)
|
|
957
|
+
prefix = posixpath.basename(remaining)
|
|
958
|
+
|
|
959
|
+
# Process objects in a memory-efficient streaming fashion
|
|
960
|
+
# Instead of loading all objects into memory, we'll accumulate folder data incrementally
|
|
961
|
+
folder_data: Dict[str, FolderInfo] = {} # dirname -> FolderInfo
|
|
962
|
+
|
|
963
|
+
logger.info(f"Listing objects under {repr(uri)} with {prefix=}")
|
|
964
|
+
|
|
965
|
+
for obj in list_objects_recursive_path(
|
|
966
|
+
uri, startswith=prefix, aws_config=self.source_config.aws_config
|
|
967
|
+
):
|
|
968
|
+
s3_path = self.create_s3_path(obj.bucket_name, obj.key)
|
|
969
|
+
|
|
970
|
+
if not _is_allowed_path(path_spec, s3_path):
|
|
971
|
+
continue
|
|
972
|
+
|
|
973
|
+
# Extract the directory name (folder) from the object key
|
|
974
|
+
dirname = obj.key.rsplit("/", 1)[0]
|
|
975
|
+
|
|
976
|
+
# Initialize folder data if we haven't seen this directory before
|
|
977
|
+
if dirname not in folder_data:
|
|
978
|
+
folder_data[dirname] = FolderInfo(
|
|
979
|
+
objects=[],
|
|
980
|
+
total_size=0,
|
|
981
|
+
min_time=obj.last_modified,
|
|
982
|
+
max_time=obj.last_modified,
|
|
983
|
+
latest_obj=obj,
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
# Update folder statistics incrementally
|
|
987
|
+
folder_info = folder_data[dirname]
|
|
988
|
+
folder_info.objects.append(obj)
|
|
989
|
+
folder_info.total_size += obj.size
|
|
990
|
+
|
|
991
|
+
# Track min/max times and latest object
|
|
992
|
+
if obj.last_modified < folder_info.min_time:
|
|
993
|
+
folder_info.min_time = obj.last_modified
|
|
994
|
+
if obj.last_modified > folder_info.max_time:
|
|
995
|
+
folder_info.max_time = obj.last_modified
|
|
996
|
+
folder_info.latest_obj = obj
|
|
997
|
+
|
|
998
|
+
# Yield folders after processing all objects
|
|
999
|
+
for _dirname, folder_info in folder_data.items():
|
|
1000
|
+
latest_obj = folder_info.latest_obj
|
|
1001
|
+
max_file_s3_path = self.create_s3_path(
|
|
1002
|
+
latest_obj.bucket_name, latest_obj.key
|
|
882
1003
|
)
|
|
883
|
-
)
|
|
884
|
-
grouped_s3_objects_by_dirname = groupby_unsorted(
|
|
885
|
-
s3_objects,
|
|
886
|
-
key=lambda obj: obj.key.rsplit("/", 1)[0],
|
|
887
|
-
)
|
|
888
|
-
for _, group in grouped_s3_objects_by_dirname:
|
|
889
|
-
max_file = max(group, key=lambda x: x.last_modified)
|
|
890
|
-
max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
|
|
891
1004
|
|
|
892
1005
|
# If partition_id is None, it means the folder is not a partition
|
|
893
|
-
partition_id = path_spec.get_partition_from_path(
|
|
1006
|
+
partition_id = path_spec.get_partition_from_path(
|
|
1007
|
+
self._normalize_uri_for_pattern_matching(max_file_s3_path)
|
|
1008
|
+
)
|
|
894
1009
|
|
|
895
1010
|
yield Folder(
|
|
896
1011
|
partition_id=partition_id,
|
|
897
1012
|
is_partition=bool(partition_id),
|
|
898
|
-
creation_time=
|
|
899
|
-
modification_time=
|
|
1013
|
+
creation_time=folder_info.min_time,
|
|
1014
|
+
modification_time=folder_info.max_time,
|
|
900
1015
|
sample_file=max_file_s3_path,
|
|
901
|
-
size=
|
|
1016
|
+
size=folder_info.total_size,
|
|
902
1017
|
)
|
|
903
1018
|
|
|
1019
|
+
def create_s3_path(self, bucket_name: str, key: str) -> str:
|
|
1020
|
+
return f"s3://{bucket_name}/{key}"
|
|
1021
|
+
|
|
904
1022
|
def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
|
|
1023
|
+
"""
|
|
1024
|
+
Main entry point for browsing S3 objects and creating table-level datasets.
|
|
1025
|
+
|
|
1026
|
+
This method determines whether to use templated processing (for paths with {table})
|
|
1027
|
+
or simple file-by-file processing (for paths without templates).
|
|
1028
|
+
|
|
1029
|
+
Args:
|
|
1030
|
+
path_spec: Configuration specifying the S3 path pattern to scan
|
|
1031
|
+
sample_size: Number of files to sample (used in simple processing)
|
|
1032
|
+
|
|
1033
|
+
Returns:
|
|
1034
|
+
Iterator of BrowsePath objects representing datasets to be created
|
|
1035
|
+
|
|
1036
|
+
Examples:
|
|
1037
|
+
- Templated: s3://bucket/data/*/{table}/** -> Groups files by table
|
|
1038
|
+
- Simple: s3://bucket/data/*.csv -> Processes individual files
|
|
1039
|
+
"""
|
|
1040
|
+
if self.source_config.aws_config is None:
|
|
1041
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
1042
|
+
|
|
1043
|
+
logger.info(f"Processing path spec: {path_spec.include}")
|
|
1044
|
+
|
|
1045
|
+
# Check if we have {table} template in the path
|
|
1046
|
+
has_table_template = "{table}" in path_spec.include
|
|
1047
|
+
|
|
1048
|
+
logger.info(f"Has table template: {has_table_template}")
|
|
1049
|
+
|
|
1050
|
+
if has_table_template:
|
|
1051
|
+
logger.info("Using templated path processing")
|
|
1052
|
+
# Always use templated processing when {table} is present
|
|
1053
|
+
# This groups files under table-level datasets
|
|
1054
|
+
yield from self._process_templated_path(path_spec)
|
|
1055
|
+
else:
|
|
1056
|
+
logger.info("Using simple path processing")
|
|
1057
|
+
# Only use simple processing for non-templated paths
|
|
1058
|
+
# This creates individual file-level datasets
|
|
1059
|
+
yield from self._process_simple_path(path_spec)
|
|
1060
|
+
|
|
1061
|
+
def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
|
|
1062
|
+
"""
|
|
1063
|
+
Process S3 paths containing {table} templates to create table-level datasets.
|
|
1064
|
+
|
|
1065
|
+
This method handles complex path patterns with wildcards and templates by:
|
|
1066
|
+
1. Replacing template placeholders with stars (except {table})
|
|
1067
|
+
2. Resolving wildcards in the path up to the {table} marker
|
|
1068
|
+
3. Finding all potential table folders under each resolved path
|
|
1069
|
+
4. Applying configurable partition traversal strategy (ALL, MAX, MIN_MAX)
|
|
1070
|
+
5. Aggregating files from selected partitions under each table
|
|
1071
|
+
6. Creating one dataset per table (not per file)
|
|
1072
|
+
|
|
1073
|
+
Args:
|
|
1074
|
+
path_spec: Path specification with {table} template
|
|
1075
|
+
|
|
1076
|
+
Yields:
|
|
1077
|
+
BrowsePath: One per table (not per file), containing aggregated metadata
|
|
1078
|
+
"""
|
|
1079
|
+
|
|
905
1080
|
if self.source_config.aws_config is None:
|
|
906
1081
|
raise ValueError("aws_config not set. Cannot browse s3")
|
|
907
1082
|
s3 = self.source_config.aws_config.get_s3_resource(
|
|
908
1083
|
self.source_config.verify_ssl
|
|
909
1084
|
)
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
1085
|
+
|
|
1086
|
+
# Find the part before {table}
|
|
1087
|
+
table_marker = "{table}"
|
|
1088
|
+
if table_marker not in path_spec.include:
|
|
1089
|
+
logger.info("No {table} marker found in path")
|
|
1090
|
+
return
|
|
1091
|
+
|
|
1092
|
+
# STEP 1: Replace template placeholders with stars (except {table}) to enable folder resolution
|
|
1093
|
+
# This is the crucial missing logic from the original implementation
|
|
915
1094
|
matches = re.finditer(r"{\s*\w+\s*}", path_spec.include, re.MULTILINE)
|
|
916
1095
|
matches_list = list(matches)
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
#
|
|
920
|
-
# For example:
|
|
921
|
-
# "s3://my-test-bucket/*/{dept}/*/{table}/*/*.*" -> "s3://my-test-bucket/*/*/*/{table}/*/*.*"
|
|
922
|
-
# We only keep the last template as a marker to know the point util we need to resolve path.
|
|
923
|
-
# After the marker we can safely get sample files for sampling because it is not used in the
|
|
924
|
-
# table name, so we don't need all the files.
|
|
925
|
-
# This speed up processing but we won't be able to get a precise modification date/size/number of files.
|
|
1096
|
+
|
|
1097
|
+
if matches_list:
|
|
1098
|
+
# Replace all templates with stars except keep {table} as the marker
|
|
926
1099
|
max_start: int = -1
|
|
927
1100
|
include: str = path_spec.include
|
|
928
1101
|
max_match: str = ""
|
|
1102
|
+
|
|
929
1103
|
for match in matches_list:
|
|
930
1104
|
pos = include.find(match.group())
|
|
931
1105
|
if pos > max_start:
|
|
@@ -937,109 +1111,198 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
937
1111
|
if max_match == "{table}":
|
|
938
1112
|
break
|
|
939
1113
|
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
1114
|
+
logger.info(f"Template replacement: {path_spec.include} -> {include}")
|
|
1115
|
+
else:
|
|
1116
|
+
include = path_spec.include
|
|
1117
|
+
|
|
1118
|
+
# Split the path at {table} to get the prefix that needs wildcard resolution
|
|
1119
|
+
prefix_before_table = include.split(table_marker)[0]
|
|
1120
|
+
logger.info(f"Prefix before table: {prefix_before_table}")
|
|
1121
|
+
|
|
1122
|
+
try:
|
|
1123
|
+
# STEP 2: Resolve ALL wildcards in the path up to {table}
|
|
1124
|
+
# This converts patterns like "s3://data/*/logs/" to actual paths like ["s3://data/2023/logs/", "s3://data/2024/logs/"]
|
|
1125
|
+
resolved_prefixes = list(
|
|
1126
|
+
self.resolve_templated_folders(prefix_before_table)
|
|
1127
|
+
)
|
|
1128
|
+
logger.info(f"Resolved prefixes: {resolved_prefixes}")
|
|
1129
|
+
|
|
1130
|
+
# STEP 3: Process each resolved prefix to find table folders
|
|
1131
|
+
for resolved_prefix in resolved_prefixes:
|
|
1132
|
+
logger.info(f"Processing resolved prefix: {resolved_prefix}")
|
|
1133
|
+
|
|
1134
|
+
# Get all folders that could be tables under this resolved prefix
|
|
1135
|
+
# These are the actual table names (e.g., "users", "events", "logs")
|
|
1136
|
+
table_folders = list(
|
|
1137
|
+
list_folders_path(
|
|
1138
|
+
resolved_prefix, aws_config=self.source_config.aws_config
|
|
1139
|
+
)
|
|
1140
|
+
)
|
|
1141
|
+
logger.debug(
|
|
1142
|
+
f"Found table folders under {resolved_prefix}: {[folder.name for folder in table_folders]}"
|
|
1143
|
+
)
|
|
1144
|
+
|
|
1145
|
+
# STEP 4: Process each table folder to create a table-level dataset
|
|
1146
|
+
for folder in table_folders:
|
|
1147
|
+
logger.info(f"Processing table path: {folder.path}")
|
|
1148
|
+
|
|
1149
|
+
# Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
|
|
1150
|
+
# This uses the compiled regex pattern to extract the table name from the full path
|
|
1151
|
+
table_name, _ = self.extract_table_name_and_path(
|
|
1152
|
+
path_spec, folder.path
|
|
1153
|
+
)
|
|
1154
|
+
|
|
1155
|
+
# Apply table name filtering if configured
|
|
1156
|
+
if not path_spec.tables_filter_pattern.allowed(table_name):
|
|
1157
|
+
logger.debug(f"Table '{table_name}' not allowed and skipping")
|
|
1158
|
+
continue
|
|
1159
|
+
|
|
1160
|
+
# STEP 5: Handle partition traversal based on configuration
|
|
1161
|
+
dirs_to_process = []
|
|
1162
|
+
|
|
1163
|
+
if path_spec.traversal_method == FolderTraversalMethod.ALL:
|
|
1164
|
+
# Process ALL partitions (original behavior)
|
|
1165
|
+
dirs_to_process = [folder.path]
|
|
1166
|
+
logger.debug(
|
|
1167
|
+
f"Processing ALL partition folders under: {folder.path}"
|
|
1168
|
+
)
|
|
1169
|
+
|
|
1170
|
+
else:
|
|
1171
|
+
# Use the original get_dir_to_process logic for MIN/MAX
|
|
1172
|
+
if (
|
|
1173
|
+
path_spec.traversal_method == FolderTraversalMethod.MIN_MAX
|
|
1174
|
+
or path_spec.traversal_method == FolderTraversalMethod.MAX
|
|
1175
|
+
):
|
|
1176
|
+
# Get MAX partition using original logic
|
|
1177
|
+
dirs_to_process_max = self.get_dir_to_process(
|
|
1178
|
+
uri=folder.path,
|
|
1179
|
+
path_spec=path_spec,
|
|
1180
|
+
min=False,
|
|
991
1181
|
)
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
1182
|
+
if dirs_to_process_max:
|
|
1183
|
+
dirs_to_process.extend(dirs_to_process_max)
|
|
1184
|
+
logger.debug(
|
|
1185
|
+
f"Added MAX partition: {dirs_to_process_max}"
|
|
1186
|
+
)
|
|
1187
|
+
|
|
1188
|
+
if path_spec.traversal_method == FolderTraversalMethod.MIN_MAX:
|
|
1189
|
+
# Get MIN partition using original logic
|
|
1190
|
+
dirs_to_process_min = self.get_dir_to_process(
|
|
1191
|
+
uri=folder.path,
|
|
1192
|
+
path_spec=path_spec,
|
|
1193
|
+
min=True,
|
|
998
1194
|
)
|
|
999
|
-
|
|
1195
|
+
if dirs_to_process_min:
|
|
1196
|
+
dirs_to_process.extend(dirs_to_process_min)
|
|
1197
|
+
logger.debug(
|
|
1198
|
+
f"Added MIN partition: {dirs_to_process_min}"
|
|
1199
|
+
)
|
|
1000
1200
|
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
# TODO: Support content type inference for partitions
|
|
1201
|
+
# Process the selected partitions
|
|
1202
|
+
all_folders = []
|
|
1203
|
+
for partition_path in dirs_to_process:
|
|
1204
|
+
logger.info(f"Scanning files in partition: {partition_path}")
|
|
1205
|
+
partition_files = list(
|
|
1206
|
+
self.get_folder_info(path_spec, partition_path)
|
|
1008
1207
|
)
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1208
|
+
all_folders.extend(partition_files)
|
|
1209
|
+
|
|
1210
|
+
if all_folders:
|
|
1211
|
+
# Use the most recent file across all processed partitions
|
|
1212
|
+
latest_file = max(
|
|
1213
|
+
all_folders, key=lambda x: x.modification_time
|
|
1214
|
+
)
|
|
1215
|
+
|
|
1216
|
+
# Get partition information
|
|
1217
|
+
partitions = [f for f in all_folders if f.is_partition]
|
|
1218
|
+
|
|
1219
|
+
# Calculate total size of processed partitions
|
|
1220
|
+
total_size = sum(f.size for f in all_folders)
|
|
1221
|
+
|
|
1222
|
+
# Create ONE BrowsePath per table
|
|
1223
|
+
# The key insight: we need to provide the sample file for schema inference
|
|
1224
|
+
# but the table path should be extracted correctly by extract_table_name_and_path
|
|
1225
|
+
yield BrowsePath(
|
|
1226
|
+
file=latest_file.sample_file, # Sample file for schema inference
|
|
1227
|
+
timestamp=latest_file.modification_time, # Latest timestamp
|
|
1228
|
+
size=total_size, # Size of processed partitions
|
|
1229
|
+
partitions=partitions, # Partition metadata
|
|
1017
1230
|
)
|
|
1018
1231
|
else:
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
content_type = None
|
|
1030
|
-
if self.source_config.use_s3_content_type:
|
|
1031
|
-
content_type = s3.Object(obj.bucket_name, obj.key).content_type
|
|
1032
|
-
|
|
1033
|
-
yield BrowsePath(
|
|
1034
|
-
file=s3_path,
|
|
1035
|
-
timestamp=obj.last_modified,
|
|
1036
|
-
size=obj.size,
|
|
1037
|
-
partitions=[],
|
|
1038
|
-
content_type=content_type,
|
|
1232
|
+
logger.warning(
|
|
1233
|
+
f"No files found in processed partitions for table {table_name}"
|
|
1234
|
+
)
|
|
1235
|
+
|
|
1236
|
+
except Exception as e:
|
|
1237
|
+
if isinstance(e, s3.meta.client.exceptions.NoSuchBucket):
|
|
1238
|
+
self.get_report().report_warning(
|
|
1239
|
+
"Missing bucket",
|
|
1240
|
+
f"No bucket found {e.response['Error'].get('BucketName')}",
|
|
1039
1241
|
)
|
|
1242
|
+
return
|
|
1243
|
+
logger.error(f"Error in _process_templated_path: {e}")
|
|
1244
|
+
raise e
|
|
1040
1245
|
|
|
1041
|
-
def
|
|
1042
|
-
|
|
1246
|
+
def _process_simple_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
|
|
1247
|
+
"""
|
|
1248
|
+
Process simple S3 paths without {table} templates to create file-level datasets.
|
|
1249
|
+
|
|
1250
|
+
This method handles straightforward file patterns by:
|
|
1251
|
+
1. Listing all files matching the pattern
|
|
1252
|
+
2. Creating one dataset per file
|
|
1253
|
+
3. No aggregation or grouping is performed
|
|
1254
|
+
|
|
1255
|
+
Use Cases:
|
|
1256
|
+
- Individual file processing: s3://bucket/data/*.csv
|
|
1257
|
+
- Direct file paths: s3://bucket/data/myfile.json
|
|
1258
|
+
- Patterns without table grouping: s3://bucket/logs/*.log
|
|
1259
|
+
|
|
1260
|
+
Args:
|
|
1261
|
+
path_spec: Path specification without {table} template
|
|
1262
|
+
|
|
1263
|
+
Yields:
|
|
1264
|
+
BrowsePath: One per file, containing individual file metadata
|
|
1265
|
+
|
|
1266
|
+
Example Output:
|
|
1267
|
+
- BrowsePath(file="data/file1.csv", size=1000, partitions=[])
|
|
1268
|
+
- BrowsePath(file="data/file2.csv", size=2000, partitions=[])
|
|
1269
|
+
"""
|
|
1270
|
+
|
|
1271
|
+
if self.source_config.aws_config is None:
|
|
1272
|
+
raise ValueError("aws_config not set")
|
|
1273
|
+
s3 = self.source_config.aws_config.get_s3_resource(
|
|
1274
|
+
self.source_config.verify_ssl
|
|
1275
|
+
)
|
|
1276
|
+
|
|
1277
|
+
path_spec.sample_files = False # Disable sampling for simple paths
|
|
1278
|
+
|
|
1279
|
+
# Extract the prefix from the path spec (stops at first wildcard)
|
|
1280
|
+
prefix = self.get_prefix(path_spec.include)
|
|
1281
|
+
|
|
1282
|
+
basename_startswith = prefix.split("/")[-1]
|
|
1283
|
+
dirname = prefix.removesuffix(basename_startswith)
|
|
1284
|
+
|
|
1285
|
+
# Iterate through all objects in the bucket matching the prefix
|
|
1286
|
+
for obj in list_objects_recursive_path(
|
|
1287
|
+
dirname,
|
|
1288
|
+
startswith=basename_startswith,
|
|
1289
|
+
aws_config=self.source_config.aws_config,
|
|
1290
|
+
):
|
|
1291
|
+
s3_path = self.create_s3_path(obj.bucket_name, obj.key)
|
|
1292
|
+
|
|
1293
|
+
# Get content type if configured
|
|
1294
|
+
content_type = None
|
|
1295
|
+
if self.source_config.use_s3_content_type:
|
|
1296
|
+
content_type = s3.Object(obj.bucket_name, obj.key).content_type
|
|
1297
|
+
|
|
1298
|
+
# Create one BrowsePath per file
|
|
1299
|
+
yield BrowsePath(
|
|
1300
|
+
file=s3_path,
|
|
1301
|
+
timestamp=obj.last_modified,
|
|
1302
|
+
size=obj.size,
|
|
1303
|
+
partitions=[], # No partitions in simple mode
|
|
1304
|
+
content_type=content_type,
|
|
1305
|
+
)
|
|
1043
1306
|
|
|
1044
1307
|
def local_browser(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
|
|
1045
1308
|
prefix = self.get_prefix(path_spec.include)
|
|
@@ -1071,11 +1334,6 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1071
1334
|
)
|
|
1072
1335
|
|
|
1073
1336
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1074
|
-
self.container_WU_creator = ContainerWUCreator(
|
|
1075
|
-
self.source_config.platform,
|
|
1076
|
-
self.source_config.platform_instance,
|
|
1077
|
-
self.source_config.env,
|
|
1078
|
-
)
|
|
1079
1337
|
with PerfTimer() as timer:
|
|
1080
1338
|
assert self.source_config.path_specs
|
|
1081
1339
|
for path_spec in self.source_config.path_specs:
|
|
@@ -1088,8 +1346,13 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1088
1346
|
)
|
|
1089
1347
|
table_dict: Dict[str, TableData] = {}
|
|
1090
1348
|
for browse_path in file_browser:
|
|
1349
|
+
# Normalize URI for pattern matching
|
|
1350
|
+
normalized_file_path = self._normalize_uri_for_pattern_matching(
|
|
1351
|
+
browse_path.file
|
|
1352
|
+
)
|
|
1353
|
+
|
|
1091
1354
|
if not path_spec.allowed(
|
|
1092
|
-
|
|
1355
|
+
normalized_file_path,
|
|
1093
1356
|
ignore_ext=self.is_s3_platform()
|
|
1094
1357
|
and self.source_config.use_s3_content_type,
|
|
1095
1358
|
):
|
|
@@ -1165,5 +1428,13 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1165
1428
|
def is_s3_platform(self):
|
|
1166
1429
|
return self.source_config.platform == "s3"
|
|
1167
1430
|
|
|
1431
|
+
def strip_s3_prefix(self, s3_uri: str) -> str:
|
|
1432
|
+
"""Strip S3 prefix from URI. Can be overridden by adapters for other platforms."""
|
|
1433
|
+
return strip_s3_prefix(s3_uri)
|
|
1434
|
+
|
|
1435
|
+
def _normalize_uri_for_pattern_matching(self, uri: str) -> str:
|
|
1436
|
+
"""Normalize URI for pattern matching. Can be overridden by adapters for other platforms."""
|
|
1437
|
+
return uri
|
|
1438
|
+
|
|
1168
1439
|
def get_report(self):
|
|
1169
1440
|
return self.report
|