PyPI - acryl-datahub - Versions diffs - 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl - Mend

acryl-datahub 1.0.0rc18py3-none-any.whl → 1.3.0.1rc9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show

{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
datahub/_version.py +1 -1
datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
datahub/api/entities/assertion/assertion.py +1 -1
datahub/api/entities/common/serialized_value.py +1 -1
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/datacontract/datacontract.py +35 -3
datahub/api/entities/datajob/dataflow.py +18 -3
datahub/api/entities/datajob/datajob.py +24 -4
datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
datahub/api/entities/dataproduct/dataproduct.py +32 -3
datahub/api/entities/dataset/dataset.py +47 -72
datahub/api/entities/external/__init__.py +0 -0
datahub/api/entities/external/external_entities.py +724 -0
datahub/api/entities/external/external_tag.py +147 -0
datahub/api/entities/external/lake_formation_external_entites.py +162 -0
datahub/api/entities/external/restricted_text.py +172 -0
datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
datahub/api/entities/forms/forms.py +37 -37
datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
datahub/api/graphql/assertion.py +1 -1
datahub/api/graphql/base.py +8 -6
datahub/api/graphql/operation.py +14 -10
datahub/cli/check_cli.py +91 -9
datahub/cli/cli_utils.py +63 -0
datahub/cli/config_utils.py +20 -12
datahub/cli/container_cli.py +5 -0
datahub/cli/delete_cli.py +133 -34
datahub/cli/docker_check.py +110 -14
datahub/cli/docker_cli.py +155 -231
datahub/cli/exists_cli.py +2 -3
datahub/cli/get_cli.py +2 -3
datahub/cli/graphql_cli.py +1422 -0
datahub/cli/iceberg_cli.py +11 -5
datahub/cli/ingest_cli.py +25 -26
datahub/cli/migrate.py +12 -9
datahub/cli/migration_utils.py +4 -3
datahub/cli/put_cli.py +4 -6
datahub/cli/quickstart_versioning.py +53 -10
datahub/cli/specific/assertions_cli.py +39 -7
datahub/cli/specific/datacontract_cli.py +57 -9
datahub/cli/specific/dataproduct_cli.py +12 -24
datahub/cli/specific/dataset_cli.py +31 -21
datahub/cli/specific/forms_cli.py +2 -5
datahub/cli/specific/group_cli.py +2 -3
datahub/cli/specific/structuredproperties_cli.py +5 -7
datahub/cli/specific/user_cli.py +174 -4
datahub/cli/state_cli.py +2 -3
datahub/cli/timeline_cli.py +2 -3
datahub/configuration/common.py +46 -2
datahub/configuration/connection_resolver.py +5 -2
datahub/configuration/env_vars.py +331 -0
datahub/configuration/import_resolver.py +7 -4
datahub/configuration/kafka.py +21 -1
datahub/configuration/pydantic_migration_helpers.py +6 -13
datahub/configuration/source_common.py +4 -3
datahub/configuration/validate_field_deprecation.py +5 -2
datahub/configuration/validate_field_removal.py +8 -2
datahub/configuration/validate_field_rename.py +6 -5
datahub/configuration/validate_multiline_string.py +5 -2
datahub/emitter/mce_builder.py +12 -8
datahub/emitter/mcp.py +20 -5
datahub/emitter/mcp_builder.py +12 -0
datahub/emitter/request_helper.py +138 -15
datahub/emitter/response_helper.py +111 -19
datahub/emitter/rest_emitter.py +399 -163
datahub/entrypoints.py +10 -5
datahub/errors.py +12 -0
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
datahub/ingestion/api/common.py +9 -0
datahub/ingestion/api/decorators.py +15 -3
datahub/ingestion/api/report.py +381 -3
datahub/ingestion/api/sink.py +27 -2
datahub/ingestion/api/source.py +174 -62
datahub/ingestion/api/source_helpers.py +41 -3
datahub/ingestion/api/source_protocols.py +23 -0
datahub/ingestion/autogenerated/__init__.py +0 -0
datahub/ingestion/autogenerated/capability_summary.json +3652 -0
datahub/ingestion/autogenerated/lineage.json +402 -0
datahub/ingestion/autogenerated/lineage_helper.py +177 -0
datahub/ingestion/extractor/schema_util.py +31 -5
datahub/ingestion/glossary/classification_mixin.py +9 -2
datahub/ingestion/graph/client.py +492 -55
datahub/ingestion/graph/config.py +18 -2
datahub/ingestion/graph/filters.py +96 -32
datahub/ingestion/graph/links.py +55 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
datahub/ingestion/run/pipeline.py +90 -23
datahub/ingestion/run/pipeline_config.py +3 -3
datahub/ingestion/sink/datahub_kafka.py +1 -0
datahub/ingestion/sink/datahub_rest.py +31 -23
datahub/ingestion/sink/file.py +1 -0
datahub/ingestion/source/abs/config.py +1 -1
datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
datahub/ingestion/source/abs/source.py +15 -30
datahub/ingestion/source/apply/datahub_apply.py +6 -5
datahub/ingestion/source/aws/aws_common.py +185 -13
datahub/ingestion/source/aws/glue.py +517 -244
datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
datahub/ingestion/source/aws/tag_entities.py +270 -0
datahub/ingestion/source/azure/azure_common.py +3 -3
datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
datahub/ingestion/source/bigquery_v2/common.py +1 -1
datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
datahub/ingestion/source/bigquery_v2/queries.py +3 -3
datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
datahub/ingestion/source/cassandra/cassandra.py +7 -18
datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
datahub/ingestion/source/common/data_platforms.py +23 -0
datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
datahub/ingestion/source/common/subtypes.py +73 -1
datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
datahub/ingestion/source/data_lake_common/object_store.py +732 -0
datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
datahub/ingestion/source/datahub/config.py +19 -5
datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
datahub/ingestion/source/datahub/datahub_source.py +11 -1
datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
datahub/ingestion/source/dbt/dbt_common.py +270 -26
datahub/ingestion/source/dbt/dbt_core.py +88 -47
datahub/ingestion/source/dbt/dbt_tests.py +8 -6
datahub/ingestion/source/debug/__init__.py +0 -0
datahub/ingestion/source/debug/datahub_debug.py +300 -0
datahub/ingestion/source/delta_lake/config.py +9 -5
datahub/ingestion/source/delta_lake/source.py +8 -0
datahub/ingestion/source/dremio/dremio_api.py +114 -73
datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
datahub/ingestion/source/dremio/dremio_config.py +5 -4
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
datahub/ingestion/source/dremio/dremio_entities.py +6 -5
datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
datahub/ingestion/source/dremio/dremio_source.py +228 -215
datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
datahub/ingestion/source/excel/__init__.py +0 -0
datahub/ingestion/source/excel/config.py +92 -0
datahub/ingestion/source/excel/excel_file.py +539 -0
datahub/ingestion/source/excel/profiling.py +308 -0
datahub/ingestion/source/excel/report.py +49 -0
datahub/ingestion/source/excel/source.py +662 -0
datahub/ingestion/source/excel/util.py +18 -0
datahub/ingestion/source/feast.py +12 -14
datahub/ingestion/source/file.py +3 -0
datahub/ingestion/source/fivetran/config.py +67 -8
datahub/ingestion/source/fivetran/fivetran.py +228 -43
datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
datahub/ingestion/source/fivetran/response_models.py +97 -0
datahub/ingestion/source/gc/datahub_gc.py +0 -2
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
datahub/ingestion/source/gcs/gcs_source.py +53 -10
datahub/ingestion/source/gcs/gcs_utils.py +36 -9
datahub/ingestion/source/ge_data_profiler.py +146 -33
datahub/ingestion/source/ge_profiling_config.py +26 -11
datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
datahub/ingestion/source/grafana/field_utils.py +307 -0
datahub/ingestion/source/grafana/grafana_api.py +142 -0
datahub/ingestion/source/grafana/grafana_config.py +104 -0
datahub/ingestion/source/grafana/grafana_source.py +522 -84
datahub/ingestion/source/grafana/lineage.py +202 -0
datahub/ingestion/source/grafana/models.py +137 -0
datahub/ingestion/source/grafana/report.py +90 -0
datahub/ingestion/source/grafana/types.py +16 -0
datahub/ingestion/source/hex/__init__.py +0 -0
datahub/ingestion/source/hex/api.py +402 -0
datahub/ingestion/source/hex/constants.py +8 -0
datahub/ingestion/source/hex/hex.py +311 -0
datahub/ingestion/source/hex/mapper.py +412 -0
datahub/ingestion/source/hex/model.py +78 -0
datahub/ingestion/source/hex/query_fetcher.py +307 -0
datahub/ingestion/source/iceberg/iceberg.py +385 -164
datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
datahub/ingestion/source/identity/azure_ad.py +1 -1
datahub/ingestion/source/identity/okta.py +1 -14
datahub/ingestion/source/kafka/kafka.py +28 -71
datahub/ingestion/source/kafka/kafka_config.py +78 -0
datahub/ingestion/source/kafka_connect/common.py +2 -2
datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
datahub/ingestion/source/ldap.py +1 -1
datahub/ingestion/source/looker/looker_common.py +216 -86
datahub/ingestion/source/looker/looker_config.py +15 -4
datahub/ingestion/source/looker/looker_constant.py +4 -0
datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
datahub/ingestion/source/looker/looker_source.py +539 -555
datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
datahub/ingestion/source/looker/lookml_config.py +31 -3
datahub/ingestion/source/looker/lookml_refinement.py +1 -1
datahub/ingestion/source/looker/lookml_source.py +103 -118
datahub/ingestion/source/looker/view_upstream.py +494 -1
datahub/ingestion/source/metabase.py +32 -6
datahub/ingestion/source/metadata/business_glossary.py +7 -7
datahub/ingestion/source/metadata/lineage.py +11 -10
datahub/ingestion/source/mlflow.py +254 -23
datahub/ingestion/source/mock_data/__init__.py +0 -0
datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
datahub/ingestion/source/mode.py +359 -181
datahub/ingestion/source/mongodb.py +11 -1
datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
datahub/ingestion/source/nifi.py +5 -5
datahub/ingestion/source/openapi.py +85 -38
datahub/ingestion/source/openapi_parser.py +59 -40
datahub/ingestion/source/powerbi/config.py +92 -27
datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
datahub/ingestion/source/powerbi/powerbi.py +66 -32
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
datahub/ingestion/source/preset.py +3 -3
datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
datahub/ingestion/source/redash.py +1 -1
datahub/ingestion/source/redshift/config.py +15 -9
datahub/ingestion/source/redshift/datashares.py +1 -1
datahub/ingestion/source/redshift/lineage.py +386 -687
datahub/ingestion/source/redshift/profile.py +2 -2
datahub/ingestion/source/redshift/query.py +24 -20
datahub/ingestion/source/redshift/redshift.py +52 -111
datahub/ingestion/source/redshift/redshift_schema.py +17 -12
datahub/ingestion/source/redshift/report.py +0 -2
datahub/ingestion/source/redshift/usage.py +13 -11
datahub/ingestion/source/s3/report.py +4 -2
datahub/ingestion/source/s3/source.py +515 -244
datahub/ingestion/source/sac/sac.py +3 -1
datahub/ingestion/source/salesforce.py +28 -13
datahub/ingestion/source/schema/json_schema.py +14 -14
datahub/ingestion/source/schema_inference/object.py +22 -6
datahub/ingestion/source/sigma/config.py +75 -8
datahub/ingestion/source/sigma/data_classes.py +3 -0
datahub/ingestion/source/sigma/sigma.py +36 -7
datahub/ingestion/source/sigma/sigma_api.py +99 -58
datahub/ingestion/source/slack/slack.py +403 -140
datahub/ingestion/source/snaplogic/__init__.py +0 -0
datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
datahub/ingestion/source/snowflake/constants.py +4 -0
datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/athena.py +219 -26
datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
datahub/ingestion/source/sql/clickhouse.py +29 -9
datahub/ingestion/source/sql/cockroachdb.py +5 -4
datahub/ingestion/source/sql/druid.py +9 -4
datahub/ingestion/source/sql/hana.py +3 -1
datahub/ingestion/source/sql/hive.py +28 -8
datahub/ingestion/source/sql/hive_metastore.py +24 -25
datahub/ingestion/source/sql/mariadb.py +0 -1
datahub/ingestion/source/sql/mssql/job_models.py +18 -2
datahub/ingestion/source/sql/mssql/source.py +376 -62
datahub/ingestion/source/sql/mysql.py +154 -4
datahub/ingestion/source/sql/oracle.py +62 -11
datahub/ingestion/source/sql/postgres.py +142 -6
datahub/ingestion/source/sql/presto.py +20 -2
datahub/ingestion/source/sql/sql_common.py +281 -49
datahub/ingestion/source/sql/sql_config.py +1 -34
datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
datahub/ingestion/source/sql/sql_types.py +27 -2
datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
datahub/ingestion/source/sql/teradata.py +1028 -245
datahub/ingestion/source/sql/trino.py +43 -10
datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
datahub/ingestion/source/sql/vertica.py +14 -7
datahub/ingestion/source/sql_queries.py +219 -121
datahub/ingestion/source/state/checkpoint.py +8 -29
datahub/ingestion/source/state/entity_removal_state.py +5 -2
datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
datahub/ingestion/source/superset.py +810 -126
datahub/ingestion/source/tableau/tableau.py +172 -69
datahub/ingestion/source/tableau/tableau_common.py +11 -4
datahub/ingestion/source/tableau/tableau_constant.py +1 -4
datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
datahub/ingestion/source/tableau/tableau_validation.py +1 -1
datahub/ingestion/source/unity/config.py +161 -40
datahub/ingestion/source/unity/connection.py +61 -0
datahub/ingestion/source/unity/connection_test.py +1 -0
datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
datahub/ingestion/source/unity/proxy.py +794 -51
datahub/ingestion/source/unity/proxy_patch.py +321 -0
datahub/ingestion/source/unity/proxy_types.py +36 -2
datahub/ingestion/source/unity/report.py +15 -3
datahub/ingestion/source/unity/source.py +465 -131
datahub/ingestion/source/unity/tag_entities.py +197 -0
datahub/ingestion/source/unity/usage.py +46 -4
datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
datahub/ingestion/source/usage/usage_common.py +4 -68
datahub/ingestion/source/vertexai/__init__.py +0 -0
datahub/ingestion/source/vertexai/vertexai.py +1367 -0
datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/source_report/ingestion_stage.py +50 -11
datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
datahub/ingestion/transformer/base_transformer.py +8 -5
datahub/ingestion/transformer/dataset_domain.py +1 -1
datahub/ingestion/transformer/set_browse_path.py +112 -0
datahub/integrations/assertion/common.py +3 -2
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/lite/lite_util.py +2 -2
datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
datahub/metadata/_urns/urn_defs.py +1866 -1582
datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
datahub/metadata/schema.avsc +18404 -16617
datahub/metadata/schema_classes.py +3 -3
datahub/metadata/schemas/Actors.avsc +38 -1
datahub/metadata/schemas/ApplicationKey.avsc +31 -0
datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
datahub/metadata/schemas/Applications.avsc +38 -0
datahub/metadata/schemas/AssetSettings.avsc +63 -0
datahub/metadata/schemas/ChartInfo.avsc +2 -1
datahub/metadata/schemas/ChartKey.avsc +1 -0
datahub/metadata/schemas/ContainerKey.avsc +1 -0
datahub/metadata/schemas/ContainerProperties.avsc +8 -0
datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
datahub/metadata/schemas/DashboardKey.avsc +1 -0
datahub/metadata/schemas/DataContractKey.avsc +2 -1
datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
datahub/metadata/schemas/DataJobInfo.avsc +8 -0
datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
datahub/metadata/schemas/DataProcessKey.avsc +8 -0
datahub/metadata/schemas/DataProductKey.avsc +3 -1
datahub/metadata/schemas/DataProductProperties.avsc +1 -1
datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
datahub/metadata/schemas/DatasetKey.avsc +11 -1
datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
datahub/metadata/schemas/Deprecation.avsc +2 -0
datahub/metadata/schemas/DomainKey.avsc +2 -1
datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
datahub/metadata/schemas/FormInfo.avsc +5 -0
datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
datahub/metadata/schemas/IncidentInfo.avsc +3 -3
datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
datahub/metadata/schemas/LogicalParent.avsc +145 -0
datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
datahub/metadata/schemas/MLModelKey.avsc +9 -0
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
datahub/metadata/schemas/NotebookKey.avsc +1 -0
datahub/metadata/schemas/Operation.avsc +21 -2
datahub/metadata/schemas/Ownership.avsc +69 -0
datahub/metadata/schemas/QueryProperties.avsc +24 -2
datahub/metadata/schemas/QuerySubjects.avsc +1 -12
datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
datahub/metadata/schemas/Siblings.avsc +2 -0
datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
datahub/metadata/schemas/StructuredProperties.avsc +69 -0
datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
datahub/metadata/schemas/SystemMetadata.avsc +147 -0
datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/__init__.py +7 -0
datahub/sdk/_all_entities.py +15 -0
datahub/sdk/_shared.py +393 -10
datahub/sdk/_utils.py +4 -0
datahub/sdk/chart.py +386 -0
datahub/sdk/container.py +7 -0
datahub/sdk/dashboard.py +453 -0
datahub/sdk/dataflow.py +309 -0
datahub/sdk/datajob.py +367 -0
datahub/sdk/dataset.py +180 -4
datahub/sdk/entity.py +99 -3
datahub/sdk/entity_client.py +154 -12
datahub/sdk/lineage_client.py +943 -0
datahub/sdk/main_client.py +83 -8
datahub/sdk/mlmodel.py +383 -0
datahub/sdk/mlmodelgroup.py +240 -0
datahub/sdk/search_client.py +85 -8
datahub/sdk/search_filters.py +393 -68
datahub/secret/datahub_secret_store.py +5 -1
datahub/secret/environment_secret_store.py +29 -0
datahub/secret/file_secret_store.py +49 -0
datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
datahub/specific/aspect_helpers/siblings.py +73 -0
datahub/specific/aspect_helpers/structured_properties.py +27 -0
datahub/specific/chart.py +1 -1
datahub/specific/datajob.py +15 -1
datahub/specific/dataproduct.py +4 -0
datahub/specific/dataset.py +51 -59
datahub/sql_parsing/_sqlglot_patch.py +1 -2
datahub/sql_parsing/fingerprint_utils.py +6 -0
datahub/sql_parsing/split_statements.py +30 -3
datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
datahub/sql_parsing/sqlglot_lineage.py +517 -44
datahub/sql_parsing/sqlglot_utils.py +30 -18
datahub/sql_parsing/tool_meta_extractor.py +25 -2
datahub/telemetry/telemetry.py +30 -16
datahub/testing/check_imports.py +1 -1
datahub/testing/docker_utils.py +8 -2
datahub/testing/mce_helpers.py +421 -0
datahub/testing/mcp_diff.py +17 -21
datahub/testing/sdk_v2_helpers.py +18 -0
datahub/upgrade/upgrade.py +86 -30
datahub/utilities/file_backed_collections.py +14 -15
datahub/utilities/hive_schema_to_avro.py +2 -2
datahub/utilities/ingest_utils.py +2 -2
datahub/utilities/is_pytest.py +3 -2
datahub/utilities/logging_manager.py +30 -7
datahub/utilities/mapping.py +29 -2
datahub/utilities/sample_data.py +5 -4
datahub/utilities/server_config_util.py +298 -10
datahub/utilities/sqlalchemy_query_combiner.py +6 -4
datahub/utilities/stats_collections.py +4 -0
datahub/utilities/threaded_iterator_executor.py +16 -3
datahub/utilities/urn_encoder.py +1 -1
datahub/utilities/urns/urn.py +41 -2
datahub/emitter/sql_parsing_builder.py +0 -306
datahub/ingestion/source/redshift/lineage_v2.py +0 -458
datahub/ingestion/source/vertexai.py +0 -697
datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -3,15 +3,14 @@ import functools
 import logging
 import os
 import pathlib
+import posixpath
 import re
 import time
 from datetime import datetime
 from pathlib import PurePath
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
-from urllib.parse import urlparse
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 import smart_open.compression as so_compression
-from more_itertools import peekable
 from pyspark.conf import SparkConf
 from pyspark.sql import SparkSession
 from pyspark.sql.dataframe import DataFrame
@@ -35,14 +34,25 @@ from datahub.ingestion.api.decorators import (
 )
 from datahub.ingestion.api.source import MetadataWorkUnitProcessor
 from datahub.ingestion.api.workunit import MetadataWorkUnit
-from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags, list_folders
+from datahub.ingestion.source.aws.s3_boto_utils import (
+    get_s3_tags,
+    list_folders_path,
+    list_objects_recursive_path,
+)
 from datahub.ingestion.source.aws.s3_util import (
     get_bucket_name,
     get_bucket_relative_path,
     get_key_prefix,
     strip_s3_prefix,
 )
-from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
+from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
+from datahub.ingestion.source.data_lake_common.data_lake_utils import (
+    ContainerWUCreator,
+    add_partition_columns_to_schema,
+)
+from datahub.ingestion.source.data_lake_common.object_store import (
+    create_object_store_adapter,
+)
 from datahub.ingestion.source.data_lake_common.path_spec import FolderTraversalMethod
 from datahub.ingestion.source.s3.config import DataLakeSourceConfig, PathSpec
 from datahub.ingestion.source.s3.report import DataLakeSourceReport
@@ -56,9 +66,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
 )
 from datahub.metadata.com.linkedin.pegasus2avro.common import TimeStamp
 from datahub.metadata.com.linkedin.pegasus2avro.schema import (
-    SchemaField,
     SchemaMetadata,
-    StringTypeClass,
 )
 from datahub.metadata.schema_classes import (
     DataPlatformInstanceClass,
@@ -68,22 +76,15 @@ from datahub.metadata.schema_classes import (
     OtherSchemaClass,
     PartitionsSummaryClass,
     PartitionSummaryClass,
-    SchemaFieldDataTypeClass,
     _Aspect,
 )
 from datahub.telemetry import stats, telemetry
-from datahub.utilities.groupby import groupby_unsorted
 from datahub.utilities.perf_timer import PerfTimer
-if TYPE_CHECKING:
-    from mypy_boto3_s3.service_resource import Bucket
 # hide annoying debug errors from py4j
 logging.getLogger("py4j").setLevel(logging.ERROR)
 logger: logging.Logger = logging.getLogger(__name__)
-PAGE_SIZE = 1000
 # Hack to support the .gzip extension with smart_open.
 so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"])
@@ -109,14 +110,7 @@ profiling_flags_to_report = [
     "include_field_sample_values",
 ]
-# LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG = AddDatasetBrowsePathConfig(
-#     path_templates=["/ENV/PLATFORMDATASET_PARTS"], replace_existing=True
-# )
-#
-# LOCAL_BROWSE_PATH_TRANSFORMER = AddDatasetBrowsePathTransformer(
-#     ctx=None, config=LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG
-# )
+URI_SCHEME_REGEX = re.compile(r"^[a-z0-9]+://")
 def partitioned_folder_comparator(folder1: str, folder2: str) -> int:
@@ -159,6 +153,15 @@ class Folder:
         )
+@dataclasses.dataclass
+class FolderInfo:
+    objects: List[Any]
+    total_size: int
+    min_time: datetime
+    max_time: datetime
+    latest_obj: Any
 @dataclasses.dataclass
 class BrowsePath:
     file: str
@@ -185,8 +188,15 @@ class TableData:
 @platform_name("S3 / Local Files", id="s3")
 @config_class(DataLakeSourceConfig)
-@support_status(SupportStatus.INCUBATING)
-@capability(SourceCapability.CONTAINERS, "Enabled by default")
+@support_status(SupportStatus.CERTIFIED)
+@capability(
+    SourceCapability.CONTAINERS,
+    "Enabled by default",
+    subtype_modifier=[
+        SourceCapabilityModifier.FOLDER,
+        SourceCapabilityModifier.S3_BUCKET,
+    ],
+)
 @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
 @capability(
     SourceCapability.SCHEMA_METADATA, "Can infer schema from supported file types"
@@ -197,12 +207,59 @@ class S3Source(StatefulIngestionSourceBase):
     report: DataLakeSourceReport
     profiling_times_taken: List[float]
     container_WU_creator: ContainerWUCreator
+    object_store_adapter: Any
     def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
         self.source_config = config
         self.report = DataLakeSourceReport()
         self.profiling_times_taken = []
+        self.container_WU_creator = ContainerWUCreator(
+            self.source_config.platform,
+            self.source_config.platform_instance,
+            self.source_config.env,
+        )
+        # Create an object store adapter for handling external URLs and paths
+        if self.is_s3_platform():
+            # Get the AWS region from config, if available
+            aws_region = None
+            if self.source_config.aws_config:
+                aws_region = self.source_config.aws_config.aws_region
+                # For backward compatibility with tests: if we're using a test endpoint, use us-east-1
+                if self.source_config.aws_config.aws_endpoint_url and (
+                    "localstack"
+                    in self.source_config.aws_config.aws_endpoint_url.lower()
+                    or "storage.googleapis.com"
+                    in self.source_config.aws_config.aws_endpoint_url.lower()
+                ):
+                    aws_region = "us-east-1"
+            # Create an S3 adapter with the configured region
+            self.object_store_adapter = create_object_store_adapter(
+                "s3", aws_region=aws_region
+            )
+            # Special handling for GCS via S3 (via boto compatibility layer)
+            if (
+                self.source_config.aws_config
+                and self.source_config.aws_config.aws_endpoint_url
+                and "storage.googleapis.com"
+                in self.source_config.aws_config.aws_endpoint_url.lower()
+            ):
+                # We need to preserve the S3-style paths but use GCS external URL generation
+                self.object_store_adapter = create_object_store_adapter("gcs")
+                # Override create_s3_path to maintain S3 compatibility
+                self.object_store_adapter.register_customization(
+                    "create_s3_path", lambda bucket, key: f"s3://{bucket}/{key}"
+                )
+        else:
+            # For local files, create a default adapter
+            self.object_store_adapter = create_object_store_adapter(
+                self.source_config.platform or "file"
+            )
         config_report = {
             config_option: config.dict().get(config_option)
             for config_option in config_options_to_report
@@ -319,7 +376,10 @@ class S3Source(StatefulIngestionSourceBase):
     def read_file_spark(self, file: str, ext: str) -> Optional[DataFrame]:
         logger.debug(f"Opening file {file} for profiling in spark")
-        file = file.replace("s3://", "s3a://")
+        if "s3://" in file:
+            # replace s3:// with s3a://, and make sure standalone bucket names always end with a slash.
+            # Spark will fail if given a path like `s3a://mybucket`, and requires it to be `s3a://mybucket/`.
+            file = f"s3a://{get_bucket_name(file)}/{get_bucket_relative_path(file)}"
         telemetry.telemetry_instance.ping("data_lake_file", {"extension": ext})
@@ -376,9 +436,8 @@ class S3Source(StatefulIngestionSourceBase):
                 self.source_config.verify_ssl
             )
-            file = smart_open(
-                table_data.full_path, "rb", transport_params={"client": s3_client}
-            )
+            path = re.sub(URI_SCHEME_REGEX, "s3://", table_data.full_path)
+            file = smart_open(path, "rb", transport_params={"client": s3_client})
         else:
             # We still use smart_open here to take advantage of the compression
             # capabilities of smart_open.
@@ -417,7 +476,7 @@ class S3Source(StatefulIngestionSourceBase):
             fields = sorted(fields, key=lambda f: f.fieldPath)
         if self.source_config.add_partition_columns_to_schema and table_data.partitions:
-            self.add_partition_columns_to_schema(
+            add_partition_columns_to_schema(
                 fields=fields, path_spec=path_spec, full_path=table_data.full_path
             )
@@ -453,34 +512,6 @@ class S3Source(StatefulIngestionSourceBase):
         else:
             return None
-    def add_partition_columns_to_schema(
-        self, path_spec: PathSpec, full_path: str, fields: List[SchemaField]
-    ) -> None:
-        is_fieldpath_v2 = False
-        for field in fields:
-            if field.fieldPath.startswith("[version=2.0]"):
-                is_fieldpath_v2 = True
-                break
-        partition_keys = path_spec.get_partition_from_path(full_path)
-        if not partition_keys:
-            return None
-        for partition_key in partition_keys:
-            fields.append(
-                SchemaField(
-                    fieldPath=(
-                        f"{partition_key[0]}"
-                        if not is_fieldpath_v2
-                        else f"[version=2.0].[type=string].{partition_key[0]}"
-                    ),
-                    nativeDataType="string",
-                    type=SchemaFieldDataTypeClass(StringTypeClass()),
-                    isPartitioningKey=True,
-                    nullable=True,
-                    recursive=False,
-                )
-            )
     def get_table_profile(
         self, table_data: TableData, dataset_urn: str
     ) -> Iterable[MetadataWorkUnit]:
@@ -605,17 +636,28 @@ class S3Source(StatefulIngestionSourceBase):
             maxPartition=max_partition_summary, minPartition=min_partition_summary
         )
+    def get_external_url(self, table_data: TableData) -> Optional[str]:
+        """
+        Get the external URL for a table using the configured object store adapter.
+        Args:
+            table_data: Table data containing path information
+        Returns:
+            An external URL or None if not applicable
+        """
+        # The adapter handles all the URL generation with proper region handling
+        return self.object_store_adapter.get_external_url(table_data)
     def ingest_table(
         self, table_data: TableData, path_spec: PathSpec
     ) -> Iterable[MetadataWorkUnit]:
         aspects: List[Optional[_Aspect]] = []
         logger.info(f"Extracting table schema from file: {table_data.full_path}")
-        browse_path: str = (
-            strip_s3_prefix(table_data.table_path)
-            if self.is_s3_platform()
-            else table_data.table_path.strip("/")
-        )
+        # remove protocol and any leading or trailing slashes
+        browse_path = re.sub(URI_SCHEME_REGEX, "", table_data.table_path).strip("/")
         data_platform_urn = make_data_platform_urn(self.source_config.platform)
         logger.info(f"Creating dataset urn with name: {browse_path}")
@@ -674,6 +716,7 @@ class S3Source(StatefulIngestionSourceBase):
                 if max_partition
                 else None
             ),
+            externalUrl=self.get_external_url(table_data),
         )
         aspects.append(dataset_properties)
         if table_data.size_in_bytes > 0:
@@ -748,10 +791,20 @@ class S3Source(StatefulIngestionSourceBase):
         else:
             return relative_path
-    def extract_table_name(self, path_spec: PathSpec, named_vars: dict) -> str:
-        if path_spec.table_name is None:
-            raise ValueError("path_spec.table_name is not set")
-        return path_spec.table_name.format_map(named_vars)
+    def extract_table_name_and_path(
+        self, path_spec: PathSpec, path: str
+    ) -> Tuple[str, str]:
+        # Extract the table name and base path from a path that's been normalized back to the
+        # "s3://" scheme that matches the path_spec
+        table_name, table_path = path_spec.extract_table_name_and_path(
+            self._normalize_uri_for_pattern_matching(path)
+        )
+        # Then convert the table base path back to the original scheme
+        scheme = re.match(URI_SCHEME_REGEX, path)
+        if scheme:
+            table_path = re.sub(URI_SCHEME_REGEX, scheme[0], table_path)
+        return table_name, table_path
     def extract_table_data(
         self,
@@ -761,7 +814,7 @@ class S3Source(StatefulIngestionSourceBase):
         path = browse_path.file
         partitions = browse_path.partitions
         logger.debug(f"Getting table data for path: {path}")
-        table_name, table_path = path_spec.extract_table_name_and_path(path)
+        table_name, table_path = self.extract_table_name_and_path(path_spec, path)
         return TableData(
             display_name=table_name,
             is_s3=self.is_s3_platform(),
@@ -785,72 +838,91 @@ class S3Source(StatefulIngestionSourceBase):
             content_type=browse_path.content_type,
         )
-    def resolve_templated_folders(self, bucket_name: str, prefix: str) -> Iterable[str]:
+    def resolve_templated_folders(self, prefix: str) -> Iterable[str]:
         folder_split: List[str] = prefix.split("*", 1)
         # If the len of split is 1 it means we don't have * in the prefix
         if len(folder_split) == 1:
             yield prefix
             return
-        folders: Iterable[str] = list_folders(
-            bucket_name, folder_split[0], self.source_config.aws_config
+        basename_startswith = folder_split[0].split("/")[-1]
+        dirname = folder_split[0].removesuffix(basename_startswith)
+        folders = list_folders_path(
+            dirname,
+            startswith=basename_startswith,
+            aws_config=self.source_config.aws_config,
         )
         for folder in folders:
+            # Ensure proper path joining - folders from list_folders path never include a
+            # trailing slash, but we need to handle the case where folder_split[1] might
+            # start with a slash
+            remaining_pattern = folder_split[1]
+            if remaining_pattern.startswith("/"):
+                remaining_pattern = remaining_pattern[1:]
             yield from self.resolve_templated_folders(
-                bucket_name, f"{folder}{folder_split[1]}"
+                f"{folder.path}/{remaining_pattern}"
             )
     def get_dir_to_process(
         self,
-        bucket_name: str,
-        folder: str,
+        uri: str,
         path_spec: PathSpec,
-        protocol: str,
         min: bool = False,
     ) -> List[str]:
-        # if len(path_spec.include.split("/")) == len(f"{protocol}{bucket_name}/{folder}".split("/")):
-        #    return [f"{protocol}{bucket_name}/{folder}"]
-        iterator = list_folders(
-            bucket_name=bucket_name,
-            prefix=folder,
+        # Add any remaining parts of the path_spec before globs, excluding the
+        # final filename component, to the URI and prefix so that we don't
+        # unnecessarily list too many objects.
+        if not uri.endswith("/"):
+            uri += "/"
+        remaining = posixpath.dirname(path_spec.get_remaining_glob_include(uri)).split(
+            "*"
+        )[0]
+        uri += posixpath.dirname(remaining)
+        prefix = posixpath.basename(remaining)
+        # Check if we're at the end of the include path. If so, no need to list sub-folders.
+        if path_spec.has_correct_number_of_directory_components(uri):
+            return [uri]
+        logger.debug(f"get_dir_to_process listing folders {uri=} {prefix=}")
+        iterator = list_folders_path(
+            s3_uri=uri,
+            startswith=prefix,
             aws_config=self.source_config.aws_config,
         )
-        iterator = peekable(iterator)
-        if iterator:
-            sorted_dirs = sorted(
-                iterator,
-                key=functools.cmp_to_key(partitioned_folder_comparator),
-                reverse=not min,
-            )
-            folders = []
-            for dir in sorted_dirs:
-                if path_spec.dir_allowed(f"{protocol}{bucket_name}/{dir}/"):
-                    folders_list = self.get_dir_to_process(
-                        bucket_name=bucket_name,
-                        folder=dir + "/",
-                        path_spec=path_spec,
-                        protocol=protocol,
-                        min=min,
-                    )
-                    folders.extend(folders_list)
-                    if path_spec.traversal_method != FolderTraversalMethod.ALL:
-                        return folders
-            if folders:
-                return folders
-            else:
-                return [f"{protocol}{bucket_name}/{folder}"]
-        return [f"{protocol}{bucket_name}/{folder}"]
+        sorted_dirs = sorted(
+            iterator,
+            key=lambda dir: functools.cmp_to_key(partitioned_folder_comparator)(
+                dir.name
+            ),
+            reverse=not min,
+        )
+        folders = []
+        for dir in sorted_dirs:
+            if path_spec.dir_allowed(dir.path):
+                folders_list = self.get_dir_to_process(
+                    uri=dir.path,
+                    path_spec=path_spec,
+                    min=min,
+                )
+                folders.extend(folders_list)
+                if path_spec.traversal_method != FolderTraversalMethod.ALL:
+                    return folders
+        if folders:
+            return folders
+        else:
+            return [uri]
     def get_folder_info(
         self,
         path_spec: PathSpec,
-        bucket: "Bucket",
-        prefix: str,
+        uri: str,
     ) -> Iterable[Folder]:
         """
-        Retrieves all the folders in a path by listing all the files in the prefix.
-        If the prefix is a full path then only that folder will be extracted.
+        Retrieves all the folders in a path by recursively listing all the files under the
+        given URI.
         A folder has creation and modification times, size, and a sample file path.
         - Creation time is the earliest creation time of all files in the folder.
@@ -860,72 +932,174 @@ class S3Source(StatefulIngestionSourceBase):
         Parameters:
         path_spec (PathSpec): The path specification used to determine partitioning.
-        bucket (Bucket): The S3 bucket object.
-        prefix (str): The prefix path in the S3 bucket to list objects from.
+        uri (str): The path in the S3 bucket to list objects from.
         Returns:
         List[Folder]: A list of Folder objects representing the partitions found.
         """
         def _is_allowed_path(path_spec_: PathSpec, s3_uri: str) -> bool:
-            allowed = path_spec_.allowed(s3_uri)
+            # Normalize URI for pattern matching
+            normalized_uri = self._normalize_uri_for_pattern_matching(s3_uri)
+            allowed = path_spec_.allowed(normalized_uri)
             if not allowed:
                 logger.debug(f"File {s3_uri} not allowed and skipping")
                 self.report.report_file_dropped(s3_uri)
             return allowed
-        s3_objects = (
-            obj
-            for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
-            if _is_allowed_path(
-                path_spec, self.create_s3_path(obj.bucket_name, obj.key)
+        # Add any remaining parts of the path_spec before globs to the URI and prefix,
+        # so that we don't unnecessarily list too many objects.
+        if not uri.endswith("/"):
+            uri += "/"
+        remaining = path_spec.get_remaining_glob_include(uri).split("*")[0]
+        uri += posixpath.dirname(remaining)
+        prefix = posixpath.basename(remaining)
+        # Process objects in a memory-efficient streaming fashion
+        # Instead of loading all objects into memory, we'll accumulate folder data incrementally
+        folder_data: Dict[str, FolderInfo] = {}  # dirname -> FolderInfo
+        logger.info(f"Listing objects under {repr(uri)} with {prefix=}")
+        for obj in list_objects_recursive_path(
+            uri, startswith=prefix, aws_config=self.source_config.aws_config
+        ):
+            s3_path = self.create_s3_path(obj.bucket_name, obj.key)
+            if not _is_allowed_path(path_spec, s3_path):
+                continue
+            # Extract the directory name (folder) from the object key
+            dirname = obj.key.rsplit("/", 1)[0]
+            # Initialize folder data if we haven't seen this directory before
+            if dirname not in folder_data:
+                folder_data[dirname] = FolderInfo(
+                    objects=[],
+                    total_size=0,
+                    min_time=obj.last_modified,
+                    max_time=obj.last_modified,
+                    latest_obj=obj,
+                )
+            # Update folder statistics incrementally
+            folder_info = folder_data[dirname]
+            folder_info.objects.append(obj)
+            folder_info.total_size += obj.size
+            # Track min/max times and latest object
+            if obj.last_modified < folder_info.min_time:
+                folder_info.min_time = obj.last_modified
+            if obj.last_modified > folder_info.max_time:
+                folder_info.max_time = obj.last_modified
+                folder_info.latest_obj = obj
+        # Yield folders after processing all objects
+        for _dirname, folder_info in folder_data.items():
+            latest_obj = folder_info.latest_obj
+            max_file_s3_path = self.create_s3_path(
+                latest_obj.bucket_name, latest_obj.key
             )
-        )
-        grouped_s3_objects_by_dirname = groupby_unsorted(
-            s3_objects,
-            key=lambda obj: obj.key.rsplit("/", 1)[0],
-        )
-        for _, group in grouped_s3_objects_by_dirname:
-            max_file = max(group, key=lambda x: x.last_modified)
-            max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
             # If partition_id is None, it means the folder is not a partition
-            partition_id = path_spec.get_partition_from_path(max_file_s3_path)
+            partition_id = path_spec.get_partition_from_path(
+                self._normalize_uri_for_pattern_matching(max_file_s3_path)
+            )
             yield Folder(
                 partition_id=partition_id,
                 is_partition=bool(partition_id),
-                creation_time=min(obj.last_modified for obj in group),
-                modification_time=max_file.last_modified,
+                creation_time=folder_info.min_time,
+                modification_time=folder_info.max_time,
                 sample_file=max_file_s3_path,
-                size=sum(obj.size for obj in group),
+                size=folder_info.total_size,
             )
+    def create_s3_path(self, bucket_name: str, key: str) -> str:
+        return f"s3://{bucket_name}/{key}"
     def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
+        """
+        Main entry point for browsing S3 objects and creating table-level datasets.
+        This method determines whether to use templated processing (for paths with {table})
+        or simple file-by-file processing (for paths without templates).
+        Args:
+            path_spec: Configuration specifying the S3 path pattern to scan
+            sample_size: Number of files to sample (used in simple processing)
+        Returns:
+            Iterator of BrowsePath objects representing datasets to be created
+        Examples:
+            - Templated: s3://bucket/data/*/{table}/** -> Groups files by table
+            - Simple: s3://bucket/data/*.csv -> Processes individual files
+        """
+        if self.source_config.aws_config is None:
+            raise ValueError("aws_config not set. Cannot browse s3")
+        logger.info(f"Processing path spec: {path_spec.include}")
+        # Check if we have {table} template in the path
+        has_table_template = "{table}" in path_spec.include
+        logger.info(f"Has table template: {has_table_template}")
+        if has_table_template:
+            logger.info("Using templated path processing")
+            # Always use templated processing when {table} is present
+            # This groups files under table-level datasets
+            yield from self._process_templated_path(path_spec)
+        else:
+            logger.info("Using simple path processing")
+            # Only use simple processing for non-templated paths
+            # This creates individual file-level datasets
+            yield from self._process_simple_path(path_spec)
+    def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
+        """
+        Process S3 paths containing {table} templates to create table-level datasets.
+        This method handles complex path patterns with wildcards and templates by:
+        1. Replacing template placeholders with stars (except {table})
+        2. Resolving wildcards in the path up to the {table} marker
+        3. Finding all potential table folders under each resolved path
+        4. Applying configurable partition traversal strategy (ALL, MAX, MIN_MAX)
+        5. Aggregating files from selected partitions under each table
+        6. Creating one dataset per table (not per file)
+        Args:
+            path_spec: Path specification with {table} template
+        Yields:
+            BrowsePath: One per table (not per file), containing aggregated metadata
+        """
         if self.source_config.aws_config is None:
             raise ValueError("aws_config not set. Cannot browse s3")
         s3 = self.source_config.aws_config.get_s3_resource(
             self.source_config.verify_ssl
         )
-        bucket_name = get_bucket_name(path_spec.include)
-        logger.debug(f"Scanning bucket: {bucket_name}")
-        bucket = s3.Bucket(bucket_name)
-        prefix = self.get_prefix(get_bucket_relative_path(path_spec.include))
-        logger.debug(f"Scanning objects with prefix:{prefix}")
+        # Find the part before {table}
+        table_marker = "{table}"
+        if table_marker not in path_spec.include:
+            logger.info("No {table} marker found in path")
+            return
+        # STEP 1: Replace template placeholders with stars (except {table}) to enable folder resolution
+        # This is the crucial missing logic from the original implementation
         matches = re.finditer(r"{\s*\w+\s*}", path_spec.include, re.MULTILINE)
         matches_list = list(matches)
-        if matches_list and path_spec.sample_files:
-            # Replace the patch_spec include's templates with star because later we want to resolve all the stars
-            # to actual directories.
-            # For example:
-            # "s3://my-test-bucket/*/{dept}/*/{table}/*/*.*" -> "s3://my-test-bucket/*/*/*/{table}/*/*.*"
-            # We only keep the last template as a marker to know the point util we need to resolve path.
-            # After the marker we can safely get sample files for sampling because it is not used in the
-            # table name, so we don't need all the files.
-            # This speed up processing but we won't be able to get a precise modification date/size/number of files.
+        if matches_list:
+            # Replace all templates with stars except keep {table} as the marker
             max_start: int = -1
             include: str = path_spec.include
             max_match: str = ""
             for match in matches_list:
                 pos = include.find(match.group())
                 if pos > max_start:
@@ -937,109 +1111,198 @@ class S3Source(StatefulIngestionSourceBase):
                     if max_match == "{table}":
                         break
-            table_index = include.find(max_match)
-            for folder in self.resolve_templated_folders(
-                bucket_name, get_bucket_relative_path(include[:table_index])
-            ):
-                try:
-                    for f in list_folders(
-                        bucket_name, f"{folder}", self.source_config.aws_config
-                    ):
-                        dirs_to_process = []
-                        logger.info(f"Processing folder: {f}")
-                        if path_spec.traversal_method == FolderTraversalMethod.ALL:
-                            dirs_to_process.append(f)
-                        else:
-                            if (
-                                path_spec.traversal_method
-                                == FolderTraversalMethod.MIN_MAX
-                                or path_spec.traversal_method
-                                == FolderTraversalMethod.MAX
-                            ):
-                                protocol = ContainerWUCreator.get_protocol(
-                                    path_spec.include
-                                )
-                                dirs_to_process_max = self.get_dir_to_process(
-                                    bucket_name=bucket_name,
-                                    folder=f + "/",
-                                    path_spec=path_spec,
-                                    protocol=protocol,
-                                )
-                                dirs_to_process.append(dirs_to_process_max[0])
-                            if (
-                                path_spec.traversal_method
-                                == FolderTraversalMethod.MIN_MAX
-                            ):
-                                dirs_to_process_min = self.get_dir_to_process(
-                                    bucket_name=bucket_name,
-                                    folder=f + "/",
-                                    path_spec=path_spec,
-                                    protocol=protocol,
-                                    min=True,
-                                )
-                                dirs_to_process.append(dirs_to_process_min[0])
-                        folders: List[Folder] = []
-                        for dir in dirs_to_process:
-                            logger.info(f"Getting files from folder: {dir}")
-                            prefix_to_process = urlparse(dir).path.lstrip("/")
-                            folders.extend(
-                                self.get_folder_info(
-                                    path_spec, bucket, prefix_to_process
-                                )
+            logger.info(f"Template replacement: {path_spec.include} -> {include}")
+        else:
+            include = path_spec.include
+        # Split the path at {table} to get the prefix that needs wildcard resolution
+        prefix_before_table = include.split(table_marker)[0]
+        logger.info(f"Prefix before table: {prefix_before_table}")
+        try:
+            # STEP 2: Resolve ALL wildcards in the path up to {table}
+            # This converts patterns like "s3://data/*/logs/" to actual paths like ["s3://data/2023/logs/", "s3://data/2024/logs/"]
+            resolved_prefixes = list(
+                self.resolve_templated_folders(prefix_before_table)
+            )
+            logger.info(f"Resolved prefixes: {resolved_prefixes}")
+            # STEP 3: Process each resolved prefix to find table folders
+            for resolved_prefix in resolved_prefixes:
+                logger.info(f"Processing resolved prefix: {resolved_prefix}")
+                # Get all folders that could be tables under this resolved prefix
+                # These are the actual table names (e.g., "users", "events", "logs")
+                table_folders = list(
+                    list_folders_path(
+                        resolved_prefix, aws_config=self.source_config.aws_config
+                    )
+                )
+                logger.debug(
+                    f"Found table folders under {resolved_prefix}: {[folder.name for folder in table_folders]}"
+                )
+                # STEP 4: Process each table folder to create a table-level dataset
+                for folder in table_folders:
+                    logger.info(f"Processing table path: {folder.path}")
+                    # Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
+                    # This uses the compiled regex pattern to extract the table name from the full path
+                    table_name, _ = self.extract_table_name_and_path(
+                        path_spec, folder.path
+                    )
+                    # Apply table name filtering if configured
+                    if not path_spec.tables_filter_pattern.allowed(table_name):
+                        logger.debug(f"Table '{table_name}' not allowed and skipping")
+                        continue
+                    # STEP 5: Handle partition traversal based on configuration
+                    dirs_to_process = []
+                    if path_spec.traversal_method == FolderTraversalMethod.ALL:
+                        # Process ALL partitions (original behavior)
+                        dirs_to_process = [folder.path]
+                        logger.debug(
+                            f"Processing ALL partition folders under: {folder.path}"
+                        )
+                    else:
+                        # Use the original get_dir_to_process logic for MIN/MAX
+                        if (
+                            path_spec.traversal_method == FolderTraversalMethod.MIN_MAX
+                            or path_spec.traversal_method == FolderTraversalMethod.MAX
+                        ):
+                            # Get MAX partition using original logic
+                            dirs_to_process_max = self.get_dir_to_process(
+                                uri=folder.path,
+                                path_spec=path_spec,
+                                min=False,
                             )
-                        max_folder = None
-                        if folders:
-                            max_folder = max(folders, key=lambda x: x.modification_time)
-                        if not max_folder:
-                            logger.warning(
-                                f"Unable to find any files in the folder {dir}. Skipping..."
+                            if dirs_to_process_max:
+                                dirs_to_process.extend(dirs_to_process_max)
+                                logger.debug(
+                                    f"Added MAX partition: {dirs_to_process_max}"
+                                )
+                        if path_spec.traversal_method == FolderTraversalMethod.MIN_MAX:
+                            # Get MIN partition using original logic
+                            dirs_to_process_min = self.get_dir_to_process(
+                                uri=folder.path,
+                                path_spec=path_spec,
+                                min=True,
                             )
-                            continue
+                            if dirs_to_process_min:
+                                dirs_to_process.extend(dirs_to_process_min)
+                                logger.debug(
+                                    f"Added MIN partition: {dirs_to_process_min}"
+                                )
-                        partitions = list(filter(lambda x: x.is_partition, folders))
-                        yield BrowsePath(
-                            file=max_folder.sample_file,
-                            timestamp=max_folder.modification_time,
-                            size=max_folder.size,
-                            partitions=partitions,
-                            # TODO: Support content type inference for partitions
+                    # Process the selected partitions
+                    all_folders = []
+                    for partition_path in dirs_to_process:
+                        logger.info(f"Scanning files in partition: {partition_path}")
+                        partition_files = list(
+                            self.get_folder_info(path_spec, partition_path)
                         )
-                except Exception as e:
-                    # This odd check if being done because boto does not have a proper exception to catch
-                    # The exception that appears in stacktrace cannot actually be caught without a lot more work
-                    # https://github.com/boto/boto3/issues/1195
-                    if "NoSuchBucket" in repr(e):
-                        logger.debug(f"Got NoSuchBucket exception for {bucket_name}", e)
-                        self.get_report().report_warning(
-                            "Missing bucket", f"No bucket found {bucket_name}"
+                        all_folders.extend(partition_files)
+                    if all_folders:
+                        # Use the most recent file across all processed partitions
+                        latest_file = max(
+                            all_folders, key=lambda x: x.modification_time
+                        )
+                        # Get partition information
+                        partitions = [f for f in all_folders if f.is_partition]
+                        # Calculate total size of processed partitions
+                        total_size = sum(f.size for f in all_folders)
+                        # Create ONE BrowsePath per table
+                        # The key insight: we need to provide the sample file for schema inference
+                        # but the table path should be extracted correctly by extract_table_name_and_path
+                        yield BrowsePath(
+                            file=latest_file.sample_file,  # Sample file for schema inference
+                            timestamp=latest_file.modification_time,  # Latest timestamp
+                            size=total_size,  # Size of processed partitions
+                            partitions=partitions,  # Partition metadata
                         )
                     else:
-                        raise e
-        else:
-            logger.debug(
-                "No template in the pathspec can't do sampling, fallbacking to do full scan"
-            )
-            path_spec.sample_files = False
-            for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
-                s3_path = self.create_s3_path(obj.bucket_name, obj.key)
-                logger.debug(f"Path: {s3_path}")
-                content_type = None
-                if self.source_config.use_s3_content_type:
-                    content_type = s3.Object(obj.bucket_name, obj.key).content_type
-                yield BrowsePath(
-                    file=s3_path,
-                    timestamp=obj.last_modified,
-                    size=obj.size,
-                    partitions=[],
-                    content_type=content_type,
+                        logger.warning(
+                            f"No files found in processed partitions for table {table_name}"
+                        )
+        except Exception as e:
+            if isinstance(e, s3.meta.client.exceptions.NoSuchBucket):
+                self.get_report().report_warning(
+                    "Missing bucket",
+                    f"No bucket found {e.response['Error'].get('BucketName')}",
                 )
+                return
+            logger.error(f"Error in _process_templated_path: {e}")
+            raise e
-    def create_s3_path(self, bucket_name: str, key: str) -> str:
-        return f"s3://{bucket_name}/{key}"
+    def _process_simple_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
+        """
+        Process simple S3 paths without {table} templates to create file-level datasets.
+        This method handles straightforward file patterns by:
+        1. Listing all files matching the pattern
+        2. Creating one dataset per file
+        3. No aggregation or grouping is performed
+        Use Cases:
+        - Individual file processing: s3://bucket/data/*.csv
+        - Direct file paths: s3://bucket/data/myfile.json
+        - Patterns without table grouping: s3://bucket/logs/*.log
+        Args:
+            path_spec: Path specification without {table} template
+        Yields:
+            BrowsePath: One per file, containing individual file metadata
+        Example Output:
+            - BrowsePath(file="data/file1.csv", size=1000, partitions=[])
+            - BrowsePath(file="data/file2.csv", size=2000, partitions=[])
+        """
+        if self.source_config.aws_config is None:
+            raise ValueError("aws_config not set")
+        s3 = self.source_config.aws_config.get_s3_resource(
+            self.source_config.verify_ssl
+        )
+        path_spec.sample_files = False  # Disable sampling for simple paths
+        # Extract the prefix from the path spec (stops at first wildcard)
+        prefix = self.get_prefix(path_spec.include)
+        basename_startswith = prefix.split("/")[-1]
+        dirname = prefix.removesuffix(basename_startswith)
+        # Iterate through all objects in the bucket matching the prefix
+        for obj in list_objects_recursive_path(
+            dirname,
+            startswith=basename_startswith,
+            aws_config=self.source_config.aws_config,
+        ):
+            s3_path = self.create_s3_path(obj.bucket_name, obj.key)
+            # Get content type if configured
+            content_type = None
+            if self.source_config.use_s3_content_type:
+                content_type = s3.Object(obj.bucket_name, obj.key).content_type
+            # Create one BrowsePath per file
+            yield BrowsePath(
+                file=s3_path,
+                timestamp=obj.last_modified,
+                size=obj.size,
+                partitions=[],  # No partitions in simple mode
+                content_type=content_type,
+            )
     def local_browser(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
         prefix = self.get_prefix(path_spec.include)
@@ -1071,11 +1334,6 @@ class S3Source(StatefulIngestionSourceBase):
                     )
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
-        self.container_WU_creator = ContainerWUCreator(
-            self.source_config.platform,
-            self.source_config.platform_instance,
-            self.source_config.env,
-        )
         with PerfTimer() as timer:
             assert self.source_config.path_specs
             for path_spec in self.source_config.path_specs:
@@ -1088,8 +1346,13 @@ class S3Source(StatefulIngestionSourceBase):
                 )
                 table_dict: Dict[str, TableData] = {}
                 for browse_path in file_browser:
+                    # Normalize URI for pattern matching
+                    normalized_file_path = self._normalize_uri_for_pattern_matching(
+                        browse_path.file
+                    )
                     if not path_spec.allowed(
-                        browse_path.file,
+                        normalized_file_path,
                         ignore_ext=self.is_s3_platform()
                         and self.source_config.use_s3_content_type,
                     ):
@@ -1165,5 +1428,13 @@ class S3Source(StatefulIngestionSourceBase):
     def is_s3_platform(self):
         return self.source_config.platform == "s3"
+    def strip_s3_prefix(self, s3_uri: str) -> str:
+        """Strip S3 prefix from URI. Can be overridden by adapters for other platforms."""
+        return strip_s3_prefix(s3_uri)
+    def _normalize_uri_for_pattern_matching(self, uri: str) -> str:
+        """Normalize URI for pattern matching. Can be overridden by adapters for other platforms."""
+        return uri
     def get_report(self):
         return self.report

acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0rc18py3-none-any.whl → 1.3.0.1rc9py3-none-any.whl