acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -9,7 +9,9 @@ from datahub.configuration.validate_multiline_string import pydantic_multiline_s
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class GCPCredential(ConfigModel):
|
|
12
|
-
project_id: Optional[str] = Field(
|
|
12
|
+
project_id: Optional[str] = Field(
|
|
13
|
+
None, description="Project id to set the credentials"
|
|
14
|
+
)
|
|
13
15
|
private_key_id: str = Field(description="Private key id")
|
|
14
16
|
private_key: str = Field(
|
|
15
17
|
description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"
|
|
@@ -1,5 +1,10 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
1
4
|
from datahub.utilities.str_enum import StrEnum
|
|
2
5
|
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
3
8
|
|
|
4
9
|
class DatasetSubTypes(StrEnum):
|
|
5
10
|
# Generic SubTypes
|
|
@@ -25,7 +30,12 @@ class DatasetSubTypes(StrEnum):
|
|
|
25
30
|
NEO4J_NODE = "Neo4j Node"
|
|
26
31
|
NEO4J_RELATIONSHIP = "Neo4j Relationship"
|
|
27
32
|
SNOWFLAKE_STREAM = "Snowflake Stream"
|
|
33
|
+
DYNAMIC_TABLE = "Dynamic Table"
|
|
28
34
|
API_ENDPOINT = "API Endpoint"
|
|
35
|
+
SLACK_CHANNEL = "Slack Channel"
|
|
36
|
+
PROJECTIONS = "Projections"
|
|
37
|
+
GOOGLE_SHEETS = "Google Sheets"
|
|
38
|
+
GOOGLE_SHEETS_NAMED_RANGE = "Google Sheets Named Range"
|
|
29
39
|
|
|
30
40
|
# TODO: Create separate entity...
|
|
31
41
|
NOTEBOOK = "Notebook"
|
|
@@ -46,13 +56,18 @@ class DatasetContainerSubTypes(StrEnum):
|
|
|
46
56
|
ABS_CONTAINER = "ABS container"
|
|
47
57
|
KEYSPACE = "Keyspace" # Cassandra
|
|
48
58
|
NAMESPACE = "Namespace" # Iceberg
|
|
59
|
+
DREMIO_SPACE = "Dremio Space"
|
|
60
|
+
DREMIO_SOURCE = "Dremio Source"
|
|
49
61
|
|
|
50
62
|
|
|
51
63
|
class BIContainerSubTypes(StrEnum):
|
|
52
64
|
LOOKER_FOLDER = "Folder"
|
|
53
65
|
LOOKML_PROJECT = "LookML Project"
|
|
54
66
|
LOOKML_MODEL = "LookML Model"
|
|
67
|
+
TABLEAU_SITE = "Site"
|
|
68
|
+
TABLEAU_PROJECT = "Project"
|
|
55
69
|
TABLEAU_WORKBOOK = "Workbook"
|
|
70
|
+
POWERBI_WORKSPACE = "Workspace"
|
|
56
71
|
POWERBI_DATASET = "Semantic Model"
|
|
57
72
|
POWERBI_DATASET_TABLE = "Table"
|
|
58
73
|
QLIK_SPACE = "Qlik Space"
|
|
@@ -60,6 +75,8 @@ class BIContainerSubTypes(StrEnum):
|
|
|
60
75
|
SIGMA_WORKSPACE = "Sigma Workspace"
|
|
61
76
|
SIGMA_WORKBOOK = "Sigma Workbook"
|
|
62
77
|
MODE_COLLECTION = "Collection"
|
|
78
|
+
GRAFANA_FOLDER = "Folder"
|
|
79
|
+
GRAFANA_DASHBOARD = "Dashboard"
|
|
63
80
|
|
|
64
81
|
|
|
65
82
|
class FlowContainerSubTypes(StrEnum):
|
|
@@ -74,6 +91,9 @@ class JobContainerSubTypes(StrEnum):
|
|
|
74
91
|
|
|
75
92
|
|
|
76
93
|
class BIAssetSubTypes(StrEnum):
|
|
94
|
+
DASHBOARD = "Dashboard"
|
|
95
|
+
CHART = "Chart"
|
|
96
|
+
|
|
77
97
|
# Generic SubTypes
|
|
78
98
|
REPORT = "Report"
|
|
79
99
|
|
|
@@ -116,3 +136,36 @@ class MLAssetSubTypes(StrEnum):
|
|
|
116
136
|
VERTEX_PIPELINE = "Pipeline Job"
|
|
117
137
|
VERTEX_PIPELINE_TASK = "Pipeline Task"
|
|
118
138
|
VERTEX_PIPELINE_TASK_RUN = "Pipeline Task Run"
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def create_source_capability_modifier_enum():
|
|
142
|
+
all_values: Dict[str, Any] = {}
|
|
143
|
+
source_enums = [
|
|
144
|
+
DatasetSubTypes,
|
|
145
|
+
DatasetContainerSubTypes,
|
|
146
|
+
BIContainerSubTypes,
|
|
147
|
+
FlowContainerSubTypes,
|
|
148
|
+
JobContainerSubTypes,
|
|
149
|
+
BIAssetSubTypes,
|
|
150
|
+
MLAssetSubTypes,
|
|
151
|
+
]
|
|
152
|
+
|
|
153
|
+
for enum_class in source_enums:
|
|
154
|
+
for member in enum_class: # type: ignore[var-annotated]
|
|
155
|
+
if member.name in all_values:
|
|
156
|
+
logger.debug(
|
|
157
|
+
f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
|
|
158
|
+
)
|
|
159
|
+
continue
|
|
160
|
+
all_values[member.name] = member.value
|
|
161
|
+
|
|
162
|
+
enum_code = "class SourceCapabilityModifier(StrEnum):\n"
|
|
163
|
+
for name, value in all_values.items():
|
|
164
|
+
enum_code += f' {name} = "{value}"\n'
|
|
165
|
+
|
|
166
|
+
exec(enum_code, globals())
|
|
167
|
+
return globals()["SourceCapabilityModifier"]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# This will have all values from the enums above
|
|
171
|
+
SourceCapabilityModifier = create_source_capability_modifier_enum()
|
|
@@ -25,10 +25,16 @@ from datahub.ingestion.source.data_lake_common.object_store import (
|
|
|
25
25
|
get_object_store_bucket_name,
|
|
26
26
|
get_object_store_for_uri,
|
|
27
27
|
)
|
|
28
|
+
from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
|
|
28
29
|
from datahub.ingestion.source.gcs.gcs_utils import (
|
|
29
30
|
get_gcs_prefix,
|
|
30
31
|
is_gcs_uri,
|
|
31
32
|
)
|
|
33
|
+
from datahub.metadata.schema_classes import (
|
|
34
|
+
SchemaFieldClass,
|
|
35
|
+
SchemaFieldDataTypeClass,
|
|
36
|
+
StringTypeClass,
|
|
37
|
+
)
|
|
32
38
|
|
|
33
39
|
# hide annoying debug errors from py4j
|
|
34
40
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
@@ -39,6 +45,37 @@ PLATFORM_GCS = "gcs"
|
|
|
39
45
|
PLATFORM_ABS = "abs"
|
|
40
46
|
|
|
41
47
|
|
|
48
|
+
def add_partition_columns_to_schema(
|
|
49
|
+
path_spec: PathSpec, full_path: str, fields: List[SchemaFieldClass]
|
|
50
|
+
) -> None:
|
|
51
|
+
# Check if using fieldPath v2 format
|
|
52
|
+
is_fieldpath_v2 = any(
|
|
53
|
+
field.fieldPath.startswith("[version=2.0]") for field in fields
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Extract partition information from path
|
|
57
|
+
partition_keys = path_spec.get_partition_from_path(full_path)
|
|
58
|
+
if not partition_keys:
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
# Add partition fields to schema
|
|
62
|
+
for partition_key in partition_keys:
|
|
63
|
+
fields.append(
|
|
64
|
+
SchemaFieldClass(
|
|
65
|
+
fieldPath=(
|
|
66
|
+
f"{partition_key[0]}"
|
|
67
|
+
if not is_fieldpath_v2
|
|
68
|
+
else f"[version=2.0].[type=string].{partition_key[0]}"
|
|
69
|
+
),
|
|
70
|
+
nativeDataType="string",
|
|
71
|
+
type=SchemaFieldDataTypeClass(StringTypeClass()),
|
|
72
|
+
isPartitioningKey=True,
|
|
73
|
+
nullable=False,
|
|
74
|
+
recursive=False,
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
42
79
|
class ContainerWUCreator:
|
|
43
80
|
processed_containers: List[str]
|
|
44
81
|
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
2
3
|
|
|
3
4
|
# Add imports for source customization
|
|
@@ -236,42 +237,76 @@ class ABSObjectStore(ObjectStoreInterface):
|
|
|
236
237
|
"""Implementation of ObjectStoreInterface for Azure Blob Storage."""
|
|
237
238
|
|
|
238
239
|
PREFIX = "abfss://"
|
|
240
|
+
HTTPS_REGEX = re.compile(r"(https?://[a-z0-9]{3,24}\.blob\.core\.windows\.net/)")
|
|
239
241
|
|
|
240
242
|
@classmethod
|
|
241
243
|
def is_uri(cls, uri: str) -> bool:
|
|
242
|
-
return uri.startswith(cls.PREFIX)
|
|
244
|
+
return uri.startswith(cls.PREFIX) or bool(cls.HTTPS_REGEX.match(uri))
|
|
243
245
|
|
|
244
246
|
@classmethod
|
|
245
247
|
def get_prefix(cls, uri: str) -> Optional[str]:
|
|
246
248
|
if uri.startswith(cls.PREFIX):
|
|
247
249
|
return cls.PREFIX
|
|
250
|
+
|
|
251
|
+
# Check for HTTPS format
|
|
252
|
+
match = cls.HTTPS_REGEX.match(uri)
|
|
253
|
+
if match:
|
|
254
|
+
return match.group(1)
|
|
255
|
+
|
|
248
256
|
return None
|
|
249
257
|
|
|
250
258
|
@classmethod
|
|
251
259
|
def strip_prefix(cls, uri: str) -> str:
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
260
|
+
if uri.startswith(cls.PREFIX):
|
|
261
|
+
return uri[len(cls.PREFIX) :]
|
|
262
|
+
|
|
263
|
+
# Handle HTTPS format
|
|
264
|
+
match = cls.HTTPS_REGEX.match(uri)
|
|
265
|
+
if match:
|
|
266
|
+
return uri[len(match.group(1)) :]
|
|
267
|
+
|
|
268
|
+
raise ValueError(
|
|
269
|
+
f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
|
|
270
|
+
)
|
|
256
271
|
|
|
257
272
|
@classmethod
|
|
258
273
|
def get_bucket_name(cls, uri: str) -> str:
|
|
259
274
|
if not cls.is_uri(uri):
|
|
260
|
-
raise ValueError(
|
|
261
|
-
|
|
275
|
+
raise ValueError(
|
|
276
|
+
f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
if uri.startswith(cls.PREFIX):
|
|
280
|
+
# abfss://container@account.dfs.core.windows.net/path
|
|
281
|
+
return cls.strip_prefix(uri).split("@")[0]
|
|
282
|
+
else:
|
|
283
|
+
# https://account.blob.core.windows.net/container/path
|
|
284
|
+
return cls.strip_prefix(uri).split("/")[0]
|
|
262
285
|
|
|
263
286
|
@classmethod
|
|
264
287
|
def get_object_key(cls, uri: str) -> str:
|
|
265
288
|
if not cls.is_uri(uri):
|
|
266
|
-
raise ValueError(
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
289
|
+
raise ValueError(
|
|
290
|
+
f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
if uri.startswith(cls.PREFIX):
|
|
294
|
+
# abfss://container@account.dfs.core.windows.net/path
|
|
295
|
+
parts = cls.strip_prefix(uri).split("@", 1)
|
|
296
|
+
if len(parts) < 2:
|
|
297
|
+
return ""
|
|
298
|
+
account_path = parts[1]
|
|
299
|
+
path_parts = account_path.split("/", 1)
|
|
300
|
+
if len(path_parts) < 2:
|
|
301
|
+
return ""
|
|
302
|
+
return path_parts[1]
|
|
303
|
+
else:
|
|
304
|
+
# https://account.blob.core.windows.net/container/path
|
|
305
|
+
stripped = cls.strip_prefix(uri)
|
|
306
|
+
parts = stripped.split("/", 1)
|
|
307
|
+
if len(parts) < 2:
|
|
308
|
+
return ""
|
|
309
|
+
return parts[1]
|
|
275
310
|
|
|
276
311
|
|
|
277
312
|
# Registry of all object store implementations
|
|
@@ -331,6 +366,12 @@ def get_object_store_bucket_name(uri: str) -> str:
|
|
|
331
366
|
return uri[prefix_length:].split("/")[0]
|
|
332
367
|
elif uri.startswith(ABSObjectStore.PREFIX):
|
|
333
368
|
return uri[len(ABSObjectStore.PREFIX) :].split("@")[0]
|
|
369
|
+
elif ABSObjectStore.HTTPS_REGEX.match(uri):
|
|
370
|
+
# Handle HTTPS Azure Blob Storage URLs
|
|
371
|
+
match = ABSObjectStore.HTTPS_REGEX.match(uri)
|
|
372
|
+
if match:
|
|
373
|
+
stripped = uri[len(match.group(1)) :]
|
|
374
|
+
return stripped.split("/")[0]
|
|
334
375
|
|
|
335
376
|
raise ValueError(f"Unsupported URI format: {uri}")
|
|
336
377
|
|
|
@@ -470,18 +511,25 @@ class ObjectStoreSourceAdapter:
|
|
|
470
511
|
if not ABSObjectStore.is_uri(table_data.table_path):
|
|
471
512
|
return None
|
|
472
513
|
|
|
473
|
-
# Parse the ABS URI
|
|
474
514
|
try:
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
515
|
+
if table_data.table_path.startswith("abfss://"):
|
|
516
|
+
# URI format: abfss://container@account.dfs.core.windows.net/path
|
|
517
|
+
path_without_prefix = ABSObjectStore.strip_prefix(table_data.table_path)
|
|
518
|
+
parts = path_without_prefix.split("@", 1)
|
|
519
|
+
if len(parts) < 2:
|
|
520
|
+
return None
|
|
521
|
+
|
|
522
|
+
container_name = parts[0]
|
|
523
|
+
account_parts = parts[1].split("/", 1)
|
|
524
|
+
account_domain = account_parts[0]
|
|
525
|
+
account_name = account_domain.split(".")[0]
|
|
526
|
+
else:
|
|
527
|
+
# Handle HTTPS format: https://account.blob.core.windows.net/container/path
|
|
528
|
+
container_name = ABSObjectStore.get_bucket_name(table_data.table_path)
|
|
529
|
+
if "blob.core.windows.net" in table_data.table_path:
|
|
530
|
+
account_name = table_data.table_path.split("//")[1].split(".")[0]
|
|
531
|
+
else:
|
|
532
|
+
return None
|
|
485
533
|
|
|
486
534
|
# Construct Azure portal URL
|
|
487
535
|
return f"https://portal.azure.com/#blade/Microsoft_Azure_Storage/ContainerMenuBlade/overview/storageAccountId/{account_name}/containerName/{container_name}"
|
|
@@ -519,6 +567,13 @@ class ObjectStoreSourceAdapter:
|
|
|
519
567
|
"get_external_url",
|
|
520
568
|
lambda table_data: self.get_gcs_external_url(table_data),
|
|
521
569
|
)
|
|
570
|
+
# Fix URI mismatch issue in pattern matching
|
|
571
|
+
self.register_customization(
|
|
572
|
+
"_normalize_uri_for_pattern_matching",
|
|
573
|
+
self._normalize_gcs_uri_for_pattern_matching,
|
|
574
|
+
)
|
|
575
|
+
# Fix URI handling in schema extraction - override strip_s3_prefix for GCS
|
|
576
|
+
self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
|
|
522
577
|
elif platform == "s3":
|
|
523
578
|
self.register_customization("is_s3_platform", lambda: True)
|
|
524
579
|
self.register_customization("create_s3_path", self.create_s3_path)
|
|
@@ -612,6 +667,39 @@ class ObjectStoreSourceAdapter:
|
|
|
612
667
|
return self.get_abs_external_url(table_data)
|
|
613
668
|
return None
|
|
614
669
|
|
|
670
|
+
def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
|
|
671
|
+
"""
|
|
672
|
+
Normalize GCS URI for pattern matching.
|
|
673
|
+
|
|
674
|
+
This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
|
|
675
|
+
fixing the URI mismatch issue in GCS ingestion.
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
uri: The URI to normalize
|
|
679
|
+
|
|
680
|
+
Returns:
|
|
681
|
+
The normalized URI for pattern matching
|
|
682
|
+
"""
|
|
683
|
+
if uri.startswith("gs://"):
|
|
684
|
+
return uri.replace("gs://", "s3://", 1)
|
|
685
|
+
return uri
|
|
686
|
+
|
|
687
|
+
def _strip_gcs_prefix(self, uri: str) -> str:
|
|
688
|
+
"""
|
|
689
|
+
Strip GCS prefix from URI.
|
|
690
|
+
|
|
691
|
+
This method removes the gs:// prefix from GCS URIs for path processing.
|
|
692
|
+
|
|
693
|
+
Args:
|
|
694
|
+
uri: The URI to strip the prefix from
|
|
695
|
+
|
|
696
|
+
Returns:
|
|
697
|
+
The URI without the gs:// prefix
|
|
698
|
+
"""
|
|
699
|
+
if uri.startswith("gs://"):
|
|
700
|
+
return uri[5:] # Remove "gs://" prefix
|
|
701
|
+
return uri
|
|
702
|
+
|
|
615
703
|
|
|
616
704
|
# Factory function to create an adapter for a specific platform
|
|
617
705
|
def create_object_store_adapter(
|
|
@@ -11,7 +11,7 @@ from cached_property import cached_property
|
|
|
11
11
|
from pydantic.fields import Field
|
|
12
12
|
from wcmatch import pathlib
|
|
13
13
|
|
|
14
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
14
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
15
15
|
from datahub.ingestion.source.aws.s3_util import is_s3_uri
|
|
16
16
|
from datahub.ingestion.source.azure.abs_utils import is_abs_uri
|
|
17
17
|
from datahub.ingestion.source.gcs.gcs_utils import is_gcs_uri
|
|
@@ -62,7 +62,6 @@ class SortKey(ConfigModel):
|
|
|
62
62
|
|
|
63
63
|
date_format: Optional[str] = Field(
|
|
64
64
|
default=None,
|
|
65
|
-
type=str,
|
|
66
65
|
description="The date format to use when sorting. This is used to parse the date from the key. The format should follow the java [SimpleDateFormat](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) format.",
|
|
67
66
|
)
|
|
68
67
|
|
|
@@ -90,63 +89,62 @@ class PathSpec(ConfigModel):
|
|
|
90
89
|
description="Path to table. Name variable `{table}` is used to mark the folder with dataset. In absence of `{table}`, file level dataset will be created. Check below examples for more details."
|
|
91
90
|
)
|
|
92
91
|
exclude: Optional[List[str]] = Field(
|
|
93
|
-
|
|
92
|
+
[],
|
|
94
93
|
description="list of paths in glob pattern which will be excluded while scanning for the datasets",
|
|
95
94
|
)
|
|
96
95
|
file_types: List[str] = Field(
|
|
97
|
-
|
|
96
|
+
SUPPORTED_FILE_TYPES,
|
|
98
97
|
description="Files with extenstions specified here (subset of default value) only will be scanned to create dataset. Other files will be omitted.",
|
|
99
98
|
)
|
|
100
99
|
|
|
101
100
|
default_extension: Optional[str] = Field(
|
|
102
|
-
|
|
101
|
+
None,
|
|
103
102
|
description="For files without extension it will assume the specified file type. If it is not set the files without extensions will be skipped.",
|
|
104
103
|
)
|
|
105
104
|
|
|
106
105
|
table_name: Optional[str] = Field(
|
|
107
|
-
|
|
106
|
+
None,
|
|
108
107
|
description="Display name of the dataset.Combination of named variables from include path and strings",
|
|
109
108
|
)
|
|
110
109
|
|
|
111
110
|
# This is not used yet, but will be used in the future to sort the partitions
|
|
112
|
-
sort_key: Optional[SortKey] = Field(
|
|
113
|
-
|
|
114
|
-
default=None,
|
|
111
|
+
sort_key: HiddenFromDocs[Optional[SortKey]] = Field(
|
|
112
|
+
None,
|
|
115
113
|
description="Sort key to use when sorting the partitions. This is useful when the partitions are not sorted in the order of the data. The key can be a compound key based on the path_spec variables.",
|
|
116
114
|
)
|
|
117
115
|
|
|
118
116
|
enable_compression: bool = Field(
|
|
119
|
-
|
|
117
|
+
True,
|
|
120
118
|
description="Enable or disable processing compressed files. Currently .gz and .bz files are supported.",
|
|
121
119
|
)
|
|
122
120
|
|
|
123
121
|
sample_files: bool = Field(
|
|
124
|
-
|
|
122
|
+
True,
|
|
125
123
|
description="Not listing all the files but only taking a handful amount of sample file to infer the schema. File count and file size calculation will be disabled. This can affect performance significantly if enabled",
|
|
126
124
|
)
|
|
127
125
|
|
|
128
126
|
allow_double_stars: bool = Field(
|
|
129
|
-
|
|
127
|
+
False,
|
|
130
128
|
description="Allow double stars in the include path. This can affect performance significantly if enabled",
|
|
131
129
|
)
|
|
132
130
|
|
|
133
131
|
autodetect_partitions: bool = Field(
|
|
134
|
-
|
|
132
|
+
True,
|
|
135
133
|
description="Autodetect partition(s) from the path. If set to true, it will autodetect partition key/value if the folder format is {partition_key}={partition_value} for example `year=2024`",
|
|
136
134
|
)
|
|
137
135
|
|
|
138
136
|
traversal_method: FolderTraversalMethod = Field(
|
|
139
|
-
|
|
137
|
+
FolderTraversalMethod.MAX,
|
|
140
138
|
description="Method to traverse the folder. ALL: Traverse all the folders, MIN_MAX: Traverse the folders by finding min and max value, MAX: Traverse the folder with max value",
|
|
141
139
|
)
|
|
142
140
|
|
|
143
141
|
include_hidden_folders: bool = Field(
|
|
144
|
-
|
|
142
|
+
False,
|
|
145
143
|
description="Include hidden folders in the traversal (folders starting with . or _",
|
|
146
144
|
)
|
|
147
145
|
|
|
148
146
|
tables_filter_pattern: AllowDenyPattern = Field(
|
|
149
|
-
|
|
147
|
+
AllowDenyPattern.allow_all(),
|
|
150
148
|
description="The tables_filter_pattern configuration field uses regular expressions to filter the tables part of the Pathspec for ingestion, allowing fine-grained control over which tables are included or excluded based on specified patterns. The default setting allows all tables.",
|
|
151
149
|
)
|
|
152
150
|
|
|
@@ -166,7 +164,6 @@ class PathSpec(ConfigModel):
|
|
|
166
164
|
return False
|
|
167
165
|
|
|
168
166
|
def allowed(self, path: str, ignore_ext: bool = False) -> bool:
|
|
169
|
-
logger.debug(f"Checking file to inclusion: {path}")
|
|
170
167
|
if self.is_path_hidden(path) and not self.include_hidden_folders:
|
|
171
168
|
return False
|
|
172
169
|
|
|
@@ -174,19 +171,17 @@ class PathSpec(ConfigModel):
|
|
|
174
171
|
self.glob_include, flags=pathlib.GLOBSTAR
|
|
175
172
|
):
|
|
176
173
|
return False
|
|
177
|
-
|
|
174
|
+
|
|
178
175
|
if self.exclude:
|
|
179
176
|
for exclude_path in self.exclude:
|
|
180
177
|
if pathlib.PurePath(path).globmatch(
|
|
181
178
|
exclude_path, flags=pathlib.GLOBSTAR
|
|
182
179
|
):
|
|
183
180
|
return False
|
|
184
|
-
logger.debug(f"{path} is not excluded")
|
|
185
181
|
|
|
186
182
|
table_name, _ = self.extract_table_name_and_path(path)
|
|
187
183
|
if not self.tables_filter_pattern.allowed(table_name):
|
|
188
184
|
return False
|
|
189
|
-
logger.debug(f"{path} is passed table name check")
|
|
190
185
|
|
|
191
186
|
ext = os.path.splitext(path)[1].strip(".")
|
|
192
187
|
|
|
@@ -196,11 +191,12 @@ class PathSpec(ConfigModel):
|
|
|
196
191
|
):
|
|
197
192
|
return False
|
|
198
193
|
|
|
199
|
-
logger.debug(f"{path} had selected extension {ext}")
|
|
200
|
-
logger.debug(f"{path} allowed for dataset creation")
|
|
201
194
|
return True
|
|
202
195
|
|
|
203
196
|
def dir_allowed(self, path: str) -> bool:
|
|
197
|
+
if not path.endswith("/"):
|
|
198
|
+
path += "/"
|
|
199
|
+
|
|
204
200
|
if self.glob_include.endswith("**"):
|
|
205
201
|
return self.allowed(path, ignore_ext=True)
|
|
206
202
|
|
|
@@ -219,10 +215,8 @@ class PathSpec(ConfigModel):
|
|
|
219
215
|
for _ in range(slash_to_remove_from_glob):
|
|
220
216
|
glob_include = glob_include.rsplit("/", 1)[0]
|
|
221
217
|
|
|
222
|
-
logger.debug(f"Checking dir to inclusion: {path}")
|
|
223
218
|
if not pathlib.PurePath(path).globmatch(glob_include, flags=pathlib.GLOBSTAR):
|
|
224
219
|
return False
|
|
225
|
-
logger.debug(f"{path} matched include ")
|
|
226
220
|
if self.exclude:
|
|
227
221
|
for exclude_path in self.exclude:
|
|
228
222
|
if pathlib.PurePath(path.rstrip("/")).globmatch(
|
|
@@ -230,13 +224,12 @@ class PathSpec(ConfigModel):
|
|
|
230
224
|
):
|
|
231
225
|
return False
|
|
232
226
|
|
|
233
|
-
file_name_pattern = self.include.rsplit("/", 1)[1]
|
|
234
227
|
table_name, _ = self.extract_table_name_and_path(
|
|
235
|
-
|
|
228
|
+
path + self.get_remaining_glob_include(path)
|
|
236
229
|
)
|
|
237
230
|
if not self.tables_filter_pattern.allowed(table_name):
|
|
238
231
|
return False
|
|
239
|
-
logger.debug(f"{path} is passed table name check")
|
|
232
|
+
# logger.debug(f"{path} is passed table name check")
|
|
240
233
|
|
|
241
234
|
return True
|
|
242
235
|
|
|
@@ -246,10 +239,10 @@ class PathSpec(ConfigModel):
|
|
|
246
239
|
if parsable_include.endswith("/{table}/**"):
|
|
247
240
|
# Remove the last two characters to make it parsable if it ends with {table}/** which marks autodetect partition
|
|
248
241
|
parsable_include = parsable_include[:-2]
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
242
|
+
|
|
243
|
+
# Replace all * with {folder[i]} to make it parsable
|
|
244
|
+
for i in range(parsable_include.count("*")):
|
|
245
|
+
parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
|
|
253
246
|
return parsable_include
|
|
254
247
|
|
|
255
248
|
def get_named_vars(self, path: str) -> Union[None, parse.Result, parse.Match]:
|
|
@@ -267,7 +260,7 @@ class PathSpec(ConfigModel):
|
|
|
267
260
|
) -> Union[None, parse.Result, parse.Match]:
|
|
268
261
|
return self.compiled_folder_include.parse(path)
|
|
269
262
|
|
|
270
|
-
@pydantic.root_validator()
|
|
263
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
271
264
|
def validate_no_double_stars(cls, values: Dict) -> Dict:
|
|
272
265
|
if "include" not in values:
|
|
273
266
|
return values
|
|
@@ -330,8 +323,6 @@ class PathSpec(ConfigModel):
|
|
|
330
323
|
if "{table}" in values["include"]:
|
|
331
324
|
v = "{table}"
|
|
332
325
|
else:
|
|
333
|
-
logger.debug(f"include fields: {compiled_include.named_fields}")
|
|
334
|
-
logger.debug(f"table_name fields: {parse.compile(v).named_fields}")
|
|
335
326
|
if not all(
|
|
336
327
|
x in compiled_include.named_fields
|
|
337
328
|
for x in parse.compile(v).named_fields
|
|
@@ -356,9 +347,7 @@ class PathSpec(ConfigModel):
|
|
|
356
347
|
@cached_property
|
|
357
348
|
def compiled_include(self):
|
|
358
349
|
parsable_include = PathSpec.get_parsable_include(self.include)
|
|
359
|
-
logger.debug(f"parsable_include: {parsable_include}")
|
|
360
350
|
compiled_include = parse.compile(parsable_include)
|
|
361
|
-
logger.debug(f"Setting compiled_include: {compiled_include}")
|
|
362
351
|
return compiled_include
|
|
363
352
|
|
|
364
353
|
@cached_property
|
|
@@ -366,9 +355,8 @@ class PathSpec(ConfigModel):
|
|
|
366
355
|
parsable_folder_include = PathSpec.get_parsable_include(self.include).rsplit(
|
|
367
356
|
"/", 1
|
|
368
357
|
)[0]
|
|
369
|
-
logger.debug(f"parsable_folder_include: {parsable_folder_include}")
|
|
370
358
|
compiled_folder_include = parse.compile(parsable_folder_include)
|
|
371
|
-
|
|
359
|
+
|
|
372
360
|
return compiled_folder_include
|
|
373
361
|
|
|
374
362
|
@cached_property
|
|
@@ -376,7 +364,8 @@ class PathSpec(ConfigModel):
|
|
|
376
364
|
# Regular expression to find all substrings enclosed in {}
|
|
377
365
|
pattern = r"\{(.*?)\}"
|
|
378
366
|
# Find all matches
|
|
379
|
-
|
|
367
|
+
split_parts = self.include.split("{table}/")
|
|
368
|
+
matches = re.findall(pattern, split_parts[1]) if len(split_parts) > 1 else []
|
|
380
369
|
return matches
|
|
381
370
|
|
|
382
371
|
def get_partition_from_path(self, path: str) -> Optional[List[Tuple[str, str]]]:
|
|
@@ -467,7 +456,11 @@ class PathSpec(ConfigModel):
|
|
|
467
456
|
partition = partition.rsplit("/", 1)[0]
|
|
468
457
|
for partition_key in partition.split("/"):
|
|
469
458
|
if partition_key.find("=") != -1:
|
|
470
|
-
|
|
459
|
+
key_value = partition_key.split(
|
|
460
|
+
"=", 1
|
|
461
|
+
) # Split into at most 2 parts
|
|
462
|
+
if len(key_value) == 2:
|
|
463
|
+
partition_keys.append((key_value[0], key_value[1]))
|
|
471
464
|
else:
|
|
472
465
|
partition_split = partition.rsplit("/", 1)
|
|
473
466
|
if len(partition_split) == 1:
|
|
@@ -487,7 +480,8 @@ class PathSpec(ConfigModel):
|
|
|
487
480
|
return glob_include
|
|
488
481
|
|
|
489
482
|
@pydantic.root_validator(skip_on_failure=True)
|
|
490
|
-
|
|
483
|
+
@staticmethod
|
|
484
|
+
def validate_path_spec(values: Dict) -> Dict[str, Any]:
|
|
491
485
|
# validate that main fields are populated
|
|
492
486
|
required_fields = ["include", "file_types", "default_extension"]
|
|
493
487
|
for f in required_fields:
|
|
@@ -563,7 +557,7 @@ class PathSpec(ConfigModel):
|
|
|
563
557
|
f"{{{template_key}}}", var[key]
|
|
564
558
|
)
|
|
565
559
|
else:
|
|
566
|
-
partition_format.replace(f"{{{var_key}}}", var)
|
|
560
|
+
partition_format = partition_format.replace(f"{{{var_key}}}", var)
|
|
567
561
|
return datetime.datetime.strptime(partition_format, datetime_format).replace(
|
|
568
562
|
tzinfo=datetime.timezone.utc
|
|
569
563
|
)
|
|
@@ -571,7 +565,7 @@ class PathSpec(ConfigModel):
|
|
|
571
565
|
def extract_table_name_and_path(self, path: str) -> Tuple[str, str]:
|
|
572
566
|
parsed_vars = self.get_named_vars(path)
|
|
573
567
|
if parsed_vars is None or "table" not in parsed_vars.named:
|
|
574
|
-
return os.path.basename(path), path
|
|
568
|
+
return os.path.basename(path.removesuffix("/")), path
|
|
575
569
|
else:
|
|
576
570
|
include = self.include
|
|
577
571
|
depth = include.count("/", 0, include.find("{table}"))
|
|
@@ -579,3 +573,38 @@ class PathSpec(ConfigModel):
|
|
|
579
573
|
"/".join(path.split("/")[:depth]) + "/" + parsed_vars.named["table"]
|
|
580
574
|
)
|
|
581
575
|
return self._extract_table_name(parsed_vars.named), table_path
|
|
576
|
+
|
|
577
|
+
def has_correct_number_of_directory_components(self, path: str) -> bool:
|
|
578
|
+
"""
|
|
579
|
+
Checks that a given path has the same number of components as the path spec
|
|
580
|
+
has directory components. Useful for checking if a path needs to descend further
|
|
581
|
+
into child directories or if the source can switch into file listing mode. If the
|
|
582
|
+
glob form of the path spec ends in "**", this always returns False.
|
|
583
|
+
"""
|
|
584
|
+
if self.glob_include.endswith("**"):
|
|
585
|
+
return False
|
|
586
|
+
|
|
587
|
+
if not path.endswith("/"):
|
|
588
|
+
path += "/"
|
|
589
|
+
path_slash = path.count("/")
|
|
590
|
+
glob_slash = self.glob_include.count("/")
|
|
591
|
+
if path_slash == glob_slash:
|
|
592
|
+
return True
|
|
593
|
+
return False
|
|
594
|
+
|
|
595
|
+
def get_remaining_glob_include(self, path: str) -> str:
|
|
596
|
+
"""
|
|
597
|
+
Given a path, return the remaining components of the path spec (if any
|
|
598
|
+
exist) in glob form. If the glob form of the path spec ends in "**", this
|
|
599
|
+
function's return value also always ends in "**", regardless of how
|
|
600
|
+
many components the input path has.
|
|
601
|
+
"""
|
|
602
|
+
if not path.endswith("/"):
|
|
603
|
+
path += "/"
|
|
604
|
+
path_slash = path.count("/")
|
|
605
|
+
remainder = "/".join(self.glob_include.split("/")[path_slash:])
|
|
606
|
+
if remainder:
|
|
607
|
+
return remainder
|
|
608
|
+
if self.glob_include.endswith("**"):
|
|
609
|
+
return "**"
|
|
610
|
+
return ""
|