acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -11,7 +11,7 @@ from cached_property import cached_property
|
|
|
11
11
|
from pydantic.fields import Field
|
|
12
12
|
from wcmatch import pathlib
|
|
13
13
|
|
|
14
|
-
from datahub.configuration.common import ConfigModel
|
|
14
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
15
15
|
from datahub.ingestion.source.aws.s3_util import is_s3_uri
|
|
16
16
|
from datahub.ingestion.source.azure.abs_utils import is_abs_uri
|
|
17
17
|
from datahub.ingestion.source.gcs.gcs_utils import is_gcs_uri
|
|
@@ -62,7 +62,6 @@ class SortKey(ConfigModel):
|
|
|
62
62
|
|
|
63
63
|
date_format: Optional[str] = Field(
|
|
64
64
|
default=None,
|
|
65
|
-
type=str,
|
|
66
65
|
description="The date format to use when sorting. This is used to parse the date from the key. The format should follow the java [SimpleDateFormat](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) format.",
|
|
67
66
|
)
|
|
68
67
|
|
|
@@ -90,61 +89,65 @@ class PathSpec(ConfigModel):
|
|
|
90
89
|
description="Path to table. Name variable `{table}` is used to mark the folder with dataset. In absence of `{table}`, file level dataset will be created. Check below examples for more details."
|
|
91
90
|
)
|
|
92
91
|
exclude: Optional[List[str]] = Field(
|
|
93
|
-
|
|
92
|
+
[],
|
|
94
93
|
description="list of paths in glob pattern which will be excluded while scanning for the datasets",
|
|
95
94
|
)
|
|
96
95
|
file_types: List[str] = Field(
|
|
97
|
-
|
|
96
|
+
SUPPORTED_FILE_TYPES,
|
|
98
97
|
description="Files with extenstions specified here (subset of default value) only will be scanned to create dataset. Other files will be omitted.",
|
|
99
98
|
)
|
|
100
99
|
|
|
101
100
|
default_extension: Optional[str] = Field(
|
|
102
|
-
|
|
101
|
+
None,
|
|
103
102
|
description="For files without extension it will assume the specified file type. If it is not set the files without extensions will be skipped.",
|
|
104
103
|
)
|
|
105
104
|
|
|
106
105
|
table_name: Optional[str] = Field(
|
|
107
|
-
|
|
106
|
+
None,
|
|
108
107
|
description="Display name of the dataset.Combination of named variables from include path and strings",
|
|
109
108
|
)
|
|
110
109
|
|
|
111
110
|
# This is not used yet, but will be used in the future to sort the partitions
|
|
112
|
-
sort_key: Optional[SortKey] = Field(
|
|
113
|
-
|
|
114
|
-
default=None,
|
|
111
|
+
sort_key: HiddenFromDocs[Optional[SortKey]] = Field(
|
|
112
|
+
None,
|
|
115
113
|
description="Sort key to use when sorting the partitions. This is useful when the partitions are not sorted in the order of the data. The key can be a compound key based on the path_spec variables.",
|
|
116
114
|
)
|
|
117
115
|
|
|
118
116
|
enable_compression: bool = Field(
|
|
119
|
-
|
|
117
|
+
True,
|
|
120
118
|
description="Enable or disable processing compressed files. Currently .gz and .bz files are supported.",
|
|
121
119
|
)
|
|
122
120
|
|
|
123
121
|
sample_files: bool = Field(
|
|
124
|
-
|
|
122
|
+
True,
|
|
125
123
|
description="Not listing all the files but only taking a handful amount of sample file to infer the schema. File count and file size calculation will be disabled. This can affect performance significantly if enabled",
|
|
126
124
|
)
|
|
127
125
|
|
|
128
126
|
allow_double_stars: bool = Field(
|
|
129
|
-
|
|
127
|
+
False,
|
|
130
128
|
description="Allow double stars in the include path. This can affect performance significantly if enabled",
|
|
131
129
|
)
|
|
132
130
|
|
|
133
131
|
autodetect_partitions: bool = Field(
|
|
134
|
-
|
|
132
|
+
True,
|
|
135
133
|
description="Autodetect partition(s) from the path. If set to true, it will autodetect partition key/value if the folder format is {partition_key}={partition_value} for example `year=2024`",
|
|
136
134
|
)
|
|
137
135
|
|
|
138
136
|
traversal_method: FolderTraversalMethod = Field(
|
|
139
|
-
|
|
137
|
+
FolderTraversalMethod.MAX,
|
|
140
138
|
description="Method to traverse the folder. ALL: Traverse all the folders, MIN_MAX: Traverse the folders by finding min and max value, MAX: Traverse the folder with max value",
|
|
141
139
|
)
|
|
142
140
|
|
|
143
141
|
include_hidden_folders: bool = Field(
|
|
144
|
-
|
|
142
|
+
False,
|
|
145
143
|
description="Include hidden folders in the traversal (folders starting with . or _",
|
|
146
144
|
)
|
|
147
145
|
|
|
146
|
+
tables_filter_pattern: AllowDenyPattern = Field(
|
|
147
|
+
AllowDenyPattern.allow_all(),
|
|
148
|
+
description="The tables_filter_pattern configuration field uses regular expressions to filter the tables part of the Pathspec for ingestion, allowing fine-grained control over which tables are included or excluded based on specified patterns. The default setting allows all tables.",
|
|
149
|
+
)
|
|
150
|
+
|
|
148
151
|
def is_path_hidden(self, path: str) -> bool:
|
|
149
152
|
# Split the path into directories and filename
|
|
150
153
|
dirs, filename = os.path.split(path)
|
|
@@ -161,7 +164,6 @@ class PathSpec(ConfigModel):
|
|
|
161
164
|
return False
|
|
162
165
|
|
|
163
166
|
def allowed(self, path: str, ignore_ext: bool = False) -> bool:
|
|
164
|
-
logger.debug(f"Checking file to inclusion: {path}")
|
|
165
167
|
if self.is_path_hidden(path) and not self.include_hidden_folders:
|
|
166
168
|
return False
|
|
167
169
|
|
|
@@ -169,14 +171,18 @@ class PathSpec(ConfigModel):
|
|
|
169
171
|
self.glob_include, flags=pathlib.GLOBSTAR
|
|
170
172
|
):
|
|
171
173
|
return False
|
|
172
|
-
|
|
174
|
+
|
|
173
175
|
if self.exclude:
|
|
174
176
|
for exclude_path in self.exclude:
|
|
175
177
|
if pathlib.PurePath(path).globmatch(
|
|
176
178
|
exclude_path, flags=pathlib.GLOBSTAR
|
|
177
179
|
):
|
|
178
180
|
return False
|
|
179
|
-
|
|
181
|
+
|
|
182
|
+
table_name, _ = self.extract_table_name_and_path(path)
|
|
183
|
+
if not self.tables_filter_pattern.allowed(table_name):
|
|
184
|
+
return False
|
|
185
|
+
|
|
180
186
|
ext = os.path.splitext(path)[1].strip(".")
|
|
181
187
|
|
|
182
188
|
if not ignore_ext:
|
|
@@ -185,11 +191,12 @@ class PathSpec(ConfigModel):
|
|
|
185
191
|
):
|
|
186
192
|
return False
|
|
187
193
|
|
|
188
|
-
logger.debug(f"{path} had selected extension {ext}")
|
|
189
|
-
logger.debug(f"{path} allowed for dataset creation")
|
|
190
194
|
return True
|
|
191
195
|
|
|
192
196
|
def dir_allowed(self, path: str) -> bool:
|
|
197
|
+
if not path.endswith("/"):
|
|
198
|
+
path += "/"
|
|
199
|
+
|
|
193
200
|
if self.glob_include.endswith("**"):
|
|
194
201
|
return self.allowed(path, ignore_ext=True)
|
|
195
202
|
|
|
@@ -208,16 +215,22 @@ class PathSpec(ConfigModel):
|
|
|
208
215
|
for _ in range(slash_to_remove_from_glob):
|
|
209
216
|
glob_include = glob_include.rsplit("/", 1)[0]
|
|
210
217
|
|
|
211
|
-
logger.debug(f"Checking dir to inclusion: {path}")
|
|
212
218
|
if not pathlib.PurePath(path).globmatch(glob_include, flags=pathlib.GLOBSTAR):
|
|
213
219
|
return False
|
|
214
|
-
logger.debug(f"{path} matched include ")
|
|
215
220
|
if self.exclude:
|
|
216
221
|
for exclude_path in self.exclude:
|
|
217
222
|
if pathlib.PurePath(path.rstrip("/")).globmatch(
|
|
218
223
|
exclude_path.rstrip("/"), flags=pathlib.GLOBSTAR
|
|
219
224
|
):
|
|
220
225
|
return False
|
|
226
|
+
|
|
227
|
+
table_name, _ = self.extract_table_name_and_path(
|
|
228
|
+
path + self.get_remaining_glob_include(path)
|
|
229
|
+
)
|
|
230
|
+
if not self.tables_filter_pattern.allowed(table_name):
|
|
231
|
+
return False
|
|
232
|
+
# logger.debug(f"{path} is passed table name check")
|
|
233
|
+
|
|
221
234
|
return True
|
|
222
235
|
|
|
223
236
|
@classmethod
|
|
@@ -226,10 +239,10 @@ class PathSpec(ConfigModel):
|
|
|
226
239
|
if parsable_include.endswith("/{table}/**"):
|
|
227
240
|
# Remove the last two characters to make it parsable if it ends with {table}/** which marks autodetect partition
|
|
228
241
|
parsable_include = parsable_include[:-2]
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
242
|
+
|
|
243
|
+
# Replace all * with {folder[i]} to make it parsable
|
|
244
|
+
for i in range(parsable_include.count("*")):
|
|
245
|
+
parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
|
|
233
246
|
return parsable_include
|
|
234
247
|
|
|
235
248
|
def get_named_vars(self, path: str) -> Union[None, parse.Result, parse.Match]:
|
|
@@ -247,7 +260,7 @@ class PathSpec(ConfigModel):
|
|
|
247
260
|
) -> Union[None, parse.Result, parse.Match]:
|
|
248
261
|
return self.compiled_folder_include.parse(path)
|
|
249
262
|
|
|
250
|
-
@pydantic.root_validator()
|
|
263
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
251
264
|
def validate_no_double_stars(cls, values: Dict) -> Dict:
|
|
252
265
|
if "include" not in values:
|
|
253
266
|
return values
|
|
@@ -310,8 +323,6 @@ class PathSpec(ConfigModel):
|
|
|
310
323
|
if "{table}" in values["include"]:
|
|
311
324
|
v = "{table}"
|
|
312
325
|
else:
|
|
313
|
-
logger.debug(f"include fields: {compiled_include.named_fields}")
|
|
314
|
-
logger.debug(f"table_name fields: {parse.compile(v).named_fields}")
|
|
315
326
|
if not all(
|
|
316
327
|
x in compiled_include.named_fields
|
|
317
328
|
for x in parse.compile(v).named_fields
|
|
@@ -336,9 +347,7 @@ class PathSpec(ConfigModel):
|
|
|
336
347
|
@cached_property
|
|
337
348
|
def compiled_include(self):
|
|
338
349
|
parsable_include = PathSpec.get_parsable_include(self.include)
|
|
339
|
-
logger.debug(f"parsable_include: {parsable_include}")
|
|
340
350
|
compiled_include = parse.compile(parsable_include)
|
|
341
|
-
logger.debug(f"Setting compiled_include: {compiled_include}")
|
|
342
351
|
return compiled_include
|
|
343
352
|
|
|
344
353
|
@cached_property
|
|
@@ -346,9 +355,8 @@ class PathSpec(ConfigModel):
|
|
|
346
355
|
parsable_folder_include = PathSpec.get_parsable_include(self.include).rsplit(
|
|
347
356
|
"/", 1
|
|
348
357
|
)[0]
|
|
349
|
-
logger.debug(f"parsable_folder_include: {parsable_folder_include}")
|
|
350
358
|
compiled_folder_include = parse.compile(parsable_folder_include)
|
|
351
|
-
|
|
359
|
+
|
|
352
360
|
return compiled_folder_include
|
|
353
361
|
|
|
354
362
|
@cached_property
|
|
@@ -356,7 +364,8 @@ class PathSpec(ConfigModel):
|
|
|
356
364
|
# Regular expression to find all substrings enclosed in {}
|
|
357
365
|
pattern = r"\{(.*?)\}"
|
|
358
366
|
# Find all matches
|
|
359
|
-
|
|
367
|
+
split_parts = self.include.split("{table}/")
|
|
368
|
+
matches = re.findall(pattern, split_parts[1]) if len(split_parts) > 1 else []
|
|
360
369
|
return matches
|
|
361
370
|
|
|
362
371
|
def get_partition_from_path(self, path: str) -> Optional[List[Tuple[str, str]]]:
|
|
@@ -447,7 +456,11 @@ class PathSpec(ConfigModel):
|
|
|
447
456
|
partition = partition.rsplit("/", 1)[0]
|
|
448
457
|
for partition_key in partition.split("/"):
|
|
449
458
|
if partition_key.find("=") != -1:
|
|
450
|
-
|
|
459
|
+
key_value = partition_key.split(
|
|
460
|
+
"=", 1
|
|
461
|
+
) # Split into at most 2 parts
|
|
462
|
+
if len(key_value) == 2:
|
|
463
|
+
partition_keys.append((key_value[0], key_value[1]))
|
|
451
464
|
else:
|
|
452
465
|
partition_split = partition.rsplit("/", 1)
|
|
453
466
|
if len(partition_split) == 1:
|
|
@@ -467,7 +480,8 @@ class PathSpec(ConfigModel):
|
|
|
467
480
|
return glob_include
|
|
468
481
|
|
|
469
482
|
@pydantic.root_validator(skip_on_failure=True)
|
|
470
|
-
|
|
483
|
+
@staticmethod
|
|
484
|
+
def validate_path_spec(values: Dict) -> Dict[str, Any]:
|
|
471
485
|
# validate that main fields are populated
|
|
472
486
|
required_fields = ["include", "file_types", "default_extension"]
|
|
473
487
|
for f in required_fields:
|
|
@@ -543,7 +557,7 @@ class PathSpec(ConfigModel):
|
|
|
543
557
|
f"{{{template_key}}}", var[key]
|
|
544
558
|
)
|
|
545
559
|
else:
|
|
546
|
-
partition_format.replace(f"{{{var_key}}}", var)
|
|
560
|
+
partition_format = partition_format.replace(f"{{{var_key}}}", var)
|
|
547
561
|
return datetime.datetime.strptime(partition_format, datetime_format).replace(
|
|
548
562
|
tzinfo=datetime.timezone.utc
|
|
549
563
|
)
|
|
@@ -551,7 +565,7 @@ class PathSpec(ConfigModel):
|
|
|
551
565
|
def extract_table_name_and_path(self, path: str) -> Tuple[str, str]:
|
|
552
566
|
parsed_vars = self.get_named_vars(path)
|
|
553
567
|
if parsed_vars is None or "table" not in parsed_vars.named:
|
|
554
|
-
return os.path.basename(path), path
|
|
568
|
+
return os.path.basename(path.removesuffix("/")), path
|
|
555
569
|
else:
|
|
556
570
|
include = self.include
|
|
557
571
|
depth = include.count("/", 0, include.find("{table}"))
|
|
@@ -559,3 +573,38 @@ class PathSpec(ConfigModel):
|
|
|
559
573
|
"/".join(path.split("/")[:depth]) + "/" + parsed_vars.named["table"]
|
|
560
574
|
)
|
|
561
575
|
return self._extract_table_name(parsed_vars.named), table_path
|
|
576
|
+
|
|
577
|
+
def has_correct_number_of_directory_components(self, path: str) -> bool:
|
|
578
|
+
"""
|
|
579
|
+
Checks that a given path has the same number of components as the path spec
|
|
580
|
+
has directory components. Useful for checking if a path needs to descend further
|
|
581
|
+
into child directories or if the source can switch into file listing mode. If the
|
|
582
|
+
glob form of the path spec ends in "**", this always returns False.
|
|
583
|
+
"""
|
|
584
|
+
if self.glob_include.endswith("**"):
|
|
585
|
+
return False
|
|
586
|
+
|
|
587
|
+
if not path.endswith("/"):
|
|
588
|
+
path += "/"
|
|
589
|
+
path_slash = path.count("/")
|
|
590
|
+
glob_slash = self.glob_include.count("/")
|
|
591
|
+
if path_slash == glob_slash:
|
|
592
|
+
return True
|
|
593
|
+
return False
|
|
594
|
+
|
|
595
|
+
def get_remaining_glob_include(self, path: str) -> str:
|
|
596
|
+
"""
|
|
597
|
+
Given a path, return the remaining components of the path spec (if any
|
|
598
|
+
exist) in glob form. If the glob form of the path spec ends in "**", this
|
|
599
|
+
function's return value also always ends in "**", regardless of how
|
|
600
|
+
many components the input path has.
|
|
601
|
+
"""
|
|
602
|
+
if not path.endswith("/"):
|
|
603
|
+
path += "/"
|
|
604
|
+
path_slash = path.count("/")
|
|
605
|
+
remainder = "/".join(self.glob_include.split("/")[path_slash:])
|
|
606
|
+
if remainder:
|
|
607
|
+
return remainder
|
|
608
|
+
if self.glob_include.endswith("**"):
|
|
609
|
+
return "**"
|
|
610
|
+
return ""
|
|
@@ -4,7 +4,7 @@ from typing import Optional, Set
|
|
|
4
4
|
import pydantic
|
|
5
5
|
from pydantic import Field, root_validator
|
|
6
6
|
|
|
7
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
8
8
|
from datahub.configuration.kafka import KafkaConsumerConnectionConfig
|
|
9
9
|
from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
|
|
10
10
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
@@ -98,16 +98,14 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
98
98
|
),
|
|
99
99
|
)
|
|
100
100
|
|
|
101
|
-
pull_from_datahub_api: bool = Field(
|
|
101
|
+
pull_from_datahub_api: HiddenFromDocs[bool] = Field(
|
|
102
102
|
default=False,
|
|
103
103
|
description="Use the DataHub API to fetch versioned aspects.",
|
|
104
|
-
hidden_from_docs=True,
|
|
105
104
|
)
|
|
106
105
|
|
|
107
|
-
max_workers: int = Field(
|
|
106
|
+
max_workers: HiddenFromDocs[int] = Field(
|
|
108
107
|
default=5 * (os.cpu_count() or 4),
|
|
109
108
|
description="Number of worker threads to use for datahub api ingestion.",
|
|
110
|
-
hidden_from_docs=True,
|
|
111
109
|
)
|
|
112
110
|
|
|
113
111
|
urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
|
|
@@ -118,6 +116,22 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
118
116
|
"Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
|
|
119
117
|
)
|
|
120
118
|
|
|
119
|
+
structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
|
|
120
|
+
Field(
|
|
121
|
+
default=60,
|
|
122
|
+
description="Interval in seconds to invalidate the structured properties template cache.",
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
query_timeout: Optional[int] = Field(
|
|
127
|
+
default=None,
|
|
128
|
+
description="Timeout for each query in seconds. ",
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
preserve_system_metadata: bool = Field(
|
|
132
|
+
default=True, description="Copy system metadata from the source system"
|
|
133
|
+
)
|
|
134
|
+
|
|
121
135
|
@root_validator(skip_on_failure=True)
|
|
122
136
|
def check_ingesting_data(cls, values):
|
|
123
137
|
if (
|