acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -13,6 +13,7 @@ from datahub.configuration.common import (
|
|
|
13
13
|
from datahub.emitter.aspect import JSON_CONTENT_TYPE
|
|
14
14
|
from datahub.emitter.mce_builder import datahub_guid, make_data_platform_urn
|
|
15
15
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
16
|
+
from datahub.emitter.rest_emitter import EmitMode
|
|
16
17
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope
|
|
17
18
|
from datahub.ingestion.api.pipeline_run_listener import PipelineRunListener
|
|
18
19
|
from datahub.ingestion.api.sink import NoopWriteCallback, Sink
|
|
@@ -111,6 +112,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
111
112
|
def __init__(self, sink: Sink, report_recipe: bool, ctx: PipelineContext) -> None:
|
|
112
113
|
assert ctx.pipeline_config is not None
|
|
113
114
|
|
|
115
|
+
self.ctx = ctx
|
|
114
116
|
self.sink: Sink = sink
|
|
115
117
|
self.report_recipe = report_recipe
|
|
116
118
|
ingestion_source_key = self.generate_unique_key(ctx.pipeline_config)
|
|
@@ -191,18 +193,25 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
191
193
|
)
|
|
192
194
|
return json.dumps(converted_recipe)
|
|
193
195
|
|
|
194
|
-
def _emit_aspect(
|
|
195
|
-
self
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
),
|
|
201
|
-
metadata={},
|
|
202
|
-
),
|
|
203
|
-
NoopWriteCallback(),
|
|
196
|
+
def _emit_aspect(
|
|
197
|
+
self, entity_urn: Urn, aspect_value: _Aspect, try_sync: bool = False
|
|
198
|
+
) -> None:
|
|
199
|
+
mcp = MetadataChangeProposalWrapper(
|
|
200
|
+
entityUrn=str(entity_urn),
|
|
201
|
+
aspect=aspect_value,
|
|
204
202
|
)
|
|
205
203
|
|
|
204
|
+
if try_sync and self.ctx.graph:
|
|
205
|
+
self.ctx.graph.emit_mcp(mcp, emit_mode=EmitMode.SYNC_PRIMARY)
|
|
206
|
+
else:
|
|
207
|
+
self.sink.write_record_async(
|
|
208
|
+
RecordEnvelope(
|
|
209
|
+
record=mcp,
|
|
210
|
+
metadata={},
|
|
211
|
+
),
|
|
212
|
+
NoopWriteCallback(),
|
|
213
|
+
)
|
|
214
|
+
|
|
206
215
|
def on_start(self, ctx: PipelineContext) -> None:
|
|
207
216
|
assert ctx.pipeline_config is not None
|
|
208
217
|
# Construct the dataHubExecutionRequestInput aspect
|
|
@@ -223,6 +232,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
223
232
|
self._emit_aspect(
|
|
224
233
|
entity_urn=self.execution_request_input_urn,
|
|
225
234
|
aspect_value=execution_input_aspect,
|
|
235
|
+
try_sync=True,
|
|
226
236
|
)
|
|
227
237
|
|
|
228
238
|
def on_completion(
|
|
@@ -258,4 +268,4 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
258
268
|
entity_urn=self.execution_request_input_urn,
|
|
259
269
|
aspect_value=execution_result_aspect,
|
|
260
270
|
)
|
|
261
|
-
|
|
271
|
+
# Note: sink.close() is handled by the pipeline's context manager
|
|
@@ -44,6 +44,10 @@ from datahub.ingestion.transformer.transform_registry import transform_registry
|
|
|
44
44
|
from datahub.sdk._attribution import KnownAttribution, change_default_attribution
|
|
45
45
|
from datahub.telemetry import stats
|
|
46
46
|
from datahub.telemetry.telemetry import telemetry_instance
|
|
47
|
+
from datahub.upgrade.upgrade import (
|
|
48
|
+
is_server_default_cli_ahead,
|
|
49
|
+
retrieve_version_stats,
|
|
50
|
+
)
|
|
47
51
|
from datahub.utilities._custom_package_loader import model_version_name
|
|
48
52
|
from datahub.utilities.global_warning_util import (
|
|
49
53
|
clear_global_warnings,
|
|
@@ -171,7 +175,10 @@ class Pipeline:
|
|
|
171
175
|
self.last_time_printed = int(time.time())
|
|
172
176
|
self.cli_report = CliReport()
|
|
173
177
|
|
|
174
|
-
with
|
|
178
|
+
with (
|
|
179
|
+
contextlib.ExitStack() as exit_stack,
|
|
180
|
+
contextlib.ExitStack() as inner_exit_stack,
|
|
181
|
+
):
|
|
175
182
|
self.graph: Optional[DataHubGraph] = None
|
|
176
183
|
with _add_init_error_context("connect to DataHub"):
|
|
177
184
|
if self.config.datahub_api:
|
|
@@ -258,6 +265,11 @@ class Pipeline:
|
|
|
258
265
|
with _add_init_error_context("configure transformers"):
|
|
259
266
|
self._configure_transforms()
|
|
260
267
|
|
|
268
|
+
# Register completion callback with sink to handle final reporting
|
|
269
|
+
self.sink.register_pre_shutdown_callback(
|
|
270
|
+
self._notify_reporters_on_ingestion_completion
|
|
271
|
+
)
|
|
272
|
+
|
|
261
273
|
# If all of the initialization succeeds, we can preserve the exit stack until the pipeline run.
|
|
262
274
|
# We need to use an exit stack so that if we have an exception during initialization,
|
|
263
275
|
# things that were already initialized are still cleaned up.
|
|
@@ -337,8 +349,48 @@ class Pipeline:
|
|
|
337
349
|
for reporter in self.reporters:
|
|
338
350
|
try:
|
|
339
351
|
reporter.on_start(ctx=self.ctx)
|
|
340
|
-
except Exception
|
|
341
|
-
logger.warning("Reporting failed on start", exc_info=
|
|
352
|
+
except Exception:
|
|
353
|
+
logger.warning("Reporting failed on start", exc_info=True)
|
|
354
|
+
|
|
355
|
+
def _warn_old_cli_version(self) -> None:
|
|
356
|
+
"""
|
|
357
|
+
Check if the server default CLI version is ahead of the CLI version being used.
|
|
358
|
+
If so, add a warning to the report.
|
|
359
|
+
"""
|
|
360
|
+
|
|
361
|
+
try:
|
|
362
|
+
version_stats = retrieve_version_stats(timeout=2.0, graph=self.graph)
|
|
363
|
+
except RuntimeError as e:
|
|
364
|
+
# Handle case where there's no event loop available (e.g., in ThreadPoolExecutor)
|
|
365
|
+
if "no current event loop" in str(e):
|
|
366
|
+
logger.debug("Skipping version check - no event loop available")
|
|
367
|
+
return
|
|
368
|
+
raise
|
|
369
|
+
|
|
370
|
+
if not version_stats or not self.graph:
|
|
371
|
+
return
|
|
372
|
+
|
|
373
|
+
if is_server_default_cli_ahead(version_stats):
|
|
374
|
+
server_default_version = (
|
|
375
|
+
version_stats.server.current_server_default_cli_version.version
|
|
376
|
+
if version_stats.server.current_server_default_cli_version
|
|
377
|
+
else None
|
|
378
|
+
)
|
|
379
|
+
current_version = version_stats.client.current.version
|
|
380
|
+
|
|
381
|
+
logger.debug(
|
|
382
|
+
f"""
|
|
383
|
+
client_version: {current_version}
|
|
384
|
+
server_default_version: {server_default_version}
|
|
385
|
+
server_default_cli_ahead: True
|
|
386
|
+
"""
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
self.source.get_report().warning(
|
|
390
|
+
title="Server default CLI version is ahead of CLI version",
|
|
391
|
+
message="Please upgrade the CLI version being used",
|
|
392
|
+
context=f"Server Default CLI version: {server_default_version}, Used CLI version: {current_version}",
|
|
393
|
+
)
|
|
342
394
|
|
|
343
395
|
def _notify_reporters_on_ingestion_completion(self) -> None:
|
|
344
396
|
for reporter in self.reporters:
|
|
@@ -360,8 +412,8 @@ class Pipeline:
|
|
|
360
412
|
report=self._get_structured_report(),
|
|
361
413
|
ctx=self.ctx,
|
|
362
414
|
)
|
|
363
|
-
except Exception
|
|
364
|
-
logger.warning("Reporting failed on completion", exc_info=
|
|
415
|
+
except Exception:
|
|
416
|
+
logger.warning("Reporting failed on completion", exc_info=True)
|
|
365
417
|
|
|
366
418
|
@classmethod
|
|
367
419
|
def create(
|
|
@@ -395,7 +447,20 @@ class Pipeline:
|
|
|
395
447
|
return True
|
|
396
448
|
return False
|
|
397
449
|
|
|
450
|
+
def _set_platform(self) -> None:
|
|
451
|
+
platform = self.source.infer_platform()
|
|
452
|
+
if platform:
|
|
453
|
+
self.source.get_report().set_platform(platform)
|
|
454
|
+
else:
|
|
455
|
+
self.source.get_report().warning(
|
|
456
|
+
message="Platform not found",
|
|
457
|
+
title="Platform not found",
|
|
458
|
+
context="Platform not found",
|
|
459
|
+
)
|
|
460
|
+
|
|
398
461
|
def run(self) -> None:
|
|
462
|
+
self._set_platform()
|
|
463
|
+
self._warn_old_cli_version()
|
|
399
464
|
with self.exit_stack, self.inner_exit_stack:
|
|
400
465
|
if self.config.flags.generate_memory_profiles:
|
|
401
466
|
import memray
|
|
@@ -461,10 +526,10 @@ class Pipeline:
|
|
|
461
526
|
|
|
462
527
|
except (RuntimeError, SystemExit):
|
|
463
528
|
raise
|
|
464
|
-
except Exception
|
|
529
|
+
except Exception:
|
|
465
530
|
logger.error(
|
|
466
531
|
"Failed to process some records. Continuing.",
|
|
467
|
-
exc_info=
|
|
532
|
+
exc_info=True,
|
|
468
533
|
)
|
|
469
534
|
# TODO: Transformer errors should be reported more loudly / as part of the pipeline report.
|
|
470
535
|
|
|
@@ -493,9 +558,9 @@ class Pipeline:
|
|
|
493
558
|
|
|
494
559
|
self.process_commits()
|
|
495
560
|
self.final_status = PipelineStatus.COMPLETED
|
|
496
|
-
except (SystemExit, KeyboardInterrupt)
|
|
561
|
+
except (SystemExit, KeyboardInterrupt):
|
|
497
562
|
self.final_status = PipelineStatus.CANCELLED
|
|
498
|
-
logger.error("Caught error", exc_info=
|
|
563
|
+
logger.error("Caught error", exc_info=True)
|
|
499
564
|
raise
|
|
500
565
|
except Exception as exc:
|
|
501
566
|
self.final_status = PipelineStatus.ERROR
|
|
@@ -503,8 +568,6 @@ class Pipeline:
|
|
|
503
568
|
finally:
|
|
504
569
|
clear_global_warnings()
|
|
505
570
|
|
|
506
|
-
self._notify_reporters_on_ingestion_completion()
|
|
507
|
-
|
|
508
571
|
def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]:
|
|
509
572
|
"""
|
|
510
573
|
Transforms the given sequence of records by passing the records through the transformers
|
|
@@ -578,15 +641,22 @@ class Pipeline:
|
|
|
578
641
|
sink_failures = len(self.sink.get_report().failures)
|
|
579
642
|
sink_warnings = len(self.sink.get_report().warnings)
|
|
580
643
|
global_warnings = len(get_global_warnings())
|
|
644
|
+
source_aspects = self.source.get_report().get_aspects_dict()
|
|
645
|
+
source_aspects_by_subtype = (
|
|
646
|
+
self.source.get_report().get_aspects_by_subtypes_dict()
|
|
647
|
+
)
|
|
581
648
|
|
|
582
649
|
telemetry_instance.ping(
|
|
583
650
|
"ingest_stats",
|
|
584
651
|
{
|
|
585
652
|
"source_type": self.source_type,
|
|
653
|
+
"source_aspects": source_aspects,
|
|
654
|
+
"source_aspects_by_subtype": source_aspects_by_subtype,
|
|
586
655
|
"sink_type": self.sink_type,
|
|
587
656
|
"transformer_types": [
|
|
588
657
|
transformer.type for transformer in self.config.transformers or []
|
|
589
658
|
],
|
|
659
|
+
"extractor_type": self.config.source.extractor,
|
|
590
660
|
"records_written": stats.discretize(
|
|
591
661
|
self.sink.get_report().total_records_written
|
|
592
662
|
),
|
|
@@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional
|
|
|
6
6
|
|
|
7
7
|
from pydantic import Field, validator
|
|
8
8
|
|
|
9
|
-
from datahub.configuration.common import ConfigModel, DynamicTypedConfig
|
|
9
|
+
from datahub.configuration.common import ConfigModel, DynamicTypedConfig, HiddenFromDocs
|
|
10
10
|
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
11
11
|
from datahub.ingestion.sink.file import FileSinkConfig
|
|
12
12
|
|
|
@@ -85,7 +85,7 @@ class PipelineConfig(ConfigModel):
|
|
|
85
85
|
source: SourceConfig
|
|
86
86
|
sink: Optional[DynamicTypedConfig] = None
|
|
87
87
|
transformers: Optional[List[DynamicTypedConfig]] = None
|
|
88
|
-
flags: FlagsConfig =
|
|
88
|
+
flags: HiddenFromDocs[FlagsConfig] = FlagsConfig()
|
|
89
89
|
reporting: List[ReporterConfig] = []
|
|
90
90
|
run_id: str = DEFAULT_RUN_ID
|
|
91
91
|
datahub_api: Optional[DatahubClientConfig] = None
|
|
@@ -3,7 +3,6 @@ import contextlib
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import functools
|
|
5
5
|
import logging
|
|
6
|
-
import os
|
|
7
6
|
import threading
|
|
8
7
|
import uuid
|
|
9
8
|
from enum import auto
|
|
@@ -16,6 +15,10 @@ from datahub.configuration.common import (
|
|
|
16
15
|
ConfigurationError,
|
|
17
16
|
OperationalError,
|
|
18
17
|
)
|
|
18
|
+
from datahub.configuration.env_vars import (
|
|
19
|
+
get_rest_sink_default_max_threads,
|
|
20
|
+
get_rest_sink_default_mode,
|
|
21
|
+
)
|
|
19
22
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
23
|
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
21
24
|
from datahub.emitter.rest_emitter import (
|
|
@@ -47,9 +50,7 @@ from datahub.utilities.server_config_util import set_gms_config
|
|
|
47
50
|
|
|
48
51
|
logger = logging.getLogger(__name__)
|
|
49
52
|
|
|
50
|
-
_DEFAULT_REST_SINK_MAX_THREADS =
|
|
51
|
-
os.getenv("DATAHUB_REST_SINK_DEFAULT_MAX_THREADS", 15)
|
|
52
|
-
)
|
|
53
|
+
_DEFAULT_REST_SINK_MAX_THREADS = get_rest_sink_default_max_threads()
|
|
53
54
|
|
|
54
55
|
|
|
55
56
|
class RestSinkMode(ConfigEnum):
|
|
@@ -63,13 +64,14 @@ class RestSinkMode(ConfigEnum):
|
|
|
63
64
|
|
|
64
65
|
|
|
65
66
|
_DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
|
|
66
|
-
RestSinkMode,
|
|
67
|
+
RestSinkMode, get_rest_sink_default_mode() or RestSinkMode.ASYNC_BATCH
|
|
67
68
|
)
|
|
68
69
|
|
|
69
70
|
|
|
70
71
|
class DatahubRestSinkConfig(DatahubClientConfig):
|
|
71
72
|
mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
|
|
72
73
|
endpoint: RestSinkEndpoint = DEFAULT_REST_EMITTER_ENDPOINT
|
|
74
|
+
server_config_refresh_interval: Optional[int] = None
|
|
73
75
|
|
|
74
76
|
# These only apply in async modes.
|
|
75
77
|
max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
|
|
@@ -90,6 +92,7 @@ class DatahubRestSinkConfig(DatahubClientConfig):
|
|
|
90
92
|
@dataclasses.dataclass
|
|
91
93
|
class DataHubRestSinkReport(SinkReport):
|
|
92
94
|
mode: Optional[RestSinkMode] = None
|
|
95
|
+
endpoint: Optional[RestSinkEndpoint] = None
|
|
93
96
|
max_threads: Optional[int] = None
|
|
94
97
|
gms_version: Optional[str] = None
|
|
95
98
|
pending_requests: int = 0
|
|
@@ -140,6 +143,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
140
143
|
|
|
141
144
|
self.report.gms_version = gms_config.service_version
|
|
142
145
|
self.report.mode = self.config.mode
|
|
146
|
+
self.report.endpoint = self.config.endpoint
|
|
143
147
|
self.report.max_threads = self.config.max_threads
|
|
144
148
|
logger.debug("Setting env variables to override config")
|
|
145
149
|
logger.debug("Setting gms config")
|
|
@@ -346,6 +350,10 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
346
350
|
)
|
|
347
351
|
|
|
348
352
|
def close(self):
|
|
353
|
+
# Execute pre-shutdown callbacks first (handled by parent class)
|
|
354
|
+
super().close()
|
|
355
|
+
|
|
356
|
+
# Then perform sink-specific shutdown
|
|
349
357
|
with self.report.main_thread_blocking_timer:
|
|
350
358
|
self.executor.shutdown()
|
|
351
359
|
|
datahub/ingestion/sink/file.py
CHANGED
|
@@ -151,7 +151,7 @@ class DataLakeSourceConfig(
|
|
|
151
151
|
raise ValueError("platform must not be empty")
|
|
152
152
|
return platform
|
|
153
153
|
|
|
154
|
-
@pydantic.root_validator()
|
|
154
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
155
155
|
def ensure_profiling_pattern_is_passed_to_profiling(
|
|
156
156
|
cls, values: Dict[str, Any]
|
|
157
157
|
) -> Dict[str, Any]:
|
|
@@ -72,7 +72,7 @@ class DataLakeProfilerConfig(ConfigModel):
|
|
|
72
72
|
description="Whether to profile for the sample values for all columns.",
|
|
73
73
|
)
|
|
74
74
|
|
|
75
|
-
@pydantic.root_validator()
|
|
75
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
76
76
|
def ensure_field_level_settings_are_normalized(
|
|
77
77
|
cls: "DataLakeProfilerConfig", values: Dict[str, Any]
|
|
78
78
|
) -> Dict[str, Any]:
|
|
@@ -44,7 +44,11 @@ from datahub.ingestion.source.azure.abs_utils import (
|
|
|
44
44
|
get_key_prefix,
|
|
45
45
|
strip_abs_prefix,
|
|
46
46
|
)
|
|
47
|
-
from datahub.ingestion.source.
|
|
47
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
48
|
+
from datahub.ingestion.source.data_lake_common.data_lake_utils import (
|
|
49
|
+
ContainerWUCreator,
|
|
50
|
+
add_partition_columns_to_schema,
|
|
51
|
+
)
|
|
48
52
|
from datahub.ingestion.source.schema_inference import avro, csv_tsv, json, parquet
|
|
49
53
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
50
54
|
StaleEntityRemovalHandler,
|
|
@@ -53,10 +57,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
53
57
|
StatefulIngestionSourceBase,
|
|
54
58
|
)
|
|
55
59
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
56
|
-
SchemaField,
|
|
57
|
-
SchemaFieldDataType,
|
|
58
60
|
SchemaMetadata,
|
|
59
|
-
StringTypeClass,
|
|
60
61
|
)
|
|
61
62
|
from datahub.metadata.schema_classes import (
|
|
62
63
|
DataPlatformInstanceClass,
|
|
@@ -128,6 +129,14 @@ class TableData:
|
|
|
128
129
|
@support_status(SupportStatus.INCUBATING)
|
|
129
130
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
130
131
|
@capability(SourceCapability.TAGS, "Can extract ABS object/container tags if enabled")
|
|
132
|
+
@capability(
|
|
133
|
+
SourceCapability.CONTAINERS,
|
|
134
|
+
"Extract ABS containers and folders",
|
|
135
|
+
subtype_modifier=[
|
|
136
|
+
SourceCapabilityModifier.FOLDER,
|
|
137
|
+
SourceCapabilityModifier.ABS_CONTAINER,
|
|
138
|
+
],
|
|
139
|
+
)
|
|
131
140
|
class ABSSource(StatefulIngestionSourceBase):
|
|
132
141
|
source_config: DataLakeSourceConfig
|
|
133
142
|
report: DataLakeSourceReport
|
|
@@ -223,36 +232,12 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
223
232
|
fields = sorted(fields, key=lambda f: f.fieldPath)
|
|
224
233
|
|
|
225
234
|
if self.source_config.add_partition_columns_to_schema:
|
|
226
|
-
|
|
235
|
+
add_partition_columns_to_schema(
|
|
227
236
|
fields=fields, path_spec=path_spec, full_path=table_data.full_path
|
|
228
237
|
)
|
|
229
238
|
|
|
230
239
|
return fields
|
|
231
240
|
|
|
232
|
-
def add_partition_columns_to_schema(
|
|
233
|
-
self, path_spec: PathSpec, full_path: str, fields: List[SchemaField]
|
|
234
|
-
) -> None:
|
|
235
|
-
vars = path_spec.get_named_vars(full_path)
|
|
236
|
-
if vars is not None and "partition" in vars:
|
|
237
|
-
for partition in vars["partition"].values():
|
|
238
|
-
partition_arr = partition.split("=")
|
|
239
|
-
if len(partition_arr) != 2:
|
|
240
|
-
logger.debug(
|
|
241
|
-
f"Could not derive partition key from partition field {partition}"
|
|
242
|
-
)
|
|
243
|
-
continue
|
|
244
|
-
partition_key = partition_arr[0]
|
|
245
|
-
fields.append(
|
|
246
|
-
SchemaField(
|
|
247
|
-
fieldPath=f"{partition_key}",
|
|
248
|
-
nativeDataType="string",
|
|
249
|
-
type=SchemaFieldDataType(StringTypeClass()),
|
|
250
|
-
isPartitioningKey=True,
|
|
251
|
-
nullable=True,
|
|
252
|
-
recursive=False,
|
|
253
|
-
)
|
|
254
|
-
)
|
|
255
|
-
|
|
256
241
|
def _create_table_operation_aspect(self, table_data: TableData) -> OperationClass:
|
|
257
242
|
reported_time = int(time.time() * 1000)
|
|
258
243
|
|
|
@@ -533,7 +518,7 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
533
518
|
)
|
|
534
519
|
path_spec.sample_files = False
|
|
535
520
|
for obj in container_client.list_blobs(
|
|
536
|
-
|
|
521
|
+
name_starts_with=f"{prefix}", results_per_page=PAGE_SIZE
|
|
537
522
|
):
|
|
538
523
|
abs_path = self.create_abs_path(obj.name)
|
|
539
524
|
logger.debug(f"Path: {abs_path}")
|