acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,6 @@ import contextlib
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import functools
|
|
5
5
|
import logging
|
|
6
|
-
import os
|
|
7
6
|
import threading
|
|
8
7
|
import uuid
|
|
9
8
|
from enum import auto
|
|
@@ -16,15 +15,18 @@ from datahub.configuration.common import (
|
|
|
16
15
|
ConfigurationError,
|
|
17
16
|
OperationalError,
|
|
18
17
|
)
|
|
18
|
+
from datahub.configuration.env_vars import (
|
|
19
|
+
get_rest_sink_default_max_threads,
|
|
20
|
+
get_rest_sink_default_mode,
|
|
21
|
+
)
|
|
19
22
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
23
|
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
21
24
|
from datahub.emitter.rest_emitter import (
|
|
22
25
|
BATCH_INGEST_MAX_PAYLOAD_LENGTH,
|
|
23
|
-
|
|
24
|
-
DEFAULT_REST_TRACE_MODE,
|
|
26
|
+
DEFAULT_REST_EMITTER_ENDPOINT,
|
|
25
27
|
DataHubRestEmitter,
|
|
28
|
+
EmitMode,
|
|
26
29
|
RestSinkEndpoint,
|
|
27
|
-
RestTraceMode,
|
|
28
30
|
)
|
|
29
31
|
from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
|
|
30
32
|
from datahub.ingestion.api.sink import (
|
|
@@ -34,7 +36,7 @@ from datahub.ingestion.api.sink import (
|
|
|
34
36
|
WriteCallback,
|
|
35
37
|
)
|
|
36
38
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
37
|
-
from datahub.ingestion.graph.
|
|
39
|
+
from datahub.ingestion.graph.config import ClientMode, DatahubClientConfig
|
|
38
40
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
39
41
|
MetadataChangeEvent,
|
|
40
42
|
MetadataChangeProposal,
|
|
@@ -48,9 +50,7 @@ from datahub.utilities.server_config_util import set_gms_config
|
|
|
48
50
|
|
|
49
51
|
logger = logging.getLogger(__name__)
|
|
50
52
|
|
|
51
|
-
_DEFAULT_REST_SINK_MAX_THREADS =
|
|
52
|
-
os.getenv("DATAHUB_REST_SINK_DEFAULT_MAX_THREADS", 15)
|
|
53
|
-
)
|
|
53
|
+
_DEFAULT_REST_SINK_MAX_THREADS = get_rest_sink_default_max_threads()
|
|
54
54
|
|
|
55
55
|
|
|
56
56
|
class RestSinkMode(ConfigEnum):
|
|
@@ -64,14 +64,14 @@ class RestSinkMode(ConfigEnum):
|
|
|
64
64
|
|
|
65
65
|
|
|
66
66
|
_DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
|
|
67
|
-
RestSinkMode,
|
|
67
|
+
RestSinkMode, get_rest_sink_default_mode() or RestSinkMode.ASYNC_BATCH
|
|
68
68
|
)
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
class DatahubRestSinkConfig(DatahubClientConfig):
|
|
72
72
|
mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
|
|
73
|
-
endpoint: RestSinkEndpoint =
|
|
74
|
-
|
|
73
|
+
endpoint: RestSinkEndpoint = DEFAULT_REST_EMITTER_ENDPOINT
|
|
74
|
+
server_config_refresh_interval: Optional[int] = None
|
|
75
75
|
|
|
76
76
|
# These only apply in async modes.
|
|
77
77
|
max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
|
|
@@ -92,6 +92,7 @@ class DatahubRestSinkConfig(DatahubClientConfig):
|
|
|
92
92
|
@dataclasses.dataclass
|
|
93
93
|
class DataHubRestSinkReport(SinkReport):
|
|
94
94
|
mode: Optional[RestSinkMode] = None
|
|
95
|
+
endpoint: Optional[RestSinkEndpoint] = None
|
|
95
96
|
max_threads: Optional[int] = None
|
|
96
97
|
gms_version: Optional[str] = None
|
|
97
98
|
pending_requests: int = 0
|
|
@@ -134,18 +135,15 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
134
135
|
self._emitter_thread_local = threading.local()
|
|
135
136
|
|
|
136
137
|
try:
|
|
137
|
-
gms_config = self.emitter.
|
|
138
|
+
gms_config = self.emitter.server_config
|
|
138
139
|
except Exception as exc:
|
|
139
140
|
raise ConfigurationError(
|
|
140
141
|
f"💥 Failed to connect to DataHub with {repr(self.emitter)}"
|
|
141
142
|
) from exc
|
|
142
143
|
|
|
143
|
-
self.report.gms_version =
|
|
144
|
-
gms_config.get("versions", {})
|
|
145
|
-
.get("acryldata/datahub", {})
|
|
146
|
-
.get("version", None)
|
|
147
|
-
)
|
|
144
|
+
self.report.gms_version = gms_config.service_version
|
|
148
145
|
self.report.mode = self.config.mode
|
|
146
|
+
self.report.endpoint = self.config.endpoint
|
|
149
147
|
self.report.max_threads = self.config.max_threads
|
|
150
148
|
logger.debug("Setting env variables to override config")
|
|
151
149
|
logger.debug("Setting gms config")
|
|
@@ -179,7 +177,8 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
179
177
|
client_certificate_path=config.client_certificate_path,
|
|
180
178
|
disable_ssl_verification=config.disable_ssl_verification,
|
|
181
179
|
openapi_ingestion=config.endpoint == RestSinkEndpoint.OPENAPI,
|
|
182
|
-
|
|
180
|
+
client_mode=config.client_mode,
|
|
181
|
+
datahub_component=config.datahub_component,
|
|
183
182
|
)
|
|
184
183
|
|
|
185
184
|
@property
|
|
@@ -190,6 +189,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
190
189
|
# https://github.com/psf/requests/issues/1871#issuecomment-32751346
|
|
191
190
|
thread_local = self._emitter_thread_local
|
|
192
191
|
if not hasattr(thread_local, "emitter"):
|
|
192
|
+
self.config.client_mode = ClientMode.INGESTION
|
|
193
193
|
thread_local.emitter = DatahubRestSink._make_emitter(self.config)
|
|
194
194
|
return thread_local.emitter
|
|
195
195
|
|
|
@@ -253,9 +253,10 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
253
253
|
MetadataChangeProposal,
|
|
254
254
|
MetadataChangeProposalWrapper,
|
|
255
255
|
],
|
|
256
|
+
emit_mode: EmitMode,
|
|
256
257
|
) -> None:
|
|
257
258
|
# TODO: Add timing metrics
|
|
258
|
-
self.emitter.emit(record)
|
|
259
|
+
self.emitter.emit(record, emit_mode=emit_mode)
|
|
259
260
|
|
|
260
261
|
def _emit_batch_wrapper(
|
|
261
262
|
self,
|
|
@@ -270,8 +271,10 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
270
271
|
],
|
|
271
272
|
) -> None:
|
|
272
273
|
events: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]] = []
|
|
274
|
+
|
|
273
275
|
for record in records:
|
|
274
276
|
event = record[0]
|
|
277
|
+
|
|
275
278
|
if isinstance(event, MetadataChangeEvent):
|
|
276
279
|
# Unpack MCEs into MCPs.
|
|
277
280
|
mcps = mcps_from_mce(event)
|
|
@@ -279,7 +282,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
279
282
|
else:
|
|
280
283
|
events.append(event)
|
|
281
284
|
|
|
282
|
-
chunks = self.emitter.emit_mcps(events)
|
|
285
|
+
chunks = self.emitter.emit_mcps(events, emit_mode=EmitMode.ASYNC)
|
|
283
286
|
self.report.async_batches_prepared += 1
|
|
284
287
|
if chunks > 1:
|
|
285
288
|
self.report.async_batches_split += chunks
|
|
@@ -310,6 +313,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
310
313
|
partition_key,
|
|
311
314
|
self._emit_wrapper,
|
|
312
315
|
record,
|
|
316
|
+
EmitMode.ASYNC,
|
|
313
317
|
done_callback=functools.partial(
|
|
314
318
|
self._write_done_callback, record_envelope, write_callback
|
|
315
319
|
),
|
|
@@ -321,6 +325,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
321
325
|
self.executor.submit(
|
|
322
326
|
partition_key,
|
|
323
327
|
record,
|
|
328
|
+
EmitMode.ASYNC,
|
|
324
329
|
done_callback=functools.partial(
|
|
325
330
|
self._write_done_callback, record_envelope, write_callback
|
|
326
331
|
),
|
|
@@ -329,7 +334,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
329
334
|
else:
|
|
330
335
|
# execute synchronously
|
|
331
336
|
try:
|
|
332
|
-
self._emit_wrapper(record)
|
|
337
|
+
self._emit_wrapper(record, emit_mode=EmitMode.SYNC_PRIMARY)
|
|
333
338
|
write_callback.on_success(record_envelope, success_metadata={})
|
|
334
339
|
except Exception as e:
|
|
335
340
|
write_callback.on_failure(record_envelope, e, failure_metadata={})
|
|
@@ -341,11 +346,14 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
341
346
|
],
|
|
342
347
|
) -> None:
|
|
343
348
|
return self.write_record_async(
|
|
344
|
-
RecordEnvelope(item, metadata={}),
|
|
345
|
-
NoopWriteCallback(),
|
|
349
|
+
RecordEnvelope(item, metadata={}), NoopWriteCallback()
|
|
346
350
|
)
|
|
347
351
|
|
|
348
352
|
def close(self):
|
|
353
|
+
# Execute pre-shutdown callbacks first (handled by parent class)
|
|
354
|
+
super().close()
|
|
355
|
+
|
|
356
|
+
# Then perform sink-specific shutdown
|
|
349
357
|
with self.report.main_thread_blocking_timer:
|
|
350
358
|
self.executor.shutdown()
|
|
351
359
|
|
datahub/ingestion/sink/file.py
CHANGED
|
@@ -151,7 +151,7 @@ class DataLakeSourceConfig(
|
|
|
151
151
|
raise ValueError("platform must not be empty")
|
|
152
152
|
return platform
|
|
153
153
|
|
|
154
|
-
@pydantic.root_validator()
|
|
154
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
155
155
|
def ensure_profiling_pattern_is_passed_to_profiling(
|
|
156
156
|
cls, values: Dict[str, Any]
|
|
157
157
|
) -> Dict[str, Any]:
|
|
@@ -72,7 +72,7 @@ class DataLakeProfilerConfig(ConfigModel):
|
|
|
72
72
|
description="Whether to profile for the sample values for all columns.",
|
|
73
73
|
)
|
|
74
74
|
|
|
75
|
-
@pydantic.root_validator()
|
|
75
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
76
76
|
def ensure_field_level_settings_are_normalized(
|
|
77
77
|
cls: "DataLakeProfilerConfig", values: Dict[str, Any]
|
|
78
78
|
) -> Dict[str, Any]:
|
|
@@ -44,7 +44,11 @@ from datahub.ingestion.source.azure.abs_utils import (
|
|
|
44
44
|
get_key_prefix,
|
|
45
45
|
strip_abs_prefix,
|
|
46
46
|
)
|
|
47
|
-
from datahub.ingestion.source.
|
|
47
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
48
|
+
from datahub.ingestion.source.data_lake_common.data_lake_utils import (
|
|
49
|
+
ContainerWUCreator,
|
|
50
|
+
add_partition_columns_to_schema,
|
|
51
|
+
)
|
|
48
52
|
from datahub.ingestion.source.schema_inference import avro, csv_tsv, json, parquet
|
|
49
53
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
50
54
|
StaleEntityRemovalHandler,
|
|
@@ -53,10 +57,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
53
57
|
StatefulIngestionSourceBase,
|
|
54
58
|
)
|
|
55
59
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
56
|
-
SchemaField,
|
|
57
|
-
SchemaFieldDataType,
|
|
58
60
|
SchemaMetadata,
|
|
59
|
-
StringTypeClass,
|
|
60
61
|
)
|
|
61
62
|
from datahub.metadata.schema_classes import (
|
|
62
63
|
DataPlatformInstanceClass,
|
|
@@ -128,6 +129,14 @@ class TableData:
|
|
|
128
129
|
@support_status(SupportStatus.INCUBATING)
|
|
129
130
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
130
131
|
@capability(SourceCapability.TAGS, "Can extract ABS object/container tags if enabled")
|
|
132
|
+
@capability(
|
|
133
|
+
SourceCapability.CONTAINERS,
|
|
134
|
+
"Extract ABS containers and folders",
|
|
135
|
+
subtype_modifier=[
|
|
136
|
+
SourceCapabilityModifier.FOLDER,
|
|
137
|
+
SourceCapabilityModifier.ABS_CONTAINER,
|
|
138
|
+
],
|
|
139
|
+
)
|
|
131
140
|
class ABSSource(StatefulIngestionSourceBase):
|
|
132
141
|
source_config: DataLakeSourceConfig
|
|
133
142
|
report: DataLakeSourceReport
|
|
@@ -223,36 +232,12 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
223
232
|
fields = sorted(fields, key=lambda f: f.fieldPath)
|
|
224
233
|
|
|
225
234
|
if self.source_config.add_partition_columns_to_schema:
|
|
226
|
-
|
|
235
|
+
add_partition_columns_to_schema(
|
|
227
236
|
fields=fields, path_spec=path_spec, full_path=table_data.full_path
|
|
228
237
|
)
|
|
229
238
|
|
|
230
239
|
return fields
|
|
231
240
|
|
|
232
|
-
def add_partition_columns_to_schema(
|
|
233
|
-
self, path_spec: PathSpec, full_path: str, fields: List[SchemaField]
|
|
234
|
-
) -> None:
|
|
235
|
-
vars = path_spec.get_named_vars(full_path)
|
|
236
|
-
if vars is not None and "partition" in vars:
|
|
237
|
-
for partition in vars["partition"].values():
|
|
238
|
-
partition_arr = partition.split("=")
|
|
239
|
-
if len(partition_arr) != 2:
|
|
240
|
-
logger.debug(
|
|
241
|
-
f"Could not derive partition key from partition field {partition}"
|
|
242
|
-
)
|
|
243
|
-
continue
|
|
244
|
-
partition_key = partition_arr[0]
|
|
245
|
-
fields.append(
|
|
246
|
-
SchemaField(
|
|
247
|
-
fieldPath=f"{partition_key}",
|
|
248
|
-
nativeDataType="string",
|
|
249
|
-
type=SchemaFieldDataType(StringTypeClass()),
|
|
250
|
-
isPartitioningKey=True,
|
|
251
|
-
nullable=True,
|
|
252
|
-
recursive=False,
|
|
253
|
-
)
|
|
254
|
-
)
|
|
255
|
-
|
|
256
241
|
def _create_table_operation_aspect(self, table_data: TableData) -> OperationClass:
|
|
257
242
|
reported_time = int(time.time() * 1000)
|
|
258
243
|
|
|
@@ -533,7 +518,7 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
533
518
|
)
|
|
534
519
|
path_spec.sample_files = False
|
|
535
520
|
for obj in container_client.list_blobs(
|
|
536
|
-
|
|
521
|
+
name_starts_with=f"{prefix}", results_per_page=PAGE_SIZE
|
|
537
522
|
):
|
|
538
523
|
abs_path = self.create_abs_path(obj.name)
|
|
539
524
|
logger.debug(f"Path: {abs_path}")
|
|
@@ -18,6 +18,7 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, Sour
|
|
|
18
18
|
from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
19
19
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
20
20
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
21
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
21
22
|
from datahub.metadata.schema_classes import (
|
|
22
23
|
DomainsClass,
|
|
23
24
|
GlossaryTermAssociationClass,
|
|
@@ -48,7 +49,7 @@ def apply_association_to_container(
|
|
|
48
49
|
"""
|
|
49
50
|
urns: List[str] = [container_urn]
|
|
50
51
|
if not graph:
|
|
51
|
-
graph = get_default_graph()
|
|
52
|
+
graph = get_default_graph(ClientMode.INGESTION)
|
|
52
53
|
logger.info(f"Using {graph}")
|
|
53
54
|
urns.extend(
|
|
54
55
|
graph.get_urns_by_filter(
|
|
@@ -95,7 +96,7 @@ def apply_association_to_container(
|
|
|
95
96
|
class DomainApplyConfig(ConfigModel):
|
|
96
97
|
assets: List[str] = Field(
|
|
97
98
|
default_factory=list,
|
|
98
|
-
description="List of assets to apply domain
|
|
99
|
+
description="List of assets to apply domain hierarchically. Currently only containers and datasets are supported",
|
|
99
100
|
)
|
|
100
101
|
domain_urn: str = Field(default="")
|
|
101
102
|
|
|
@@ -103,7 +104,7 @@ class DomainApplyConfig(ConfigModel):
|
|
|
103
104
|
class TagApplyConfig(ConfigModel):
|
|
104
105
|
assets: List[str] = Field(
|
|
105
106
|
default_factory=list,
|
|
106
|
-
description="List of assets to apply tag
|
|
107
|
+
description="List of assets to apply tag hierarchically. Currently only containers and datasets are supported",
|
|
107
108
|
)
|
|
108
109
|
tag_urn: str = Field(default="")
|
|
109
110
|
|
|
@@ -111,7 +112,7 @@ class TagApplyConfig(ConfigModel):
|
|
|
111
112
|
class TermApplyConfig(ConfigModel):
|
|
112
113
|
assets: List[str] = Field(
|
|
113
114
|
default_factory=list,
|
|
114
|
-
description="List of assets to apply term
|
|
115
|
+
description="List of assets to apply term hierarchically. Currently only containers and datasets are supported",
|
|
115
116
|
)
|
|
116
117
|
term_urn: str = Field(default="")
|
|
117
118
|
|
|
@@ -119,7 +120,7 @@ class TermApplyConfig(ConfigModel):
|
|
|
119
120
|
class OwnerApplyConfig(ConfigModel):
|
|
120
121
|
assets: List[str] = Field(
|
|
121
122
|
default_factory=list,
|
|
122
|
-
description="List of assets to apply owner
|
|
123
|
+
description="List of assets to apply owner hierarchically. Currently only containers and datasets are supported",
|
|
123
124
|
)
|
|
124
125
|
owner_urn: str = Field(default="")
|
|
125
126
|
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import os
|
|
3
2
|
from datetime import datetime, timedelta, timezone
|
|
4
3
|
from enum import Enum
|
|
5
4
|
from http import HTTPStatus
|
|
6
5
|
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
|
|
6
|
+
from urllib.parse import parse_qs, urlparse
|
|
7
7
|
|
|
8
8
|
import boto3
|
|
9
9
|
import requests
|
|
10
10
|
from boto3.session import Session
|
|
11
11
|
from botocore.config import DEFAULT_TIMEOUT, Config
|
|
12
|
+
from botocore.exceptions import ClientError, NoCredentialsError
|
|
12
13
|
from botocore.utils import fix_s3_host
|
|
13
14
|
from pydantic.fields import Field
|
|
14
15
|
|
|
@@ -17,6 +18,16 @@ from datahub.configuration.common import (
|
|
|
17
18
|
ConfigModel,
|
|
18
19
|
PermissiveConfigModel,
|
|
19
20
|
)
|
|
21
|
+
from datahub.configuration.env_vars import (
|
|
22
|
+
get_aws_app_runner_service_id,
|
|
23
|
+
get_aws_execution_env,
|
|
24
|
+
get_aws_lambda_function_name,
|
|
25
|
+
get_aws_role_arn,
|
|
26
|
+
get_aws_web_identity_token_file,
|
|
27
|
+
get_ecs_container_metadata_uri,
|
|
28
|
+
get_ecs_container_metadata_uri_v4,
|
|
29
|
+
get_elastic_beanstalk_environment_name,
|
|
30
|
+
)
|
|
20
31
|
from datahub.configuration.source_common import EnvConfigMixin
|
|
21
32
|
|
|
22
33
|
logger = logging.getLogger(__name__)
|
|
@@ -24,6 +35,7 @@ logger = logging.getLogger(__name__)
|
|
|
24
35
|
if TYPE_CHECKING:
|
|
25
36
|
from mypy_boto3_dynamodb import DynamoDBClient
|
|
26
37
|
from mypy_boto3_glue import GlueClient
|
|
38
|
+
from mypy_boto3_lakeformation import LakeFormationClient
|
|
27
39
|
from mypy_boto3_s3 import S3Client, S3ServiceResource
|
|
28
40
|
from mypy_boto3_sagemaker import SageMakerClient
|
|
29
41
|
from mypy_boto3_sts import STSClient
|
|
@@ -99,27 +111,25 @@ def detect_aws_environment() -> AwsEnvironment:
|
|
|
99
111
|
Order matters as some environments may have multiple indicators.
|
|
100
112
|
"""
|
|
101
113
|
# Check Lambda first as it's most specific
|
|
102
|
-
if
|
|
103
|
-
if
|
|
114
|
+
if get_aws_lambda_function_name():
|
|
115
|
+
if (get_aws_execution_env() or "").startswith("CloudFormation"):
|
|
104
116
|
return AwsEnvironment.CLOUD_FORMATION
|
|
105
117
|
return AwsEnvironment.LAMBDA
|
|
106
118
|
|
|
107
119
|
# Check EKS (IRSA)
|
|
108
|
-
if
|
|
120
|
+
if get_aws_web_identity_token_file() and get_aws_role_arn():
|
|
109
121
|
return AwsEnvironment.EKS
|
|
110
122
|
|
|
111
123
|
# Check App Runner
|
|
112
|
-
if
|
|
124
|
+
if get_aws_app_runner_service_id():
|
|
113
125
|
return AwsEnvironment.APP_RUNNER
|
|
114
126
|
|
|
115
127
|
# Check ECS
|
|
116
|
-
if
|
|
117
|
-
"ECS_CONTAINER_METADATA_URI"
|
|
118
|
-
):
|
|
128
|
+
if get_ecs_container_metadata_uri_v4() or get_ecs_container_metadata_uri():
|
|
119
129
|
return AwsEnvironment.ECS
|
|
120
130
|
|
|
121
131
|
# Check Elastic Beanstalk
|
|
122
|
-
if
|
|
132
|
+
if get_elastic_beanstalk_environment_name():
|
|
123
133
|
return AwsEnvironment.BEANSTALK
|
|
124
134
|
|
|
125
135
|
if is_running_on_ec2():
|
|
@@ -154,7 +164,7 @@ def get_instance_role_arn() -> Optional[str]:
|
|
|
154
164
|
def get_lambda_role_arn() -> Optional[str]:
|
|
155
165
|
"""Get the Lambda function's role ARN"""
|
|
156
166
|
try:
|
|
157
|
-
function_name =
|
|
167
|
+
function_name = get_aws_lambda_function_name()
|
|
158
168
|
if not function_name:
|
|
159
169
|
return None
|
|
160
170
|
|
|
@@ -180,7 +190,7 @@ def get_current_identity() -> Tuple[Optional[str], Optional[str]]:
|
|
|
180
190
|
return role_arn, AwsServicePrincipal.LAMBDA.value
|
|
181
191
|
|
|
182
192
|
elif env == AwsEnvironment.EKS:
|
|
183
|
-
role_arn =
|
|
193
|
+
role_arn = get_aws_role_arn()
|
|
184
194
|
return role_arn, AwsServicePrincipal.EKS.value
|
|
185
195
|
|
|
186
196
|
elif env == AwsEnvironment.APP_RUNNER:
|
|
@@ -193,8 +203,8 @@ def get_current_identity() -> Tuple[Optional[str], Optional[str]]:
|
|
|
193
203
|
|
|
194
204
|
elif env == AwsEnvironment.ECS:
|
|
195
205
|
try:
|
|
196
|
-
metadata_uri =
|
|
197
|
-
|
|
206
|
+
metadata_uri = (
|
|
207
|
+
get_ecs_container_metadata_uri_v4() or get_ecs_container_metadata_uri()
|
|
198
208
|
)
|
|
199
209
|
if metadata_uri:
|
|
200
210
|
response = requests.get(f"{metadata_uri}/task", timeout=1)
|
|
@@ -454,6 +464,168 @@ class AwsConnectionConfig(ConfigModel):
|
|
|
454
464
|
def get_sagemaker_client(self) -> "SageMakerClient":
|
|
455
465
|
return self.get_session().client("sagemaker", config=self._aws_config())
|
|
456
466
|
|
|
467
|
+
def get_lakeformation_client(self) -> "LakeFormationClient":
|
|
468
|
+
return self.get_session().client("lakeformation", config=self._aws_config())
|
|
469
|
+
|
|
470
|
+
def get_rds_client(self):
|
|
471
|
+
"""Get an RDS client for generating IAM auth tokens."""
|
|
472
|
+
return self.get_session().client("rds", config=self._aws_config())
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def generate_rds_iam_token(
|
|
476
|
+
endpoint: str,
|
|
477
|
+
username: str,
|
|
478
|
+
port: int,
|
|
479
|
+
aws_config: AwsConnectionConfig,
|
|
480
|
+
) -> str:
|
|
481
|
+
"""
|
|
482
|
+
Generate an AWS RDS IAM authentication token.
|
|
483
|
+
|
|
484
|
+
boto3's generate_db_auth_token() returns a presigned URL in the format:
|
|
485
|
+
"hostname:port/?Action=connect&DBUser=username&X-Amz-Date=...&X-Amz-Expires=..."
|
|
486
|
+
|
|
487
|
+
This token should be used as-is by pymysql/psycopg2 drivers.
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
endpoint: RDS endpoint hostname
|
|
491
|
+
username: Database username for IAM authentication
|
|
492
|
+
port: Database port (5432 for PostgreSQL, 3306 for MySQL)
|
|
493
|
+
aws_config: AwsConnectionConfig for session management and credentials
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
Authentication token (presigned URL format)
|
|
497
|
+
|
|
498
|
+
Raises:
|
|
499
|
+
ValueError: If AWS credentials are not found or token generation fails
|
|
500
|
+
|
|
501
|
+
"""
|
|
502
|
+
try:
|
|
503
|
+
client = aws_config.get_rds_client()
|
|
504
|
+
token = client.generate_db_auth_token(
|
|
505
|
+
DBHostname=endpoint, Port=port, DBUsername=username
|
|
506
|
+
)
|
|
507
|
+
logger.debug(f"Generated RDS IAM token for {username}@{endpoint}:{port}")
|
|
508
|
+
return token
|
|
509
|
+
except NoCredentialsError as e:
|
|
510
|
+
raise ValueError("AWS credentials not found") from e
|
|
511
|
+
except ClientError as e:
|
|
512
|
+
raise ValueError(f"Failed to generate RDS IAM token: {e}") from e
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
class RDSIAMTokenManager:
|
|
516
|
+
"""
|
|
517
|
+
Manages RDS IAM token lifecycle with automatic refresh.
|
|
518
|
+
|
|
519
|
+
RDS IAM tokens include expiration information in the URL parameters.
|
|
520
|
+
This manager parses the token expiry and refreshes before expiration
|
|
521
|
+
to ensure uninterrupted database access.
|
|
522
|
+
"""
|
|
523
|
+
|
|
524
|
+
def __init__(
|
|
525
|
+
self,
|
|
526
|
+
endpoint: str,
|
|
527
|
+
username: str,
|
|
528
|
+
port: int,
|
|
529
|
+
aws_config: AwsConnectionConfig,
|
|
530
|
+
refresh_threshold_minutes: int = 5,
|
|
531
|
+
):
|
|
532
|
+
"""
|
|
533
|
+
Initialize the token manager.
|
|
534
|
+
|
|
535
|
+
Args:
|
|
536
|
+
endpoint: RDS endpoint hostname
|
|
537
|
+
username: Database username for IAM authentication
|
|
538
|
+
port: Database port
|
|
539
|
+
aws_config: AwsConnectionConfig for session management and credentials
|
|
540
|
+
refresh_threshold_minutes: Refresh token when this many minutes remain before expiry
|
|
541
|
+
"""
|
|
542
|
+
self.endpoint = endpoint
|
|
543
|
+
self.username = username
|
|
544
|
+
self.port = port
|
|
545
|
+
self.aws_config = aws_config
|
|
546
|
+
self.refresh_threshold = timedelta(minutes=refresh_threshold_minutes)
|
|
547
|
+
|
|
548
|
+
self._current_token: Optional[str] = None
|
|
549
|
+
self._token_expires_at: Optional[datetime] = None
|
|
550
|
+
|
|
551
|
+
def get_token(self) -> str:
|
|
552
|
+
"""
|
|
553
|
+
Get current token, refreshing if necessary.
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
Valid authentication token
|
|
557
|
+
|
|
558
|
+
Raises:
|
|
559
|
+
RuntimeError: If token generation or refresh fails
|
|
560
|
+
"""
|
|
561
|
+
if self._needs_refresh():
|
|
562
|
+
self._refresh_token()
|
|
563
|
+
|
|
564
|
+
assert self._current_token is not None
|
|
565
|
+
return self._current_token
|
|
566
|
+
|
|
567
|
+
def _needs_refresh(self) -> bool:
|
|
568
|
+
"""Check if token needs to be refreshed."""
|
|
569
|
+
if self._current_token is None or self._token_expires_at is None:
|
|
570
|
+
return True
|
|
571
|
+
|
|
572
|
+
time_until_expiry = self._token_expires_at - datetime.now(timezone.utc)
|
|
573
|
+
return time_until_expiry <= self.refresh_threshold
|
|
574
|
+
|
|
575
|
+
def _parse_token_expiry(self, token: str) -> datetime:
|
|
576
|
+
"""
|
|
577
|
+
Parse token expiry from X-Amz-Date and X-Amz-Expires URL parameters.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
token: RDS IAM authentication token (presigned URL)
|
|
581
|
+
|
|
582
|
+
Returns:
|
|
583
|
+
Expiration datetime in UTC
|
|
584
|
+
|
|
585
|
+
Raises:
|
|
586
|
+
ValueError: If token URL format is invalid or missing required parameters
|
|
587
|
+
"""
|
|
588
|
+
try:
|
|
589
|
+
parsed_url = urlparse(token)
|
|
590
|
+
query_params = parse_qs(parsed_url.query)
|
|
591
|
+
|
|
592
|
+
# Extract X-Amz-Date (ISO 8601 format: YYYYMMDDTHHMMSSZ)
|
|
593
|
+
amz_date_list = query_params.get("X-Amz-Date")
|
|
594
|
+
if not amz_date_list:
|
|
595
|
+
raise ValueError("Missing X-Amz-Date parameter in RDS IAM token")
|
|
596
|
+
amz_date_str = amz_date_list[0]
|
|
597
|
+
|
|
598
|
+
# Extract X-Amz-Expires (duration in seconds)
|
|
599
|
+
amz_expires_list = query_params.get("X-Amz-Expires")
|
|
600
|
+
if not amz_expires_list:
|
|
601
|
+
raise ValueError("Missing X-Amz-Expires parameter in RDS IAM token")
|
|
602
|
+
amz_expires_seconds = int(amz_expires_list[0])
|
|
603
|
+
|
|
604
|
+
# Parse X-Amz-Date to datetime
|
|
605
|
+
token_issued_at = datetime.strptime(amz_date_str, "%Y%m%dT%H%M%SZ").replace(
|
|
606
|
+
tzinfo=timezone.utc
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
# Calculate expiration
|
|
610
|
+
return token_issued_at + timedelta(seconds=amz_expires_seconds)
|
|
611
|
+
|
|
612
|
+
except (ValueError, KeyError, IndexError) as e:
|
|
613
|
+
raise ValueError(
|
|
614
|
+
f"Failed to parse RDS IAM token expiry: {e}. Token format may be invalid."
|
|
615
|
+
) from e
|
|
616
|
+
|
|
617
|
+
def _refresh_token(self) -> None:
|
|
618
|
+
"""Generate and store a new token with parsed expiry."""
|
|
619
|
+
logger.info("Refreshing RDS IAM authentication token")
|
|
620
|
+
self._current_token = generate_rds_iam_token(
|
|
621
|
+
endpoint=self.endpoint,
|
|
622
|
+
username=self.username,
|
|
623
|
+
port=self.port,
|
|
624
|
+
aws_config=self.aws_config,
|
|
625
|
+
)
|
|
626
|
+
self._token_expires_at = self._parse_token_expiry(self._current_token)
|
|
627
|
+
logger.debug(f"Token will expire at {self._token_expires_at}")
|
|
628
|
+
|
|
457
629
|
|
|
458
630
|
class AwsSourceConfig(EnvConfigMixin, AwsConnectionConfig):
|
|
459
631
|
"""
|