acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import functools
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
|
-
import
|
|
6
|
+
import re
|
|
7
7
|
import time
|
|
8
8
|
from collections import defaultdict
|
|
9
9
|
from dataclasses import dataclass
|
|
@@ -20,18 +20,18 @@ from typing import (
|
|
|
20
20
|
Sequence,
|
|
21
21
|
Tuple,
|
|
22
22
|
Union,
|
|
23
|
+
overload,
|
|
23
24
|
)
|
|
24
25
|
|
|
25
26
|
import pydantic
|
|
26
27
|
import requests
|
|
27
|
-
from deprecated import deprecated
|
|
28
28
|
from requests.adapters import HTTPAdapter, Retry
|
|
29
29
|
from requests.exceptions import HTTPError, RequestException
|
|
30
|
+
from typing_extensions import deprecated
|
|
30
31
|
|
|
31
32
|
from datahub._version import nice_version_name
|
|
32
33
|
from datahub.cli import config_utils
|
|
33
34
|
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
|
|
34
|
-
from datahub.cli.env_utils import get_boolean_env_variable
|
|
35
35
|
from datahub.configuration.common import (
|
|
36
36
|
ConfigEnum,
|
|
37
37
|
ConfigModel,
|
|
@@ -40,10 +40,17 @@ from datahub.configuration.common import (
|
|
|
40
40
|
TraceTimeoutError,
|
|
41
41
|
TraceValidationError,
|
|
42
42
|
)
|
|
43
|
-
from datahub.
|
|
43
|
+
from datahub.configuration.env_vars import (
|
|
44
|
+
get_emit_mode,
|
|
45
|
+
get_emitter_trace,
|
|
46
|
+
get_rest_emitter_batch_max_payload_bytes,
|
|
47
|
+
get_rest_emitter_batch_max_payload_length,
|
|
48
|
+
get_rest_emitter_default_endpoint,
|
|
49
|
+
get_rest_emitter_default_retry_max_times,
|
|
50
|
+
)
|
|
44
51
|
from datahub.emitter.generic_emitter import Emitter
|
|
45
52
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
46
|
-
from datahub.emitter.request_helper import make_curl_command
|
|
53
|
+
from datahub.emitter.request_helper import OpenApiRequest, make_curl_command
|
|
47
54
|
from datahub.emitter.response_helper import (
|
|
48
55
|
TraceData,
|
|
49
56
|
extract_trace_data,
|
|
@@ -51,11 +58,20 @@ from datahub.emitter.response_helper import (
|
|
|
51
58
|
)
|
|
52
59
|
from datahub.emitter.serialization_helper import pre_json_transform
|
|
53
60
|
from datahub.ingestion.api.closeable import Closeable
|
|
61
|
+
from datahub.ingestion.graph.config import (
|
|
62
|
+
DATAHUB_COMPONENT_ENV,
|
|
63
|
+
ClientMode,
|
|
64
|
+
)
|
|
54
65
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
55
66
|
MetadataChangeEvent,
|
|
56
67
|
MetadataChangeProposal,
|
|
57
68
|
)
|
|
58
69
|
from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation
|
|
70
|
+
from datahub.metadata.schema_classes import (
|
|
71
|
+
KEY_ASPECT_NAMES,
|
|
72
|
+
ChangeTypeClass,
|
|
73
|
+
)
|
|
74
|
+
from datahub.utilities.server_config_util import RestServiceConfig, ServiceFeature
|
|
59
75
|
|
|
60
76
|
if TYPE_CHECKING:
|
|
61
77
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
@@ -72,51 +88,77 @@ _DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on
|
|
|
72
88
|
504,
|
|
73
89
|
]
|
|
74
90
|
_DEFAULT_RETRY_METHODS = ["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"]
|
|
75
|
-
_DEFAULT_RETRY_MAX_TIMES = int(
|
|
76
|
-
os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
|
|
77
|
-
)
|
|
91
|
+
_DEFAULT_RETRY_MAX_TIMES = int(get_rest_emitter_default_retry_max_times())
|
|
78
92
|
|
|
79
|
-
_DATAHUB_EMITTER_TRACE =
|
|
93
|
+
_DATAHUB_EMITTER_TRACE = get_emitter_trace()
|
|
94
|
+
|
|
95
|
+
_DEFAULT_CLIENT_MODE: ClientMode = ClientMode.SDK
|
|
80
96
|
|
|
81
97
|
TRACE_PENDING_STATUS = "PENDING"
|
|
82
98
|
TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
|
|
83
99
|
TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
|
|
84
100
|
TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
|
|
85
101
|
|
|
86
|
-
# The limit is
|
|
102
|
+
# The limit is 16,000,000 bytes. We will use a max of 15mb to have some space
|
|
87
103
|
# for overhead like request headers.
|
|
88
104
|
# This applies to pretty much all calls to GMS.
|
|
89
|
-
INGEST_MAX_PAYLOAD_BYTES =
|
|
105
|
+
INGEST_MAX_PAYLOAD_BYTES = get_rest_emitter_batch_max_payload_bytes()
|
|
90
106
|
|
|
91
107
|
# This limit is somewhat arbitrary. All GMS endpoints will timeout
|
|
92
108
|
# and return a 500 if processing takes too long. To avoid sending
|
|
93
109
|
# too much to the backend and hitting a timeout, we try to limit
|
|
94
110
|
# the number of MCPs we send in a batch.
|
|
95
|
-
BATCH_INGEST_MAX_PAYLOAD_LENGTH =
|
|
96
|
-
|
|
111
|
+
BATCH_INGEST_MAX_PAYLOAD_LENGTH = get_rest_emitter_batch_max_payload_length()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def preserve_unicode_escapes(obj: Any) -> Any:
|
|
115
|
+
"""Recursively convert unicode characters back to escape sequences"""
|
|
116
|
+
if isinstance(obj, dict):
|
|
117
|
+
return {k: preserve_unicode_escapes(v) for k, v in obj.items()}
|
|
118
|
+
elif isinstance(obj, list):
|
|
119
|
+
return [preserve_unicode_escapes(item) for item in obj]
|
|
120
|
+
elif isinstance(obj, str):
|
|
121
|
+
# Convert non-ASCII characters back to \u escapes
|
|
122
|
+
def escape_unicode(match: Any) -> Any:
|
|
123
|
+
return f"\\u{ord(match.group(0)):04x}"
|
|
124
|
+
|
|
125
|
+
return re.sub(r"[^\x00-\x7F]", escape_unicode, obj)
|
|
126
|
+
else:
|
|
127
|
+
return obj
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class EmitMode(ConfigEnum):
|
|
131
|
+
# Fully synchronous processing that updates both primary storage (SQL) and search storage (Elasticsearch) before returning.
|
|
132
|
+
# Provides the strongest consistency guarantee but with the highest cost. Best for critical operations where immediate
|
|
133
|
+
# searchability and consistent reads are required.
|
|
134
|
+
SYNC_WAIT = auto()
|
|
135
|
+
# Synchronously updates the primary storage (SQL) but asynchronously updates search storage (Elasticsearch). Provides
|
|
136
|
+
# a balance between consistency and performance. Suitable for updates that need to be immediately reflected in direct
|
|
137
|
+
# entity retrievals but where search index consistency can be slightly delayed.
|
|
138
|
+
SYNC_PRIMARY = auto()
|
|
139
|
+
# Queues the metadata change for asynchronous processing and returns immediately. The client continues execution without
|
|
140
|
+
# waiting for the change to be fully processed. Best for high-throughput scenarios where eventual consistency is acceptable.
|
|
141
|
+
ASYNC = auto()
|
|
142
|
+
# Queues the metadata change asynchronously but blocks until confirmation that the write has been fully persisted.
|
|
143
|
+
# More efficient than fully synchronous operations due to backend parallelization and batching while still providing
|
|
144
|
+
# strong consistency guarantees. Useful when you need confirmation of successful persistence without sacrificing performance.
|
|
145
|
+
ASYNC_WAIT = auto()
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
_DEFAULT_EMIT_MODE = pydantic.parse_obj_as(
|
|
149
|
+
EmitMode,
|
|
150
|
+
get_emit_mode() or EmitMode.SYNC_PRIMARY,
|
|
97
151
|
)
|
|
98
152
|
|
|
99
153
|
|
|
100
|
-
class RestTraceMode(ConfigEnum):
|
|
101
|
-
ENABLED = auto()
|
|
102
|
-
DISABLED = auto()
|
|
103
|
-
|
|
104
|
-
|
|
105
154
|
class RestSinkEndpoint(ConfigEnum):
|
|
106
155
|
RESTLI = auto()
|
|
107
156
|
OPENAPI = auto()
|
|
108
157
|
|
|
109
158
|
|
|
110
|
-
|
|
159
|
+
DEFAULT_REST_EMITTER_ENDPOINT = pydantic.parse_obj_as(
|
|
111
160
|
RestSinkEndpoint,
|
|
112
|
-
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
# Supported with v1.0
|
|
117
|
-
DEFAULT_REST_TRACE_MODE = pydantic.parse_obj_as(
|
|
118
|
-
RestTraceMode,
|
|
119
|
-
os.getenv("DATAHUB_REST_TRACE_MODE", RestTraceMode.DISABLED),
|
|
161
|
+
get_rest_emitter_default_endpoint() or RestSinkEndpoint.RESTLI,
|
|
120
162
|
)
|
|
121
163
|
|
|
122
164
|
|
|
@@ -132,12 +174,24 @@ class RequestsSessionConfig(ConfigModel):
|
|
|
132
174
|
ca_certificate_path: Optional[str] = None
|
|
133
175
|
client_certificate_path: Optional[str] = None
|
|
134
176
|
disable_ssl_verification: bool = False
|
|
177
|
+
client_mode: Optional[ClientMode] = _DEFAULT_CLIENT_MODE
|
|
178
|
+
datahub_component: Optional[str] = None
|
|
135
179
|
|
|
136
180
|
def build_session(self) -> requests.Session:
|
|
137
181
|
session = requests.Session()
|
|
138
182
|
|
|
139
|
-
|
|
140
|
-
|
|
183
|
+
user_agent = self._get_user_agent_string(session)
|
|
184
|
+
|
|
185
|
+
base_headers = {
|
|
186
|
+
"User-Agent": user_agent,
|
|
187
|
+
"X-DataHub-Client-Mode": self.client_mode.name
|
|
188
|
+
if self.client_mode
|
|
189
|
+
else _DEFAULT_CLIENT_MODE.name,
|
|
190
|
+
"X-DataHub-Py-Cli-Version": nice_version_name(),
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
headers = {**base_headers, **self.extra_headers}
|
|
194
|
+
session.headers.update(headers)
|
|
141
195
|
|
|
142
196
|
if self.client_certificate_path:
|
|
143
197
|
session.cert = self.client_certificate_path
|
|
@@ -185,6 +239,59 @@ class RequestsSessionConfig(ConfigModel):
|
|
|
185
239
|
|
|
186
240
|
return session
|
|
187
241
|
|
|
242
|
+
@classmethod
|
|
243
|
+
def get_client_mode_from_session(
|
|
244
|
+
cls, session: requests.Session
|
|
245
|
+
) -> Optional[ClientMode]:
|
|
246
|
+
"""
|
|
247
|
+
Extract the ClientMode enum from a requests Session by checking the headers.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
session: The requests.Session object to check
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
The corresponding ClientMode enum value if found, None otherwise
|
|
254
|
+
"""
|
|
255
|
+
# Check if the session has the X-DataHub-Client-Mode header
|
|
256
|
+
mode_str = session.headers.get("X-DataHub-Client-Mode")
|
|
257
|
+
|
|
258
|
+
if not mode_str:
|
|
259
|
+
return None
|
|
260
|
+
|
|
261
|
+
# Try to convert the string value to enum
|
|
262
|
+
try:
|
|
263
|
+
# First ensure we're working with a str value
|
|
264
|
+
if isinstance(mode_str, bytes):
|
|
265
|
+
mode_str = mode_str.decode("utf-8")
|
|
266
|
+
|
|
267
|
+
# Then find the matching enum value
|
|
268
|
+
for mode in ClientMode:
|
|
269
|
+
if mode.name == mode_str:
|
|
270
|
+
return mode
|
|
271
|
+
|
|
272
|
+
# If we got here, no matching enum was found
|
|
273
|
+
return None
|
|
274
|
+
except Exception:
|
|
275
|
+
# Handle any other errors
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
def _get_user_agent_string(self, session: requests.Session) -> str:
|
|
279
|
+
"""Generate appropriate user agent string based on client mode"""
|
|
280
|
+
version = nice_version_name()
|
|
281
|
+
client_mode = self.client_mode if self.client_mode else _DEFAULT_CLIENT_MODE
|
|
282
|
+
|
|
283
|
+
if "User-Agent" in session.headers:
|
|
284
|
+
user_agent = session.headers["User-Agent"]
|
|
285
|
+
if isinstance(user_agent, bytes):
|
|
286
|
+
requests_user_agent = " " + user_agent.decode("utf-8")
|
|
287
|
+
else:
|
|
288
|
+
requests_user_agent = " " + user_agent
|
|
289
|
+
else:
|
|
290
|
+
requests_user_agent = ""
|
|
291
|
+
|
|
292
|
+
# 1.0 refers to the user agent string version
|
|
293
|
+
return f"DataHub-Client/1.0 ({client_mode.name.lower()}; {self.datahub_component if self.datahub_component else DATAHUB_COMPONENT_ENV}; {version}){requests_user_agent}"
|
|
294
|
+
|
|
188
295
|
|
|
189
296
|
@dataclass
|
|
190
297
|
class _Chunk:
|
|
@@ -210,8 +317,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
210
317
|
_gms_server: str
|
|
211
318
|
_token: Optional[str]
|
|
212
319
|
_session: requests.Session
|
|
213
|
-
_openapi_ingestion: bool
|
|
214
|
-
|
|
320
|
+
_openapi_ingestion: Optional[bool]
|
|
321
|
+
_server_config: RestServiceConfig
|
|
215
322
|
|
|
216
323
|
def __init__(
|
|
217
324
|
self,
|
|
@@ -227,8 +334,10 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
227
334
|
ca_certificate_path: Optional[str] = None,
|
|
228
335
|
client_certificate_path: Optional[str] = None,
|
|
229
336
|
disable_ssl_verification: bool = False,
|
|
230
|
-
openapi_ingestion: bool =
|
|
231
|
-
|
|
337
|
+
openapi_ingestion: Optional[bool] = None,
|
|
338
|
+
client_mode: Optional[ClientMode] = None,
|
|
339
|
+
datahub_component: Optional[str] = None,
|
|
340
|
+
server_config_refresh_interval: Optional[int] = None,
|
|
232
341
|
):
|
|
233
342
|
if not gms_server:
|
|
234
343
|
raise ConfigurationError("gms server is required")
|
|
@@ -240,21 +349,15 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
240
349
|
|
|
241
350
|
self._gms_server = fixup_gms_url(gms_server)
|
|
242
351
|
self._token = token
|
|
243
|
-
self.server_config: Dict[str, Any] = {}
|
|
244
|
-
self._openapi_ingestion = openapi_ingestion
|
|
245
|
-
self._default_trace_mode = default_trace_mode
|
|
246
352
|
self._session = requests.Session()
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
|
|
353
|
+
self._openapi_ingestion = (
|
|
354
|
+
openapi_ingestion # Re-evaluated after test connection
|
|
250
355
|
)
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
logger.debug("Using API Tracing for ingestion.")
|
|
356
|
+
self._server_config_refresh_interval = server_config_refresh_interval
|
|
357
|
+
self._config_fetch_time: Optional[float] = None
|
|
254
358
|
|
|
255
359
|
headers = {
|
|
256
360
|
"X-RestLi-Protocol-Version": "2.0.0",
|
|
257
|
-
"X-DataHub-Py-Cli-Version": nice_version_name(),
|
|
258
361
|
"Content-Type": "application/json",
|
|
259
362
|
}
|
|
260
363
|
if token:
|
|
@@ -300,39 +403,116 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
300
403
|
ca_certificate_path=ca_certificate_path,
|
|
301
404
|
client_certificate_path=client_certificate_path,
|
|
302
405
|
disable_ssl_verification=disable_ssl_verification,
|
|
406
|
+
client_mode=client_mode,
|
|
407
|
+
datahub_component=datahub_component,
|
|
303
408
|
)
|
|
304
409
|
|
|
305
410
|
self._session = self._session_config.build_session()
|
|
306
411
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
if response.status_code == 200:
|
|
311
|
-
config: dict = response.json()
|
|
312
|
-
if config.get("noCode") == "true":
|
|
313
|
-
self.server_config = config
|
|
314
|
-
return
|
|
412
|
+
@property
|
|
413
|
+
def server_config(self) -> RestServiceConfig:
|
|
414
|
+
return self.fetch_server_config()
|
|
315
415
|
|
|
316
|
-
|
|
416
|
+
# TODO: This should move to DataHubGraph once it no longer inherits from DataHubRestEmitter
|
|
417
|
+
def fetch_server_config(self) -> RestServiceConfig:
|
|
418
|
+
"""
|
|
419
|
+
Fetch configuration from the server if not already loaded.
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
The configuration dictionary
|
|
423
|
+
|
|
424
|
+
Raises:
|
|
425
|
+
ConfigurationError: If there's an error fetching or validating the configuration
|
|
426
|
+
"""
|
|
427
|
+
|
|
428
|
+
if (
|
|
429
|
+
not hasattr(self, "_server_config")
|
|
430
|
+
or self._server_config is None
|
|
431
|
+
or (
|
|
432
|
+
self._server_config_refresh_interval is not None
|
|
433
|
+
and self._config_fetch_time is not None
|
|
434
|
+
and (time.time() - self._config_fetch_time)
|
|
435
|
+
> self._server_config_refresh_interval
|
|
436
|
+
)
|
|
437
|
+
):
|
|
438
|
+
if self._session is None or self._gms_server is None:
|
|
317
439
|
raise ConfigurationError(
|
|
318
|
-
"
|
|
319
|
-
"The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms). "
|
|
320
|
-
"For Acryl users, the endpoint should be https://<name>.acryl.io/gms"
|
|
440
|
+
"Session and URL are required to load configuration"
|
|
321
441
|
)
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
if response.status_code ==
|
|
327
|
-
|
|
442
|
+
|
|
443
|
+
url = f"{self._gms_server}/config"
|
|
444
|
+
response = self._session.get(url)
|
|
445
|
+
|
|
446
|
+
if response.status_code == 200:
|
|
447
|
+
raw_config = response.json()
|
|
448
|
+
|
|
449
|
+
# Validate that we're connected to the correct service
|
|
450
|
+
if not raw_config.get("noCode") == "true":
|
|
451
|
+
raise ConfigurationError(
|
|
452
|
+
"You seem to have connected to the frontend service instead of the GMS endpoint. "
|
|
453
|
+
"The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms). "
|
|
454
|
+
"For Acryl users, the endpoint should be https://<name>.acryl.io/gms"
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
self._server_config = RestServiceConfig(raw_config=raw_config)
|
|
458
|
+
self._config_fetch_time = time.time()
|
|
459
|
+
self._post_fetch_server_config()
|
|
460
|
+
|
|
461
|
+
else:
|
|
462
|
+
logger.debug(
|
|
463
|
+
f"Unable to connect to {url} with status_code: {response.status_code}. Response: {response.text}"
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
if response.status_code == 401:
|
|
467
|
+
message = f"Unable to connect to {url} - got an authentication error: {response.text}."
|
|
468
|
+
else:
|
|
469
|
+
message = f"Unable to connect to {url} with status_code: {response.status_code}."
|
|
470
|
+
|
|
471
|
+
message += "\nPlease check your configuration and make sure you are talking to the DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)."
|
|
472
|
+
raise ConfigurationError(message)
|
|
473
|
+
|
|
474
|
+
return self._server_config
|
|
475
|
+
|
|
476
|
+
def _post_fetch_server_config(self) -> None:
|
|
477
|
+
# Determine OpenAPI mode
|
|
478
|
+
if self._openapi_ingestion is None:
|
|
479
|
+
# No constructor parameter
|
|
480
|
+
if (
|
|
481
|
+
not get_rest_emitter_default_endpoint()
|
|
482
|
+
and self._session_config.client_mode == ClientMode.SDK
|
|
483
|
+
and self._server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
|
|
484
|
+
):
|
|
485
|
+
# Enable if SDK client and no environment variable specified
|
|
486
|
+
self._openapi_ingestion = True
|
|
328
487
|
else:
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
488
|
+
# The system env is specifying the value
|
|
489
|
+
self._openapi_ingestion = (
|
|
490
|
+
DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
def test_connection(self) -> None:
|
|
494
|
+
self.fetch_server_config()
|
|
495
|
+
logger.debug(
|
|
496
|
+
f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
|
|
497
|
+
)
|
|
498
|
+
logger.debug(
|
|
499
|
+
f"{EmitMode.ASYNC_WAIT} {'IS' if self._should_trace(emit_mode=EmitMode.ASYNC_WAIT, warn=False) else 'IS NOT'} supported."
|
|
500
|
+
)
|
|
332
501
|
|
|
333
502
|
def get_server_config(self) -> dict:
|
|
334
|
-
self.
|
|
335
|
-
|
|
503
|
+
return self.server_config.raw_config
|
|
504
|
+
|
|
505
|
+
def invalidate_config_cache(self) -> None:
|
|
506
|
+
"""Manually invalidate the configuration cache."""
|
|
507
|
+
if (
|
|
508
|
+
hasattr(self, "_server_config")
|
|
509
|
+
and self._server_config is not None
|
|
510
|
+
and self._server_config_refresh_interval is not None
|
|
511
|
+
):
|
|
512
|
+
# Set fetch time to beyond TTL in the past to force refresh on next access
|
|
513
|
+
self._config_fetch_time = (
|
|
514
|
+
time.time() - self._server_config_refresh_interval - 1
|
|
515
|
+
)
|
|
336
516
|
|
|
337
517
|
def to_graph(self) -> "DataHubGraph":
|
|
338
518
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
@@ -342,39 +522,24 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
342
522
|
def _to_openapi_request(
|
|
343
523
|
self,
|
|
344
524
|
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
resolved_async_flag = (
|
|
350
|
-
async_flag if async_flag is not None else async_default
|
|
351
|
-
)
|
|
352
|
-
url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
|
|
525
|
+
emit_mode: EmitMode,
|
|
526
|
+
) -> Optional[OpenApiRequest]:
|
|
527
|
+
"""
|
|
528
|
+
Convert a MetadataChangeProposal to an OpenAPI request format.
|
|
353
529
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
"urn": mcp.entityUrn,
|
|
368
|
-
mcp.aspectName: {
|
|
369
|
-
"value": aspect_value,
|
|
370
|
-
"systemMetadata": mcp.systemMetadata.to_obj()
|
|
371
|
-
if mcp.systemMetadata
|
|
372
|
-
else None,
|
|
373
|
-
},
|
|
374
|
-
}
|
|
375
|
-
],
|
|
376
|
-
)
|
|
377
|
-
return None
|
|
530
|
+
Args:
|
|
531
|
+
mcp: The metadata change proposal
|
|
532
|
+
emit_mode: Client emit mode
|
|
533
|
+
|
|
534
|
+
Returns:
|
|
535
|
+
An OpenApiRequest object or None if the MCP doesn't have required fields
|
|
536
|
+
"""
|
|
537
|
+
return OpenApiRequest.from_mcp(
|
|
538
|
+
mcp=mcp,
|
|
539
|
+
gms_server=self._gms_server,
|
|
540
|
+
async_flag=emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT),
|
|
541
|
+
search_sync_flag=emit_mode == EmitMode.SYNC_WAIT,
|
|
542
|
+
)
|
|
378
543
|
|
|
379
544
|
def emit(
|
|
380
545
|
self,
|
|
@@ -385,7 +550,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
385
550
|
UsageAggregation,
|
|
386
551
|
],
|
|
387
552
|
callback: Optional[Callable[[Exception, str], None]] = None,
|
|
388
|
-
|
|
553
|
+
emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
|
|
389
554
|
) -> None:
|
|
390
555
|
try:
|
|
391
556
|
if isinstance(item, UsageAggregation):
|
|
@@ -393,7 +558,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
393
558
|
elif isinstance(
|
|
394
559
|
item, (MetadataChangeProposal, MetadataChangeProposalWrapper)
|
|
395
560
|
):
|
|
396
|
-
self.emit_mcp(item,
|
|
561
|
+
self.emit_mcp(item, emit_mode=emit_mode)
|
|
397
562
|
else:
|
|
398
563
|
self.emit_mce(item)
|
|
399
564
|
except Exception as e:
|
|
@@ -421,42 +586,84 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
421
586
|
"systemMetadata": system_metadata_obj,
|
|
422
587
|
}
|
|
423
588
|
payload = json.dumps(snapshot)
|
|
589
|
+
if len(payload) > INGEST_MAX_PAYLOAD_BYTES:
|
|
590
|
+
logger.warning(
|
|
591
|
+
f"MCE object has size {len(payload)} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
|
|
592
|
+
"so this metadata will likely fail to be emitted."
|
|
593
|
+
)
|
|
424
594
|
|
|
425
595
|
self._emit_generic(url, payload)
|
|
426
596
|
|
|
597
|
+
@overload
|
|
598
|
+
@deprecated("Use emit_mode instead of async_flag")
|
|
599
|
+
def emit_mcp(
|
|
600
|
+
self,
|
|
601
|
+
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
602
|
+
*,
|
|
603
|
+
async_flag: Optional[bool] = None,
|
|
604
|
+
) -> None: ...
|
|
605
|
+
|
|
606
|
+
@overload
|
|
607
|
+
def emit_mcp(
|
|
608
|
+
self,
|
|
609
|
+
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
610
|
+
*,
|
|
611
|
+
emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
|
|
612
|
+
wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
613
|
+
) -> None: ...
|
|
614
|
+
|
|
427
615
|
def emit_mcp(
|
|
428
616
|
self,
|
|
429
617
|
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
430
618
|
async_flag: Optional[bool] = None,
|
|
431
|
-
|
|
432
|
-
|
|
619
|
+
emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
|
|
620
|
+
wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
433
621
|
) -> None:
|
|
622
|
+
if async_flag is True:
|
|
623
|
+
emit_mode = EmitMode.ASYNC
|
|
624
|
+
|
|
434
625
|
ensure_has_system_metadata(mcp)
|
|
435
626
|
|
|
436
627
|
trace_data = None
|
|
437
628
|
|
|
438
629
|
if self._openapi_ingestion:
|
|
439
|
-
request = self._to_openapi_request(mcp,
|
|
630
|
+
request = self._to_openapi_request(mcp, emit_mode)
|
|
440
631
|
if request:
|
|
441
|
-
response = self._emit_generic(
|
|
632
|
+
response = self._emit_generic(
|
|
633
|
+
request.url, payload=request.payload, method=request.method
|
|
634
|
+
)
|
|
442
635
|
|
|
443
|
-
if self._should_trace(
|
|
636
|
+
if self._should_trace(emit_mode):
|
|
444
637
|
trace_data = extract_trace_data(response) if response else None
|
|
445
638
|
|
|
446
639
|
else:
|
|
447
|
-
|
|
640
|
+
if mcp.changeType == ChangeTypeClass.DELETE:
|
|
641
|
+
if mcp.aspectName not in KEY_ASPECT_NAMES:
|
|
642
|
+
raise OperationalError(
|
|
643
|
+
f"Delete not supported for non key aspect: {mcp.aspectName} for urn: "
|
|
644
|
+
f"{mcp.entityUrn}"
|
|
645
|
+
)
|
|
448
646
|
|
|
449
|
-
|
|
450
|
-
|
|
647
|
+
url = f"{self._gms_server}/entities?action=delete"
|
|
648
|
+
payload_dict = {
|
|
649
|
+
"urn": mcp.entityUrn,
|
|
650
|
+
}
|
|
651
|
+
else:
|
|
652
|
+
url = f"{self._gms_server}/aspects?action=ingestProposal"
|
|
451
653
|
|
|
452
|
-
|
|
453
|
-
payload_dict
|
|
654
|
+
mcp_obj = preserve_unicode_escapes(pre_json_transform(mcp.to_obj()))
|
|
655
|
+
payload_dict = {
|
|
656
|
+
"proposal": mcp_obj,
|
|
657
|
+
"async": "true"
|
|
658
|
+
if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
|
|
659
|
+
else "false",
|
|
660
|
+
}
|
|
454
661
|
|
|
455
662
|
payload = json.dumps(payload_dict)
|
|
456
663
|
|
|
457
664
|
response = self._emit_generic(url, payload)
|
|
458
665
|
|
|
459
|
-
if self._should_trace(
|
|
666
|
+
if self._should_trace(emit_mode):
|
|
460
667
|
trace_data = (
|
|
461
668
|
extract_trace_data_from_mcps(response, [mcp]) if response else None
|
|
462
669
|
)
|
|
@@ -464,15 +671,14 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
464
671
|
if trace_data:
|
|
465
672
|
self._await_status(
|
|
466
673
|
[trace_data],
|
|
467
|
-
|
|
674
|
+
wait_timeout,
|
|
468
675
|
)
|
|
469
676
|
|
|
470
677
|
def emit_mcps(
|
|
471
678
|
self,
|
|
472
679
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
680
|
+
emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
|
|
681
|
+
wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
476
682
|
) -> int:
|
|
477
683
|
if _DATAHUB_EMITTER_TRACE:
|
|
478
684
|
logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
|
|
@@ -481,43 +687,46 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
481
687
|
ensure_has_system_metadata(mcp)
|
|
482
688
|
|
|
483
689
|
if self._openapi_ingestion:
|
|
484
|
-
return self._emit_openapi_mcps(mcps,
|
|
690
|
+
return self._emit_openapi_mcps(mcps, emit_mode, wait_timeout)
|
|
485
691
|
else:
|
|
486
|
-
return self._emit_restli_mcps(mcps,
|
|
692
|
+
return self._emit_restli_mcps(mcps, emit_mode)
|
|
487
693
|
|
|
488
694
|
def _emit_openapi_mcps(
|
|
489
695
|
self,
|
|
490
696
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
697
|
+
emit_mode: EmitMode,
|
|
698
|
+
wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
494
699
|
) -> int:
|
|
495
700
|
"""
|
|
496
|
-
1. Grouping MCPs by their entity URL
|
|
701
|
+
1. Grouping MCPs by their HTTP method and entity URL and HTTP method
|
|
497
702
|
2. Breaking down large batches into smaller chunks based on both:
|
|
498
703
|
* Total byte size (INGEST_MAX_PAYLOAD_BYTES)
|
|
499
704
|
* Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
|
|
500
705
|
|
|
501
706
|
The Chunk class encapsulates both the items and their byte size tracking
|
|
502
|
-
Serializing the items only once with json.dumps(request
|
|
707
|
+
Serializing the items only once with json.dumps(request.payload) and reusing that
|
|
503
708
|
The chunking logic handles edge cases (always accepting at least one item per chunk)
|
|
504
709
|
The joining logic is efficient with a simple string concatenation
|
|
505
710
|
|
|
506
711
|
:param mcps: metadata change proposals to transmit
|
|
507
|
-
:param
|
|
712
|
+
:param emit_mode: the mode to emit the MCPs
|
|
713
|
+
:param wait_timeout: timeout for blocking queue
|
|
508
714
|
:return: number of requests
|
|
509
715
|
"""
|
|
510
|
-
#
|
|
511
|
-
batches: Dict[str, List[_Chunk]] = defaultdict(
|
|
716
|
+
# Group by entity URL and HTTP method
|
|
717
|
+
batches: Dict[Tuple[str, str], List[_Chunk]] = defaultdict(
|
|
512
718
|
lambda: [_Chunk(items=[])]
|
|
513
719
|
) # Initialize with one empty Chunk
|
|
514
720
|
|
|
515
721
|
for mcp in mcps:
|
|
516
|
-
request = self._to_openapi_request(mcp,
|
|
722
|
+
request = self._to_openapi_request(mcp, emit_mode)
|
|
517
723
|
if request:
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
724
|
+
# Create a composite key with both method and URL
|
|
725
|
+
key = (request.method, request.url)
|
|
726
|
+
current_chunk = batches[key][-1] # Get the last chunk
|
|
727
|
+
|
|
728
|
+
# Only serialize once - we're serializing a single payload item
|
|
729
|
+
serialized_item = json.dumps(request.payload[0])
|
|
521
730
|
item_bytes = len(serialized_item.encode())
|
|
522
731
|
|
|
523
732
|
# If adding this item would exceed max_bytes, create a new chunk
|
|
@@ -527,18 +736,20 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
527
736
|
or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
528
737
|
):
|
|
529
738
|
new_chunk = _Chunk(items=[])
|
|
530
|
-
batches[
|
|
739
|
+
batches[key].append(new_chunk)
|
|
531
740
|
current_chunk = new_chunk
|
|
532
741
|
|
|
533
742
|
current_chunk.add_item(serialized_item)
|
|
534
743
|
|
|
535
744
|
responses = []
|
|
536
|
-
for url, chunks in batches.items():
|
|
745
|
+
for (method, url), chunks in batches.items():
|
|
537
746
|
for chunk in chunks:
|
|
538
|
-
response = self._emit_generic(
|
|
747
|
+
response = self._emit_generic(
|
|
748
|
+
url, payload=_Chunk.join(chunk), method=method
|
|
749
|
+
)
|
|
539
750
|
responses.append(response)
|
|
540
751
|
|
|
541
|
-
if self._should_trace(
|
|
752
|
+
if self._should_trace(emit_mode):
|
|
542
753
|
trace_data = []
|
|
543
754
|
for response in responses:
|
|
544
755
|
data = extract_trace_data(response) if response else None
|
|
@@ -546,28 +757,36 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
546
757
|
trace_data.append(data)
|
|
547
758
|
|
|
548
759
|
if trace_data:
|
|
549
|
-
self._await_status(trace_data,
|
|
760
|
+
self._await_status(trace_data, wait_timeout)
|
|
550
761
|
|
|
551
762
|
return len(responses)
|
|
552
763
|
|
|
553
764
|
def _emit_restli_mcps(
|
|
554
765
|
self,
|
|
555
766
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
556
|
-
|
|
767
|
+
emit_mode: EmitMode,
|
|
557
768
|
) -> int:
|
|
558
769
|
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
|
|
559
770
|
|
|
560
771
|
mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
|
|
772
|
+
if len(mcp_objs) == 0:
|
|
773
|
+
return 0
|
|
561
774
|
|
|
562
775
|
# As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
|
|
563
776
|
# If we will exceed the limit, we need to break it up into chunks.
|
|
564
|
-
mcp_obj_chunks: List[List[str]] = []
|
|
565
|
-
current_chunk_size =
|
|
777
|
+
mcp_obj_chunks: List[List[str]] = [[]]
|
|
778
|
+
current_chunk_size = 0
|
|
566
779
|
for mcp_obj in mcp_objs:
|
|
780
|
+
mcp_identifier = f"{mcp_obj.get('entityUrn')}-{mcp_obj.get('aspectName')}"
|
|
567
781
|
mcp_obj_size = len(json.dumps(mcp_obj))
|
|
568
782
|
if _DATAHUB_EMITTER_TRACE:
|
|
569
783
|
logger.debug(
|
|
570
|
-
f"Iterating through object with size {mcp_obj_size}
|
|
784
|
+
f"Iterating through object ({mcp_identifier}) with size {mcp_obj_size}"
|
|
785
|
+
)
|
|
786
|
+
if mcp_obj_size > INGEST_MAX_PAYLOAD_BYTES:
|
|
787
|
+
logger.warning(
|
|
788
|
+
f"MCP object {mcp_identifier} has size {mcp_obj_size} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
|
|
789
|
+
"so this metadata will likely fail to be emitted."
|
|
571
790
|
)
|
|
572
791
|
|
|
573
792
|
if (
|
|
@@ -580,7 +799,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
580
799
|
current_chunk_size = 0
|
|
581
800
|
mcp_obj_chunks[-1].append(mcp_obj)
|
|
582
801
|
current_chunk_size += mcp_obj_size
|
|
583
|
-
if len(mcp_obj_chunks) >
|
|
802
|
+
if len(mcp_obj_chunks) > 1 or _DATAHUB_EMITTER_TRACE:
|
|
584
803
|
logger.debug(
|
|
585
804
|
f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
|
|
586
805
|
)
|
|
@@ -588,16 +807,19 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
588
807
|
for mcp_obj_chunk in mcp_obj_chunks:
|
|
589
808
|
# TODO: We're calling json.dumps on each MCP object twice, once to estimate
|
|
590
809
|
# the size when chunking, and again for the actual request.
|
|
591
|
-
payload_dict: dict = {
|
|
592
|
-
|
|
593
|
-
|
|
810
|
+
payload_dict: dict = {
|
|
811
|
+
"proposals": mcp_obj_chunk,
|
|
812
|
+
"async": "true"
|
|
813
|
+
if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
|
|
814
|
+
else "false",
|
|
815
|
+
}
|
|
594
816
|
|
|
595
817
|
payload = json.dumps(payload_dict)
|
|
596
818
|
self._emit_generic(url, payload)
|
|
597
819
|
|
|
598
820
|
return len(mcp_obj_chunks)
|
|
599
821
|
|
|
600
|
-
@deprecated
|
|
822
|
+
@deprecated("Use emit with a datasetUsageStatistics aspect instead")
|
|
601
823
|
def emit_usage(self, usageStats: UsageAggregation) -> None:
|
|
602
824
|
url = f"{self._gms_server}/usageStats?action=batchIngest"
|
|
603
825
|
|
|
@@ -608,11 +830,13 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
608
830
|
payload = json.dumps(snapshot)
|
|
609
831
|
self._emit_generic(url, payload)
|
|
610
832
|
|
|
611
|
-
def _emit_generic(
|
|
833
|
+
def _emit_generic(
|
|
834
|
+
self, url: str, payload: Union[str, Any], method: str = "POST"
|
|
835
|
+
) -> requests.Response:
|
|
612
836
|
if not isinstance(payload, str):
|
|
613
837
|
payload = json.dumps(payload)
|
|
614
838
|
|
|
615
|
-
curl_command = make_curl_command(self._session,
|
|
839
|
+
curl_command = make_curl_command(self._session, method, url, payload)
|
|
616
840
|
payload_size = len(payload)
|
|
617
841
|
if payload_size > INGEST_MAX_PAYLOAD_BYTES:
|
|
618
842
|
# since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
|
|
@@ -625,7 +849,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
625
849
|
curl_command,
|
|
626
850
|
)
|
|
627
851
|
try:
|
|
628
|
-
|
|
852
|
+
method_func = getattr(self._session, method.lower())
|
|
853
|
+
response = method_func(url, data=payload) if payload else method_func(url)
|
|
629
854
|
response.raise_for_status()
|
|
630
855
|
return response
|
|
631
856
|
except HTTPError as e:
|
|
@@ -661,7 +886,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
661
886
|
def _await_status(
|
|
662
887
|
self,
|
|
663
888
|
trace_data: List[TraceData],
|
|
664
|
-
|
|
889
|
+
wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
665
890
|
) -> None:
|
|
666
891
|
"""Verify the status of asynchronous write operations.
|
|
667
892
|
Args:
|
|
@@ -671,8 +896,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
671
896
|
TraceTimeoutError: If verification fails or times out
|
|
672
897
|
TraceValidationError: Expected write was not completed successfully
|
|
673
898
|
"""
|
|
674
|
-
if
|
|
675
|
-
raise ValueError("
|
|
899
|
+
if wait_timeout is None:
|
|
900
|
+
raise ValueError("wait_timeout cannot be None")
|
|
676
901
|
|
|
677
902
|
try:
|
|
678
903
|
if not trace_data:
|
|
@@ -685,9 +910,9 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
685
910
|
current_backoff = TRACE_INITIAL_BACKOFF
|
|
686
911
|
|
|
687
912
|
while trace.data:
|
|
688
|
-
if datetime.now() - start_time >
|
|
913
|
+
if datetime.now() - start_time > wait_timeout:
|
|
689
914
|
raise TraceTimeoutError(
|
|
690
|
-
f"Timeout waiting for async write completion after {
|
|
915
|
+
f"Timeout waiting for async write completion after {wait_timeout.total_seconds()} seconds"
|
|
691
916
|
)
|
|
692
917
|
|
|
693
918
|
base_url = f"{self._gms_server}/openapi/v1/trace/write"
|
|
@@ -700,7 +925,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
700
925
|
for aspect_name, aspect_status in aspects.items():
|
|
701
926
|
if not aspect_status["success"]:
|
|
702
927
|
error_msg = (
|
|
703
|
-
f"Unable to validate async write to DataHub GMS: "
|
|
928
|
+
f"Unable to validate async write {trace.trace_id} ({trace.extract_timestamp()}) to DataHub GMS: "
|
|
704
929
|
f"Persistence failure for URN '{urn}' aspect '{aspect_name}'. "
|
|
705
930
|
f"Status: {aspect_status}"
|
|
706
931
|
)
|
|
@@ -739,17 +964,28 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
739
964
|
logger.error(f"Error during status verification: {str(e)}")
|
|
740
965
|
raise
|
|
741
966
|
|
|
742
|
-
def _should_trace(
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
967
|
+
def _should_trace(self, emit_mode: EmitMode, warn: bool = True) -> bool:
|
|
968
|
+
if emit_mode == EmitMode.ASYNC_WAIT:
|
|
969
|
+
if not bool(self._openapi_ingestion):
|
|
970
|
+
if warn:
|
|
971
|
+
logger.warning(
|
|
972
|
+
f"{emit_mode} requested but is only available when using OpenAPI."
|
|
973
|
+
)
|
|
974
|
+
return False
|
|
975
|
+
elif getattr(
|
|
976
|
+
self, "server_config", None
|
|
977
|
+
) is None or not self.server_config.supports_feature(
|
|
978
|
+
ServiceFeature.API_TRACING
|
|
979
|
+
):
|
|
980
|
+
if warn:
|
|
981
|
+
logger.warning(
|
|
982
|
+
f"{emit_mode} requested but is only available with a newer GMS version."
|
|
983
|
+
)
|
|
984
|
+
return False
|
|
985
|
+
else:
|
|
986
|
+
return True
|
|
987
|
+
else:
|
|
988
|
+
return False
|
|
753
989
|
|
|
754
990
|
def __repr__(self) -> str:
|
|
755
991
|
token_str = (
|