acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
datahub/configuration/kafka.py
CHANGED
|
@@ -1,16 +1,36 @@
|
|
|
1
1
|
from pydantic import Field, validator
|
|
2
2
|
|
|
3
3
|
from datahub.configuration.common import ConfigModel, ConfigurationError
|
|
4
|
+
from datahub.configuration.env_vars import (
|
|
5
|
+
get_gms_base_path,
|
|
6
|
+
get_kafka_schema_registry_url,
|
|
7
|
+
)
|
|
4
8
|
from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
|
|
5
9
|
from datahub.configuration.validate_host_port import validate_host_port
|
|
6
10
|
|
|
7
11
|
|
|
12
|
+
def _get_schema_registry_url() -> str:
|
|
13
|
+
"""Get schema registry URL with proper base path handling."""
|
|
14
|
+
explicit_url = get_kafka_schema_registry_url()
|
|
15
|
+
if explicit_url:
|
|
16
|
+
return explicit_url
|
|
17
|
+
|
|
18
|
+
base_path = get_gms_base_path()
|
|
19
|
+
if base_path in ("/", ""):
|
|
20
|
+
base_path = ""
|
|
21
|
+
|
|
22
|
+
return f"http://localhost:8080{base_path}/schema-registry/api/"
|
|
23
|
+
|
|
24
|
+
|
|
8
25
|
class _KafkaConnectionConfig(ConfigModel):
|
|
9
26
|
# bootstrap servers
|
|
10
27
|
bootstrap: str = "localhost:9092"
|
|
11
28
|
|
|
12
29
|
# schema registry location
|
|
13
|
-
schema_registry_url: str =
|
|
30
|
+
schema_registry_url: str = Field(
|
|
31
|
+
default_factory=_get_schema_registry_url,
|
|
32
|
+
description="Schema registry URL. Can be overridden with KAFKA_SCHEMAREGISTRY_URL environment variable, or will use DATAHUB_GMS_BASE_PATH if not set.",
|
|
33
|
+
)
|
|
14
34
|
|
|
15
35
|
schema_registry_config: dict = Field(
|
|
16
36
|
default_factory=dict,
|
|
@@ -1,20 +1,13 @@
|
|
|
1
1
|
import pydantic.version
|
|
2
2
|
from packaging.version import Version
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
if Version(pydantic.version.VERSION) >= Version("2.0"):
|
|
6
|
-
PYDANTIC_VERSION_2 = True
|
|
7
|
-
else:
|
|
8
|
-
PYDANTIC_VERSION_2 = False
|
|
9
|
-
|
|
4
|
+
_pydantic_version = Version(pydantic.version.VERSION)
|
|
10
5
|
|
|
11
|
-
|
|
12
|
-
if PYDANTIC_VERSION_2:
|
|
13
|
-
from pydantic import PydanticDeprecatedSince20 # type: ignore
|
|
14
|
-
else:
|
|
6
|
+
PYDANTIC_VERSION_2 = _pydantic_version >= Version("2.0")
|
|
15
7
|
|
|
16
|
-
|
|
17
|
-
|
|
8
|
+
# The pydantic.Discriminator type was added in v2.5.0.
|
|
9
|
+
# https://docs.pydantic.dev/latest/changelog/#v250-2023-11-13
|
|
10
|
+
PYDANTIC_SUPPORTS_CALLABLE_DISCRIMINATOR = _pydantic_version >= Version("2.5.0")
|
|
18
11
|
|
|
19
12
|
|
|
20
13
|
if PYDANTIC_VERSION_2:
|
|
@@ -50,7 +43,7 @@ class v1_ConfigModel(v1_BaseModel):
|
|
|
50
43
|
|
|
51
44
|
__all__ = [
|
|
52
45
|
"PYDANTIC_VERSION_2",
|
|
53
|
-
"
|
|
46
|
+
"PYDANTIC_SUPPORTS_CALLABLE_DISCRIMINATOR",
|
|
54
47
|
"GenericModel",
|
|
55
48
|
"v1_ConfigModel",
|
|
56
49
|
"v1_Field",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import Dict, Optional
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
import pydantic
|
|
4
4
|
from pydantic.fields import Field
|
|
5
5
|
|
|
6
6
|
from datahub.configuration.common import ConfigModel
|
|
@@ -30,7 +30,8 @@ class EnvConfigMixin(ConfigModel):
|
|
|
30
30
|
description="The environment that all assets produced by this connector belong to",
|
|
31
31
|
)
|
|
32
32
|
|
|
33
|
-
@
|
|
33
|
+
@pydantic.field_validator("env", mode="after")
|
|
34
|
+
@classmethod
|
|
34
35
|
def env_must_be_one_of(cls, v: str) -> str:
|
|
35
36
|
if v.upper() not in ALL_ENV_TYPES:
|
|
36
37
|
raise ValueError(f"env must be one of {ALL_ENV_TYPES}, found {v}")
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
import warnings
|
|
2
|
-
from typing import Any, Optional, Type
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Optional, Type
|
|
3
3
|
|
|
4
4
|
import pydantic
|
|
5
5
|
|
|
6
6
|
from datahub.configuration.common import ConfigurationWarning
|
|
7
7
|
from datahub.utilities.global_warning_util import add_global_warning
|
|
8
8
|
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from pydantic.deprecated.class_validators import V1RootValidator
|
|
11
|
+
|
|
9
12
|
_unset = object()
|
|
10
13
|
|
|
11
14
|
|
|
@@ -13,7 +16,7 @@ def pydantic_field_deprecated(
|
|
|
13
16
|
field: str,
|
|
14
17
|
warn_if_value_is_not: Any = _unset,
|
|
15
18
|
message: Optional[str] = None,
|
|
16
|
-
) ->
|
|
19
|
+
) -> "V1RootValidator":
|
|
17
20
|
if message:
|
|
18
21
|
output = message
|
|
19
22
|
else:
|
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
import warnings
|
|
2
|
-
from typing import Type
|
|
2
|
+
from typing import TYPE_CHECKING, Type
|
|
3
3
|
|
|
4
4
|
import pydantic
|
|
5
5
|
|
|
6
6
|
from datahub.configuration.common import ConfigurationWarning
|
|
7
7
|
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from pydantic.deprecated.class_validators import V1RootValidator
|
|
10
|
+
|
|
8
11
|
|
|
9
12
|
def pydantic_removed_field(
|
|
10
13
|
field: str,
|
|
11
14
|
print_warning: bool = True,
|
|
12
|
-
) ->
|
|
15
|
+
) -> "V1RootValidator":
|
|
13
16
|
def _validate_field_removal(cls: Type, values: dict) -> dict:
|
|
14
17
|
if field in values:
|
|
15
18
|
if print_warning:
|
|
@@ -21,6 +24,9 @@ def pydantic_removed_field(
|
|
|
21
24
|
values.pop(field)
|
|
22
25
|
return values
|
|
23
26
|
|
|
27
|
+
# Mark the function as handling a removed field for doc generation
|
|
28
|
+
_validate_field_removal._doc_removed_field = field # type: ignore[attr-defined]
|
|
29
|
+
|
|
24
30
|
# Hack: Pydantic maintains unique list of validators by referring its __name__.
|
|
25
31
|
# https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
|
|
26
32
|
# This hack ensures that multiple field removals do not overwrite each other.
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
import warnings
|
|
2
|
-
from typing import Callable, Type, TypeVar
|
|
2
|
+
from typing import TYPE_CHECKING, Callable, Type, TypeVar
|
|
3
3
|
|
|
4
4
|
import pydantic
|
|
5
5
|
|
|
6
6
|
from datahub.configuration.common import ConfigurationWarning
|
|
7
7
|
from datahub.utilities.global_warning_util import add_global_warning
|
|
8
8
|
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from pydantic.deprecated.class_validators import V1RootValidator
|
|
11
|
+
|
|
9
12
|
_T = TypeVar("_T")
|
|
10
13
|
|
|
11
14
|
|
|
@@ -18,7 +21,7 @@ def pydantic_renamed_field(
|
|
|
18
21
|
new_name: str,
|
|
19
22
|
transform: Callable = _default_rename_transform,
|
|
20
23
|
print_warning: bool = True,
|
|
21
|
-
) ->
|
|
24
|
+
) -> "V1RootValidator":
|
|
22
25
|
def _validate_field_rename(cls: Type, values: dict) -> dict:
|
|
23
26
|
if old_name in values:
|
|
24
27
|
if new_name in values:
|
|
@@ -49,6 +52,4 @@ def pydantic_renamed_field(
|
|
|
49
52
|
# validator with pre=True gets all the values that were passed in.
|
|
50
53
|
# Given that a renamed field doesn't show up in the fields list, we can't use
|
|
51
54
|
# the field-level validator, even with a different field name.
|
|
52
|
-
return pydantic.root_validator(pre=True,
|
|
53
|
-
_validate_field_rename
|
|
54
|
-
)
|
|
55
|
+
return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename)
|
|
@@ -1,9 +1,12 @@
|
|
|
1
|
-
from typing import Optional, Type, Union
|
|
1
|
+
from typing import TYPE_CHECKING, Optional, Type, Union
|
|
2
2
|
|
|
3
3
|
import pydantic
|
|
4
4
|
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from pydantic.deprecated.class_validators import V1Validator
|
|
5
7
|
|
|
6
|
-
|
|
8
|
+
|
|
9
|
+
def pydantic_multiline_string(field: str) -> "V1Validator":
|
|
7
10
|
"""If the field is present and contains an escaped newline, replace it with a real newline.
|
|
8
11
|
|
|
9
12
|
This makes the assumption that the field value is never supposed to have a
|
datahub/emitter/mce_builder.py
CHANGED
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
import hashlib
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
|
-
import os
|
|
7
6
|
import re
|
|
8
7
|
import time
|
|
9
8
|
from datetime import datetime, timezone
|
|
@@ -26,6 +25,7 @@ import typing_inspect
|
|
|
26
25
|
from avrogen.dict_wrapper import DictWrapper
|
|
27
26
|
from typing_extensions import assert_never
|
|
28
27
|
|
|
28
|
+
from datahub.configuration.env_vars import get_dataset_urn_to_lower
|
|
29
29
|
from datahub.emitter.enum_helpers import get_enum_options
|
|
30
30
|
from datahub.metadata.schema_classes import (
|
|
31
31
|
AssertionKeyClass,
|
|
@@ -72,9 +72,7 @@ ALL_ENV_TYPES: Set[str] = set(get_enum_options(FabricTypeClass))
|
|
|
72
72
|
|
|
73
73
|
DEFAULT_FLOW_CLUSTER = "prod"
|
|
74
74
|
UNKNOWN_USER = "urn:li:corpuser:unknown"
|
|
75
|
-
DATASET_URN_TO_LOWER: bool = (
|
|
76
|
-
os.getenv("DATAHUB_DATASET_URN_TO_LOWER", "false") == "true"
|
|
77
|
-
)
|
|
75
|
+
DATASET_URN_TO_LOWER: bool = get_dataset_urn_to_lower() == "true"
|
|
78
76
|
|
|
79
77
|
if TYPE_CHECKING:
|
|
80
78
|
from datahub.emitter.mcp_builder import DatahubKey
|
|
@@ -376,6 +374,12 @@ def make_domain_urn(domain: str) -> str:
|
|
|
376
374
|
return f"urn:li:domain:{domain}"
|
|
377
375
|
|
|
378
376
|
|
|
377
|
+
def make_data_product_urn(data_product_id: str) -> str:
|
|
378
|
+
if data_product_id.startswith("urn:li:dataProduct:"):
|
|
379
|
+
return data_product_id
|
|
380
|
+
return f"urn:li:dataProduct:{data_product_id}"
|
|
381
|
+
|
|
382
|
+
|
|
379
383
|
def make_ml_primary_key_urn(feature_table_name: str, primary_key_name: str) -> str:
|
|
380
384
|
return f"urn:li:mlPrimaryKey:({feature_table_name},{primary_key_name})"
|
|
381
385
|
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import functools
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
|
-
import
|
|
6
|
+
import re
|
|
7
7
|
import time
|
|
8
8
|
from collections import defaultdict
|
|
9
9
|
from dataclasses import dataclass
|
|
@@ -32,7 +32,6 @@ from typing_extensions import deprecated
|
|
|
32
32
|
from datahub._version import nice_version_name
|
|
33
33
|
from datahub.cli import config_utils
|
|
34
34
|
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
|
|
35
|
-
from datahub.cli.env_utils import get_boolean_env_variable
|
|
36
35
|
from datahub.configuration.common import (
|
|
37
36
|
ConfigEnum,
|
|
38
37
|
ConfigModel,
|
|
@@ -41,6 +40,14 @@ from datahub.configuration.common import (
|
|
|
41
40
|
TraceTimeoutError,
|
|
42
41
|
TraceValidationError,
|
|
43
42
|
)
|
|
43
|
+
from datahub.configuration.env_vars import (
|
|
44
|
+
get_emit_mode,
|
|
45
|
+
get_emitter_trace,
|
|
46
|
+
get_rest_emitter_batch_max_payload_bytes,
|
|
47
|
+
get_rest_emitter_batch_max_payload_length,
|
|
48
|
+
get_rest_emitter_default_endpoint,
|
|
49
|
+
get_rest_emitter_default_retry_max_times,
|
|
50
|
+
)
|
|
44
51
|
from datahub.emitter.generic_emitter import Emitter
|
|
45
52
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
46
53
|
from datahub.emitter.request_helper import OpenApiRequest, make_curl_command
|
|
@@ -60,6 +67,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
|
60
67
|
MetadataChangeProposal,
|
|
61
68
|
)
|
|
62
69
|
from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation
|
|
70
|
+
from datahub.metadata.schema_classes import (
|
|
71
|
+
KEY_ASPECT_NAMES,
|
|
72
|
+
ChangeTypeClass,
|
|
73
|
+
)
|
|
63
74
|
from datahub.utilities.server_config_util import RestServiceConfig, ServiceFeature
|
|
64
75
|
|
|
65
76
|
if TYPE_CHECKING:
|
|
@@ -77,11 +88,9 @@ _DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on
|
|
|
77
88
|
504,
|
|
78
89
|
]
|
|
79
90
|
_DEFAULT_RETRY_METHODS = ["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"]
|
|
80
|
-
_DEFAULT_RETRY_MAX_TIMES = int(
|
|
81
|
-
os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
|
|
82
|
-
)
|
|
91
|
+
_DEFAULT_RETRY_MAX_TIMES = int(get_rest_emitter_default_retry_max_times())
|
|
83
92
|
|
|
84
|
-
_DATAHUB_EMITTER_TRACE =
|
|
93
|
+
_DATAHUB_EMITTER_TRACE = get_emitter_trace()
|
|
85
94
|
|
|
86
95
|
_DEFAULT_CLIENT_MODE: ClientMode = ClientMode.SDK
|
|
87
96
|
|
|
@@ -90,18 +99,32 @@ TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
|
|
|
90
99
|
TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
|
|
91
100
|
TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
|
|
92
101
|
|
|
93
|
-
# The limit is
|
|
102
|
+
# The limit is 16,000,000 bytes. We will use a max of 15mb to have some space
|
|
94
103
|
# for overhead like request headers.
|
|
95
104
|
# This applies to pretty much all calls to GMS.
|
|
96
|
-
INGEST_MAX_PAYLOAD_BYTES =
|
|
105
|
+
INGEST_MAX_PAYLOAD_BYTES = get_rest_emitter_batch_max_payload_bytes()
|
|
97
106
|
|
|
98
107
|
# This limit is somewhat arbitrary. All GMS endpoints will timeout
|
|
99
108
|
# and return a 500 if processing takes too long. To avoid sending
|
|
100
109
|
# too much to the backend and hitting a timeout, we try to limit
|
|
101
110
|
# the number of MCPs we send in a batch.
|
|
102
|
-
BATCH_INGEST_MAX_PAYLOAD_LENGTH =
|
|
103
|
-
|
|
104
|
-
|
|
111
|
+
BATCH_INGEST_MAX_PAYLOAD_LENGTH = get_rest_emitter_batch_max_payload_length()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def preserve_unicode_escapes(obj: Any) -> Any:
|
|
115
|
+
"""Recursively convert unicode characters back to escape sequences"""
|
|
116
|
+
if isinstance(obj, dict):
|
|
117
|
+
return {k: preserve_unicode_escapes(v) for k, v in obj.items()}
|
|
118
|
+
elif isinstance(obj, list):
|
|
119
|
+
return [preserve_unicode_escapes(item) for item in obj]
|
|
120
|
+
elif isinstance(obj, str):
|
|
121
|
+
# Convert non-ASCII characters back to \u escapes
|
|
122
|
+
def escape_unicode(match: Any) -> Any:
|
|
123
|
+
return f"\\u{ord(match.group(0)):04x}"
|
|
124
|
+
|
|
125
|
+
return re.sub(r"[^\x00-\x7F]", escape_unicode, obj)
|
|
126
|
+
else:
|
|
127
|
+
return obj
|
|
105
128
|
|
|
106
129
|
|
|
107
130
|
class EmitMode(ConfigEnum):
|
|
@@ -124,7 +147,7 @@ class EmitMode(ConfigEnum):
|
|
|
124
147
|
|
|
125
148
|
_DEFAULT_EMIT_MODE = pydantic.parse_obj_as(
|
|
126
149
|
EmitMode,
|
|
127
|
-
|
|
150
|
+
get_emit_mode() or EmitMode.SYNC_PRIMARY,
|
|
128
151
|
)
|
|
129
152
|
|
|
130
153
|
|
|
@@ -135,7 +158,7 @@ class RestSinkEndpoint(ConfigEnum):
|
|
|
135
158
|
|
|
136
159
|
DEFAULT_REST_EMITTER_ENDPOINT = pydantic.parse_obj_as(
|
|
137
160
|
RestSinkEndpoint,
|
|
138
|
-
|
|
161
|
+
get_rest_emitter_default_endpoint() or RestSinkEndpoint.RESTLI,
|
|
139
162
|
)
|
|
140
163
|
|
|
141
164
|
|
|
@@ -314,6 +337,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
314
337
|
openapi_ingestion: Optional[bool] = None,
|
|
315
338
|
client_mode: Optional[ClientMode] = None,
|
|
316
339
|
datahub_component: Optional[str] = None,
|
|
340
|
+
server_config_refresh_interval: Optional[int] = None,
|
|
317
341
|
):
|
|
318
342
|
if not gms_server:
|
|
319
343
|
raise ConfigurationError("gms server is required")
|
|
@@ -329,6 +353,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
329
353
|
self._openapi_ingestion = (
|
|
330
354
|
openapi_ingestion # Re-evaluated after test connection
|
|
331
355
|
)
|
|
356
|
+
self._server_config_refresh_interval = server_config_refresh_interval
|
|
357
|
+
self._config_fetch_time: Optional[float] = None
|
|
332
358
|
|
|
333
359
|
headers = {
|
|
334
360
|
"X-RestLi-Protocol-Version": "2.0.0",
|
|
@@ -398,7 +424,17 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
398
424
|
Raises:
|
|
399
425
|
ConfigurationError: If there's an error fetching or validating the configuration
|
|
400
426
|
"""
|
|
401
|
-
|
|
427
|
+
|
|
428
|
+
if (
|
|
429
|
+
not hasattr(self, "_server_config")
|
|
430
|
+
or self._server_config is None
|
|
431
|
+
or (
|
|
432
|
+
self._server_config_refresh_interval is not None
|
|
433
|
+
and self._config_fetch_time is not None
|
|
434
|
+
and (time.time() - self._config_fetch_time)
|
|
435
|
+
> self._server_config_refresh_interval
|
|
436
|
+
)
|
|
437
|
+
):
|
|
402
438
|
if self._session is None or self._gms_server is None:
|
|
403
439
|
raise ConfigurationError(
|
|
404
440
|
"Session and URL are required to load configuration"
|
|
@@ -419,6 +455,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
419
455
|
)
|
|
420
456
|
|
|
421
457
|
self._server_config = RestServiceConfig(raw_config=raw_config)
|
|
458
|
+
self._config_fetch_time = time.time()
|
|
422
459
|
self._post_fetch_server_config()
|
|
423
460
|
|
|
424
461
|
else:
|
|
@@ -441,7 +478,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
441
478
|
if self._openapi_ingestion is None:
|
|
442
479
|
# No constructor parameter
|
|
443
480
|
if (
|
|
444
|
-
not
|
|
481
|
+
not get_rest_emitter_default_endpoint()
|
|
445
482
|
and self._session_config.client_mode == ClientMode.SDK
|
|
446
483
|
and self._server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
|
|
447
484
|
):
|
|
@@ -453,6 +490,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
453
490
|
DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
|
|
454
491
|
)
|
|
455
492
|
|
|
493
|
+
def test_connection(self) -> None:
|
|
494
|
+
self.fetch_server_config()
|
|
456
495
|
logger.debug(
|
|
457
496
|
f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
|
|
458
497
|
)
|
|
@@ -460,12 +499,21 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
460
499
|
f"{EmitMode.ASYNC_WAIT} {'IS' if self._should_trace(emit_mode=EmitMode.ASYNC_WAIT, warn=False) else 'IS NOT'} supported."
|
|
461
500
|
)
|
|
462
501
|
|
|
463
|
-
def test_connection(self) -> None:
|
|
464
|
-
self.fetch_server_config()
|
|
465
|
-
|
|
466
502
|
def get_server_config(self) -> dict:
|
|
467
503
|
return self.server_config.raw_config
|
|
468
504
|
|
|
505
|
+
def invalidate_config_cache(self) -> None:
|
|
506
|
+
"""Manually invalidate the configuration cache."""
|
|
507
|
+
if (
|
|
508
|
+
hasattr(self, "_server_config")
|
|
509
|
+
and self._server_config is not None
|
|
510
|
+
and self._server_config_refresh_interval is not None
|
|
511
|
+
):
|
|
512
|
+
# Set fetch time to beyond TTL in the past to force refresh on next access
|
|
513
|
+
self._config_fetch_time = (
|
|
514
|
+
time.time() - self._server_config_refresh_interval - 1
|
|
515
|
+
)
|
|
516
|
+
|
|
469
517
|
def to_graph(self) -> "DataHubGraph":
|
|
470
518
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
471
519
|
|
|
@@ -538,6 +586,11 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
538
586
|
"systemMetadata": system_metadata_obj,
|
|
539
587
|
}
|
|
540
588
|
payload = json.dumps(snapshot)
|
|
589
|
+
if len(payload) > INGEST_MAX_PAYLOAD_BYTES:
|
|
590
|
+
logger.warning(
|
|
591
|
+
f"MCE object has size {len(payload)} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
|
|
592
|
+
"so this metadata will likely fail to be emitted."
|
|
593
|
+
)
|
|
541
594
|
|
|
542
595
|
self._emit_generic(url, payload)
|
|
543
596
|
|
|
@@ -584,15 +637,27 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
584
637
|
trace_data = extract_trace_data(response) if response else None
|
|
585
638
|
|
|
586
639
|
else:
|
|
587
|
-
|
|
640
|
+
if mcp.changeType == ChangeTypeClass.DELETE:
|
|
641
|
+
if mcp.aspectName not in KEY_ASPECT_NAMES:
|
|
642
|
+
raise OperationalError(
|
|
643
|
+
f"Delete not supported for non key aspect: {mcp.aspectName} for urn: "
|
|
644
|
+
f"{mcp.entityUrn}"
|
|
645
|
+
)
|
|
588
646
|
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
647
|
+
url = f"{self._gms_server}/entities?action=delete"
|
|
648
|
+
payload_dict = {
|
|
649
|
+
"urn": mcp.entityUrn,
|
|
650
|
+
}
|
|
651
|
+
else:
|
|
652
|
+
url = f"{self._gms_server}/aspects?action=ingestProposal"
|
|
653
|
+
|
|
654
|
+
mcp_obj = preserve_unicode_escapes(pre_json_transform(mcp.to_obj()))
|
|
655
|
+
payload_dict = {
|
|
656
|
+
"proposal": mcp_obj,
|
|
657
|
+
"async": "true"
|
|
658
|
+
if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
|
|
659
|
+
else "false",
|
|
660
|
+
}
|
|
596
661
|
|
|
597
662
|
payload = json.dumps(payload_dict)
|
|
598
663
|
|
|
@@ -704,16 +769,24 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
704
769
|
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
|
|
705
770
|
|
|
706
771
|
mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
|
|
772
|
+
if len(mcp_objs) == 0:
|
|
773
|
+
return 0
|
|
707
774
|
|
|
708
775
|
# As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
|
|
709
776
|
# If we will exceed the limit, we need to break it up into chunks.
|
|
710
|
-
mcp_obj_chunks: List[List[str]] = []
|
|
711
|
-
current_chunk_size =
|
|
777
|
+
mcp_obj_chunks: List[List[str]] = [[]]
|
|
778
|
+
current_chunk_size = 0
|
|
712
779
|
for mcp_obj in mcp_objs:
|
|
780
|
+
mcp_identifier = f"{mcp_obj.get('entityUrn')}-{mcp_obj.get('aspectName')}"
|
|
713
781
|
mcp_obj_size = len(json.dumps(mcp_obj))
|
|
714
782
|
if _DATAHUB_EMITTER_TRACE:
|
|
715
783
|
logger.debug(
|
|
716
|
-
f"Iterating through object with size {mcp_obj_size}
|
|
784
|
+
f"Iterating through object ({mcp_identifier}) with size {mcp_obj_size}"
|
|
785
|
+
)
|
|
786
|
+
if mcp_obj_size > INGEST_MAX_PAYLOAD_BYTES:
|
|
787
|
+
logger.warning(
|
|
788
|
+
f"MCP object {mcp_identifier} has size {mcp_obj_size} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
|
|
789
|
+
"so this metadata will likely fail to be emitted."
|
|
717
790
|
)
|
|
718
791
|
|
|
719
792
|
if (
|
|
@@ -726,7 +799,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
726
799
|
current_chunk_size = 0
|
|
727
800
|
mcp_obj_chunks[-1].append(mcp_obj)
|
|
728
801
|
current_chunk_size += mcp_obj_size
|
|
729
|
-
if len(mcp_obj_chunks) >
|
|
802
|
+
if len(mcp_obj_chunks) > 1 or _DATAHUB_EMITTER_TRACE:
|
|
730
803
|
logger.debug(
|
|
731
804
|
f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
|
|
732
805
|
)
|
datahub/entrypoints.py
CHANGED
|
@@ -10,6 +10,7 @@ import click
|
|
|
10
10
|
import datahub._version as datahub_version
|
|
11
11
|
from datahub.cli.check_cli import check
|
|
12
12
|
from datahub.cli.cli_utils import (
|
|
13
|
+
enable_auto_decorators,
|
|
13
14
|
fixup_gms_url,
|
|
14
15
|
generate_access_token,
|
|
15
16
|
make_shim_command,
|
|
@@ -21,6 +22,7 @@ from datahub.cli.docker_cli import docker
|
|
|
21
22
|
from datahub.cli.env_utils import get_boolean_env_variable
|
|
22
23
|
from datahub.cli.exists_cli import exists
|
|
23
24
|
from datahub.cli.get_cli import get
|
|
25
|
+
from datahub.cli.graphql_cli import graphql
|
|
24
26
|
from datahub.cli.ingest_cli import ingest
|
|
25
27
|
from datahub.cli.migrate import migrate
|
|
26
28
|
from datahub.cli.put_cli import put
|
|
@@ -38,7 +40,6 @@ from datahub.cli.timeline_cli import timeline
|
|
|
38
40
|
from datahub.configuration.common import should_show_stack_trace
|
|
39
41
|
from datahub.ingestion.graph.client import get_default_graph
|
|
40
42
|
from datahub.ingestion.graph.config import ClientMode
|
|
41
|
-
from datahub.telemetry import telemetry
|
|
42
43
|
from datahub.utilities._custom_package_loader import model_version_name
|
|
43
44
|
from datahub.utilities.logging_manager import configure_logging
|
|
44
45
|
from datahub.utilities.server_config_util import get_gms_config
|
|
@@ -111,7 +112,6 @@ def datahub(
|
|
|
111
112
|
default=False,
|
|
112
113
|
help="If passed will show server config. Assumes datahub init has happened.",
|
|
113
114
|
)
|
|
114
|
-
@telemetry.with_telemetry()
|
|
115
115
|
def version(include_server: bool = False) -> None:
|
|
116
116
|
"""Print version number and exit."""
|
|
117
117
|
|
|
@@ -131,7 +131,6 @@ def version(include_server: bool = False) -> None:
|
|
|
131
131
|
default=False,
|
|
132
132
|
help="If passed then uses password to initialise token.",
|
|
133
133
|
)
|
|
134
|
-
@telemetry.with_telemetry()
|
|
135
134
|
def init(use_password: bool = False) -> None:
|
|
136
135
|
"""Configure which datahub instance to connect to"""
|
|
137
136
|
|
|
@@ -171,6 +170,7 @@ datahub.add_command(ingest)
|
|
|
171
170
|
datahub.add_command(delete)
|
|
172
171
|
datahub.add_command(exists)
|
|
173
172
|
datahub.add_command(get)
|
|
173
|
+
datahub.add_command(graphql)
|
|
174
174
|
datahub.add_command(put)
|
|
175
175
|
datahub.add_command(state)
|
|
176
176
|
datahub.add_command(telemetry_cli)
|
|
@@ -218,6 +218,9 @@ except ImportError as e:
|
|
|
218
218
|
make_shim_command("actions", "run `pip install acryl-datahub-actions`")
|
|
219
219
|
)
|
|
220
220
|
|
|
221
|
+
# Adding telemetry and upgrade decorators to all commands
|
|
222
|
+
enable_auto_decorators(datahub)
|
|
223
|
+
|
|
221
224
|
|
|
222
225
|
def main(**kwargs):
|
|
223
226
|
# We use threads in a variety of places within our CLI. The multiprocessing
|