acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
datahub/upgrade/upgrade.py
CHANGED
|
@@ -7,13 +7,15 @@ from typing import Any, Callable, Optional, Tuple, TypeVar
|
|
|
7
7
|
|
|
8
8
|
import click
|
|
9
9
|
import humanfriendly
|
|
10
|
-
from packaging.version import Version
|
|
10
|
+
from packaging.version import InvalidVersion, Version
|
|
11
11
|
from pydantic import BaseModel
|
|
12
12
|
|
|
13
13
|
from datahub._version import __version__
|
|
14
14
|
from datahub.cli.config_utils import load_client_config
|
|
15
15
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
16
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
16
17
|
from datahub.utilities.perf_timer import PerfTimer
|
|
18
|
+
from datahub.utilities.server_config_util import RestServiceConfig
|
|
17
19
|
|
|
18
20
|
log = logging.getLogger(__name__)
|
|
19
21
|
|
|
@@ -26,10 +28,24 @@ class VersionStats(BaseModel, arbitrary_types_allowed=True):
|
|
|
26
28
|
release_date: Optional[datetime] = None
|
|
27
29
|
|
|
28
30
|
|
|
31
|
+
def _safe_version_stats(version_string: str) -> Optional[VersionStats]:
|
|
32
|
+
"""
|
|
33
|
+
Safely create a VersionStats object from a version string.
|
|
34
|
+
Returns None if the version string is invalid.
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
return VersionStats(version=Version(version_string), release_date=None)
|
|
38
|
+
except InvalidVersion:
|
|
39
|
+
log.warning(f"Invalid version format received: {version_string!r}")
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
|
|
29
43
|
class ServerVersionStats(BaseModel):
|
|
30
44
|
current: VersionStats
|
|
31
45
|
latest: Optional[VersionStats] = None
|
|
32
46
|
current_server_type: Optional[str] = None
|
|
47
|
+
current_server_default_cli_version: Optional[VersionStats] = None
|
|
48
|
+
is_cloud_server: Optional[bool] = None
|
|
33
49
|
|
|
34
50
|
|
|
35
51
|
class ClientVersionStats(BaseModel):
|
|
@@ -42,7 +58,7 @@ class DataHubVersionStats(BaseModel):
|
|
|
42
58
|
client: ClientVersionStats
|
|
43
59
|
|
|
44
60
|
|
|
45
|
-
async def get_client_version_stats():
|
|
61
|
+
async def get_client_version_stats() -> ClientVersionStats:
|
|
46
62
|
import aiohttp
|
|
47
63
|
|
|
48
64
|
current_version_string = __version__
|
|
@@ -50,6 +66,7 @@ async def get_client_version_stats():
|
|
|
50
66
|
client_version_stats: ClientVersionStats = ClientVersionStats(
|
|
51
67
|
current=VersionStats(version=current_version, release_date=None), latest=None
|
|
52
68
|
)
|
|
69
|
+
|
|
53
70
|
async with aiohttp.ClientSession() as session:
|
|
54
71
|
pypi_url = "https://pypi.org/pypi/acryl_datahub/json"
|
|
55
72
|
async with session.get(pypi_url) as resp:
|
|
@@ -109,7 +126,7 @@ async def get_github_stats():
|
|
|
109
126
|
return (latest_server_version, latest_server_date)
|
|
110
127
|
|
|
111
128
|
|
|
112
|
-
async def get_server_config(gms_url: str, token: Optional[str]) ->
|
|
129
|
+
async def get_server_config(gms_url: str, token: Optional[str]) -> RestServiceConfig:
|
|
113
130
|
import aiohttp
|
|
114
131
|
|
|
115
132
|
headers = {
|
|
@@ -124,19 +141,22 @@ async def get_server_config(gms_url: str, token: Optional[str]) -> dict:
|
|
|
124
141
|
config_endpoint = f"{gms_url}/config"
|
|
125
142
|
async with session.get(config_endpoint, headers=headers) as dh_response:
|
|
126
143
|
dh_response_json = await dh_response.json()
|
|
127
|
-
return dh_response_json
|
|
144
|
+
return RestServiceConfig(raw_config=dh_response_json)
|
|
128
145
|
|
|
129
146
|
|
|
130
147
|
async def get_server_version_stats(
|
|
131
148
|
server: Optional[DataHubGraph] = None,
|
|
132
|
-
) -> Tuple[
|
|
149
|
+
) -> Tuple[
|
|
150
|
+
Optional[str], Optional[Version], Optional[str], Optional[datetime], Optional[bool]
|
|
151
|
+
]:
|
|
133
152
|
import aiohttp
|
|
134
153
|
|
|
135
|
-
server_config = None
|
|
154
|
+
server_config: Optional[RestServiceConfig] = None
|
|
136
155
|
if not server:
|
|
137
156
|
try:
|
|
138
157
|
# let's get the server from the cli config
|
|
139
158
|
client_config = load_client_config()
|
|
159
|
+
client_config.client_mode = ClientMode.CLI
|
|
140
160
|
host = client_config.server
|
|
141
161
|
token = client_config.token
|
|
142
162
|
server_config = await get_server_config(host, token)
|
|
@@ -148,17 +168,15 @@ async def get_server_version_stats(
|
|
|
148
168
|
|
|
149
169
|
server_type = None
|
|
150
170
|
server_version: Optional[Version] = None
|
|
171
|
+
current_server_default_cli_version = None
|
|
151
172
|
current_server_release_date = None
|
|
173
|
+
is_cloud_server: Optional[bool] = None
|
|
152
174
|
if server_config:
|
|
153
|
-
server_version_string =
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
commit_hash = (
|
|
159
|
-
server_config.get("versions", {}).get("acryldata/datahub", {}).get("commit")
|
|
160
|
-
)
|
|
161
|
-
server_type = server_config.get("datahub", {}).get("serverType", "unknown")
|
|
175
|
+
server_version_string = server_config.service_version
|
|
176
|
+
commit_hash = server_config.commit_hash
|
|
177
|
+
server_type = server_config.server_type
|
|
178
|
+
current_server_default_cli_version = server_config.default_cli_version
|
|
179
|
+
is_cloud_server = server_config.is_datahub_cloud
|
|
162
180
|
if server_type == "quickstart" and commit_hash:
|
|
163
181
|
async with aiohttp.ClientSession(
|
|
164
182
|
headers={"Accept": "application/vnd.github.v3+json"}
|
|
@@ -173,7 +191,13 @@ async def get_server_version_stats(
|
|
|
173
191
|
if server_version_string and server_version_string.startswith("v"):
|
|
174
192
|
server_version = Version(server_version_string[1:])
|
|
175
193
|
|
|
176
|
-
return (
|
|
194
|
+
return (
|
|
195
|
+
server_type,
|
|
196
|
+
server_version,
|
|
197
|
+
current_server_default_cli_version,
|
|
198
|
+
current_server_release_date,
|
|
199
|
+
is_cloud_server,
|
|
200
|
+
)
|
|
177
201
|
|
|
178
202
|
|
|
179
203
|
def retrieve_version_stats(
|
|
@@ -216,7 +240,9 @@ async def _retrieve_version_stats(
|
|
|
216
240
|
(
|
|
217
241
|
current_server_type,
|
|
218
242
|
current_server_version,
|
|
243
|
+
current_server_default_cli_version,
|
|
219
244
|
current_server_release_date,
|
|
245
|
+
is_cloud_server,
|
|
220
246
|
) = results[2]
|
|
221
247
|
|
|
222
248
|
server_version_stats = None
|
|
@@ -225,12 +251,18 @@ async def _retrieve_version_stats(
|
|
|
225
251
|
current=VersionStats(
|
|
226
252
|
version=current_server_version, release_date=current_server_release_date
|
|
227
253
|
),
|
|
254
|
+
current_server_default_cli_version=(
|
|
255
|
+
_safe_version_stats(current_server_default_cli_version)
|
|
256
|
+
if current_server_default_cli_version
|
|
257
|
+
else None
|
|
258
|
+
),
|
|
228
259
|
latest=(
|
|
229
260
|
VersionStats(version=last_server_version, release_date=last_server_date)
|
|
230
261
|
if last_server_version
|
|
231
262
|
else None
|
|
232
263
|
),
|
|
233
264
|
current_server_type=current_server_type,
|
|
265
|
+
is_cloud_server=is_cloud_server,
|
|
234
266
|
)
|
|
235
267
|
|
|
236
268
|
if client_version_stats and server_version_stats:
|
|
@@ -257,21 +289,14 @@ def valid_client_version(version: Version) -> bool:
|
|
|
257
289
|
"""Only version strings like 0.4.5 and 0.6.7.8 are valid. 0.8.6.7rc1 is not"""
|
|
258
290
|
if version.is_prerelease or version.is_postrelease or version.is_devrelease:
|
|
259
291
|
return False
|
|
260
|
-
|
|
261
|
-
return True
|
|
262
|
-
|
|
263
|
-
return False
|
|
292
|
+
return True
|
|
264
293
|
|
|
265
294
|
|
|
266
295
|
def valid_server_version(version: Version) -> bool:
|
|
267
296
|
"""Only version strings like 0.8.x, 0.9.x or 0.10.x are valid. 0.1.x is not"""
|
|
268
297
|
if version.is_prerelease or version.is_postrelease or version.is_devrelease:
|
|
269
298
|
return False
|
|
270
|
-
|
|
271
|
-
if version.major == 0 and version.minor in [8, 9, 10]:
|
|
272
|
-
return True
|
|
273
|
-
|
|
274
|
-
return False
|
|
299
|
+
return True
|
|
275
300
|
|
|
276
301
|
|
|
277
302
|
def is_client_server_compatible(client: VersionStats, server: VersionStats) -> int:
|
|
@@ -293,6 +318,27 @@ def is_client_server_compatible(client: VersionStats, server: VersionStats) -> i
|
|
|
293
318
|
return server.version.micro - client.version.micro
|
|
294
319
|
|
|
295
320
|
|
|
321
|
+
def is_server_default_cli_ahead(version_stats: DataHubVersionStats) -> bool:
|
|
322
|
+
"""
|
|
323
|
+
Check if the server default CLI version is ahead of the current CLI version.
|
|
324
|
+
Returns True if server default CLI is newer and both versions are valid.
|
|
325
|
+
"""
|
|
326
|
+
if not version_stats.server.current_server_default_cli_version:
|
|
327
|
+
return False
|
|
328
|
+
|
|
329
|
+
current_cli = version_stats.client.current
|
|
330
|
+
server_default_cli = version_stats.server.current_server_default_cli_version
|
|
331
|
+
|
|
332
|
+
is_valid_client_version = valid_client_version(current_cli.version)
|
|
333
|
+
is_valid_server_version = valid_client_version(server_default_cli.version)
|
|
334
|
+
|
|
335
|
+
if not (is_valid_client_version and is_valid_server_version):
|
|
336
|
+
return False
|
|
337
|
+
|
|
338
|
+
compatibility_result = is_client_server_compatible(current_cli, server_default_cli)
|
|
339
|
+
return compatibility_result > 0
|
|
340
|
+
|
|
341
|
+
|
|
296
342
|
def _maybe_print_upgrade_message(
|
|
297
343
|
version_stats: Optional[DataHubVersionStats],
|
|
298
344
|
) -> None:
|
|
@@ -314,9 +360,15 @@ def _maybe_print_upgrade_message(
|
|
|
314
360
|
if version_stats.client.latest
|
|
315
361
|
else None
|
|
316
362
|
)
|
|
317
|
-
client_server_compat =
|
|
318
|
-
|
|
319
|
-
|
|
363
|
+
client_server_compat = 0
|
|
364
|
+
# Skip version compatibility checks for cloud servers (serverEnv="cloud")
|
|
365
|
+
# Cloud servers use different versioning schemes between server and CLI
|
|
366
|
+
is_cloud = version_stats.server.is_cloud_server
|
|
367
|
+
|
|
368
|
+
if not is_cloud:
|
|
369
|
+
client_server_compat = is_client_server_compatible(
|
|
370
|
+
version_stats.client.current, version_stats.server.current
|
|
371
|
+
)
|
|
320
372
|
|
|
321
373
|
if latest_release_date and current_release_date:
|
|
322
374
|
assert version_stats.client.latest
|
|
@@ -379,7 +431,8 @@ def _maybe_print_upgrade_message(
|
|
|
379
431
|
+ click.style(
|
|
380
432
|
f"➡️ Upgrade via \"pip install 'acryl-datahub=={version_stats.server.current.version}'\"",
|
|
381
433
|
fg="cyan",
|
|
382
|
-
)
|
|
434
|
+
),
|
|
435
|
+
err=True,
|
|
383
436
|
)
|
|
384
437
|
elif client_server_compat == 0 and encourage_cli_upgrade:
|
|
385
438
|
with contextlib.suppress(Exception):
|
|
@@ -389,7 +442,8 @@ def _maybe_print_upgrade_message(
|
|
|
389
442
|
+ click.style(
|
|
390
443
|
f"You seem to be running an old version of datahub cli: {current_version} {get_days(current_release_date)}. Latest version is {latest_version} {get_days(latest_release_date)}.\nUpgrade via \"pip install -U 'acryl-datahub'\"",
|
|
391
444
|
fg="cyan",
|
|
392
|
-
)
|
|
445
|
+
),
|
|
446
|
+
err=True,
|
|
393
447
|
)
|
|
394
448
|
elif encourage_quickstart_upgrade:
|
|
395
449
|
try:
|
|
@@ -429,6 +483,8 @@ def check_upgrade_post(
|
|
|
429
483
|
|
|
430
484
|
|
|
431
485
|
def check_upgrade(func: Callable[..., T]) -> Callable[..., T]:
|
|
486
|
+
log.debug(f"Checking upgrade for {func.__module__}.{func.__name__}")
|
|
487
|
+
|
|
432
488
|
@wraps(func)
|
|
433
489
|
def async_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
434
490
|
with PerfTimer() as timer:
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import collections
|
|
2
2
|
import gzip
|
|
3
3
|
import logging
|
|
4
|
-
import os
|
|
5
4
|
import pathlib
|
|
6
5
|
import pickle
|
|
7
6
|
import shutil
|
|
@@ -28,18 +27,18 @@ from typing import (
|
|
|
28
27
|
Union,
|
|
29
28
|
)
|
|
30
29
|
|
|
30
|
+
from datahub.configuration.env_vars import get_override_sqlite_version_req
|
|
31
31
|
from datahub.ingestion.api.closeable import Closeable
|
|
32
32
|
from datahub.utilities.sentinels import Unset, unset
|
|
33
33
|
|
|
34
34
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
)
|
|
36
|
+
|
|
37
|
+
def _get_sqlite_version_override() -> bool:
|
|
38
|
+
"""Check if SQLite version requirement should be overridden at runtime."""
|
|
39
|
+
override_str = get_override_sqlite_version_req()
|
|
40
|
+
return bool(override_str and override_str.lower() != "false")
|
|
41
|
+
|
|
43
42
|
|
|
44
43
|
_DEFAULT_FILE_NAME = "sqlite.db"
|
|
45
44
|
_DEFAULT_TABLE_NAME = "data"
|
|
@@ -231,7 +230,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
231
230
|
# We use the ON CONFLICT clause to implement UPSERTs with sqlite.
|
|
232
231
|
# This was added in 3.24.0 from 2018-06-04.
|
|
233
232
|
# See https://www.sqlite.org/lang_conflict.html
|
|
234
|
-
if
|
|
233
|
+
if _get_sqlite_version_override():
|
|
235
234
|
self._use_sqlite_on_conflict = False
|
|
236
235
|
else:
|
|
237
236
|
raise RuntimeError("SQLite version 3.24.0 or later is required")
|
|
@@ -250,7 +249,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
250
249
|
rowid INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
251
250
|
key TEXT UNIQUE,
|
|
252
251
|
value BLOB
|
|
253
|
-
{"".join(f", {column_name} BLOB" for column_name in self.extra_columns
|
|
252
|
+
{"".join(f", {column_name} BLOB" for column_name in self.extra_columns)}
|
|
254
253
|
)"""
|
|
255
254
|
)
|
|
256
255
|
|
|
@@ -267,7 +266,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
267
266
|
if self.indexes_created:
|
|
268
267
|
return
|
|
269
268
|
# The key column will automatically be indexed, but we need indexes for the extra columns.
|
|
270
|
-
for column_name in self.extra_columns
|
|
269
|
+
for column_name in self.extra_columns:
|
|
271
270
|
self._conn.execute(
|
|
272
271
|
f"CREATE INDEX {self.tablename}_{column_name} ON {self.tablename} ({column_name})"
|
|
273
272
|
)
|
|
@@ -305,12 +304,12 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
305
304
|
f"""INSERT INTO {self.tablename} (
|
|
306
305
|
key,
|
|
307
306
|
value
|
|
308
|
-
{"".join(f", {column_name}" for column_name in self.extra_columns
|
|
307
|
+
{"".join(f", {column_name}" for column_name in self.extra_columns)}
|
|
309
308
|
)
|
|
310
309
|
VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})
|
|
311
310
|
ON CONFLICT (key) DO UPDATE SET
|
|
312
311
|
value = excluded.value
|
|
313
|
-
{"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns
|
|
312
|
+
{"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns)}
|
|
314
313
|
""",
|
|
315
314
|
items_to_write,
|
|
316
315
|
)
|
|
@@ -321,7 +320,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
321
320
|
f"""INSERT INTO {self.tablename} (
|
|
322
321
|
key,
|
|
323
322
|
value
|
|
324
|
-
{"".join(f", {column_name}" for column_name in self.extra_columns
|
|
323
|
+
{"".join(f", {column_name}" for column_name in self.extra_columns)}
|
|
325
324
|
)
|
|
326
325
|
VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})""",
|
|
327
326
|
item,
|
|
@@ -330,7 +329,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
330
329
|
self._conn.execute(
|
|
331
330
|
f"""UPDATE {self.tablename} SET
|
|
332
331
|
value = ?
|
|
333
|
-
{"".join(f", {column_name} = ?" for column_name in self.extra_columns
|
|
332
|
+
{"".join(f", {column_name} = ?" for column_name in self.extra_columns)}
|
|
334
333
|
WHERE key = ?""",
|
|
335
334
|
(*item[1:], item[0]),
|
|
336
335
|
)
|
|
@@ -155,7 +155,7 @@ class HiveColumnToAvroConverter:
|
|
|
155
155
|
|
|
156
156
|
@staticmethod
|
|
157
157
|
def _parse_basic_datatype_string(s: str) -> Dict[str, object]:
|
|
158
|
-
if s in HiveColumnToAvroConverter._PRIVIMITE_HIVE_TYPE_TO_AVRO_TYPE
|
|
158
|
+
if s in HiveColumnToAvroConverter._PRIVIMITE_HIVE_TYPE_TO_AVRO_TYPE:
|
|
159
159
|
return {
|
|
160
160
|
"type": HiveColumnToAvroConverter._PRIVIMITE_HIVE_TYPE_TO_AVRO_TYPE[s],
|
|
161
161
|
"native_data_type": s,
|
|
@@ -218,7 +218,7 @@ class HiveColumnToAvroConverter:
|
|
|
218
218
|
buf = ""
|
|
219
219
|
level = 0
|
|
220
220
|
for c in s:
|
|
221
|
-
if c in HiveColumnToAvroConverter._BRACKETS
|
|
221
|
+
if c in HiveColumnToAvroConverter._BRACKETS:
|
|
222
222
|
level += 1
|
|
223
223
|
buf += c
|
|
224
224
|
elif c in HiveColumnToAvroConverter._BRACKETS.values():
|
|
@@ -32,10 +32,10 @@ def deploy_source_vars(
|
|
|
32
32
|
name: Optional[str],
|
|
33
33
|
config: str,
|
|
34
34
|
urn: Optional[str],
|
|
35
|
-
executor_id: str,
|
|
35
|
+
executor_id: Optional[str],
|
|
36
36
|
cli_version: Optional[str],
|
|
37
37
|
schedule: Optional[str],
|
|
38
|
-
time_zone: str,
|
|
38
|
+
time_zone: Optional[str],
|
|
39
39
|
extra_pip: Optional[str],
|
|
40
40
|
debug: bool = False,
|
|
41
41
|
) -> dict:
|
datahub/utilities/is_pytest.py
CHANGED
|
@@ -15,13 +15,13 @@ import collections
|
|
|
15
15
|
import contextlib
|
|
16
16
|
import itertools
|
|
17
17
|
import logging
|
|
18
|
-
import os
|
|
19
18
|
import pathlib
|
|
20
19
|
import sys
|
|
21
20
|
from typing import Deque, Iterator, Optional
|
|
22
21
|
|
|
23
22
|
import click
|
|
24
23
|
|
|
24
|
+
from datahub.configuration.env_vars import get_no_color, get_suppress_logging_manager
|
|
25
25
|
from datahub.utilities.tee_io import TeeIO
|
|
26
26
|
|
|
27
27
|
BASE_LOGGING_FORMAT = (
|
|
@@ -38,7 +38,7 @@ IN_MEMORY_LOG_BUFFER_SIZE = 2000 # lines
|
|
|
38
38
|
IN_MEMORY_LOG_BUFFER_MAX_LINE_LENGTH = 2000 # characters
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
NO_COLOR =
|
|
41
|
+
NO_COLOR = get_no_color()
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
def extract_name_from_filename(filename: str, fallback_name: str) -> str:
|
|
@@ -161,6 +161,7 @@ class _LogBuffer:
|
|
|
161
161
|
self._buffer: Deque[str] = collections.deque(maxlen=maxlen)
|
|
162
162
|
|
|
163
163
|
def write(self, line: str) -> None:
|
|
164
|
+
# We do not expect `line` to have a trailing newline.
|
|
164
165
|
if len(line) > IN_MEMORY_LOG_BUFFER_MAX_LINE_LENGTH:
|
|
165
166
|
line = line[:IN_MEMORY_LOG_BUFFER_MAX_LINE_LENGTH] + "[truncated]"
|
|
166
167
|
|
|
@@ -178,6 +179,18 @@ class _LogBuffer:
|
|
|
178
179
|
return text
|
|
179
180
|
|
|
180
181
|
|
|
182
|
+
class _ResilientStreamHandler(logging.StreamHandler):
|
|
183
|
+
"""StreamHandler that gracefully handles closed streams."""
|
|
184
|
+
|
|
185
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
186
|
+
try:
|
|
187
|
+
super().emit(record)
|
|
188
|
+
except (ValueError, OSError):
|
|
189
|
+
# Stream was closed (e.g., during pytest teardown)
|
|
190
|
+
# Silently ignore to prevent test failures
|
|
191
|
+
pass
|
|
192
|
+
|
|
193
|
+
|
|
181
194
|
class _BufferLogHandler(logging.Handler):
|
|
182
195
|
def __init__(self, storage: _LogBuffer) -> None:
|
|
183
196
|
super().__init__()
|
|
@@ -188,13 +201,23 @@ class _BufferLogHandler(logging.Handler):
|
|
|
188
201
|
message = self.format(record)
|
|
189
202
|
except TypeError as e:
|
|
190
203
|
message = f"Error formatting log message: {e}\nMessage: {record.msg}, Args: {record.args}"
|
|
191
|
-
|
|
204
|
+
|
|
205
|
+
# For exception stack traces, the message is split over multiple lines,
|
|
206
|
+
# but we store it as a single string. Because we truncate based on line
|
|
207
|
+
# length, it's better for us to split it into multiple lines so that we
|
|
208
|
+
# don't lose any information on deeper stack traces.
|
|
209
|
+
for line in message.split("\n"):
|
|
210
|
+
self._storage.write(line)
|
|
192
211
|
|
|
193
212
|
|
|
194
213
|
def _remove_all_handlers(logger: logging.Logger) -> None:
|
|
195
214
|
for handler in logger.handlers[:]:
|
|
196
215
|
logger.removeHandler(handler)
|
|
197
|
-
|
|
216
|
+
try:
|
|
217
|
+
handler.close()
|
|
218
|
+
except (ValueError, OSError):
|
|
219
|
+
# Handler stream may already be closed (e.g., during pytest teardown)
|
|
220
|
+
pass
|
|
198
221
|
|
|
199
222
|
|
|
200
223
|
_log_buffer = _LogBuffer(maxlen=IN_MEMORY_LOG_BUFFER_SIZE)
|
|
@@ -212,14 +235,14 @@ _default_formatter = logging.Formatter(BASE_LOGGING_FORMAT)
|
|
|
212
235
|
def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[None]:
|
|
213
236
|
_log_buffer.clear()
|
|
214
237
|
|
|
215
|
-
if
|
|
238
|
+
if get_suppress_logging_manager() == "1":
|
|
216
239
|
# If we're running in pytest, we don't want to configure logging.
|
|
217
240
|
yield
|
|
218
241
|
return
|
|
219
242
|
|
|
220
243
|
with contextlib.ExitStack() as stack:
|
|
221
244
|
# Create stdout handler.
|
|
222
|
-
stream_handler =
|
|
245
|
+
stream_handler = _ResilientStreamHandler()
|
|
223
246
|
stream_handler.addFilter(_DatahubLogFilter(debug=debug))
|
|
224
247
|
stream_handler.setFormatter(_stream_formatter)
|
|
225
248
|
|
|
@@ -230,7 +253,7 @@ def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[N
|
|
|
230
253
|
tee = TeeIO(sys.stdout, file)
|
|
231
254
|
stack.enter_context(contextlib.redirect_stdout(tee)) # type: ignore
|
|
232
255
|
|
|
233
|
-
file_handler =
|
|
256
|
+
file_handler = _ResilientStreamHandler(file)
|
|
234
257
|
file_handler.addFilter(_DatahubLogFilter(debug=True))
|
|
235
258
|
file_handler.setFormatter(_default_formatter)
|
|
236
259
|
else:
|
datahub/utilities/mapping.py
CHANGED
|
@@ -83,7 +83,7 @@ class Constants:
|
|
|
83
83
|
MATCH = "match"
|
|
84
84
|
USER_OWNER = "user"
|
|
85
85
|
GROUP_OWNER = "group"
|
|
86
|
-
OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float]
|
|
86
|
+
OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float, list]
|
|
87
87
|
TAG_PARTITION_KEY = "PARTITION_KEY"
|
|
88
88
|
TAG_DIST_KEY = "DIST_KEY"
|
|
89
89
|
TAG_SORT_KEY = "SORT_KEY"
|
|
@@ -455,7 +455,34 @@ class OperationProcessor:
|
|
|
455
455
|
# function to check if a match clause is satisfied to a value.
|
|
456
456
|
if not any(
|
|
457
457
|
isinstance(raw_props_value, t) for t in Constants.OPERAND_DATATYPE_SUPPORTED
|
|
458
|
-
)
|
|
458
|
+
):
|
|
459
|
+
return None
|
|
460
|
+
|
|
461
|
+
# Handle list values by checking if any item in the list matches
|
|
462
|
+
if isinstance(raw_props_value, list):
|
|
463
|
+
# For lists, we need to find at least one matching item
|
|
464
|
+
# Return a match with the concatenated values of all matching items
|
|
465
|
+
matching_items = []
|
|
466
|
+
for item in raw_props_value:
|
|
467
|
+
if isinstance(item, str):
|
|
468
|
+
match = re.match(match_clause, item)
|
|
469
|
+
if match:
|
|
470
|
+
matching_items.append(item)
|
|
471
|
+
elif isinstance(match_clause, type(item)):
|
|
472
|
+
match = re.match(str(match_clause), str(item))
|
|
473
|
+
if match:
|
|
474
|
+
matching_items.append(str(item))
|
|
475
|
+
|
|
476
|
+
if matching_items:
|
|
477
|
+
# Create a synthetic match object with all matching items joined
|
|
478
|
+
combined_value = ",".join(matching_items)
|
|
479
|
+
return re.match(
|
|
480
|
+
".*", combined_value
|
|
481
|
+
) # Always matches, returns combined value
|
|
482
|
+
return None
|
|
483
|
+
|
|
484
|
+
# Handle scalar values (existing logic)
|
|
485
|
+
elif not isinstance(raw_props_value, type(match_clause)):
|
|
459
486
|
return None
|
|
460
487
|
elif isinstance(raw_props_value, str):
|
|
461
488
|
return re.match(match_clause, raw_props_value)
|
datahub/utilities/sample_data.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import pathlib
|
|
3
2
|
import tempfile
|
|
4
3
|
|
|
5
4
|
import requests
|
|
6
5
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
6
|
+
from datahub.configuration.env_vars import get_docker_compose_base
|
|
7
|
+
|
|
8
|
+
DOCKER_COMPOSE_BASE = (
|
|
9
|
+
get_docker_compose_base()
|
|
10
|
+
or "https://raw.githubusercontent.com/datahub-project/datahub/master"
|
|
10
11
|
)
|
|
11
12
|
BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json"
|
|
12
13
|
BOOTSTRAP_MCES_URL = f"{DOCKER_COMPOSE_BASE}/{BOOTSTRAP_MCES_FILE}"
|