acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -36,9 +36,10 @@ from datahub.ingestion.api.source_helpers import (
|
|
|
36
36
|
)
|
|
37
37
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
38
|
from datahub.ingestion.graph.client import get_default_graph
|
|
39
|
-
from datahub.
|
|
40
|
-
|
|
41
|
-
|
|
39
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
40
|
+
from datahub.metadata.schema_classes import (
|
|
41
|
+
FineGrainedLineageDownstreamTypeClass,
|
|
42
|
+
FineGrainedLineageUpstreamTypeClass,
|
|
42
43
|
)
|
|
43
44
|
|
|
44
45
|
logger = logging.getLogger(__name__)
|
|
@@ -48,7 +49,7 @@ class EntityConfig(EnvConfigMixin):
|
|
|
48
49
|
name: str
|
|
49
50
|
type: str
|
|
50
51
|
platform: str
|
|
51
|
-
platform_instance: Optional[str]
|
|
52
|
+
platform_instance: Optional[str] = None
|
|
52
53
|
|
|
53
54
|
@validator("type")
|
|
54
55
|
def type_must_be_supported(cls, v: str) -> str:
|
|
@@ -79,9 +80,9 @@ class FineGrainedLineageConfig(ConfigModel):
|
|
|
79
80
|
@validator("upstreamType")
|
|
80
81
|
def upstream_type_must_be_supported(cls, v: str) -> str:
|
|
81
82
|
allowed_types = [
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
83
|
+
FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
84
|
+
FineGrainedLineageUpstreamTypeClass.DATASET,
|
|
85
|
+
FineGrainedLineageUpstreamTypeClass.NONE,
|
|
85
86
|
]
|
|
86
87
|
if v not in allowed_types:
|
|
87
88
|
raise ValueError(
|
|
@@ -92,8 +93,8 @@ class FineGrainedLineageConfig(ConfigModel):
|
|
|
92
93
|
@validator("downstreamType")
|
|
93
94
|
def downstream_type_must_be_supported(cls, v: str) -> str:
|
|
94
95
|
allowed_types = [
|
|
95
|
-
|
|
96
|
-
|
|
96
|
+
FineGrainedLineageDownstreamTypeClass.FIELD_SET,
|
|
97
|
+
FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
97
98
|
]
|
|
98
99
|
if v not in allowed_types:
|
|
99
100
|
raise ValueError(
|
|
@@ -210,7 +211,7 @@ def _get_lineage_mcp(
|
|
|
210
211
|
|
|
211
212
|
# extract the old lineage and save it for the new mcp
|
|
212
213
|
if preserve_upstream:
|
|
213
|
-
client = get_default_graph()
|
|
214
|
+
client = get_default_graph(ClientMode.INGESTION)
|
|
214
215
|
|
|
215
216
|
old_upstream_lineage = get_aspects_for_entity(
|
|
216
217
|
client._session,
|
|
@@ -1,10 +1,13 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
1
3
|
import time
|
|
2
4
|
from dataclasses import dataclass
|
|
3
|
-
from typing import Any, Callable, Iterable, List, Optional, TypeVar, Union
|
|
5
|
+
from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Union
|
|
4
6
|
|
|
5
7
|
from mlflow import MlflowClient
|
|
6
|
-
from mlflow.entities import Experiment, Run
|
|
8
|
+
from mlflow.entities import Dataset as MlflowDataset, Experiment, Run
|
|
7
9
|
from mlflow.entities.model_registry import ModelVersion, RegisteredModel
|
|
10
|
+
from mlflow.exceptions import MlflowException
|
|
8
11
|
from mlflow.store.entities import PagedList
|
|
9
12
|
from pydantic.fields import Field
|
|
10
13
|
|
|
@@ -14,7 +17,7 @@ from datahub.api.entities.dataprocess.dataprocess_instance import (
|
|
|
14
17
|
)
|
|
15
18
|
from datahub.configuration.source_common import EnvConfigMixin
|
|
16
19
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
17
|
-
from datahub.emitter.mcp_builder import
|
|
20
|
+
from datahub.emitter.mcp_builder import ExperimentKey
|
|
18
21
|
from datahub.ingestion.api.common import PipelineContext
|
|
19
22
|
from datahub.ingestion.api.decorators import (
|
|
20
23
|
SupportStatus,
|
|
@@ -29,10 +32,15 @@ from datahub.ingestion.api.source import (
|
|
|
29
32
|
SourceReport,
|
|
30
33
|
)
|
|
31
34
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
32
|
-
from datahub.ingestion.source.common.
|
|
35
|
+
from datahub.ingestion.source.common.data_platforms import KNOWN_VALID_PLATFORM_NAMES
|
|
36
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
37
|
+
MLAssetSubTypes,
|
|
38
|
+
SourceCapabilityModifier,
|
|
39
|
+
)
|
|
33
40
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
34
41
|
StaleEntityRemovalHandler,
|
|
35
42
|
StaleEntityRemovalSourceReport,
|
|
43
|
+
StatefulStaleMetadataRemovalConfig,
|
|
36
44
|
)
|
|
37
45
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
38
46
|
StatefulIngestionConfigBase,
|
|
@@ -42,6 +50,7 @@ from datahub.metadata.schema_classes import (
|
|
|
42
50
|
AuditStampClass,
|
|
43
51
|
ContainerClass,
|
|
44
52
|
DataPlatformInstanceClass,
|
|
53
|
+
DataProcessInstanceInputClass,
|
|
45
54
|
DataProcessInstanceOutputClass,
|
|
46
55
|
DataProcessInstancePropertiesClass,
|
|
47
56
|
DataProcessInstanceRunEventClass,
|
|
@@ -60,24 +69,19 @@ from datahub.metadata.schema_classes import (
|
|
|
60
69
|
TagAssociationClass,
|
|
61
70
|
TagPropertiesClass,
|
|
62
71
|
TimeStampClass,
|
|
72
|
+
UpstreamClass,
|
|
73
|
+
UpstreamLineageClass,
|
|
63
74
|
VersionPropertiesClass,
|
|
64
75
|
VersionTagClass,
|
|
65
76
|
_Aspect,
|
|
66
77
|
)
|
|
67
|
-
from datahub.metadata.urns import
|
|
68
|
-
DataPlatformUrn,
|
|
69
|
-
MlModelUrn,
|
|
70
|
-
VersionSetUrn,
|
|
71
|
-
)
|
|
78
|
+
from datahub.metadata.urns import DataPlatformUrn, DatasetUrn, MlModelUrn, VersionSetUrn
|
|
72
79
|
from datahub.sdk.container import Container
|
|
80
|
+
from datahub.sdk.dataset import Dataset
|
|
73
81
|
|
|
74
82
|
T = TypeVar("T")
|
|
75
83
|
|
|
76
84
|
|
|
77
|
-
class ContainerKeyWithId(ContainerKey):
|
|
78
|
-
id: str
|
|
79
|
-
|
|
80
|
-
|
|
81
85
|
class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
82
86
|
tracking_uri: Optional[str] = Field(
|
|
83
87
|
default=None,
|
|
@@ -105,6 +109,22 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
|
105
109
|
" If neither is set, external URLs are not generated."
|
|
106
110
|
),
|
|
107
111
|
)
|
|
112
|
+
materialize_dataset_inputs: Optional[bool] = Field(
|
|
113
|
+
default=False,
|
|
114
|
+
description="Whether to materialize dataset inputs for each run",
|
|
115
|
+
)
|
|
116
|
+
source_mapping_to_platform: Optional[dict] = Field(
|
|
117
|
+
default=None, description="Mapping of source type to datahub platform"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
username: Optional[str] = Field(
|
|
121
|
+
default=None, description="Username for MLflow authentication"
|
|
122
|
+
)
|
|
123
|
+
password: Optional[str] = Field(
|
|
124
|
+
default=None, description="Password for MLflow authentication"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
108
128
|
|
|
109
129
|
|
|
110
130
|
@dataclass
|
|
@@ -116,11 +136,18 @@ class MLflowRegisteredModelStageInfo:
|
|
|
116
136
|
|
|
117
137
|
@platform_name("MLflow")
|
|
118
138
|
@config_class(MLflowConfig)
|
|
119
|
-
@support_status(SupportStatus.
|
|
139
|
+
@support_status(SupportStatus.INCUBATING)
|
|
120
140
|
@capability(
|
|
121
141
|
SourceCapability.DESCRIPTIONS,
|
|
122
142
|
"Extract descriptions for MLflow Registered Models and Model Versions",
|
|
123
143
|
)
|
|
144
|
+
@capability(
|
|
145
|
+
SourceCapability.CONTAINERS,
|
|
146
|
+
"Extract ML experiments",
|
|
147
|
+
subtype_modifier=[
|
|
148
|
+
SourceCapabilityModifier.MLFLOW_EXPERIMENT,
|
|
149
|
+
],
|
|
150
|
+
)
|
|
124
151
|
@capability(SourceCapability.TAGS, "Extract tags for MLflow Registered Model Stages")
|
|
125
152
|
class MLflowSource(StatefulIngestionSourceBase):
|
|
126
153
|
platform = "mlflow"
|
|
@@ -152,7 +179,17 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
152
179
|
self.ctx = ctx
|
|
153
180
|
self.config = config
|
|
154
181
|
self.report = StaleEntityRemovalSourceReport()
|
|
155
|
-
self.client =
|
|
182
|
+
self.client = self._configure_client()
|
|
183
|
+
|
|
184
|
+
def _configure_client(self) -> MlflowClient:
|
|
185
|
+
if bool(self.config.username) != bool(self.config.password):
|
|
186
|
+
raise ValueError("Both username and password must be set together")
|
|
187
|
+
|
|
188
|
+
if self.config.username and self.config.password:
|
|
189
|
+
os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.username
|
|
190
|
+
os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.password
|
|
191
|
+
|
|
192
|
+
return MlflowClient(
|
|
156
193
|
tracking_uri=self.config.tracking_uri,
|
|
157
194
|
registry_uri=self.config.registry_uri,
|
|
158
195
|
)
|
|
@@ -213,6 +250,7 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
213
250
|
if runs:
|
|
214
251
|
for run in runs:
|
|
215
252
|
yield from self._get_run_workunits(experiment, run)
|
|
253
|
+
yield from self._get_dataset_input_workunits(run)
|
|
216
254
|
|
|
217
255
|
def _get_experiment_custom_properties(self, experiment):
|
|
218
256
|
experiment_custom_props = getattr(experiment, "tags", {}) or {}
|
|
@@ -224,7 +262,7 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
224
262
|
self, experiment: Experiment
|
|
225
263
|
) -> Iterable[MetadataWorkUnit]:
|
|
226
264
|
experiment_container = Container(
|
|
227
|
-
container_key=
|
|
265
|
+
container_key=ExperimentKey(
|
|
228
266
|
platform=str(DataPlatformUrn(platform_name=self.platform)),
|
|
229
267
|
id=experiment.name,
|
|
230
268
|
),
|
|
@@ -262,10 +300,187 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
262
300
|
type="SKIPPED", nativeResultType=self.platform
|
|
263
301
|
)
|
|
264
302
|
|
|
303
|
+
def _get_dataset_schema(
|
|
304
|
+
self, dataset: MlflowDataset
|
|
305
|
+
) -> Optional[List[Tuple[str, str]]]:
|
|
306
|
+
try:
|
|
307
|
+
schema_dict = json.loads(dataset.schema)
|
|
308
|
+
except json.JSONDecodeError:
|
|
309
|
+
self.report.warning(
|
|
310
|
+
title="Failed to load dataset schema",
|
|
311
|
+
message="Schema metadata will be missing due to a JSON parsing error.",
|
|
312
|
+
context=f"Dataset: {dataset.name}, Schema: {dataset.schema}",
|
|
313
|
+
)
|
|
314
|
+
return None
|
|
315
|
+
|
|
316
|
+
if "mlflow_colspec" in schema_dict:
|
|
317
|
+
try:
|
|
318
|
+
return [
|
|
319
|
+
(field["name"], field["type"])
|
|
320
|
+
for field in schema_dict["mlflow_colspec"]
|
|
321
|
+
]
|
|
322
|
+
except (KeyError, TypeError):
|
|
323
|
+
return None
|
|
324
|
+
# If the schema is not formatted, return None
|
|
325
|
+
return None
|
|
326
|
+
|
|
327
|
+
def _get_external_dataset_urn(self, platform: str, dataset_name: str) -> str:
|
|
328
|
+
"""
|
|
329
|
+
Get the URN for an external dataset.
|
|
330
|
+
Args:
|
|
331
|
+
platform: The platform of the external dataset (e.g., 's3', 'bigquery')
|
|
332
|
+
dataset: The MLflow dataset
|
|
333
|
+
Returns:
|
|
334
|
+
str: The URN of the external dataset
|
|
335
|
+
"""
|
|
336
|
+
return str(DatasetUrn(platform=platform, name=dataset_name))
|
|
337
|
+
|
|
338
|
+
def _get_dataset_input_workunits(self, run: Run) -> Iterable[MetadataWorkUnit]:
|
|
339
|
+
"""
|
|
340
|
+
Generate workunits for dataset inputs in a run.
|
|
341
|
+
|
|
342
|
+
For each dataset input:
|
|
343
|
+
1. If source type is 'local' or 'code':
|
|
344
|
+
- Create a local dataset reference
|
|
345
|
+
2. Otherwise:
|
|
346
|
+
- If materialization is enabled:
|
|
347
|
+
- Create a hosted dataset and a dataset reference with upstream
|
|
348
|
+
- If materialization is not enabled:
|
|
349
|
+
- Create a dataset reference and add upstream if dataset exists
|
|
350
|
+
3. Add all dataset references as upstreams for the run
|
|
351
|
+
"""
|
|
352
|
+
run_urn = DataProcessInstance(
|
|
353
|
+
id=run.info.run_id,
|
|
354
|
+
orchestrator=self.platform,
|
|
355
|
+
).urn
|
|
356
|
+
|
|
357
|
+
dataset_reference_urns = []
|
|
358
|
+
|
|
359
|
+
for dataset_input in run.inputs.dataset_inputs:
|
|
360
|
+
dataset = dataset_input.dataset
|
|
361
|
+
source_type = dataset.source_type
|
|
362
|
+
dataset_tags = {k[1]: v[1] for k, v in dataset_input.tags}
|
|
363
|
+
|
|
364
|
+
# Prepare dataset properties
|
|
365
|
+
custom_properties = dataset_tags
|
|
366
|
+
formatted_schema = self._get_dataset_schema(dataset)
|
|
367
|
+
if formatted_schema is None:
|
|
368
|
+
custom_properties["schema"] = dataset.schema
|
|
369
|
+
|
|
370
|
+
# Handle local/code datasets
|
|
371
|
+
if source_type in ("local", "code"):
|
|
372
|
+
local_dataset = Dataset(
|
|
373
|
+
platform=self.platform,
|
|
374
|
+
name=dataset.name,
|
|
375
|
+
schema=formatted_schema,
|
|
376
|
+
custom_properties=custom_properties,
|
|
377
|
+
)
|
|
378
|
+
yield from local_dataset.as_workunits()
|
|
379
|
+
dataset_reference_urns.append(local_dataset.urn)
|
|
380
|
+
continue
|
|
381
|
+
|
|
382
|
+
# Handle hosted datasets
|
|
383
|
+
formatted_platform = self._get_dataset_platform_from_source_type(
|
|
384
|
+
source_type
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
# Validate platform if materialization is enabled
|
|
388
|
+
if self.config.materialize_dataset_inputs:
|
|
389
|
+
if not formatted_platform:
|
|
390
|
+
self.report.failure(
|
|
391
|
+
title="Unable to materialize dataset inputs",
|
|
392
|
+
message=f"No mapping dataPlatform found for dataset input source type '{source_type}'",
|
|
393
|
+
context=f"please add `materialize_dataset_inputs.source_mapping_to_platform` in config "
|
|
394
|
+
f"(e.g. '{source_type}': 'snowflake')",
|
|
395
|
+
)
|
|
396
|
+
continue
|
|
397
|
+
# Create hosted dataset
|
|
398
|
+
hosted_dataset = Dataset(
|
|
399
|
+
platform=formatted_platform,
|
|
400
|
+
name=dataset.name,
|
|
401
|
+
schema=formatted_schema,
|
|
402
|
+
custom_properties=dataset_tags,
|
|
403
|
+
)
|
|
404
|
+
yield from hosted_dataset.as_workunits()
|
|
405
|
+
|
|
406
|
+
# Create dataset reference with upstream
|
|
407
|
+
hosted_dataset_reference = Dataset(
|
|
408
|
+
platform=self.platform,
|
|
409
|
+
name=dataset.name,
|
|
410
|
+
schema=formatted_schema,
|
|
411
|
+
custom_properties=dataset_tags,
|
|
412
|
+
upstreams=UpstreamLineageClass(
|
|
413
|
+
upstreams=[
|
|
414
|
+
UpstreamClass(
|
|
415
|
+
self._get_external_dataset_urn(
|
|
416
|
+
formatted_platform, dataset.name
|
|
417
|
+
),
|
|
418
|
+
type="COPY",
|
|
419
|
+
)
|
|
420
|
+
]
|
|
421
|
+
)
|
|
422
|
+
if formatted_platform
|
|
423
|
+
else None,
|
|
424
|
+
)
|
|
425
|
+
dataset_reference_urns.append(hosted_dataset_reference.urn)
|
|
426
|
+
yield from hosted_dataset_reference.as_workunits()
|
|
427
|
+
|
|
428
|
+
# Add dataset references as upstreams for the run
|
|
429
|
+
if dataset_reference_urns:
|
|
430
|
+
input_edges = [
|
|
431
|
+
EdgeClass(destinationUrn=str(dataset_ref_urn))
|
|
432
|
+
for dataset_ref_urn in dataset_reference_urns
|
|
433
|
+
]
|
|
434
|
+
yield MetadataChangeProposalWrapper(
|
|
435
|
+
entityUrn=str(run_urn),
|
|
436
|
+
aspect=DataProcessInstanceInputClass(inputs=[], inputEdges=input_edges),
|
|
437
|
+
).as_workunit()
|
|
438
|
+
|
|
439
|
+
def _get_dataset_platform_from_source_type(self, source_type: str) -> Optional[str]:
|
|
440
|
+
"""
|
|
441
|
+
Map MLflow source type to DataHub platform.
|
|
442
|
+
|
|
443
|
+
Priority:
|
|
444
|
+
1. User-provided mapping in config
|
|
445
|
+
2. Internal mapping
|
|
446
|
+
3. Direct platform match from list of supported platforms
|
|
447
|
+
"""
|
|
448
|
+
source_type = source_type.lower()
|
|
449
|
+
|
|
450
|
+
# User-provided mapping
|
|
451
|
+
platform = self._get_platform_from_user_mapping(source_type)
|
|
452
|
+
if platform:
|
|
453
|
+
return platform
|
|
454
|
+
|
|
455
|
+
# Internal mapping
|
|
456
|
+
if source_type == "gs":
|
|
457
|
+
return "gcs"
|
|
458
|
+
|
|
459
|
+
# Check direct platform match
|
|
460
|
+
if self._is_valid_platform(source_type):
|
|
461
|
+
return source_type
|
|
462
|
+
|
|
463
|
+
return None
|
|
464
|
+
|
|
465
|
+
def _get_platform_from_user_mapping(self, source_type: str) -> Optional[str]:
|
|
466
|
+
"""
|
|
467
|
+
Get platform from user-provided mapping in config.
|
|
468
|
+
Returns None if mapping is invalid or platform is not supported.
|
|
469
|
+
"""
|
|
470
|
+
source_mapping = self.config.source_mapping_to_platform
|
|
471
|
+
if not source_mapping:
|
|
472
|
+
return None
|
|
473
|
+
|
|
474
|
+
platform = source_mapping.get(source_type)
|
|
475
|
+
if not platform:
|
|
476
|
+
return None
|
|
477
|
+
|
|
478
|
+
return platform
|
|
479
|
+
|
|
265
480
|
def _get_run_workunits(
|
|
266
481
|
self, experiment: Experiment, run: Run
|
|
267
482
|
) -> Iterable[MetadataWorkUnit]:
|
|
268
|
-
experiment_key =
|
|
483
|
+
experiment_key = ExperimentKey(
|
|
269
484
|
platform=str(DataPlatformUrn(self.platform)), id=experiment.name
|
|
270
485
|
)
|
|
271
486
|
|
|
@@ -385,8 +600,8 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
385
600
|
)
|
|
386
601
|
return runs
|
|
387
602
|
|
|
388
|
-
@staticmethod
|
|
389
603
|
def _traverse_mlflow_search_func(
|
|
604
|
+
self,
|
|
390
605
|
search_func: Callable[..., PagedList[T]],
|
|
391
606
|
**kwargs: Any,
|
|
392
607
|
) -> Iterable[T]:
|
|
@@ -394,12 +609,24 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
394
609
|
Utility to traverse an MLflow search_* functions which return PagedList.
|
|
395
610
|
"""
|
|
396
611
|
next_page_token = None
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
612
|
+
try:
|
|
613
|
+
while True:
|
|
614
|
+
paged_list = search_func(page_token=next_page_token, **kwargs)
|
|
615
|
+
yield from paged_list.to_list()
|
|
616
|
+
next_page_token = paged_list.token
|
|
617
|
+
if not next_page_token:
|
|
618
|
+
return
|
|
619
|
+
except MlflowException as e:
|
|
620
|
+
if e.error_code == "ENDPOINT_NOT_FOUND":
|
|
621
|
+
self.report.warning(
|
|
622
|
+
title="MLflow API Endpoint Not Found for Experiments.",
|
|
623
|
+
message="Please upgrade to version 1.28.0 or higher to ensure compatibility. Skipping ingestion for experiments and runs.",
|
|
624
|
+
context=None,
|
|
625
|
+
exc=e,
|
|
626
|
+
)
|
|
402
627
|
return
|
|
628
|
+
else:
|
|
629
|
+
raise # Only re-raise other exceptions
|
|
403
630
|
|
|
404
631
|
def _get_latest_version(self, registered_model: RegisteredModel) -> Optional[str]:
|
|
405
632
|
return (
|
|
@@ -659,6 +886,10 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
659
886
|
)
|
|
660
887
|
return wu
|
|
661
888
|
|
|
889
|
+
def _is_valid_platform(self, platform: Optional[str]) -> bool:
|
|
890
|
+
"""Check if platform is registered as a source plugin"""
|
|
891
|
+
return platform in KNOWN_VALID_PLATFORM_NAMES
|
|
892
|
+
|
|
662
893
|
@classmethod
|
|
663
894
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "MLflowSource":
|
|
664
895
|
config = MLflowConfig.parse_obj(config_dict)
|
|
File without changes
|