acryl-datahub 1.3.0.1rc9__py3-none-any.whl → 1.3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/METADATA +2550 -2543
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/RECORD +263 -261
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +2 -2
- datahub/api/entities/corpgroup/corpgroup.py +11 -6
- datahub/api/entities/corpuser/corpuser.py +11 -11
- datahub/api/entities/dataproduct/dataproduct.py +47 -27
- datahub/api/entities/dataset/dataset.py +32 -21
- datahub/api/entities/external/lake_formation_external_entites.py +5 -6
- datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
- datahub/api/entities/forms/forms.py +16 -14
- datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
- datahub/cli/check_cli.py +2 -2
- datahub/cli/config_utils.py +3 -3
- datahub/cli/lite_cli.py +9 -7
- datahub/cli/migrate.py +4 -4
- datahub/cli/quickstart_versioning.py +3 -3
- datahub/cli/specific/group_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +1 -1
- datahub/cli/specific/user_cli.py +1 -1
- datahub/configuration/common.py +14 -2
- datahub/configuration/connection_resolver.py +2 -2
- datahub/configuration/git.py +47 -30
- datahub/configuration/import_resolver.py +2 -2
- datahub/configuration/kafka.py +4 -3
- datahub/configuration/time_window_config.py +26 -26
- datahub/configuration/validate_field_deprecation.py +2 -2
- datahub/configuration/validate_field_removal.py +2 -2
- datahub/configuration/validate_field_rename.py +2 -2
- datahub/configuration/validate_multiline_string.py +2 -1
- datahub/emitter/kafka_emitter.py +3 -1
- datahub/emitter/rest_emitter.py +2 -4
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/report.py +1 -1
- datahub/ingestion/api/sink.py +1 -1
- datahub/ingestion/api/source.py +1 -1
- datahub/ingestion/glossary/datahub_classifier.py +11 -8
- datahub/ingestion/graph/client.py +5 -1
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/reporting/file_reporter.py +5 -4
- datahub/ingestion/run/pipeline.py +7 -6
- datahub/ingestion/run/pipeline_config.py +12 -14
- datahub/ingestion/run/sink_callback.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +6 -4
- datahub/ingestion/source/abs/config.py +19 -19
- datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/abs/source.py +2 -2
- datahub/ingestion/source/aws/aws_common.py +1 -1
- datahub/ingestion/source/aws/glue.py +6 -4
- datahub/ingestion/source/aws/sagemaker.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +8 -12
- datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
- datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
- datahub/ingestion/source/datahub/config.py +8 -8
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
- datahub/ingestion/source/dbt/dbt_common.py +39 -37
- datahub/ingestion/source/dbt/dbt_core.py +10 -12
- datahub/ingestion/source/debug/datahub_debug.py +1 -1
- datahub/ingestion/source/delta_lake/config.py +6 -4
- datahub/ingestion/source/dremio/dremio_api.py +212 -78
- datahub/ingestion/source/dremio/dremio_config.py +10 -6
- datahub/ingestion/source/dremio/dremio_entities.py +55 -39
- datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
- datahub/ingestion/source/dremio/dremio_source.py +24 -26
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/elastic_search.py +110 -32
- datahub/ingestion/source/excel/source.py +1 -1
- datahub/ingestion/source/feast.py +1 -1
- datahub/ingestion/source/file.py +5 -4
- datahub/ingestion/source/fivetran/config.py +17 -16
- datahub/ingestion/source/fivetran/fivetran.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +8 -10
- datahub/ingestion/source/ge_profiling_config.py +8 -5
- datahub/ingestion/source/grafana/grafana_api.py +2 -2
- datahub/ingestion/source/grafana/grafana_config.py +4 -3
- datahub/ingestion/source/grafana/grafana_source.py +1 -1
- datahub/ingestion/source/grafana/models.py +23 -5
- datahub/ingestion/source/hex/api.py +7 -5
- datahub/ingestion/source/hex/hex.py +4 -3
- datahub/ingestion/source/iceberg/iceberg.py +1 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +10 -10
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +7 -5
- datahub/ingestion/source/looker/looker_config.py +21 -20
- datahub/ingestion/source/looker/lookml_config.py +47 -47
- datahub/ingestion/source/metabase.py +8 -8
- datahub/ingestion/source/metadata/business_glossary.py +2 -2
- datahub/ingestion/source/metadata/lineage.py +13 -8
- datahub/ingestion/source/mlflow.py +1 -1
- datahub/ingestion/source/mode.py +6 -4
- datahub/ingestion/source/mongodb.py +4 -3
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +17 -23
- datahub/ingestion/source/openapi.py +6 -8
- datahub/ingestion/source/powerbi/config.py +33 -32
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
- datahub/ingestion/source/powerbi/powerbi.py +1 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
- datahub/ingestion/source/preset.py +8 -8
- datahub/ingestion/source/pulsar.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
- datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +18 -20
- datahub/ingestion/source/redshift/redshift.py +2 -2
- datahub/ingestion/source/redshift/usage.py +23 -3
- datahub/ingestion/source/s3/config.py +83 -62
- datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/s3/source.py +8 -5
- datahub/ingestion/source/sac/sac.py +5 -4
- datahub/ingestion/source/salesforce.py +3 -2
- datahub/ingestion/source/schema/json_schema.py +2 -2
- datahub/ingestion/source/sigma/data_classes.py +3 -2
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/sigma/sigma_api.py +7 -7
- datahub/ingestion/source/slack/slack.py +1 -1
- datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
- datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_queries.py +28 -4
- datahub/ingestion/source/sql/athena.py +1 -1
- datahub/ingestion/source/sql/clickhouse.py +4 -2
- datahub/ingestion/source/sql/cockroachdb.py +1 -1
- datahub/ingestion/source/sql/druid.py +1 -1
- datahub/ingestion/source/sql/hana.py +1 -1
- datahub/ingestion/source/sql/hive.py +7 -5
- datahub/ingestion/source/sql/hive_metastore.py +1 -1
- datahub/ingestion/source/sql/mssql/source.py +13 -6
- datahub/ingestion/source/sql/mysql.py +1 -1
- datahub/ingestion/source/sql/oracle.py +17 -10
- datahub/ingestion/source/sql/postgres.py +2 -2
- datahub/ingestion/source/sql/presto.py +1 -1
- datahub/ingestion/source/sql/sql_config.py +8 -9
- datahub/ingestion/source/sql/sql_generic.py +1 -1
- datahub/ingestion/source/sql/teradata.py +1 -1
- datahub/ingestion/source/sql/trino.py +1 -1
- datahub/ingestion/source/sql/vertica.py +5 -4
- datahub/ingestion/source/sql_queries.py +174 -22
- datahub/ingestion/source/state/checkpoint.py +2 -2
- datahub/ingestion/source/state/entity_removal_state.py +2 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +9 -9
- datahub/ingestion/source/tableau/tableau.py +14 -16
- datahub/ingestion/source/unity/azure_auth_config.py +15 -0
- datahub/ingestion/source/unity/config.py +51 -34
- datahub/ingestion/source/unity/connection.py +7 -1
- datahub/ingestion/source/unity/connection_test.py +1 -1
- datahub/ingestion/source/unity/proxy.py +216 -7
- datahub/ingestion/source/unity/proxy_types.py +91 -0
- datahub/ingestion/source/unity/source.py +29 -3
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
- datahub/ingestion/source/usage/usage_common.py +5 -3
- datahub/ingestion/source_config/csv_enricher.py +7 -6
- datahub/ingestion/source_config/operation_config.py +7 -4
- datahub/ingestion/source_config/pulsar.py +11 -15
- datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
- datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
- datahub/ingestion/transformer/add_dataset_properties.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
- datahub/ingestion/transformer/add_dataset_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
- datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
- datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
- datahub/ingestion/transformer/mark_dataset_status.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
- datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/replace_external_url.py +2 -2
- datahub/ingestion/transformer/set_browse_path.py +1 -1
- datahub/ingestion/transformer/tags_to_terms.py +1 -1
- datahub/lite/duckdb_lite.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_internal_schema_classes.py +62 -2
- datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
- datahub/metadata/schema.avsc +271 -91
- datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
- datahub/metadata/schemas/AssertionInfo.avsc +48 -5
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
- datahub/metadata/schemas/ChartInfo.avsc +12 -5
- datahub/metadata/schemas/ContainerProperties.avsc +12 -5
- datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
- datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
- datahub/metadata/schemas/DashboardInfo.avsc +16 -4
- datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
- datahub/metadata/schemas/DataJobInfo.avsc +9 -4
- datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
- datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- datahub/metadata/schemas/DataProductProperties.avsc +5 -2
- datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/DatasetProperties.avsc +12 -5
- datahub/metadata/schemas/DomainProperties.avsc +7 -3
- datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
- datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalTags.avsc +3 -2
- datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- datahub/metadata/schemas/InputFields.avsc +3 -2
- datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLModelProperties.avsc +4 -2
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
- datahub/metadata/schemas/NotebookInfo.avsc +5 -2
- datahub/metadata/schemas/Ownership.avsc +3 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -1
- datahub/metadata/schemas/RoleProperties.avsc +3 -1
- datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
- datahub/metadata/schemas/TagProperties.avsc +3 -1
- datahub/metadata/schemas/TestInfo.avsc +2 -1
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/_all_entities.py +2 -0
- datahub/sdk/search_filters.py +68 -40
- datahub/sdk/tag.py +112 -0
- datahub/secret/datahub_secret_store.py +7 -4
- datahub/secret/file_secret_store.py +1 -1
- datahub/sql_parsing/schema_resolver.py +29 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
- datahub/sql_parsing/sqlglot_lineage.py +5 -2
- datahub/testing/check_sql_parser_result.py +2 -2
- datahub/utilities/ingest_utils.py +1 -1
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
from datetime import datetime, timedelta, timezone
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Dict, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
|
-
from pydantic import Field
|
|
7
|
+
from pydantic import Field, field_validator, model_validator
|
|
8
8
|
from typing_extensions import Literal
|
|
9
9
|
|
|
10
10
|
from datahub.configuration.common import (
|
|
@@ -397,13 +397,15 @@ class UnityCatalogSourceConfig(
|
|
|
397
397
|
default=None, description="Unity Catalog Stateful Ingestion Config."
|
|
398
398
|
)
|
|
399
399
|
|
|
400
|
-
@
|
|
400
|
+
@field_validator("start_time", mode="after")
|
|
401
|
+
@classmethod
|
|
401
402
|
def within_thirty_days(cls, v: datetime) -> datetime:
|
|
402
403
|
if (datetime.now(timezone.utc) - v).days > 30:
|
|
403
404
|
raise ValueError("Query history is only maintained for 30 days.")
|
|
404
405
|
return v
|
|
405
406
|
|
|
406
|
-
@
|
|
407
|
+
@field_validator("workspace_url", mode="after")
|
|
408
|
+
@classmethod
|
|
407
409
|
def workspace_url_should_start_with_http_scheme(cls, workspace_url: str) -> str:
|
|
408
410
|
if not workspace_url.lower().startswith(("http://", "https://")):
|
|
409
411
|
raise ValueError(
|
|
@@ -411,7 +413,26 @@ class UnityCatalogSourceConfig(
|
|
|
411
413
|
)
|
|
412
414
|
return workspace_url
|
|
413
415
|
|
|
414
|
-
@
|
|
416
|
+
@model_validator(mode="before")
|
|
417
|
+
def either_token_or_azure_auth_provided(cls, values: dict) -> dict:
|
|
418
|
+
token = values.get("token")
|
|
419
|
+
azure_auth = values.get("azure_auth")
|
|
420
|
+
|
|
421
|
+
# Check if exactly one of the authentication methods is provided
|
|
422
|
+
if not token and not azure_auth:
|
|
423
|
+
raise ValueError(
|
|
424
|
+
"Either 'azure_auth' or 'token' (personal access token) must be provided in the configuration."
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
if token and azure_auth:
|
|
428
|
+
raise ValueError(
|
|
429
|
+
"Cannot specify both 'token' and 'azure_auth'. Please provide only one authentication method."
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
return values
|
|
433
|
+
|
|
434
|
+
@field_validator("include_metastore", mode="after")
|
|
435
|
+
@classmethod
|
|
415
436
|
def include_metastore_warning(cls, v: bool) -> bool:
|
|
416
437
|
if v:
|
|
417
438
|
msg = (
|
|
@@ -424,60 +445,56 @@ class UnityCatalogSourceConfig(
|
|
|
424
445
|
add_global_warning(msg)
|
|
425
446
|
return v
|
|
426
447
|
|
|
427
|
-
@
|
|
428
|
-
def set_warehouse_id_from_profiling(
|
|
429
|
-
profiling
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
if not values.get("warehouse_id") and profiling and profiling.warehouse_id:
|
|
433
|
-
values["warehouse_id"] = profiling.warehouse_id
|
|
448
|
+
@model_validator(mode="after")
|
|
449
|
+
def set_warehouse_id_from_profiling(self):
|
|
450
|
+
profiling = self.profiling
|
|
451
|
+
if not self.warehouse_id and profiling and profiling.warehouse_id:
|
|
452
|
+
self.warehouse_id = profiling.warehouse_id
|
|
434
453
|
if (
|
|
435
|
-
|
|
454
|
+
self.warehouse_id
|
|
436
455
|
and profiling
|
|
437
456
|
and profiling.warehouse_id
|
|
438
|
-
and
|
|
457
|
+
and self.warehouse_id != profiling.warehouse_id
|
|
439
458
|
):
|
|
440
459
|
raise ValueError(
|
|
441
460
|
"When `warehouse_id` is set, it must match the `warehouse_id` in `profiling`."
|
|
442
461
|
)
|
|
443
462
|
|
|
444
|
-
if
|
|
445
|
-
profiling.warehouse_id =
|
|
463
|
+
if self.warehouse_id and profiling and not profiling.warehouse_id:
|
|
464
|
+
profiling.warehouse_id = self.warehouse_id
|
|
446
465
|
|
|
447
466
|
if profiling and profiling.enabled and not profiling.warehouse_id:
|
|
448
467
|
raise ValueError("warehouse_id must be set when profiling is enabled.")
|
|
449
468
|
|
|
450
|
-
return
|
|
469
|
+
return self
|
|
451
470
|
|
|
452
|
-
@
|
|
453
|
-
def validate_lineage_data_source_with_warehouse(
|
|
454
|
-
|
|
455
|
-
) -> Dict[str, Any]:
|
|
456
|
-
lineage_data_source = values.get("lineage_data_source", LineageDataSource.AUTO)
|
|
457
|
-
warehouse_id = values.get("warehouse_id")
|
|
471
|
+
@model_validator(mode="after")
|
|
472
|
+
def validate_lineage_data_source_with_warehouse(self):
|
|
473
|
+
lineage_data_source = self.lineage_data_source or LineageDataSource.AUTO
|
|
458
474
|
|
|
459
|
-
if
|
|
475
|
+
if (
|
|
476
|
+
lineage_data_source == LineageDataSource.SYSTEM_TABLES
|
|
477
|
+
and not self.warehouse_id
|
|
478
|
+
):
|
|
460
479
|
raise ValueError(
|
|
461
480
|
f"lineage_data_source='{LineageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
|
|
462
481
|
)
|
|
463
482
|
|
|
464
|
-
return
|
|
483
|
+
return self
|
|
465
484
|
|
|
466
|
-
@
|
|
467
|
-
def validate_usage_data_source_with_warehouse(
|
|
468
|
-
|
|
469
|
-
) -> Dict[str, Any]:
|
|
470
|
-
usage_data_source = values.get("usage_data_source", UsageDataSource.AUTO)
|
|
471
|
-
warehouse_id = values.get("warehouse_id")
|
|
485
|
+
@model_validator(mode="after")
|
|
486
|
+
def validate_usage_data_source_with_warehouse(self):
|
|
487
|
+
usage_data_source = self.usage_data_source or UsageDataSource.AUTO
|
|
472
488
|
|
|
473
|
-
if usage_data_source == UsageDataSource.SYSTEM_TABLES and not warehouse_id:
|
|
489
|
+
if usage_data_source == UsageDataSource.SYSTEM_TABLES and not self.warehouse_id:
|
|
474
490
|
raise ValueError(
|
|
475
491
|
f"usage_data_source='{UsageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
|
|
476
492
|
)
|
|
477
493
|
|
|
478
|
-
return
|
|
494
|
+
return self
|
|
479
495
|
|
|
480
|
-
@
|
|
496
|
+
@field_validator("schema_pattern", mode="before")
|
|
497
|
+
@classmethod
|
|
481
498
|
def schema_pattern_should__always_deny_information_schema(
|
|
482
499
|
cls, v: AllowDenyPattern
|
|
483
500
|
) -> AllowDenyPattern:
|
|
@@ -8,6 +8,7 @@ from pydantic import Field
|
|
|
8
8
|
|
|
9
9
|
from datahub.configuration.common import ConfigModel
|
|
10
10
|
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
11
|
+
from datahub.ingestion.source.unity.azure_auth_config import AzureAuthConfig
|
|
11
12
|
|
|
12
13
|
DATABRICKS = "databricks"
|
|
13
14
|
|
|
@@ -19,7 +20,12 @@ class UnityCatalogConnectionConfig(ConfigModel):
|
|
|
19
20
|
"""
|
|
20
21
|
|
|
21
22
|
scheme: str = DATABRICKS
|
|
22
|
-
token: str = pydantic.Field(
|
|
23
|
+
token: Optional[str] = pydantic.Field(
|
|
24
|
+
default=None, description="Databricks personal access token"
|
|
25
|
+
)
|
|
26
|
+
azure_auth: Optional[AzureAuthConfig] = Field(
|
|
27
|
+
default=None, description="Azure configuration"
|
|
28
|
+
)
|
|
23
29
|
workspace_url: str = pydantic.Field(
|
|
24
30
|
description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com"
|
|
25
31
|
)
|
|
@@ -16,10 +16,10 @@ class UnityCatalogConnectionTest:
|
|
|
16
16
|
self.report = UnityCatalogReport()
|
|
17
17
|
self.proxy = UnityCatalogApiProxy(
|
|
18
18
|
self.config.workspace_url,
|
|
19
|
-
self.config.token,
|
|
20
19
|
self.config.profiling.warehouse_id,
|
|
21
20
|
report=self.report,
|
|
22
21
|
databricks_api_page_size=self.config.databricks_api_page_size,
|
|
22
|
+
personal_access_token=self.config.token,
|
|
23
23
|
)
|
|
24
24
|
|
|
25
25
|
def get_connection_test(self) -> TestConnectionReport:
|
|
@@ -3,6 +3,7 @@ Manage the communication with DataBricks Server and provide equivalent dataclass
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import dataclasses
|
|
6
|
+
import json
|
|
6
7
|
import logging
|
|
7
8
|
import os
|
|
8
9
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -11,6 +12,7 @@ from typing import Any, Dict, Iterable, List, Optional, Sequence, Union, cast
|
|
|
11
12
|
from unittest.mock import patch
|
|
12
13
|
|
|
13
14
|
import cachetools
|
|
15
|
+
import yaml
|
|
14
16
|
from cachetools import cached
|
|
15
17
|
from databricks.sdk import WorkspaceClient
|
|
16
18
|
from databricks.sdk.service.catalog import (
|
|
@@ -23,7 +25,11 @@ from databricks.sdk.service.catalog import (
|
|
|
23
25
|
SchemaInfo,
|
|
24
26
|
TableInfo,
|
|
25
27
|
)
|
|
28
|
+
from databricks.sdk.service.files import DownloadResponse, FilesAPI
|
|
26
29
|
from databricks.sdk.service.iam import ServicePrincipal as DatabricksServicePrincipal
|
|
30
|
+
from databricks.sdk.service.ml import (
|
|
31
|
+
ExperimentsAPI,
|
|
32
|
+
)
|
|
27
33
|
from databricks.sdk.service.sql import (
|
|
28
34
|
QueryFilter,
|
|
29
35
|
QueryInfo,
|
|
@@ -38,6 +44,7 @@ from typing_extensions import assert_never
|
|
|
38
44
|
from datahub._version import nice_version_name
|
|
39
45
|
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
40
46
|
from datahub.emitter.mce_builder import parse_ts_millis
|
|
47
|
+
from datahub.ingestion.source.unity.azure_auth_config import AzureAuthConfig
|
|
41
48
|
from datahub.ingestion.source.unity.config import (
|
|
42
49
|
LineageDataSource,
|
|
43
50
|
UsageDataSource,
|
|
@@ -54,6 +61,8 @@ from datahub.ingestion.source.unity.proxy_types import (
|
|
|
54
61
|
ExternalTableReference,
|
|
55
62
|
Metastore,
|
|
56
63
|
Model,
|
|
64
|
+
ModelRunDetails,
|
|
65
|
+
ModelSignature,
|
|
57
66
|
ModelVersion,
|
|
58
67
|
Notebook,
|
|
59
68
|
NotebookReference,
|
|
@@ -155,30 +164,44 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
155
164
|
_workspace_url: str
|
|
156
165
|
report: UnityCatalogReport
|
|
157
166
|
warehouse_id: str
|
|
167
|
+
_experiments_api: ExperimentsAPI
|
|
168
|
+
_files_api: FilesAPI
|
|
158
169
|
|
|
159
170
|
def __init__(
|
|
160
171
|
self,
|
|
161
172
|
workspace_url: str,
|
|
162
|
-
personal_access_token: str,
|
|
163
173
|
warehouse_id: Optional[str],
|
|
164
174
|
report: UnityCatalogReport,
|
|
165
175
|
hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
|
|
166
176
|
lineage_data_source: LineageDataSource = LineageDataSource.AUTO,
|
|
167
177
|
usage_data_source: UsageDataSource = UsageDataSource.AUTO,
|
|
168
178
|
databricks_api_page_size: int = 0,
|
|
179
|
+
personal_access_token: Optional[str] = None,
|
|
180
|
+
azure_auth: Optional[AzureAuthConfig] = None,
|
|
169
181
|
):
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
182
|
+
if azure_auth:
|
|
183
|
+
self._workspace_client = WorkspaceClient(
|
|
184
|
+
host=workspace_url,
|
|
185
|
+
azure_tenant_id=azure_auth.tenant_id,
|
|
186
|
+
azure_client_id=azure_auth.client_id,
|
|
187
|
+
azure_client_secret=azure_auth.client_secret.get_secret_value(),
|
|
188
|
+
product="datahub",
|
|
189
|
+
product_version=nice_version_name(),
|
|
190
|
+
)
|
|
191
|
+
else:
|
|
192
|
+
self._workspace_client = WorkspaceClient(
|
|
193
|
+
host=workspace_url,
|
|
194
|
+
token=personal_access_token,
|
|
195
|
+
product="datahub",
|
|
196
|
+
product_version=nice_version_name(),
|
|
197
|
+
)
|
|
176
198
|
self.warehouse_id = warehouse_id or ""
|
|
177
199
|
self.report = report
|
|
178
200
|
self.hive_metastore_proxy = hive_metastore_proxy
|
|
179
201
|
self.lineage_data_source = lineage_data_source
|
|
180
202
|
self.usage_data_source = usage_data_source
|
|
181
203
|
self.databricks_api_page_size = databricks_api_page_size
|
|
204
|
+
self._workspace_url = workspace_url
|
|
182
205
|
self._sql_connection_params = {
|
|
183
206
|
"server_hostname": self._workspace_client.config.host.replace(
|
|
184
207
|
"https://", ""
|
|
@@ -187,6 +210,179 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
187
210
|
"access_token": self._workspace_client.config.token,
|
|
188
211
|
"user_agent_entry": "datahub",
|
|
189
212
|
}
|
|
213
|
+
# Initialize MLflow APIs
|
|
214
|
+
self._experiments_api = ExperimentsAPI(self._workspace_client.api_client)
|
|
215
|
+
self._files_api = FilesAPI(self._workspace_client.api_client)
|
|
216
|
+
|
|
217
|
+
def get_run_details(self, run_id: str) -> Optional[ModelRunDetails]:
|
|
218
|
+
"""
|
|
219
|
+
Get comprehensive details from an MLflow run.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
run_id: The MLflow run ID
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
ModelRunDetails object with comprehensive run information
|
|
226
|
+
"""
|
|
227
|
+
try:
|
|
228
|
+
run_response = self._experiments_api.get_run(run_id)
|
|
229
|
+
run = run_response.run
|
|
230
|
+
|
|
231
|
+
if (
|
|
232
|
+
not run
|
|
233
|
+
or not run.info
|
|
234
|
+
or not run.info.run_id
|
|
235
|
+
or not run.info.experiment_id
|
|
236
|
+
):
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
# Extract metrics
|
|
240
|
+
metrics: Dict[str, Any] = {}
|
|
241
|
+
if run.data and run.data.metrics:
|
|
242
|
+
for metric in run.data.metrics:
|
|
243
|
+
if metric.key is not None:
|
|
244
|
+
metrics[metric.key] = metric.value
|
|
245
|
+
|
|
246
|
+
# Extract parameters
|
|
247
|
+
parameters: Dict[str, Any] = {}
|
|
248
|
+
if run.data and run.data.params:
|
|
249
|
+
for param in run.data.params:
|
|
250
|
+
if param.key is not None:
|
|
251
|
+
parameters[param.key] = param.value
|
|
252
|
+
|
|
253
|
+
# Extract tags
|
|
254
|
+
tags: Dict[str, str] = {}
|
|
255
|
+
if run.data and run.data.tags:
|
|
256
|
+
for tag in run.data.tags:
|
|
257
|
+
if tag.key is not None and tag.value is not None:
|
|
258
|
+
tags[tag.key] = tag.value
|
|
259
|
+
|
|
260
|
+
return ModelRunDetails(
|
|
261
|
+
run_id=run.info.run_id,
|
|
262
|
+
experiment_id=run.info.experiment_id,
|
|
263
|
+
status=run.info.status.value if run.info.status else None,
|
|
264
|
+
start_time=parse_ts_millis(run.info.start_time),
|
|
265
|
+
end_time=parse_ts_millis(run.info.end_time),
|
|
266
|
+
user_id=run.info.user_id,
|
|
267
|
+
metrics=metrics,
|
|
268
|
+
parameters=parameters,
|
|
269
|
+
tags=tags,
|
|
270
|
+
)
|
|
271
|
+
except Exception as e:
|
|
272
|
+
logger.warning(
|
|
273
|
+
f"Unable to get run details for MLflow experiment, run-id: {run_id}",
|
|
274
|
+
exc_info=True,
|
|
275
|
+
)
|
|
276
|
+
self.report.report_warning(
|
|
277
|
+
title="Unable to get run details for MLflow experiment",
|
|
278
|
+
message="Error while getting run details for MLflow experiment",
|
|
279
|
+
context=f"run-id: {run_id}",
|
|
280
|
+
exc=e,
|
|
281
|
+
)
|
|
282
|
+
return None
|
|
283
|
+
|
|
284
|
+
def _extract_signature_from_files_api(
|
|
285
|
+
self, model_version: ModelVersionInfo
|
|
286
|
+
) -> Optional[ModelSignature]:
|
|
287
|
+
"""
|
|
288
|
+
Extract signature from MLmodel file using Databricks FilesAPI.
|
|
289
|
+
Uses the API endpoint: /api/2.0/fs/files/Models/{catalog}/{schema}/{model}/{version}/MLmodel
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
model_version: Unity Catalog ModelVersionInfo object with catalog_name, schema_name, model_name, version
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
ModelSignature if found, None otherwise
|
|
296
|
+
"""
|
|
297
|
+
try:
|
|
298
|
+
# Construct file path for FilesAPI
|
|
299
|
+
# The correct path format is: /Models/{catalog}/{schema}/{model}/{version}/MLmodel
|
|
300
|
+
file_path = (
|
|
301
|
+
f"/Models/{model_version.catalog_name}/{model_version.schema_name}/"
|
|
302
|
+
f"{model_version.model_name}/{model_version.version}/MLmodel"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
logger.debug(f"Downloading MLmodel from FilesAPI: {file_path}")
|
|
306
|
+
|
|
307
|
+
# Download the file using FilesAPI
|
|
308
|
+
download_response: DownloadResponse = self._files_api.download(
|
|
309
|
+
file_path=file_path
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Read the file content
|
|
313
|
+
# DownloadResponse.contents is a BinaryIO object
|
|
314
|
+
if download_response and download_response.contents:
|
|
315
|
+
content_stream = download_response.contents
|
|
316
|
+
|
|
317
|
+
# Read from the binary stream
|
|
318
|
+
if content_stream:
|
|
319
|
+
mlmodel_content: str = content_stream.read().decode("utf-8")
|
|
320
|
+
|
|
321
|
+
logger.debug(
|
|
322
|
+
f"MLmodel file contents from FilesAPI ({file_path}):\n{mlmodel_content}"
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Parse YAML content
|
|
326
|
+
mlmodel_data = yaml.safe_load(mlmodel_content)
|
|
327
|
+
|
|
328
|
+
# Extract signature from MLmodel YAML
|
|
329
|
+
if mlmodel_data and "signature" in mlmodel_data:
|
|
330
|
+
signature_raw = mlmodel_data["signature"]
|
|
331
|
+
|
|
332
|
+
# Signature inputs and outputs are stored as JSON strings in the YAML
|
|
333
|
+
# Parse them into proper dict/list format
|
|
334
|
+
signature_data = {}
|
|
335
|
+
if "inputs" in signature_raw:
|
|
336
|
+
try:
|
|
337
|
+
signature_data["inputs"] = json.loads(
|
|
338
|
+
signature_raw["inputs"]
|
|
339
|
+
)
|
|
340
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
341
|
+
logger.debug(f"Failed to parse inputs JSON: {e}")
|
|
342
|
+
|
|
343
|
+
if "outputs" in signature_raw:
|
|
344
|
+
try:
|
|
345
|
+
signature_data["outputs"] = json.loads(
|
|
346
|
+
signature_raw["outputs"]
|
|
347
|
+
)
|
|
348
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
349
|
+
logger.debug(f"Failed to parse outputs JSON: {e}")
|
|
350
|
+
|
|
351
|
+
if "params" in signature_raw:
|
|
352
|
+
try:
|
|
353
|
+
signature_data["params"] = json.loads(
|
|
354
|
+
signature_raw["params"]
|
|
355
|
+
)
|
|
356
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
357
|
+
logger.debug(f"Failed to parse params JSON: {e}")
|
|
358
|
+
|
|
359
|
+
return ModelSignature(
|
|
360
|
+
inputs=signature_data.get("inputs"),
|
|
361
|
+
outputs=signature_data.get("outputs"),
|
|
362
|
+
parameters=signature_data.get("params"),
|
|
363
|
+
)
|
|
364
|
+
else:
|
|
365
|
+
logger.debug(
|
|
366
|
+
f"No signature found in MLmodel data from {file_path}"
|
|
367
|
+
)
|
|
368
|
+
return None
|
|
369
|
+
|
|
370
|
+
return None
|
|
371
|
+
|
|
372
|
+
except Exception as e:
|
|
373
|
+
model_name = getattr(model_version, "model_name", "unknown")
|
|
374
|
+
version_num = getattr(model_version, "version", "unknown")
|
|
375
|
+
self.report.report_warning(
|
|
376
|
+
title="Unable to extract signature from MLmodel file",
|
|
377
|
+
message="Error while extracting signature from MLmodel file",
|
|
378
|
+
context=f"model-name: {model_name}, model-version: {version_num}",
|
|
379
|
+
exc=e,
|
|
380
|
+
)
|
|
381
|
+
logger.warning(
|
|
382
|
+
f"Unable to extract signature from MLmodel file, model-name: {model_name}, model-version: {version_num}",
|
|
383
|
+
exc_info=True,
|
|
384
|
+
)
|
|
385
|
+
return None
|
|
190
386
|
|
|
191
387
|
def check_basic_connectivity(self) -> bool:
|
|
192
388
|
return bool(
|
|
@@ -1019,6 +1215,17 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
1019
1215
|
for alias in obj.aliases:
|
|
1020
1216
|
if alias.alias_name:
|
|
1021
1217
|
aliases.append(alias.alias_name)
|
|
1218
|
+
|
|
1219
|
+
run_details: Optional[ModelRunDetails] = None
|
|
1220
|
+
# Fetch run details if run_id exists
|
|
1221
|
+
if obj.run_id:
|
|
1222
|
+
run_details = self.get_run_details(obj.run_id)
|
|
1223
|
+
|
|
1224
|
+
# Extract signature separately from Files API
|
|
1225
|
+
signature: Optional[ModelSignature] = self._extract_signature_from_files_api(
|
|
1226
|
+
obj
|
|
1227
|
+
)
|
|
1228
|
+
|
|
1022
1229
|
return ModelVersion(
|
|
1023
1230
|
id=f"{model.id}_{obj.version}",
|
|
1024
1231
|
name=f"{model.name}_{obj.version}",
|
|
@@ -1029,6 +1236,8 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
1029
1236
|
created_at=parse_ts_millis(obj.created_at),
|
|
1030
1237
|
updated_at=parse_ts_millis(obj.updated_at),
|
|
1031
1238
|
created_by=obj.created_by,
|
|
1239
|
+
run_details=run_details,
|
|
1240
|
+
signature=signature,
|
|
1032
1241
|
)
|
|
1033
1242
|
|
|
1034
1243
|
def _create_service_principal(
|
|
@@ -339,8 +339,75 @@ class Notebook:
|
|
|
339
339
|
)
|
|
340
340
|
|
|
341
341
|
|
|
342
|
+
@dataclass
|
|
343
|
+
class ModelSignature:
|
|
344
|
+
"""
|
|
345
|
+
Represents the model signature with input and output schemas extracted from MLflow.
|
|
346
|
+
|
|
347
|
+
In Unity Catalog, model signatures define the expected input/output formats for ML models.
|
|
348
|
+
Model signature is stored in the MLmodel YAML file.
|
|
349
|
+
|
|
350
|
+
Attributes:
|
|
351
|
+
inputs: List of input schema specifications, each containing name, type, dtype, shape
|
|
352
|
+
outputs: List of output schema specifications, each containing name, type, dtype, shape
|
|
353
|
+
parameters: List of model parameters
|
|
354
|
+
"""
|
|
355
|
+
|
|
356
|
+
inputs: Optional[List[Dict[str, str]]]
|
|
357
|
+
outputs: Optional[List[Dict[str, str]]]
|
|
358
|
+
parameters: Optional[List[Dict[str, str]]]
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
@dataclass
|
|
362
|
+
class ModelRunDetails:
|
|
363
|
+
"""
|
|
364
|
+
Represents comprehensive details from an MLflow run associated with a Unity Catalog model version.
|
|
365
|
+
|
|
366
|
+
In Unity Catalog, each model version is linked to an MLflow run via run_id. This dataclass
|
|
367
|
+
contains all the metadata extracted from that MLflow run, including metrics, parameters,
|
|
368
|
+
and tags.
|
|
369
|
+
|
|
370
|
+
Attributes:
|
|
371
|
+
run_id: MLflow run ID
|
|
372
|
+
experiment_id: MLflow experiment ID
|
|
373
|
+
status: Run status (e.g., "FINISHED", "RUNNING")
|
|
374
|
+
start_time: Run start timestamp (milliseconds since epoch)
|
|
375
|
+
end_time: Run end timestamp (milliseconds since epoch)
|
|
376
|
+
user_id: User who initiated the run
|
|
377
|
+
metrics: Training metrics (e.g., accuracy, loss)
|
|
378
|
+
parameters: Hyperparameters used for training
|
|
379
|
+
tags: Run tags/metadata
|
|
380
|
+
"""
|
|
381
|
+
|
|
382
|
+
run_id: str
|
|
383
|
+
experiment_id: str
|
|
384
|
+
status: Optional[str]
|
|
385
|
+
start_time: Optional[datetime]
|
|
386
|
+
end_time: Optional[datetime]
|
|
387
|
+
user_id: Optional[str]
|
|
388
|
+
metrics: Optional[Dict[str, str]]
|
|
389
|
+
parameters: Optional[Dict[str, str]]
|
|
390
|
+
tags: Optional[Dict[str, str]]
|
|
391
|
+
|
|
392
|
+
|
|
342
393
|
@dataclass
|
|
343
394
|
class Model:
|
|
395
|
+
"""
|
|
396
|
+
Represents a Unity Catalog registered ML model (model group).
|
|
397
|
+
|
|
398
|
+
In Unity Catalog, a registered model is a collection of model versions.
|
|
399
|
+
This dataclass corresponds to a Unity Catalog RegisteredModelInfo.
|
|
400
|
+
|
|
401
|
+
Attributes:
|
|
402
|
+
id: Full qualified name (e.g., "catalog.schema.model_name")
|
|
403
|
+
name: Model name without catalog/schema prefix
|
|
404
|
+
schema_name: Schema name containing the model
|
|
405
|
+
catalog_name: Catalog name containing the model
|
|
406
|
+
description: Model description/comment
|
|
407
|
+
created_at: Model creation timestamp
|
|
408
|
+
updated_at: Last update timestamp
|
|
409
|
+
"""
|
|
410
|
+
|
|
344
411
|
id: str
|
|
345
412
|
name: str
|
|
346
413
|
schema_name: str
|
|
@@ -352,6 +419,28 @@ class Model:
|
|
|
352
419
|
|
|
353
420
|
@dataclass
|
|
354
421
|
class ModelVersion:
|
|
422
|
+
"""
|
|
423
|
+
Represents a specific version of a Unity Catalog registered ML model.
|
|
424
|
+
|
|
425
|
+
In Unity Catalog, each model version is linked to an MLflow run (via run_id).
|
|
426
|
+
This dataclass corresponds to a Unity Catalog ModelVersionInfo.
|
|
427
|
+
|
|
428
|
+
Attributes:
|
|
429
|
+
id: Unique identifier combining model ID and version (e.g., "catalog.schema.model_1")
|
|
430
|
+
name: Versioned model name
|
|
431
|
+
model: Reference to the parent Model (model group)
|
|
432
|
+
version: Version number as string
|
|
433
|
+
aliases: List of aliases (e.g., ["prod", "latest"])
|
|
434
|
+
description: Version description/comment
|
|
435
|
+
created_at: Version creation timestamp
|
|
436
|
+
updated_at: Last update timestamp
|
|
437
|
+
created_by: User who created this version
|
|
438
|
+
run_details: Comprehensive MLflow run details (metrics, parameters, tags)
|
|
439
|
+
extracted from the MLflow run linked to this model version.
|
|
440
|
+
signature: Model signature extracted from the MLmodel file via Files API.
|
|
441
|
+
Contains input/output schema specifications and parameters.
|
|
442
|
+
"""
|
|
443
|
+
|
|
355
444
|
id: str
|
|
356
445
|
name: str
|
|
357
446
|
model: Model
|
|
@@ -361,3 +450,5 @@ class ModelVersion:
|
|
|
361
450
|
created_at: Optional[datetime]
|
|
362
451
|
updated_at: Optional[datetime]
|
|
363
452
|
created_by: Optional[str]
|
|
453
|
+
run_details: Optional["ModelRunDetails"]
|
|
454
|
+
signature: Optional["ModelSignature"]
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import json
|
|
1
3
|
import logging
|
|
2
4
|
import re
|
|
3
5
|
import time
|
|
4
|
-
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
6
|
+
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union, cast
|
|
5
7
|
from urllib.parse import urljoin
|
|
6
8
|
|
|
7
9
|
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
@@ -209,13 +211,14 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
209
211
|
|
|
210
212
|
self.unity_catalog_api_proxy = UnityCatalogApiProxy(
|
|
211
213
|
config.workspace_url,
|
|
212
|
-
config.token,
|
|
213
214
|
config.warehouse_id,
|
|
214
215
|
report=self.report,
|
|
215
216
|
hive_metastore_proxy=self.hive_metastore_proxy,
|
|
216
217
|
lineage_data_source=config.lineage_data_source,
|
|
217
218
|
usage_data_source=config.usage_data_source,
|
|
218
219
|
databricks_api_page_size=config.databricks_api_page_size,
|
|
220
|
+
personal_access_token=config.token if config.token else None,
|
|
221
|
+
azure_auth=config.azure_auth if config.azure_auth else None,
|
|
219
222
|
)
|
|
220
223
|
|
|
221
224
|
self.external_url_base = urljoin(self.config.workspace_url, "/explore/data")
|
|
@@ -317,7 +320,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
317
320
|
|
|
318
321
|
@classmethod
|
|
319
322
|
def create(cls, config_dict, ctx):
|
|
320
|
-
config = UnityCatalogSourceConfig.
|
|
323
|
+
config = UnityCatalogSourceConfig.model_validate(config_dict)
|
|
321
324
|
return cls(ctx=ctx, config=config)
|
|
322
325
|
|
|
323
326
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
@@ -741,6 +744,17 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
741
744
|
created=TimeStampClass(time=created_time, actor=created_actor),
|
|
742
745
|
)
|
|
743
746
|
)
|
|
747
|
+
custom_properties = {}
|
|
748
|
+
if ml_model_version.signature:
|
|
749
|
+
for key, value in dataclasses.asdict(ml_model_version.signature).items():
|
|
750
|
+
if value:
|
|
751
|
+
custom_properties[f"signature.{key}"] = json.dumps(value)
|
|
752
|
+
|
|
753
|
+
if ml_model_version.run_details:
|
|
754
|
+
if ml_model_version.run_details.tags:
|
|
755
|
+
for key, value in ml_model_version.run_details.tags.items():
|
|
756
|
+
if value:
|
|
757
|
+
custom_properties[key] = json.dumps(value)
|
|
744
758
|
|
|
745
759
|
ml_model = MLModel(
|
|
746
760
|
id=ml_model_version.id,
|
|
@@ -751,6 +765,18 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
751
765
|
model_group=ml_model_urn,
|
|
752
766
|
platform=self.platform,
|
|
753
767
|
last_modified=ml_model_version.updated_at,
|
|
768
|
+
training_metrics=cast(
|
|
769
|
+
Optional[Dict[str, Optional[str]]], ml_model_version.run_details.metrics
|
|
770
|
+
)
|
|
771
|
+
if ml_model_version.run_details and ml_model_version.run_details.metrics
|
|
772
|
+
else None,
|
|
773
|
+
hyper_params=cast(
|
|
774
|
+
Optional[Dict[str, Optional[str]]],
|
|
775
|
+
ml_model_version.run_details.parameters,
|
|
776
|
+
)
|
|
777
|
+
if ml_model_version.run_details and ml_model_version.run_details.parameters
|
|
778
|
+
else None,
|
|
779
|
+
custom_properties=custom_properties if custom_properties else None,
|
|
754
780
|
extra_aspects=extra_aspects,
|
|
755
781
|
)
|
|
756
782
|
|
|
@@ -115,7 +115,7 @@ class ClickHouseUsageSource(Source):
|
|
|
115
115
|
|
|
116
116
|
@classmethod
|
|
117
117
|
def create(cls, config_dict, ctx):
|
|
118
|
-
config = ClickHouseUsageConfig.
|
|
118
|
+
config = ClickHouseUsageConfig.model_validate(config_dict)
|
|
119
119
|
return cls(ctx, config)
|
|
120
120
|
|
|
121
121
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
@@ -133,7 +133,7 @@ class TrinoUsageSource(Source):
|
|
|
133
133
|
|
|
134
134
|
@classmethod
|
|
135
135
|
def create(cls, config_dict, ctx):
|
|
136
|
-
config = TrinoUsageConfig.
|
|
136
|
+
config = TrinoUsageConfig.model_validate(config_dict)
|
|
137
137
|
return cls(ctx, config)
|
|
138
138
|
|
|
139
139
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|