acryl-datahub 1.3.1__py3-none-any.whl → 1.3.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (203) hide show
  1. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/METADATA +2582 -2582
  2. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/RECORD +203 -201
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +2 -2
  5. datahub/api/entities/corpgroup/corpgroup.py +11 -6
  6. datahub/api/entities/corpuser/corpuser.py +11 -11
  7. datahub/api/entities/dataproduct/dataproduct.py +47 -27
  8. datahub/api/entities/dataset/dataset.py +32 -21
  9. datahub/api/entities/external/lake_formation_external_entites.py +5 -6
  10. datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
  11. datahub/api/entities/forms/forms.py +16 -14
  12. datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
  13. datahub/cli/check_cli.py +2 -2
  14. datahub/cli/config_utils.py +3 -3
  15. datahub/cli/lite_cli.py +9 -7
  16. datahub/cli/migrate.py +4 -4
  17. datahub/cli/quickstart_versioning.py +3 -3
  18. datahub/cli/specific/group_cli.py +1 -1
  19. datahub/cli/specific/structuredproperties_cli.py +1 -1
  20. datahub/cli/specific/user_cli.py +1 -1
  21. datahub/configuration/common.py +14 -2
  22. datahub/configuration/connection_resolver.py +2 -2
  23. datahub/configuration/git.py +47 -30
  24. datahub/configuration/import_resolver.py +2 -2
  25. datahub/configuration/kafka.py +4 -3
  26. datahub/configuration/time_window_config.py +26 -26
  27. datahub/configuration/validate_field_deprecation.py +2 -2
  28. datahub/configuration/validate_field_removal.py +2 -2
  29. datahub/configuration/validate_field_rename.py +2 -2
  30. datahub/configuration/validate_multiline_string.py +2 -1
  31. datahub/emitter/kafka_emitter.py +3 -1
  32. datahub/emitter/rest_emitter.py +2 -4
  33. datahub/ingestion/api/decorators.py +1 -1
  34. datahub/ingestion/api/report.py +1 -1
  35. datahub/ingestion/api/sink.py +1 -1
  36. datahub/ingestion/api/source.py +1 -1
  37. datahub/ingestion/glossary/datahub_classifier.py +11 -8
  38. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  39. datahub/ingestion/reporting/file_reporter.py +5 -4
  40. datahub/ingestion/run/pipeline.py +6 -6
  41. datahub/ingestion/run/pipeline_config.py +12 -14
  42. datahub/ingestion/run/sink_callback.py +1 -1
  43. datahub/ingestion/sink/datahub_rest.py +6 -4
  44. datahub/ingestion/source/abs/config.py +19 -19
  45. datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
  46. datahub/ingestion/source/abs/source.py +2 -2
  47. datahub/ingestion/source/aws/aws_common.py +1 -1
  48. datahub/ingestion/source/aws/glue.py +6 -4
  49. datahub/ingestion/source/aws/sagemaker.py +1 -1
  50. datahub/ingestion/source/azure/azure_common.py +8 -12
  51. datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
  52. datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
  53. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
  54. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  55. datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
  56. datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
  57. datahub/ingestion/source/datahub/config.py +8 -8
  58. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  59. datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
  60. datahub/ingestion/source/dbt/dbt_common.py +39 -37
  61. datahub/ingestion/source/dbt/dbt_core.py +10 -12
  62. datahub/ingestion/source/debug/datahub_debug.py +1 -1
  63. datahub/ingestion/source/delta_lake/config.py +6 -4
  64. datahub/ingestion/source/dremio/dremio_config.py +10 -6
  65. datahub/ingestion/source/dremio/dremio_source.py +15 -15
  66. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  67. datahub/ingestion/source/elastic_search.py +4 -3
  68. datahub/ingestion/source/excel/source.py +1 -1
  69. datahub/ingestion/source/feast.py +1 -1
  70. datahub/ingestion/source/file.py +5 -4
  71. datahub/ingestion/source/fivetran/config.py +17 -16
  72. datahub/ingestion/source/fivetran/fivetran.py +2 -2
  73. datahub/ingestion/source/gc/datahub_gc.py +1 -1
  74. datahub/ingestion/source/gcs/gcs_source.py +8 -10
  75. datahub/ingestion/source/ge_profiling_config.py +8 -5
  76. datahub/ingestion/source/grafana/grafana_api.py +2 -2
  77. datahub/ingestion/source/grafana/grafana_config.py +4 -3
  78. datahub/ingestion/source/grafana/grafana_source.py +1 -1
  79. datahub/ingestion/source/grafana/models.py +23 -5
  80. datahub/ingestion/source/hex/api.py +7 -5
  81. datahub/ingestion/source/hex/hex.py +4 -3
  82. datahub/ingestion/source/iceberg/iceberg.py +1 -1
  83. datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
  84. datahub/ingestion/source/identity/azure_ad.py +1 -1
  85. datahub/ingestion/source/identity/okta.py +10 -10
  86. datahub/ingestion/source/kafka/kafka.py +1 -1
  87. datahub/ingestion/source/ldap.py +1 -1
  88. datahub/ingestion/source/looker/looker_common.py +7 -5
  89. datahub/ingestion/source/looker/looker_config.py +21 -20
  90. datahub/ingestion/source/looker/lookml_config.py +47 -47
  91. datahub/ingestion/source/metabase.py +8 -8
  92. datahub/ingestion/source/metadata/business_glossary.py +2 -2
  93. datahub/ingestion/source/metadata/lineage.py +13 -8
  94. datahub/ingestion/source/mlflow.py +1 -1
  95. datahub/ingestion/source/mode.py +6 -4
  96. datahub/ingestion/source/mongodb.py +4 -3
  97. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  98. datahub/ingestion/source/nifi.py +17 -23
  99. datahub/ingestion/source/openapi.py +6 -8
  100. datahub/ingestion/source/powerbi/config.py +33 -32
  101. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
  102. datahub/ingestion/source/powerbi/powerbi.py +1 -1
  103. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
  104. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
  105. datahub/ingestion/source/preset.py +8 -8
  106. datahub/ingestion/source/pulsar.py +1 -1
  107. datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
  108. datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
  109. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  110. datahub/ingestion/source/redshift/config.py +18 -20
  111. datahub/ingestion/source/redshift/redshift.py +2 -2
  112. datahub/ingestion/source/redshift/usage.py +23 -3
  113. datahub/ingestion/source/s3/config.py +83 -62
  114. datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
  115. datahub/ingestion/source/s3/source.py +8 -5
  116. datahub/ingestion/source/sac/sac.py +5 -4
  117. datahub/ingestion/source/salesforce.py +3 -2
  118. datahub/ingestion/source/schema/json_schema.py +2 -2
  119. datahub/ingestion/source/sigma/data_classes.py +3 -2
  120. datahub/ingestion/source/sigma/sigma.py +1 -1
  121. datahub/ingestion/source/sigma/sigma_api.py +7 -7
  122. datahub/ingestion/source/slack/slack.py +1 -1
  123. datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
  124. datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
  125. datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
  126. datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
  127. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
  128. datahub/ingestion/source/snowflake/snowflake_queries.py +1 -1
  129. datahub/ingestion/source/sql/athena.py +1 -1
  130. datahub/ingestion/source/sql/clickhouse.py +4 -2
  131. datahub/ingestion/source/sql/cockroachdb.py +1 -1
  132. datahub/ingestion/source/sql/druid.py +1 -1
  133. datahub/ingestion/source/sql/hana.py +1 -1
  134. datahub/ingestion/source/sql/hive.py +7 -5
  135. datahub/ingestion/source/sql/hive_metastore.py +1 -1
  136. datahub/ingestion/source/sql/mssql/source.py +13 -6
  137. datahub/ingestion/source/sql/mysql.py +1 -1
  138. datahub/ingestion/source/sql/oracle.py +17 -10
  139. datahub/ingestion/source/sql/postgres.py +2 -2
  140. datahub/ingestion/source/sql/presto.py +1 -1
  141. datahub/ingestion/source/sql/sql_config.py +8 -9
  142. datahub/ingestion/source/sql/sql_generic.py +1 -1
  143. datahub/ingestion/source/sql/teradata.py +1 -1
  144. datahub/ingestion/source/sql/trino.py +1 -1
  145. datahub/ingestion/source/sql/vertica.py +5 -4
  146. datahub/ingestion/source/sql_queries.py +11 -8
  147. datahub/ingestion/source/state/checkpoint.py +2 -2
  148. datahub/ingestion/source/state/entity_removal_state.py +2 -1
  149. datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
  150. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
  151. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  152. datahub/ingestion/source/superset.py +9 -9
  153. datahub/ingestion/source/tableau/tableau.py +14 -16
  154. datahub/ingestion/source/unity/azure_auth_config.py +15 -0
  155. datahub/ingestion/source/unity/config.py +51 -34
  156. datahub/ingestion/source/unity/connection.py +7 -1
  157. datahub/ingestion/source/unity/connection_test.py +1 -1
  158. datahub/ingestion/source/unity/proxy.py +216 -7
  159. datahub/ingestion/source/unity/proxy_types.py +91 -0
  160. datahub/ingestion/source/unity/source.py +29 -3
  161. datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
  162. datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
  163. datahub/ingestion/source/usage/usage_common.py +5 -3
  164. datahub/ingestion/source_config/csv_enricher.py +7 -6
  165. datahub/ingestion/source_config/operation_config.py +7 -4
  166. datahub/ingestion/source_config/pulsar.py +11 -15
  167. datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
  168. datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
  169. datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
  170. datahub/ingestion/transformer/add_dataset_properties.py +2 -2
  171. datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
  172. datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
  173. datahub/ingestion/transformer/add_dataset_tags.py +3 -3
  174. datahub/ingestion/transformer/add_dataset_terms.py +3 -3
  175. datahub/ingestion/transformer/dataset_domain.py +3 -3
  176. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
  177. datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
  178. datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
  179. datahub/ingestion/transformer/mark_dataset_status.py +1 -1
  180. datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
  181. datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
  182. datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
  183. datahub/ingestion/transformer/replace_external_url.py +2 -2
  184. datahub/ingestion/transformer/set_browse_path.py +1 -1
  185. datahub/ingestion/transformer/tags_to_terms.py +1 -1
  186. datahub/lite/duckdb_lite.py +1 -1
  187. datahub/lite/lite_util.py +2 -2
  188. datahub/metadata/schema.avsc +7 -2
  189. datahub/metadata/schemas/QuerySubjects.avsc +1 -1
  190. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +6 -1
  191. datahub/sdk/__init__.py +1 -0
  192. datahub/sdk/_all_entities.py +2 -0
  193. datahub/sdk/search_filters.py +68 -40
  194. datahub/sdk/tag.py +112 -0
  195. datahub/secret/datahub_secret_store.py +7 -4
  196. datahub/secret/file_secret_store.py +1 -1
  197. datahub/sql_parsing/sqlglot_lineage.py +5 -2
  198. datahub/testing/check_sql_parser_result.py +2 -2
  199. datahub/utilities/ingest_utils.py +1 -1
  200. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/WHEEL +0 -0
  201. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/entry_points.txt +0 -0
  202. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/licenses/LICENSE +0 -0
  203. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@ Manage the communication with DataBricks Server and provide equivalent dataclass
3
3
  """
4
4
 
5
5
  import dataclasses
6
+ import json
6
7
  import logging
7
8
  import os
8
9
  from concurrent.futures import ThreadPoolExecutor
@@ -11,6 +12,7 @@ from typing import Any, Dict, Iterable, List, Optional, Sequence, Union, cast
11
12
  from unittest.mock import patch
12
13
 
13
14
  import cachetools
15
+ import yaml
14
16
  from cachetools import cached
15
17
  from databricks.sdk import WorkspaceClient
16
18
  from databricks.sdk.service.catalog import (
@@ -23,7 +25,11 @@ from databricks.sdk.service.catalog import (
23
25
  SchemaInfo,
24
26
  TableInfo,
25
27
  )
28
+ from databricks.sdk.service.files import DownloadResponse, FilesAPI
26
29
  from databricks.sdk.service.iam import ServicePrincipal as DatabricksServicePrincipal
30
+ from databricks.sdk.service.ml import (
31
+ ExperimentsAPI,
32
+ )
27
33
  from databricks.sdk.service.sql import (
28
34
  QueryFilter,
29
35
  QueryInfo,
@@ -38,6 +44,7 @@ from typing_extensions import assert_never
38
44
  from datahub._version import nice_version_name
39
45
  from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
40
46
  from datahub.emitter.mce_builder import parse_ts_millis
47
+ from datahub.ingestion.source.unity.azure_auth_config import AzureAuthConfig
41
48
  from datahub.ingestion.source.unity.config import (
42
49
  LineageDataSource,
43
50
  UsageDataSource,
@@ -54,6 +61,8 @@ from datahub.ingestion.source.unity.proxy_types import (
54
61
  ExternalTableReference,
55
62
  Metastore,
56
63
  Model,
64
+ ModelRunDetails,
65
+ ModelSignature,
57
66
  ModelVersion,
58
67
  Notebook,
59
68
  NotebookReference,
@@ -155,30 +164,44 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
155
164
  _workspace_url: str
156
165
  report: UnityCatalogReport
157
166
  warehouse_id: str
167
+ _experiments_api: ExperimentsAPI
168
+ _files_api: FilesAPI
158
169
 
159
170
  def __init__(
160
171
  self,
161
172
  workspace_url: str,
162
- personal_access_token: str,
163
173
  warehouse_id: Optional[str],
164
174
  report: UnityCatalogReport,
165
175
  hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
166
176
  lineage_data_source: LineageDataSource = LineageDataSource.AUTO,
167
177
  usage_data_source: UsageDataSource = UsageDataSource.AUTO,
168
178
  databricks_api_page_size: int = 0,
179
+ personal_access_token: Optional[str] = None,
180
+ azure_auth: Optional[AzureAuthConfig] = None,
169
181
  ):
170
- self._workspace_client = WorkspaceClient(
171
- host=workspace_url,
172
- token=personal_access_token,
173
- product="datahub",
174
- product_version=nice_version_name(),
175
- )
182
+ if azure_auth:
183
+ self._workspace_client = WorkspaceClient(
184
+ host=workspace_url,
185
+ azure_tenant_id=azure_auth.tenant_id,
186
+ azure_client_id=azure_auth.client_id,
187
+ azure_client_secret=azure_auth.client_secret.get_secret_value(),
188
+ product="datahub",
189
+ product_version=nice_version_name(),
190
+ )
191
+ else:
192
+ self._workspace_client = WorkspaceClient(
193
+ host=workspace_url,
194
+ token=personal_access_token,
195
+ product="datahub",
196
+ product_version=nice_version_name(),
197
+ )
176
198
  self.warehouse_id = warehouse_id or ""
177
199
  self.report = report
178
200
  self.hive_metastore_proxy = hive_metastore_proxy
179
201
  self.lineage_data_source = lineage_data_source
180
202
  self.usage_data_source = usage_data_source
181
203
  self.databricks_api_page_size = databricks_api_page_size
204
+ self._workspace_url = workspace_url
182
205
  self._sql_connection_params = {
183
206
  "server_hostname": self._workspace_client.config.host.replace(
184
207
  "https://", ""
@@ -187,6 +210,179 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
187
210
  "access_token": self._workspace_client.config.token,
188
211
  "user_agent_entry": "datahub",
189
212
  }
213
+ # Initialize MLflow APIs
214
+ self._experiments_api = ExperimentsAPI(self._workspace_client.api_client)
215
+ self._files_api = FilesAPI(self._workspace_client.api_client)
216
+
217
+ def get_run_details(self, run_id: str) -> Optional[ModelRunDetails]:
218
+ """
219
+ Get comprehensive details from an MLflow run.
220
+
221
+ Args:
222
+ run_id: The MLflow run ID
223
+
224
+ Returns:
225
+ ModelRunDetails object with comprehensive run information
226
+ """
227
+ try:
228
+ run_response = self._experiments_api.get_run(run_id)
229
+ run = run_response.run
230
+
231
+ if (
232
+ not run
233
+ or not run.info
234
+ or not run.info.run_id
235
+ or not run.info.experiment_id
236
+ ):
237
+ return None
238
+
239
+ # Extract metrics
240
+ metrics: Dict[str, Any] = {}
241
+ if run.data and run.data.metrics:
242
+ for metric in run.data.metrics:
243
+ if metric.key is not None:
244
+ metrics[metric.key] = metric.value
245
+
246
+ # Extract parameters
247
+ parameters: Dict[str, Any] = {}
248
+ if run.data and run.data.params:
249
+ for param in run.data.params:
250
+ if param.key is not None:
251
+ parameters[param.key] = param.value
252
+
253
+ # Extract tags
254
+ tags: Dict[str, str] = {}
255
+ if run.data and run.data.tags:
256
+ for tag in run.data.tags:
257
+ if tag.key is not None and tag.value is not None:
258
+ tags[tag.key] = tag.value
259
+
260
+ return ModelRunDetails(
261
+ run_id=run.info.run_id,
262
+ experiment_id=run.info.experiment_id,
263
+ status=run.info.status.value if run.info.status else None,
264
+ start_time=parse_ts_millis(run.info.start_time),
265
+ end_time=parse_ts_millis(run.info.end_time),
266
+ user_id=run.info.user_id,
267
+ metrics=metrics,
268
+ parameters=parameters,
269
+ tags=tags,
270
+ )
271
+ except Exception as e:
272
+ logger.warning(
273
+ f"Unable to get run details for MLflow experiment, run-id: {run_id}",
274
+ exc_info=True,
275
+ )
276
+ self.report.report_warning(
277
+ title="Unable to get run details for MLflow experiment",
278
+ message="Error while getting run details for MLflow experiment",
279
+ context=f"run-id: {run_id}",
280
+ exc=e,
281
+ )
282
+ return None
283
+
284
+ def _extract_signature_from_files_api(
285
+ self, model_version: ModelVersionInfo
286
+ ) -> Optional[ModelSignature]:
287
+ """
288
+ Extract signature from MLmodel file using Databricks FilesAPI.
289
+ Uses the API endpoint: /api/2.0/fs/files/Models/{catalog}/{schema}/{model}/{version}/MLmodel
290
+
291
+ Args:
292
+ model_version: Unity Catalog ModelVersionInfo object with catalog_name, schema_name, model_name, version
293
+
294
+ Returns:
295
+ ModelSignature if found, None otherwise
296
+ """
297
+ try:
298
+ # Construct file path for FilesAPI
299
+ # The correct path format is: /Models/{catalog}/{schema}/{model}/{version}/MLmodel
300
+ file_path = (
301
+ f"/Models/{model_version.catalog_name}/{model_version.schema_name}/"
302
+ f"{model_version.model_name}/{model_version.version}/MLmodel"
303
+ )
304
+
305
+ logger.debug(f"Downloading MLmodel from FilesAPI: {file_path}")
306
+
307
+ # Download the file using FilesAPI
308
+ download_response: DownloadResponse = self._files_api.download(
309
+ file_path=file_path
310
+ )
311
+
312
+ # Read the file content
313
+ # DownloadResponse.contents is a BinaryIO object
314
+ if download_response and download_response.contents:
315
+ content_stream = download_response.contents
316
+
317
+ # Read from the binary stream
318
+ if content_stream:
319
+ mlmodel_content: str = content_stream.read().decode("utf-8")
320
+
321
+ logger.debug(
322
+ f"MLmodel file contents from FilesAPI ({file_path}):\n{mlmodel_content}"
323
+ )
324
+
325
+ # Parse YAML content
326
+ mlmodel_data = yaml.safe_load(mlmodel_content)
327
+
328
+ # Extract signature from MLmodel YAML
329
+ if mlmodel_data and "signature" in mlmodel_data:
330
+ signature_raw = mlmodel_data["signature"]
331
+
332
+ # Signature inputs and outputs are stored as JSON strings in the YAML
333
+ # Parse them into proper dict/list format
334
+ signature_data = {}
335
+ if "inputs" in signature_raw:
336
+ try:
337
+ signature_data["inputs"] = json.loads(
338
+ signature_raw["inputs"]
339
+ )
340
+ except (json.JSONDecodeError, TypeError) as e:
341
+ logger.debug(f"Failed to parse inputs JSON: {e}")
342
+
343
+ if "outputs" in signature_raw:
344
+ try:
345
+ signature_data["outputs"] = json.loads(
346
+ signature_raw["outputs"]
347
+ )
348
+ except (json.JSONDecodeError, TypeError) as e:
349
+ logger.debug(f"Failed to parse outputs JSON: {e}")
350
+
351
+ if "params" in signature_raw:
352
+ try:
353
+ signature_data["params"] = json.loads(
354
+ signature_raw["params"]
355
+ )
356
+ except (json.JSONDecodeError, TypeError) as e:
357
+ logger.debug(f"Failed to parse params JSON: {e}")
358
+
359
+ return ModelSignature(
360
+ inputs=signature_data.get("inputs"),
361
+ outputs=signature_data.get("outputs"),
362
+ parameters=signature_data.get("params"),
363
+ )
364
+ else:
365
+ logger.debug(
366
+ f"No signature found in MLmodel data from {file_path}"
367
+ )
368
+ return None
369
+
370
+ return None
371
+
372
+ except Exception as e:
373
+ model_name = getattr(model_version, "model_name", "unknown")
374
+ version_num = getattr(model_version, "version", "unknown")
375
+ self.report.report_warning(
376
+ title="Unable to extract signature from MLmodel file",
377
+ message="Error while extracting signature from MLmodel file",
378
+ context=f"model-name: {model_name}, model-version: {version_num}",
379
+ exc=e,
380
+ )
381
+ logger.warning(
382
+ f"Unable to extract signature from MLmodel file, model-name: {model_name}, model-version: {version_num}",
383
+ exc_info=True,
384
+ )
385
+ return None
190
386
 
191
387
  def check_basic_connectivity(self) -> bool:
192
388
  return bool(
@@ -1019,6 +1215,17 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
1019
1215
  for alias in obj.aliases:
1020
1216
  if alias.alias_name:
1021
1217
  aliases.append(alias.alias_name)
1218
+
1219
+ run_details: Optional[ModelRunDetails] = None
1220
+ # Fetch run details if run_id exists
1221
+ if obj.run_id:
1222
+ run_details = self.get_run_details(obj.run_id)
1223
+
1224
+ # Extract signature separately from Files API
1225
+ signature: Optional[ModelSignature] = self._extract_signature_from_files_api(
1226
+ obj
1227
+ )
1228
+
1022
1229
  return ModelVersion(
1023
1230
  id=f"{model.id}_{obj.version}",
1024
1231
  name=f"{model.name}_{obj.version}",
@@ -1029,6 +1236,8 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
1029
1236
  created_at=parse_ts_millis(obj.created_at),
1030
1237
  updated_at=parse_ts_millis(obj.updated_at),
1031
1238
  created_by=obj.created_by,
1239
+ run_details=run_details,
1240
+ signature=signature,
1032
1241
  )
1033
1242
 
1034
1243
  def _create_service_principal(
@@ -339,8 +339,75 @@ class Notebook:
339
339
  )
340
340
 
341
341
 
342
+ @dataclass
343
+ class ModelSignature:
344
+ """
345
+ Represents the model signature with input and output schemas extracted from MLflow.
346
+
347
+ In Unity Catalog, model signatures define the expected input/output formats for ML models.
348
+ Model signature is stored in the MLmodel YAML file.
349
+
350
+ Attributes:
351
+ inputs: List of input schema specifications, each containing name, type, dtype, shape
352
+ outputs: List of output schema specifications, each containing name, type, dtype, shape
353
+ parameters: List of model parameters
354
+ """
355
+
356
+ inputs: Optional[List[Dict[str, str]]]
357
+ outputs: Optional[List[Dict[str, str]]]
358
+ parameters: Optional[List[Dict[str, str]]]
359
+
360
+
361
+ @dataclass
362
+ class ModelRunDetails:
363
+ """
364
+ Represents comprehensive details from an MLflow run associated with a Unity Catalog model version.
365
+
366
+ In Unity Catalog, each model version is linked to an MLflow run via run_id. This dataclass
367
+ contains all the metadata extracted from that MLflow run, including metrics, parameters,
368
+ and tags.
369
+
370
+ Attributes:
371
+ run_id: MLflow run ID
372
+ experiment_id: MLflow experiment ID
373
+ status: Run status (e.g., "FINISHED", "RUNNING")
374
+ start_time: Run start timestamp (milliseconds since epoch)
375
+ end_time: Run end timestamp (milliseconds since epoch)
376
+ user_id: User who initiated the run
377
+ metrics: Training metrics (e.g., accuracy, loss)
378
+ parameters: Hyperparameters used for training
379
+ tags: Run tags/metadata
380
+ """
381
+
382
+ run_id: str
383
+ experiment_id: str
384
+ status: Optional[str]
385
+ start_time: Optional[datetime]
386
+ end_time: Optional[datetime]
387
+ user_id: Optional[str]
388
+ metrics: Optional[Dict[str, str]]
389
+ parameters: Optional[Dict[str, str]]
390
+ tags: Optional[Dict[str, str]]
391
+
392
+
342
393
  @dataclass
343
394
  class Model:
395
+ """
396
+ Represents a Unity Catalog registered ML model (model group).
397
+
398
+ In Unity Catalog, a registered model is a collection of model versions.
399
+ This dataclass corresponds to a Unity Catalog RegisteredModelInfo.
400
+
401
+ Attributes:
402
+ id: Full qualified name (e.g., "catalog.schema.model_name")
403
+ name: Model name without catalog/schema prefix
404
+ schema_name: Schema name containing the model
405
+ catalog_name: Catalog name containing the model
406
+ description: Model description/comment
407
+ created_at: Model creation timestamp
408
+ updated_at: Last update timestamp
409
+ """
410
+
344
411
  id: str
345
412
  name: str
346
413
  schema_name: str
@@ -352,6 +419,28 @@ class Model:
352
419
 
353
420
  @dataclass
354
421
  class ModelVersion:
422
+ """
423
+ Represents a specific version of a Unity Catalog registered ML model.
424
+
425
+ In Unity Catalog, each model version is linked to an MLflow run (via run_id).
426
+ This dataclass corresponds to a Unity Catalog ModelVersionInfo.
427
+
428
+ Attributes:
429
+ id: Unique identifier combining model ID and version (e.g., "catalog.schema.model_1")
430
+ name: Versioned model name
431
+ model: Reference to the parent Model (model group)
432
+ version: Version number as string
433
+ aliases: List of aliases (e.g., ["prod", "latest"])
434
+ description: Version description/comment
435
+ created_at: Version creation timestamp
436
+ updated_at: Last update timestamp
437
+ created_by: User who created this version
438
+ run_details: Comprehensive MLflow run details (metrics, parameters, tags)
439
+ extracted from the MLflow run linked to this model version.
440
+ signature: Model signature extracted from the MLmodel file via Files API.
441
+ Contains input/output schema specifications and parameters.
442
+ """
443
+
355
444
  id: str
356
445
  name: str
357
446
  model: Model
@@ -361,3 +450,5 @@ class ModelVersion:
361
450
  created_at: Optional[datetime]
362
451
  updated_at: Optional[datetime]
363
452
  created_by: Optional[str]
453
+ run_details: Optional["ModelRunDetails"]
454
+ signature: Optional["ModelSignature"]
@@ -1,7 +1,9 @@
1
+ import dataclasses
2
+ import json
1
3
  import logging
2
4
  import re
3
5
  import time
4
- from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
6
+ from typing import Dict, Iterable, List, Optional, Set, Tuple, Union, cast
5
7
  from urllib.parse import urljoin
6
8
 
7
9
  from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
@@ -209,13 +211,14 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
209
211
 
210
212
  self.unity_catalog_api_proxy = UnityCatalogApiProxy(
211
213
  config.workspace_url,
212
- config.token,
213
214
  config.warehouse_id,
214
215
  report=self.report,
215
216
  hive_metastore_proxy=self.hive_metastore_proxy,
216
217
  lineage_data_source=config.lineage_data_source,
217
218
  usage_data_source=config.usage_data_source,
218
219
  databricks_api_page_size=config.databricks_api_page_size,
220
+ personal_access_token=config.token if config.token else None,
221
+ azure_auth=config.azure_auth if config.azure_auth else None,
219
222
  )
220
223
 
221
224
  self.external_url_base = urljoin(self.config.workspace_url, "/explore/data")
@@ -317,7 +320,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
317
320
 
318
321
  @classmethod
319
322
  def create(cls, config_dict, ctx):
320
- config = UnityCatalogSourceConfig.parse_obj(config_dict)
323
+ config = UnityCatalogSourceConfig.model_validate(config_dict)
321
324
  return cls(ctx=ctx, config=config)
322
325
 
323
326
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
@@ -741,6 +744,17 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
741
744
  created=TimeStampClass(time=created_time, actor=created_actor),
742
745
  )
743
746
  )
747
+ custom_properties = {}
748
+ if ml_model_version.signature:
749
+ for key, value in dataclasses.asdict(ml_model_version.signature).items():
750
+ if value:
751
+ custom_properties[f"signature.{key}"] = json.dumps(value)
752
+
753
+ if ml_model_version.run_details:
754
+ if ml_model_version.run_details.tags:
755
+ for key, value in ml_model_version.run_details.tags.items():
756
+ if value:
757
+ custom_properties[key] = json.dumps(value)
744
758
 
745
759
  ml_model = MLModel(
746
760
  id=ml_model_version.id,
@@ -751,6 +765,18 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
751
765
  model_group=ml_model_urn,
752
766
  platform=self.platform,
753
767
  last_modified=ml_model_version.updated_at,
768
+ training_metrics=cast(
769
+ Optional[Dict[str, Optional[str]]], ml_model_version.run_details.metrics
770
+ )
771
+ if ml_model_version.run_details and ml_model_version.run_details.metrics
772
+ else None,
773
+ hyper_params=cast(
774
+ Optional[Dict[str, Optional[str]]],
775
+ ml_model_version.run_details.parameters,
776
+ )
777
+ if ml_model_version.run_details and ml_model_version.run_details.parameters
778
+ else None,
779
+ custom_properties=custom_properties if custom_properties else None,
754
780
  extra_aspects=extra_aspects,
755
781
  )
756
782
 
@@ -115,7 +115,7 @@ class ClickHouseUsageSource(Source):
115
115
 
116
116
  @classmethod
117
117
  def create(cls, config_dict, ctx):
118
- config = ClickHouseUsageConfig.parse_obj(config_dict)
118
+ config = ClickHouseUsageConfig.model_validate(config_dict)
119
119
  return cls(ctx, config)
120
120
 
121
121
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
@@ -133,7 +133,7 @@ class TrinoUsageSource(Source):
133
133
 
134
134
  @classmethod
135
135
  def create(cls, config_dict, ctx):
136
- config = TrinoUsageConfig.parse_obj(config_dict)
136
+ config = TrinoUsageConfig.model_validate(config_dict)
137
137
  return cls(ctx, config)
138
138
 
139
139
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
@@ -15,6 +15,7 @@ from typing import (
15
15
  )
16
16
 
17
17
  import pydantic
18
+ from pydantic import ValidationInfo, field_validator
18
19
  from pydantic.fields import Field
19
20
 
20
21
  import datahub.emitter.mce_builder as builder
@@ -226,10 +227,11 @@ class BaseUsageConfig(BaseTimeWindowConfig):
226
227
  default=True, description="Whether to ingest the top_n_queries."
227
228
  )
228
229
 
229
- @pydantic.validator("top_n_queries")
230
- def ensure_top_n_queries_is_not_too_big(cls, v: int, values: dict) -> int:
230
+ @field_validator("top_n_queries", mode="after")
231
+ @classmethod
232
+ def ensure_top_n_queries_is_not_too_big(cls, v: int, info: ValidationInfo) -> int:
231
233
  minimum_query_size = 20
232
-
234
+ values = info.data
233
235
  max_queries = int(values["queries_character_limit"] / minimum_query_size)
234
236
  if v > max_queries:
235
237
  raise ValueError(
@@ -1,6 +1,5 @@
1
- from typing import Any, Dict
2
-
3
1
  import pydantic
2
+ from pydantic import field_validator
4
3
 
5
4
  from datahub.configuration.common import ConfigModel
6
5
 
@@ -21,7 +20,8 @@ class CSVEnricherConfig(ConfigModel):
21
20
  description="Delimiter to use when parsing array fields (tags, terms and owners)",
22
21
  )
23
22
 
24
- @pydantic.validator("write_semantics")
23
+ @field_validator("write_semantics", mode="after")
24
+ @classmethod
25
25
  def validate_write_semantics(cls, write_semantics: str) -> str:
26
26
  if write_semantics.lower() not in {"patch", "override"}:
27
27
  raise ValueError(
@@ -31,9 +31,10 @@ class CSVEnricherConfig(ConfigModel):
31
31
  )
32
32
  return write_semantics
33
33
 
34
- @pydantic.validator("array_delimiter")
35
- def validator_diff(cls, array_delimiter: str, values: Dict[str, Any]) -> str:
36
- if array_delimiter == values["delimiter"]:
34
+ @field_validator("array_delimiter", mode="after")
35
+ @classmethod
36
+ def validator_diff(cls, array_delimiter: str, info: pydantic.ValidationInfo) -> str:
37
+ if array_delimiter == info.data["delimiter"]:
37
38
  raise ValueError(
38
39
  "array_delimiter and delimiter are the same. Please choose different delimiters."
39
40
  )
@@ -3,7 +3,7 @@ import logging
3
3
  from typing import Any, Dict, Optional
4
4
 
5
5
  import cachetools
6
- import pydantic
6
+ from pydantic import field_validator, model_validator
7
7
  from pydantic.fields import Field
8
8
 
9
9
  from datahub.configuration.common import ConfigModel
@@ -26,7 +26,8 @@ class OperationConfig(ConfigModel):
26
26
  description="Number between 1 to 31 for date of month (both inclusive). If not specified, defaults to Nothing and this field does not take affect.",
27
27
  )
28
28
 
29
- @pydantic.root_validator(pre=True)
29
+ @model_validator(mode="before")
30
+ @classmethod
30
31
  def lower_freq_configs_are_set(cls, values: Dict[str, Any]) -> Dict[str, Any]:
31
32
  lower_freq_profile_enabled = values.get("lower_freq_profile_enabled")
32
33
  profile_day_of_week = values.get("profile_day_of_week")
@@ -41,7 +42,8 @@ class OperationConfig(ConfigModel):
41
42
  )
42
43
  return values
43
44
 
44
- @pydantic.validator("profile_day_of_week")
45
+ @field_validator("profile_day_of_week", mode="after")
46
+ @classmethod
45
47
  def validate_profile_day_of_week(cls, v: Optional[int]) -> Optional[int]:
46
48
  profile_day_of_week = v
47
49
  if profile_day_of_week is None:
@@ -52,7 +54,8 @@ class OperationConfig(ConfigModel):
52
54
  )
53
55
  return profile_day_of_week
54
56
 
55
- @pydantic.validator("profile_date_of_month")
57
+ @field_validator("profile_date_of_month", mode="after")
58
+ @classmethod
56
59
  def validate_profile_date_of_month(cls, v: Optional[int]) -> Optional[int]:
57
60
  profile_date_of_month = v
58
61
  if profile_date_of_month is None:
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union
3
3
  from urllib.parse import urlparse
4
4
 
5
5
  import pydantic
6
- from pydantic import Field, validator
6
+ from pydantic import Field, model_validator
7
7
 
8
8
  from datahub.configuration.common import AllowDenyPattern
9
9
  from datahub.configuration.source_common import (
@@ -100,27 +100,23 @@ class PulsarSourceConfig(
100
100
  default_factory=dict, description="Placeholder for OpenId discovery document"
101
101
  )
102
102
 
103
- @validator("token")
104
- def ensure_only_issuer_or_token(
105
- cls, token: Optional[str], values: Dict[str, Optional[str]]
106
- ) -> Optional[str]:
107
- if token is not None and values.get("issuer_url") is not None:
103
+ @model_validator(mode="after")
104
+ def ensure_only_issuer_or_token(self) -> "PulsarSourceConfig":
105
+ if self.token is not None and self.issuer_url is not None:
108
106
  raise ValueError(
109
107
  "Expected only one authentication method, either issuer_url or token."
110
108
  )
111
- return token
112
-
113
- @validator("client_secret", always=True)
114
- def ensure_client_id_and_secret_for_issuer_url(
115
- cls, client_secret: Optional[str], values: Dict[str, Optional[str]]
116
- ) -> Optional[str]:
117
- if values.get("issuer_url") is not None and (
118
- client_secret is None or values.get("client_id") is None
109
+ return self
110
+
111
+ @model_validator(mode="after")
112
+ def ensure_client_id_and_secret_for_issuer_url(self) -> "PulsarSourceConfig":
113
+ if self.issuer_url is not None and (
114
+ self.client_secret is None or self.client_id is None
119
115
  ):
120
116
  raise ValueError(
121
117
  "Missing configuration: client_id and client_secret are mandatory when issuer_url is set."
122
118
  )
123
- return client_secret
119
+ return self
124
120
 
125
121
  @pydantic.field_validator("web_service_url", mode="after")
126
122
  @classmethod
@@ -32,7 +32,7 @@ class AddDatasetBrowsePathTransformer(DatasetBrowsePathsTransformer):
32
32
  def create(
33
33
  cls, config_dict: dict, ctx: PipelineContext
34
34
  ) -> "AddDatasetBrowsePathTransformer":
35
- config = AddDatasetBrowsePathConfig.parse_obj(config_dict)
35
+ config = AddDatasetBrowsePathConfig.model_validate(config_dict)
36
36
  return cls(config, ctx)
37
37
 
38
38
  @staticmethod