acryl-datahub 1.3.1__py3-none-any.whl → 1.3.1.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (193) hide show
  1. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/METADATA +2501 -2501
  2. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/RECORD +193 -193
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +2 -2
  5. datahub/api/entities/corpgroup/corpgroup.py +11 -6
  6. datahub/api/entities/corpuser/corpuser.py +11 -11
  7. datahub/api/entities/dataproduct/dataproduct.py +47 -27
  8. datahub/api/entities/dataset/dataset.py +32 -21
  9. datahub/api/entities/external/lake_formation_external_entites.py +5 -6
  10. datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
  11. datahub/api/entities/forms/forms.py +16 -14
  12. datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
  13. datahub/cli/check_cli.py +2 -2
  14. datahub/cli/config_utils.py +3 -3
  15. datahub/cli/lite_cli.py +9 -7
  16. datahub/cli/migrate.py +4 -4
  17. datahub/cli/quickstart_versioning.py +3 -3
  18. datahub/cli/specific/group_cli.py +1 -1
  19. datahub/cli/specific/structuredproperties_cli.py +1 -1
  20. datahub/cli/specific/user_cli.py +1 -1
  21. datahub/configuration/common.py +14 -2
  22. datahub/configuration/connection_resolver.py +2 -2
  23. datahub/configuration/git.py +47 -30
  24. datahub/configuration/import_resolver.py +2 -2
  25. datahub/configuration/kafka.py +4 -3
  26. datahub/configuration/time_window_config.py +26 -26
  27. datahub/configuration/validate_field_deprecation.py +2 -2
  28. datahub/configuration/validate_field_removal.py +2 -2
  29. datahub/configuration/validate_field_rename.py +2 -2
  30. datahub/configuration/validate_multiline_string.py +2 -1
  31. datahub/emitter/kafka_emitter.py +3 -1
  32. datahub/emitter/rest_emitter.py +2 -4
  33. datahub/ingestion/api/decorators.py +1 -1
  34. datahub/ingestion/api/report.py +1 -1
  35. datahub/ingestion/api/sink.py +1 -1
  36. datahub/ingestion/api/source.py +1 -1
  37. datahub/ingestion/glossary/datahub_classifier.py +11 -8
  38. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  39. datahub/ingestion/reporting/file_reporter.py +5 -4
  40. datahub/ingestion/run/pipeline.py +6 -6
  41. datahub/ingestion/run/pipeline_config.py +12 -14
  42. datahub/ingestion/run/sink_callback.py +1 -1
  43. datahub/ingestion/sink/datahub_rest.py +6 -4
  44. datahub/ingestion/source/abs/config.py +19 -19
  45. datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
  46. datahub/ingestion/source/abs/source.py +2 -2
  47. datahub/ingestion/source/aws/aws_common.py +1 -1
  48. datahub/ingestion/source/aws/glue.py +6 -4
  49. datahub/ingestion/source/aws/sagemaker.py +1 -1
  50. datahub/ingestion/source/azure/azure_common.py +8 -12
  51. datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
  52. datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
  53. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
  54. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  55. datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
  56. datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
  57. datahub/ingestion/source/datahub/config.py +8 -8
  58. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  59. datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
  60. datahub/ingestion/source/dbt/dbt_common.py +39 -37
  61. datahub/ingestion/source/dbt/dbt_core.py +10 -12
  62. datahub/ingestion/source/debug/datahub_debug.py +1 -1
  63. datahub/ingestion/source/delta_lake/config.py +6 -4
  64. datahub/ingestion/source/dremio/dremio_config.py +10 -6
  65. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  66. datahub/ingestion/source/elastic_search.py +4 -3
  67. datahub/ingestion/source/excel/source.py +1 -1
  68. datahub/ingestion/source/feast.py +1 -1
  69. datahub/ingestion/source/file.py +5 -4
  70. datahub/ingestion/source/fivetran/config.py +17 -16
  71. datahub/ingestion/source/fivetran/fivetran.py +2 -2
  72. datahub/ingestion/source/gc/datahub_gc.py +1 -1
  73. datahub/ingestion/source/gcs/gcs_source.py +8 -10
  74. datahub/ingestion/source/ge_profiling_config.py +8 -5
  75. datahub/ingestion/source/grafana/grafana_api.py +2 -2
  76. datahub/ingestion/source/grafana/grafana_config.py +4 -3
  77. datahub/ingestion/source/grafana/grafana_source.py +1 -1
  78. datahub/ingestion/source/grafana/models.py +23 -5
  79. datahub/ingestion/source/hex/api.py +7 -5
  80. datahub/ingestion/source/hex/hex.py +4 -3
  81. datahub/ingestion/source/iceberg/iceberg.py +1 -1
  82. datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
  83. datahub/ingestion/source/identity/azure_ad.py +1 -1
  84. datahub/ingestion/source/identity/okta.py +10 -10
  85. datahub/ingestion/source/kafka/kafka.py +1 -1
  86. datahub/ingestion/source/ldap.py +1 -1
  87. datahub/ingestion/source/looker/looker_common.py +7 -5
  88. datahub/ingestion/source/looker/looker_config.py +21 -20
  89. datahub/ingestion/source/looker/lookml_config.py +47 -47
  90. datahub/ingestion/source/metabase.py +8 -8
  91. datahub/ingestion/source/metadata/business_glossary.py +2 -2
  92. datahub/ingestion/source/metadata/lineage.py +13 -8
  93. datahub/ingestion/source/mlflow.py +1 -1
  94. datahub/ingestion/source/mode.py +6 -4
  95. datahub/ingestion/source/mongodb.py +4 -3
  96. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  97. datahub/ingestion/source/nifi.py +17 -23
  98. datahub/ingestion/source/openapi.py +6 -8
  99. datahub/ingestion/source/powerbi/config.py +33 -32
  100. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
  101. datahub/ingestion/source/powerbi/powerbi.py +1 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
  103. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
  104. datahub/ingestion/source/preset.py +8 -8
  105. datahub/ingestion/source/pulsar.py +1 -1
  106. datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
  107. datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
  108. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  109. datahub/ingestion/source/redshift/config.py +18 -20
  110. datahub/ingestion/source/redshift/redshift.py +2 -2
  111. datahub/ingestion/source/redshift/usage.py +23 -3
  112. datahub/ingestion/source/s3/config.py +83 -62
  113. datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
  114. datahub/ingestion/source/s3/source.py +8 -5
  115. datahub/ingestion/source/sac/sac.py +5 -4
  116. datahub/ingestion/source/salesforce.py +3 -2
  117. datahub/ingestion/source/schema/json_schema.py +2 -2
  118. datahub/ingestion/source/sigma/data_classes.py +3 -2
  119. datahub/ingestion/source/sigma/sigma.py +1 -1
  120. datahub/ingestion/source/sigma/sigma_api.py +7 -7
  121. datahub/ingestion/source/slack/slack.py +1 -1
  122. datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
  123. datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
  124. datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
  125. datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
  126. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
  127. datahub/ingestion/source/snowflake/snowflake_queries.py +1 -1
  128. datahub/ingestion/source/sql/athena.py +1 -1
  129. datahub/ingestion/source/sql/clickhouse.py +4 -2
  130. datahub/ingestion/source/sql/cockroachdb.py +1 -1
  131. datahub/ingestion/source/sql/druid.py +1 -1
  132. datahub/ingestion/source/sql/hana.py +1 -1
  133. datahub/ingestion/source/sql/hive.py +7 -5
  134. datahub/ingestion/source/sql/hive_metastore.py +1 -1
  135. datahub/ingestion/source/sql/mssql/source.py +13 -6
  136. datahub/ingestion/source/sql/mysql.py +1 -1
  137. datahub/ingestion/source/sql/oracle.py +17 -10
  138. datahub/ingestion/source/sql/postgres.py +2 -2
  139. datahub/ingestion/source/sql/presto.py +1 -1
  140. datahub/ingestion/source/sql/sql_config.py +8 -9
  141. datahub/ingestion/source/sql/sql_generic.py +1 -1
  142. datahub/ingestion/source/sql/teradata.py +1 -1
  143. datahub/ingestion/source/sql/trino.py +1 -1
  144. datahub/ingestion/source/sql/vertica.py +5 -4
  145. datahub/ingestion/source/sql_queries.py +11 -8
  146. datahub/ingestion/source/state/checkpoint.py +2 -2
  147. datahub/ingestion/source/state/entity_removal_state.py +2 -1
  148. datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
  149. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
  150. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  151. datahub/ingestion/source/superset.py +9 -9
  152. datahub/ingestion/source/tableau/tableau.py +14 -16
  153. datahub/ingestion/source/unity/config.py +33 -34
  154. datahub/ingestion/source/unity/proxy.py +203 -0
  155. datahub/ingestion/source/unity/proxy_types.py +91 -0
  156. datahub/ingestion/source/unity/source.py +27 -2
  157. datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
  158. datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
  159. datahub/ingestion/source/usage/usage_common.py +5 -3
  160. datahub/ingestion/source_config/csv_enricher.py +7 -6
  161. datahub/ingestion/source_config/operation_config.py +7 -4
  162. datahub/ingestion/source_config/pulsar.py +11 -15
  163. datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
  164. datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
  165. datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
  166. datahub/ingestion/transformer/add_dataset_properties.py +2 -2
  167. datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
  168. datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
  169. datahub/ingestion/transformer/add_dataset_tags.py +3 -3
  170. datahub/ingestion/transformer/add_dataset_terms.py +3 -3
  171. datahub/ingestion/transformer/dataset_domain.py +3 -3
  172. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
  173. datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
  174. datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
  175. datahub/ingestion/transformer/mark_dataset_status.py +1 -1
  176. datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
  177. datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
  178. datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
  179. datahub/ingestion/transformer/replace_external_url.py +2 -2
  180. datahub/ingestion/transformer/set_browse_path.py +1 -1
  181. datahub/ingestion/transformer/tags_to_terms.py +1 -1
  182. datahub/lite/duckdb_lite.py +1 -1
  183. datahub/lite/lite_util.py +2 -2
  184. datahub/sdk/search_filters.py +68 -40
  185. datahub/secret/datahub_secret_store.py +7 -4
  186. datahub/secret/file_secret_store.py +1 -1
  187. datahub/sql_parsing/sqlglot_lineage.py +5 -2
  188. datahub/testing/check_sql_parser_result.py +2 -2
  189. datahub/utilities/ingest_utils.py +1 -1
  190. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/WHEEL +0 -0
  191. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/entry_points.txt +0 -0
  192. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/licenses/LICENSE +0 -0
  193. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@ Manage the communication with DataBricks Server and provide equivalent dataclass
3
3
  """
4
4
 
5
5
  import dataclasses
6
+ import json
6
7
  import logging
7
8
  import os
8
9
  from concurrent.futures import ThreadPoolExecutor
@@ -11,6 +12,7 @@ from typing import Any, Dict, Iterable, List, Optional, Sequence, Union, cast
11
12
  from unittest.mock import patch
12
13
 
13
14
  import cachetools
15
+ import yaml
14
16
  from cachetools import cached
15
17
  from databricks.sdk import WorkspaceClient
16
18
  from databricks.sdk.service.catalog import (
@@ -23,7 +25,11 @@ from databricks.sdk.service.catalog import (
23
25
  SchemaInfo,
24
26
  TableInfo,
25
27
  )
28
+ from databricks.sdk.service.files import DownloadResponse, FilesAPI
26
29
  from databricks.sdk.service.iam import ServicePrincipal as DatabricksServicePrincipal
30
+ from databricks.sdk.service.ml import (
31
+ ExperimentsAPI,
32
+ )
27
33
  from databricks.sdk.service.sql import (
28
34
  QueryFilter,
29
35
  QueryInfo,
@@ -54,6 +60,8 @@ from datahub.ingestion.source.unity.proxy_types import (
54
60
  ExternalTableReference,
55
61
  Metastore,
56
62
  Model,
63
+ ModelRunDetails,
64
+ ModelSignature,
57
65
  ModelVersion,
58
66
  Notebook,
59
67
  NotebookReference,
@@ -155,6 +163,8 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
155
163
  _workspace_url: str
156
164
  report: UnityCatalogReport
157
165
  warehouse_id: str
166
+ _experiments_api: ExperimentsAPI
167
+ _files_api: FilesAPI
158
168
 
159
169
  def __init__(
160
170
  self,
@@ -179,6 +189,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
179
189
  self.lineage_data_source = lineage_data_source
180
190
  self.usage_data_source = usage_data_source
181
191
  self.databricks_api_page_size = databricks_api_page_size
192
+ self._workspace_url = workspace_url
182
193
  self._sql_connection_params = {
183
194
  "server_hostname": self._workspace_client.config.host.replace(
184
195
  "https://", ""
@@ -187,6 +198,185 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
187
198
  "access_token": self._workspace_client.config.token,
188
199
  "user_agent_entry": "datahub",
189
200
  }
201
+ # Initialize MLflow APIs
202
+ self._experiments_api = ExperimentsAPI(self._workspace_client.api_client)
203
+ self._files_api = FilesAPI(self._workspace_client.api_client)
204
+
205
+ def get_run_details(self, run_id: str) -> Optional[ModelRunDetails]:
206
+ """
207
+ Get comprehensive details from an MLflow run.
208
+
209
+ Args:
210
+ run_id: The MLflow run ID
211
+
212
+ Returns:
213
+ ModelRunDetails object with comprehensive run information
214
+ """
215
+ try:
216
+ run_response = self._experiments_api.get_run(run_id)
217
+ run = run_response.run
218
+
219
+ if (
220
+ not run
221
+ or not run.info
222
+ or not run.info.run_id
223
+ or not run.info.experiment_id
224
+ ):
225
+ return None
226
+
227
+ # Extract metrics
228
+ metrics: Dict[str, Any] = {}
229
+ if run.data and run.data.metrics:
230
+ for metric in run.data.metrics:
231
+ if metric.key is not None:
232
+ metrics[metric.key] = metric.value
233
+
234
+ # Extract parameters
235
+ parameters: Dict[str, Any] = {}
236
+ if run.data and run.data.params:
237
+ for param in run.data.params:
238
+ if param.key is not None:
239
+ parameters[param.key] = param.value
240
+
241
+ # Extract tags
242
+ tags: Dict[str, str] = {}
243
+ if run.data and run.data.tags:
244
+ for tag in run.data.tags:
245
+ if tag.key is not None and tag.value is not None:
246
+ tags[tag.key] = tag.value
247
+
248
+ return ModelRunDetails(
249
+ run_id=run.info.run_id,
250
+ experiment_id=run.info.experiment_id,
251
+ status=run.info.status.value if run.info.status else None,
252
+ start_time=parse_ts_millis(run.info.start_time),
253
+ end_time=parse_ts_millis(run.info.end_time),
254
+ user_id=run.info.user_id,
255
+ metrics=metrics,
256
+ parameters=parameters,
257
+ tags=tags,
258
+ )
259
+ except Exception as e:
260
+ logger.warning(
261
+ f"Unable to get run details for MLflow experiment, run-id: {run_id}",
262
+ exc_info=True,
263
+ )
264
+ self.report.report_warning(
265
+ title="Unable to get run details for MLflow experiment",
266
+ message="Error while getting run details for MLflow experiment",
267
+ context=f"run-id: {run_id}",
268
+ exc=e,
269
+ )
270
+ return None
271
+
272
+ def _extract_signature_from_files_api(
273
+ self, model_version: ModelVersionInfo
274
+ ) -> Optional[ModelSignature]:
275
+ """
276
+ Extract signature from MLmodel file using Databricks FilesAPI.
277
+ Uses the API endpoint: /api/2.0/fs/files/Models/{catalog}/{schema}/{model}/{version}/MLmodel
278
+
279
+ Args:
280
+ model_version: Unity Catalog ModelVersionInfo object with catalog_name, schema_name, model_name, version
281
+
282
+ Returns:
283
+ ModelSignature if found, None otherwise
284
+ """
285
+ try:
286
+ # Construct file path for FilesAPI
287
+ # The correct path format is: /Models/{catalog}/{schema}/{model}/{version}/MLmodel
288
+ file_path = (
289
+ f"/Models/{model_version.catalog_name}/{model_version.schema_name}/"
290
+ f"{model_version.model_name}/{model_version.version}/MLmodel"
291
+ )
292
+
293
+ logger.debug(f"Downloading MLmodel from FilesAPI: {file_path}")
294
+
295
+ # Download the file using FilesAPI
296
+ download_response: DownloadResponse = self._files_api.download(
297
+ file_path=file_path
298
+ )
299
+
300
+ # Read the file content
301
+ # DownloadResponse.contents is a BinaryIO object
302
+ if download_response and download_response.contents:
303
+ content_stream = download_response.contents
304
+
305
+ # Read from the binary stream
306
+ if content_stream:
307
+ mlmodel_content: str = content_stream.read().decode("utf-8")
308
+
309
+ logger.debug(
310
+ f"MLmodel file contents from FilesAPI ({file_path}):\n{mlmodel_content}"
311
+ )
312
+
313
+ # Parse YAML content
314
+ mlmodel_data = yaml.safe_load(mlmodel_content)
315
+
316
+ # Extract signature from MLmodel YAML
317
+ if mlmodel_data and "signature" in mlmodel_data:
318
+ signature_raw = mlmodel_data["signature"]
319
+
320
+ # Signature inputs and outputs are stored as JSON strings in the YAML
321
+ # Parse them into proper dict/list format
322
+ signature_data = {}
323
+ if "inputs" in signature_raw:
324
+ try:
325
+ signature_data["inputs"] = json.loads(
326
+ signature_raw["inputs"]
327
+ )
328
+ except (json.JSONDecodeError, TypeError) as e:
329
+ logger.debug(f"Failed to parse inputs JSON: {e}")
330
+
331
+ if "outputs" in signature_raw:
332
+ try:
333
+ signature_data["outputs"] = json.loads(
334
+ signature_raw["outputs"]
335
+ )
336
+ except (json.JSONDecodeError, TypeError) as e:
337
+ logger.debug(f"Failed to parse outputs JSON: {e}")
338
+
339
+ if "parameters" in signature_raw:
340
+ try:
341
+ signature_data["parameters"] = json.loads(
342
+ signature_raw["parameters"]
343
+ )
344
+ except (json.JSONDecodeError, TypeError) as e:
345
+ logger.debug(f"Failed to parse parameters JSON: {e}")
346
+
347
+ return ModelSignature(
348
+ inputs=signature_data["inputs"]
349
+ if "inputs" in signature_raw
350
+ else None,
351
+ outputs=signature_data["outputs"]
352
+ if "outputs" in signature_raw
353
+ else None,
354
+ parameters=signature_data["parameters"]
355
+ if "parameters" in signature_raw
356
+ else None,
357
+ )
358
+ else:
359
+ logger.debug(
360
+ f"No signature found in MLmodel data from {file_path}"
361
+ )
362
+ return None
363
+
364
+ return None
365
+
366
+ except Exception as e:
367
+ model_name = getattr(model_version, "model_name", "unknown")
368
+ version_num = getattr(model_version, "version", "unknown")
369
+ self.report.report_warning(
370
+ title="Unable to extract signature from MLmodel file",
371
+ message="Error while extracting signature from MLmodel file",
372
+ context=f"model-name: {model_name}, model-version: {version_num}",
373
+ exc=e,
374
+ )
375
+ logger.warning(
376
+ f"Unable to extract signature from MLmodel file, model-name: {model_name}, model-version: {version_num}",
377
+ exc_info=True,
378
+ )
379
+ return None
190
380
 
191
381
  def check_basic_connectivity(self) -> bool:
192
382
  return bool(
@@ -1019,6 +1209,17 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
1019
1209
  for alias in obj.aliases:
1020
1210
  if alias.alias_name:
1021
1211
  aliases.append(alias.alias_name)
1212
+
1213
+ run_details: Optional[ModelRunDetails] = None
1214
+ # Fetch run details if run_id exists
1215
+ if obj.run_id:
1216
+ run_details = self.get_run_details(obj.run_id)
1217
+
1218
+ # Extract signature separately from Files API
1219
+ signature: Optional[ModelSignature] = self._extract_signature_from_files_api(
1220
+ obj
1221
+ )
1222
+
1022
1223
  return ModelVersion(
1023
1224
  id=f"{model.id}_{obj.version}",
1024
1225
  name=f"{model.name}_{obj.version}",
@@ -1029,6 +1230,8 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
1029
1230
  created_at=parse_ts_millis(obj.created_at),
1030
1231
  updated_at=parse_ts_millis(obj.updated_at),
1031
1232
  created_by=obj.created_by,
1233
+ run_details=run_details,
1234
+ signature=signature,
1032
1235
  )
1033
1236
 
1034
1237
  def _create_service_principal(
@@ -339,8 +339,75 @@ class Notebook:
339
339
  )
340
340
 
341
341
 
342
+ @dataclass
343
+ class ModelSignature:
344
+ """
345
+ Represents the model signature with input and output schemas extracted from MLflow.
346
+
347
+ In Unity Catalog, model signatures define the expected input/output formats for ML models.
348
+ Model signature is stored in the MLmodel YAML file.
349
+
350
+ Attributes:
351
+ inputs: List of input schema specifications, each containing name, type, dtype, shape
352
+ outputs: List of output schema specifications, each containing name, type, dtype, shape
353
+ parameters: List of model parameters
354
+ """
355
+
356
+ inputs: Optional[List[Dict[str, str]]]
357
+ outputs: Optional[List[Dict[str, str]]]
358
+ parameters: Optional[List[Dict[str, str]]]
359
+
360
+
361
+ @dataclass
362
+ class ModelRunDetails:
363
+ """
364
+ Represents comprehensive details from an MLflow run associated with a Unity Catalog model version.
365
+
366
+ In Unity Catalog, each model version is linked to an MLflow run via run_id. This dataclass
367
+ contains all the metadata extracted from that MLflow run, including metrics, parameters,
368
+ and tags.
369
+
370
+ Attributes:
371
+ run_id: MLflow run ID
372
+ experiment_id: MLflow experiment ID
373
+ status: Run status (e.g., "FINISHED", "RUNNING")
374
+ start_time: Run start timestamp (milliseconds since epoch)
375
+ end_time: Run end timestamp (milliseconds since epoch)
376
+ user_id: User who initiated the run
377
+ metrics: Training metrics (e.g., accuracy, loss)
378
+ parameters: Hyperparameters used for training
379
+ tags: Run tags/metadata
380
+ """
381
+
382
+ run_id: str
383
+ experiment_id: str
384
+ status: Optional[str]
385
+ start_time: Optional[datetime]
386
+ end_time: Optional[datetime]
387
+ user_id: Optional[str]
388
+ metrics: Optional[Dict[str, str]]
389
+ parameters: Optional[Dict[str, str]]
390
+ tags: Optional[Dict[str, str]]
391
+
392
+
342
393
  @dataclass
343
394
  class Model:
395
+ """
396
+ Represents a Unity Catalog registered ML model (model group).
397
+
398
+ In Unity Catalog, a registered model is a collection of model versions.
399
+ This dataclass corresponds to a Unity Catalog RegisteredModelInfo.
400
+
401
+ Attributes:
402
+ id: Full qualified name (e.g., "catalog.schema.model_name")
403
+ name: Model name without catalog/schema prefix
404
+ schema_name: Schema name containing the model
405
+ catalog_name: Catalog name containing the model
406
+ description: Model description/comment
407
+ created_at: Model creation timestamp
408
+ updated_at: Last update timestamp
409
+ """
410
+
344
411
  id: str
345
412
  name: str
346
413
  schema_name: str
@@ -352,6 +419,28 @@ class Model:
352
419
 
353
420
  @dataclass
354
421
  class ModelVersion:
422
+ """
423
+ Represents a specific version of a Unity Catalog registered ML model.
424
+
425
+ In Unity Catalog, each model version is linked to an MLflow run (via run_id).
426
+ This dataclass corresponds to a Unity Catalog ModelVersionInfo.
427
+
428
+ Attributes:
429
+ id: Unique identifier combining model ID and version (e.g., "catalog.schema.model_1")
430
+ name: Versioned model name
431
+ model: Reference to the parent Model (model group)
432
+ version: Version number as string
433
+ aliases: List of aliases (e.g., ["prod", "latest"])
434
+ description: Version description/comment
435
+ created_at: Version creation timestamp
436
+ updated_at: Last update timestamp
437
+ created_by: User who created this version
438
+ run_details: Comprehensive MLflow run details (metrics, parameters, tags)
439
+ extracted from the MLflow run linked to this model version.
440
+ signature: Model signature extracted from the MLmodel file via Files API.
441
+ Contains input/output schema specifications and parameters.
442
+ """
443
+
355
444
  id: str
356
445
  name: str
357
446
  model: Model
@@ -361,3 +450,5 @@ class ModelVersion:
361
450
  created_at: Optional[datetime]
362
451
  updated_at: Optional[datetime]
363
452
  created_by: Optional[str]
453
+ run_details: Optional["ModelRunDetails"]
454
+ signature: Optional["ModelSignature"]
@@ -1,7 +1,9 @@
1
+ import dataclasses
2
+ import json
1
3
  import logging
2
4
  import re
3
5
  import time
4
- from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
6
+ from typing import Dict, Iterable, List, Optional, Set, Tuple, Union, cast
5
7
  from urllib.parse import urljoin
6
8
 
7
9
  from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
@@ -317,7 +319,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
317
319
 
318
320
  @classmethod
319
321
  def create(cls, config_dict, ctx):
320
- config = UnityCatalogSourceConfig.parse_obj(config_dict)
322
+ config = UnityCatalogSourceConfig.model_validate(config_dict)
321
323
  return cls(ctx=ctx, config=config)
322
324
 
323
325
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
@@ -741,6 +743,17 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
741
743
  created=TimeStampClass(time=created_time, actor=created_actor),
742
744
  )
743
745
  )
746
+ custom_properties = {}
747
+ if ml_model_version.signature:
748
+ for key, value in dataclasses.asdict(ml_model_version.signature).items():
749
+ if value:
750
+ custom_properties[f"signature.{key}"] = json.dumps(value)
751
+
752
+ if ml_model_version.run_details:
753
+ if ml_model_version.run_details.tags:
754
+ for key, value in ml_model_version.run_details.tags.items():
755
+ if value:
756
+ custom_properties[key] = json.dumps(value)
744
757
 
745
758
  ml_model = MLModel(
746
759
  id=ml_model_version.id,
@@ -751,6 +764,18 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
751
764
  model_group=ml_model_urn,
752
765
  platform=self.platform,
753
766
  last_modified=ml_model_version.updated_at,
767
+ training_metrics=cast(
768
+ Optional[Dict[str, Optional[str]]], ml_model_version.run_details.metrics
769
+ )
770
+ if ml_model_version.run_details and ml_model_version.run_details.metrics
771
+ else None,
772
+ hyper_params=cast(
773
+ Optional[Dict[str, Optional[str]]],
774
+ ml_model_version.run_details.parameters,
775
+ )
776
+ if ml_model_version.run_details and ml_model_version.run_details.parameters
777
+ else None,
778
+ custom_properties=custom_properties if custom_properties else None,
754
779
  extra_aspects=extra_aspects,
755
780
  )
756
781
 
@@ -115,7 +115,7 @@ class ClickHouseUsageSource(Source):
115
115
 
116
116
  @classmethod
117
117
  def create(cls, config_dict, ctx):
118
- config = ClickHouseUsageConfig.parse_obj(config_dict)
118
+ config = ClickHouseUsageConfig.model_validate(config_dict)
119
119
  return cls(ctx, config)
120
120
 
121
121
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
@@ -133,7 +133,7 @@ class TrinoUsageSource(Source):
133
133
 
134
134
  @classmethod
135
135
  def create(cls, config_dict, ctx):
136
- config = TrinoUsageConfig.parse_obj(config_dict)
136
+ config = TrinoUsageConfig.model_validate(config_dict)
137
137
  return cls(ctx, config)
138
138
 
139
139
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
@@ -15,6 +15,7 @@ from typing import (
15
15
  )
16
16
 
17
17
  import pydantic
18
+ from pydantic import ValidationInfo, field_validator
18
19
  from pydantic.fields import Field
19
20
 
20
21
  import datahub.emitter.mce_builder as builder
@@ -226,10 +227,11 @@ class BaseUsageConfig(BaseTimeWindowConfig):
226
227
  default=True, description="Whether to ingest the top_n_queries."
227
228
  )
228
229
 
229
- @pydantic.validator("top_n_queries")
230
- def ensure_top_n_queries_is_not_too_big(cls, v: int, values: dict) -> int:
230
+ @field_validator("top_n_queries", mode="after")
231
+ @classmethod
232
+ def ensure_top_n_queries_is_not_too_big(cls, v: int, info: ValidationInfo) -> int:
231
233
  minimum_query_size = 20
232
-
234
+ values = info.data
233
235
  max_queries = int(values["queries_character_limit"] / minimum_query_size)
234
236
  if v > max_queries:
235
237
  raise ValueError(
@@ -1,6 +1,5 @@
1
- from typing import Any, Dict
2
-
3
1
  import pydantic
2
+ from pydantic import field_validator
4
3
 
5
4
  from datahub.configuration.common import ConfigModel
6
5
 
@@ -21,7 +20,8 @@ class CSVEnricherConfig(ConfigModel):
21
20
  description="Delimiter to use when parsing array fields (tags, terms and owners)",
22
21
  )
23
22
 
24
- @pydantic.validator("write_semantics")
23
+ @field_validator("write_semantics", mode="after")
24
+ @classmethod
25
25
  def validate_write_semantics(cls, write_semantics: str) -> str:
26
26
  if write_semantics.lower() not in {"patch", "override"}:
27
27
  raise ValueError(
@@ -31,9 +31,10 @@ class CSVEnricherConfig(ConfigModel):
31
31
  )
32
32
  return write_semantics
33
33
 
34
- @pydantic.validator("array_delimiter")
35
- def validator_diff(cls, array_delimiter: str, values: Dict[str, Any]) -> str:
36
- if array_delimiter == values["delimiter"]:
34
+ @field_validator("array_delimiter", mode="after")
35
+ @classmethod
36
+ def validator_diff(cls, array_delimiter: str, info: pydantic.ValidationInfo) -> str:
37
+ if array_delimiter == info.data["delimiter"]:
37
38
  raise ValueError(
38
39
  "array_delimiter and delimiter are the same. Please choose different delimiters."
39
40
  )
@@ -3,7 +3,7 @@ import logging
3
3
  from typing import Any, Dict, Optional
4
4
 
5
5
  import cachetools
6
- import pydantic
6
+ from pydantic import field_validator, model_validator
7
7
  from pydantic.fields import Field
8
8
 
9
9
  from datahub.configuration.common import ConfigModel
@@ -26,7 +26,8 @@ class OperationConfig(ConfigModel):
26
26
  description="Number between 1 to 31 for date of month (both inclusive). If not specified, defaults to Nothing and this field does not take affect.",
27
27
  )
28
28
 
29
- @pydantic.root_validator(pre=True)
29
+ @model_validator(mode="before")
30
+ @classmethod
30
31
  def lower_freq_configs_are_set(cls, values: Dict[str, Any]) -> Dict[str, Any]:
31
32
  lower_freq_profile_enabled = values.get("lower_freq_profile_enabled")
32
33
  profile_day_of_week = values.get("profile_day_of_week")
@@ -41,7 +42,8 @@ class OperationConfig(ConfigModel):
41
42
  )
42
43
  return values
43
44
 
44
- @pydantic.validator("profile_day_of_week")
45
+ @field_validator("profile_day_of_week", mode="after")
46
+ @classmethod
45
47
  def validate_profile_day_of_week(cls, v: Optional[int]) -> Optional[int]:
46
48
  profile_day_of_week = v
47
49
  if profile_day_of_week is None:
@@ -52,7 +54,8 @@ class OperationConfig(ConfigModel):
52
54
  )
53
55
  return profile_day_of_week
54
56
 
55
- @pydantic.validator("profile_date_of_month")
57
+ @field_validator("profile_date_of_month", mode="after")
58
+ @classmethod
56
59
  def validate_profile_date_of_month(cls, v: Optional[int]) -> Optional[int]:
57
60
  profile_date_of_month = v
58
61
  if profile_date_of_month is None:
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union
3
3
  from urllib.parse import urlparse
4
4
 
5
5
  import pydantic
6
- from pydantic import Field, validator
6
+ from pydantic import Field, model_validator
7
7
 
8
8
  from datahub.configuration.common import AllowDenyPattern
9
9
  from datahub.configuration.source_common import (
@@ -100,27 +100,23 @@ class PulsarSourceConfig(
100
100
  default_factory=dict, description="Placeholder for OpenId discovery document"
101
101
  )
102
102
 
103
- @validator("token")
104
- def ensure_only_issuer_or_token(
105
- cls, token: Optional[str], values: Dict[str, Optional[str]]
106
- ) -> Optional[str]:
107
- if token is not None and values.get("issuer_url") is not None:
103
+ @model_validator(mode="after")
104
+ def ensure_only_issuer_or_token(self) -> "PulsarSourceConfig":
105
+ if self.token is not None and self.issuer_url is not None:
108
106
  raise ValueError(
109
107
  "Expected only one authentication method, either issuer_url or token."
110
108
  )
111
- return token
112
-
113
- @validator("client_secret", always=True)
114
- def ensure_client_id_and_secret_for_issuer_url(
115
- cls, client_secret: Optional[str], values: Dict[str, Optional[str]]
116
- ) -> Optional[str]:
117
- if values.get("issuer_url") is not None and (
118
- client_secret is None or values.get("client_id") is None
109
+ return self
110
+
111
+ @model_validator(mode="after")
112
+ def ensure_client_id_and_secret_for_issuer_url(self) -> "PulsarSourceConfig":
113
+ if self.issuer_url is not None and (
114
+ self.client_secret is None or self.client_id is None
119
115
  ):
120
116
  raise ValueError(
121
117
  "Missing configuration: client_id and client_secret are mandatory when issuer_url is set."
122
118
  )
123
- return client_secret
119
+ return self
124
120
 
125
121
  @pydantic.field_validator("web_service_url", mode="after")
126
122
  @classmethod
@@ -32,7 +32,7 @@ class AddDatasetBrowsePathTransformer(DatasetBrowsePathsTransformer):
32
32
  def create(
33
33
  cls, config_dict: dict, ctx: PipelineContext
34
34
  ) -> "AddDatasetBrowsePathTransformer":
35
- config = AddDatasetBrowsePathConfig.parse_obj(config_dict)
35
+ config = AddDatasetBrowsePathConfig.model_validate(config_dict)
36
36
  return cls(config, ctx)
37
37
 
38
38
  @staticmethod
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from typing import Callable, Dict, List, Optional, Union
3
3
 
4
- import pydantic
4
+ from pydantic import model_validator
5
5
 
6
6
  from datahub.configuration.common import ConfigModel, KeyValuePattern
7
7
  from datahub.configuration.import_resolver import pydantic_resolve_key
@@ -39,7 +39,7 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
39
39
 
40
40
  @classmethod
41
41
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddDatasetDataProduct":
42
- config = AddDatasetDataProductConfig.parse_obj(config_dict)
42
+ config = AddDatasetDataProductConfig.model_validate(config_dict)
43
43
  return cls(config, ctx)
44
44
 
45
45
  def transform_aspect(
@@ -116,7 +116,7 @@ class SimpleAddDatasetDataProduct(AddDatasetDataProduct):
116
116
  def create(
117
117
  cls, config_dict: dict, ctx: PipelineContext
118
118
  ) -> "SimpleAddDatasetDataProduct":
119
- config = SimpleDatasetDataProductConfig.parse_obj(config_dict)
119
+ config = SimpleDatasetDataProductConfig.model_validate(config_dict)
120
120
  return cls(config, ctx)
121
121
 
122
122
 
@@ -124,7 +124,8 @@ class PatternDatasetDataProductConfig(ConfigModel):
124
124
  dataset_to_data_product_urns_pattern: KeyValuePattern = KeyValuePattern.all()
125
125
  is_container: bool = False
126
126
 
127
- @pydantic.root_validator(pre=True)
127
+ @model_validator(mode="before")
128
+ @classmethod
128
129
  def validate_pattern_value(cls, values: Dict) -> Dict:
129
130
  rules = values["dataset_to_data_product_urns_pattern"]["rules"]
130
131
  for key, value in rules.items():
@@ -156,5 +157,5 @@ class PatternAddDatasetDataProduct(AddDatasetDataProduct):
156
157
  def create(
157
158
  cls, config_dict: dict, ctx: PipelineContext
158
159
  ) -> "PatternAddDatasetDataProduct":
159
- config = PatternDatasetDataProductConfig.parse_obj(config_dict)
160
+ config = PatternDatasetDataProductConfig.model_validate(config_dict)
160
161
  return cls(config, ctx)
@@ -55,7 +55,7 @@ class AddDatasetOwnership(OwnershipTransformer):
55
55
 
56
56
  @classmethod
57
57
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddDatasetOwnership":
58
- config = AddDatasetOwnershipConfig.parse_obj(config_dict)
58
+ config = AddDatasetOwnershipConfig.model_validate(config_dict)
59
59
  return cls(config, ctx)
60
60
 
61
61
  @staticmethod
@@ -209,7 +209,7 @@ class SimpleAddDatasetOwnership(AddDatasetOwnership):
209
209
  def create(
210
210
  cls, config_dict: dict, ctx: PipelineContext
211
211
  ) -> "SimpleAddDatasetOwnership":
212
- config = SimpleDatasetOwnershipConfig.parse_obj(config_dict)
212
+ config = SimpleDatasetOwnershipConfig.model_validate(config_dict)
213
213
  return cls(config, ctx)
214
214
 
215
215
 
@@ -247,5 +247,5 @@ class PatternAddDatasetOwnership(AddDatasetOwnership):
247
247
  def create(
248
248
  cls, config_dict: dict, ctx: PipelineContext
249
249
  ) -> "PatternAddDatasetOwnership":
250
- config = PatternDatasetOwnershipConfig.parse_obj(config_dict)
250
+ config = PatternDatasetOwnershipConfig.model_validate(config_dict)
251
251
  return cls(config, ctx)