truefoundry 0.5.0rc5__py3-none-any.whl → 0.5.0rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of truefoundry might be problematic. Click here for more details.

Files changed (59) hide show
  1. truefoundry/ml/__init__.py +2 -1
  2. truefoundry/ml/artifact/truefoundry_artifact_repo.py +8 -3
  3. truefoundry/ml/autogen/client/__init__.py +6 -4
  4. truefoundry/ml/autogen/client/api/deprecated_api.py +340 -7
  5. truefoundry/ml/autogen/client/api/mlfoundry_artifacts_api.py +0 -322
  6. truefoundry/ml/autogen/client/api_client.py +8 -1
  7. truefoundry/ml/autogen/client/models/__init__.py +6 -4
  8. truefoundry/ml/autogen/client/models/add_features_to_model_version_request_dto.py +3 -17
  9. truefoundry/ml/autogen/client/models/agent.py +1 -1
  10. truefoundry/ml/autogen/client/models/agent_app.py +1 -1
  11. truefoundry/ml/autogen/client/models/agent_open_api_tool.py +1 -1
  12. truefoundry/ml/autogen/client/models/agent_open_api_tool_with_fqn.py +1 -1
  13. truefoundry/ml/autogen/client/models/agent_with_fqn.py +1 -1
  14. truefoundry/ml/autogen/client/models/artifact_version_manifest.py +1 -1
  15. truefoundry/ml/autogen/client/models/assistant_message.py +1 -1
  16. truefoundry/ml/autogen/client/models/blob_storage_reference.py +1 -1
  17. truefoundry/ml/autogen/client/models/chat_prompt.py +1 -1
  18. truefoundry/ml/autogen/client/models/external_artifact_source.py +1 -1
  19. truefoundry/ml/autogen/client/models/fast_ai_framework.py +1 -1
  20. truefoundry/ml/autogen/client/models/gluon_framework.py +1 -1
  21. truefoundry/ml/autogen/client/models/h2_o_framework.py +1 -1
  22. truefoundry/ml/autogen/client/models/image_content_part.py +1 -1
  23. truefoundry/ml/autogen/client/models/keras_framework.py +1 -1
  24. truefoundry/ml/autogen/client/models/light_gbm_framework.py +1 -1
  25. truefoundry/ml/autogen/client/models/model_version_dto.py +7 -8
  26. truefoundry/ml/autogen/client/models/model_version_environment.py +97 -0
  27. truefoundry/ml/autogen/client/models/model_version_manifest.py +12 -1
  28. truefoundry/ml/autogen/client/models/onnx_framework.py +1 -1
  29. truefoundry/ml/autogen/client/models/paddle_framework.py +1 -1
  30. truefoundry/ml/autogen/client/models/py_torch_framework.py +1 -1
  31. truefoundry/ml/autogen/client/models/{feature_value_type.py → serialization_format.py} +8 -8
  32. truefoundry/ml/autogen/client/models/sklearn_framework.py +16 -3
  33. truefoundry/ml/autogen/client/models/spa_cy_framework.py +1 -1
  34. truefoundry/ml/autogen/client/models/stats_models_framework.py +1 -1
  35. truefoundry/ml/autogen/client/models/system_message.py +1 -1
  36. truefoundry/ml/autogen/client/models/tensor_flow_framework.py +1 -1
  37. truefoundry/ml/autogen/client/models/text_content_part.py +1 -1
  38. truefoundry/ml/autogen/client/models/transformers_framework.py +1 -1
  39. truefoundry/ml/autogen/client/models/true_foundry_artifact_source.py +1 -1
  40. truefoundry/ml/autogen/client/models/update_model_version_request_dto.py +1 -13
  41. truefoundry/ml/autogen/client/models/user_message.py +1 -1
  42. truefoundry/ml/autogen/client/models/xg_boost_framework.py +16 -3
  43. truefoundry/ml/autogen/client_README.md +6 -8
  44. truefoundry/ml/autogen/entities/artifacts.py +38 -1
  45. truefoundry/ml/autogen/models/__init__.py +4 -0
  46. truefoundry/ml/autogen/models/exceptions.py +30 -0
  47. truefoundry/ml/autogen/models/schema.py +1547 -0
  48. truefoundry/ml/autogen/models/signature.py +139 -0
  49. truefoundry/ml/autogen/models/utils.py +699 -0
  50. truefoundry/ml/log_types/artifacts/model.py +3 -0
  51. truefoundry/ml/mlfoundry_api.py +3 -1
  52. truefoundry/ml/mlfoundry_run.py +3 -1
  53. {truefoundry-0.5.0rc5.dist-info → truefoundry-0.5.0rc7.dist-info}/METADATA +2 -2
  54. {truefoundry-0.5.0rc5.dist-info → truefoundry-0.5.0rc7.dist-info}/RECORD +56 -53
  55. truefoundry/ml/autogen/client/models/feature_dto.py +0 -68
  56. truefoundry/ml/autogen/client/models/model_schema_dto.py +0 -85
  57. truefoundry/ml/autogen/client/models/prediction_type.py +0 -34
  58. {truefoundry-0.5.0rc5.dist-info → truefoundry-0.5.0rc7.dist-info}/WHEEL +0 -0
  59. {truefoundry-0.5.0rc5.dist-info → truefoundry-0.5.0rc7.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,699 @@
1
+ import logging
2
+ import warnings
3
+ from collections import defaultdict
4
+ from typing import Any, Dict, List, Optional, Union
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ from .exceptions import MlflowException
10
+ from .schema import (
11
+ Array,
12
+ ColSpec,
13
+ DataType,
14
+ Map,
15
+ Object,
16
+ ParamSchema,
17
+ ParamSpec,
18
+ Property,
19
+ Schema,
20
+ TensorSpec,
21
+ )
22
+
23
+ _logger = logging.getLogger(__name__)
24
+
25
+
26
+ class TensorsNotSupportedException(MlflowException):
27
+ def __init__(self, message, status_code: Optional[int] = None):
28
+ super().__init__(
29
+ f"Multidimensional arrays (aka tensors) are not supported. {message}",
30
+ status_code=status_code,
31
+ )
32
+
33
+
34
+ def _get_tensor_shape(data, variable_dimension: Optional[int] = 0) -> tuple:
35
+ """Infer the shape of the inputted data.
36
+
37
+ This method creates the shape of the tensor to store in the TensorSpec. The variable dimension
38
+ is assumed to be the first dimension by default. This assumption can be overridden by inputting
39
+ a different variable dimension or `None` to represent that the input tensor does not contain a
40
+ variable dimension.
41
+
42
+ Args:
43
+ data: Dataset to infer from.
44
+ variable_dimension: An optional integer representing a variable dimension.
45
+
46
+ Returns:
47
+ tuple: Shape of the inputted data (including a variable dimension)
48
+ """
49
+ from scipy.sparse import csc_matrix, csr_matrix
50
+
51
+ if not isinstance(data, (np.ndarray, csr_matrix, csc_matrix)):
52
+ raise TypeError(
53
+ f"Expected numpy.ndarray or csc/csr matrix, got '{type(data)}'."
54
+ )
55
+ variable_input_data_shape = data.shape
56
+ if variable_dimension is not None:
57
+ try:
58
+ variable_input_data_shape = list(variable_input_data_shape)
59
+ variable_input_data_shape[variable_dimension] = -1
60
+ except IndexError:
61
+ raise MlflowException(
62
+ f"The specified variable_dimension {variable_dimension} is out of bounds with "
63
+ f"respect to the number of dimensions {data.ndim} in the input dataset"
64
+ ) from None
65
+ return tuple(variable_input_data_shape)
66
+
67
+
68
+ def clean_tensor_type(dtype: np.dtype):
69
+ """
70
+ This method strips away the size information stored in flexible datatypes such as np.str_ and
71
+ np.bytes_. Other numpy dtypes are returned unchanged.
72
+
73
+ Args:
74
+ dtype: Numpy dtype of a tensor
75
+
76
+ Returns:
77
+ dtype: Cleaned numpy dtype
78
+ """
79
+ if not isinstance(dtype, np.dtype):
80
+ raise TypeError(
81
+ f"Expected `type` to be instance of `{np.dtype}`, received `{dtype.__class__}`"
82
+ )
83
+
84
+ # Special casing for np.str_ and np.bytes_
85
+ if dtype.char == "U":
86
+ return np.dtype("str")
87
+ elif dtype.char == "S":
88
+ return np.dtype("bytes")
89
+ return dtype
90
+
91
+
92
+ def _infer_colspec_type(data: Any) -> Union[DataType, Array, Object]:
93
+ """
94
+ Infer an MLflow Colspec type from the dataset.
95
+
96
+ Args:
97
+ data: data to infer from.
98
+
99
+ Returns:
100
+ Object
101
+ """
102
+ dtype = _infer_datatype(data)
103
+
104
+ # Currently only input that gives None is nested list whose items are all empty e.g. [[], []],
105
+ # because flat empty list [] has special handlign logic in _infer_schema
106
+ if dtype is None:
107
+ raise MlflowException(
108
+ "A column of nested array type must include at least one non-empty array."
109
+ )
110
+
111
+ return dtype
112
+
113
+
114
+ def _infer_datatype(data: Any) -> Union[DataType, Array, Object, Map]:
115
+ if isinstance(data, dict):
116
+ properties = []
117
+ for k, v in data.items():
118
+ dtype = _infer_datatype(v)
119
+ if dtype is None:
120
+ raise MlflowException("Dictionary value must not be an empty list.")
121
+ properties.append(Property(name=k, dtype=dtype))
122
+ return Object(properties=properties)
123
+
124
+ if isinstance(data, (list, np.ndarray)):
125
+ return _infer_array_datatype(data)
126
+
127
+ return _infer_scalar_datatype(data)
128
+
129
+
130
+ def _infer_array_datatype(data: Union[list, np.ndarray]) -> Optional[Array]:
131
+ """Infer schema from an array. This tries to infer type if there is at least one
132
+ non-null item in the list, assuming the list has a homogeneous type. However,
133
+ if the list is empty or all items are null, returns None as a sign of undetermined.
134
+
135
+ E.g.
136
+ ["a", "b"] => Array(string)
137
+ ["a", None] => Array(string)
138
+ [["a", "b"], []] => Array(Array(string))
139
+ [] => None
140
+
141
+ Args:
142
+ data: data to infer from.
143
+
144
+ Returns:
145
+ Array(dtype) or None if undetermined
146
+ """
147
+ result = None
148
+ for item in data:
149
+ # We accept None in list to provide backward compatibility,
150
+ # but ignore them for type inference
151
+ if _is_none_or_nan(item):
152
+ continue
153
+
154
+ dtype = _infer_datatype(item)
155
+
156
+ # Skip item with undetermined type
157
+ if dtype is None:
158
+ continue
159
+
160
+ if result is None:
161
+ result = Array(dtype)
162
+ elif isinstance(result.dtype, (Array, Object, Map)):
163
+ try:
164
+ result = Array(result.dtype._merge(dtype))
165
+ except MlflowException as e:
166
+ raise MlflowException.invalid_parameter_value(
167
+ "Expected all values in list to be of same type"
168
+ ) from e
169
+ elif isinstance(result.dtype, DataType):
170
+ if dtype != result.dtype:
171
+ raise MlflowException.invalid_parameter_value(
172
+ "Expected all values in list to be of same type"
173
+ )
174
+ else:
175
+ raise MlflowException.invalid_parameter_value(
176
+ f"{dtype} is not a valid type for an item of a list or numpy array."
177
+ )
178
+ return result
179
+
180
+
181
+ def _infer_scalar_datatype(data) -> DataType:
182
+ if DataType.is_boolean(data):
183
+ return DataType.boolean
184
+ # Order of is_long & is_integer matters
185
+ # as both of their python_types are int
186
+ if DataType.is_long(data):
187
+ return DataType.long
188
+ if DataType.is_integer(data):
189
+ return DataType.integer
190
+ # Order of is_double & is_float matters
191
+ # as both of their python_types are float
192
+ if DataType.is_double(data):
193
+ return DataType.double
194
+ if DataType.is_float(data):
195
+ return DataType.float
196
+ if DataType.is_string(data):
197
+ return DataType.string
198
+ if DataType.is_binary(data):
199
+ return DataType.binary
200
+ if DataType.is_datetime(data):
201
+ return DataType.datetime
202
+ raise MlflowException.invalid_parameter_value(
203
+ f"Data {data} is not one of the supported DataType"
204
+ )
205
+
206
+
207
+ def infer_schema(data: Any) -> Schema: # noqa: C901
208
+ """
209
+ Infer an MLflow schema from a dataset.
210
+
211
+ Data inputted as a numpy array or a dictionary is represented by :py:class:`TensorSpec`.
212
+ All other inputted data types are specified by :py:class:`ColSpec`.
213
+
214
+ A `TensorSpec` captures the data shape (default variable axis is 0), the data type (numpy.dtype)
215
+ and an optional name for each individual tensor of the dataset.
216
+ A `ColSpec` captures the data type (defined in :py:class:`DataType`) and an optional name for
217
+ each individual column of the dataset.
218
+
219
+ This method will raise an exception if the user data contains incompatible types or is not
220
+ passed in one of the supported formats (containers).
221
+
222
+ The input should be one of these:
223
+ - pandas.DataFrame
224
+ - pandas.Series
225
+ - numpy.ndarray
226
+ - dictionary of (name -> numpy.ndarray)
227
+ - scipy.sparse.csr_matrix/csc_matrix
228
+ - DataType
229
+ - List[DataType]
230
+ - Dict[str, Union[DataType, List, Dict]]
231
+ - List[Dict[str, Union[DataType, List, Dict]]]
232
+
233
+ The last two formats are used to represent complex data structures. For example,
234
+
235
+ Input Data:
236
+ [
237
+ {
238
+ 'text': 'some sentence',
239
+ 'ids': ['id1'],
240
+ 'dict': {'key': 'value'}
241
+ },
242
+ {
243
+ 'text': 'some sentence',
244
+ 'ids': ['id1', 'id2'],
245
+ 'dict': {'key': 'value', 'key2': 'value2'}
246
+ },
247
+ ]
248
+
249
+ The corresponding pandas DataFrame representation should look like this:
250
+
251
+ output ids dict
252
+ 0 some sentence [id1, id2] {'key': 'value'}
253
+ 1 some sentence [id1, id2] {'key': 'value', 'key2': 'value2'}
254
+
255
+ The inferred schema should look like this:
256
+
257
+ Schema([
258
+ ColSpec(type=DataType.string, name='output'),
259
+ ColSpec(type=Array(dtype=DataType.string), name='ids'),
260
+ ColSpec(
261
+ type=Object([
262
+ Property(name='key', dtype=DataType.string),
263
+ Property(name='key2', dtype=DataType.string, required=False)
264
+ ]),
265
+ name='dict')]
266
+ ),
267
+ ])
268
+
269
+ The element types should be mappable to one of :py:class:`mlflow.models.signature.DataType` for
270
+ dataframes and to one of numpy types for tensors.
271
+
272
+ Args:
273
+ data: Dataset to infer from.
274
+
275
+ Returns:
276
+ Schema
277
+ """
278
+ from scipy.sparse import csc_matrix, csr_matrix
279
+
280
+ # To keep backward compatibility with < 2.9.0, an empty list is inferred as string.
281
+ # ref: https://github.com/mlflow/mlflow/pull/10125#discussion_r1372751487
282
+ if isinstance(data, list) and data == []:
283
+ return Schema([ColSpec(DataType.string)])
284
+
285
+ if isinstance(data, list) and all(isinstance(value, dict) for value in data):
286
+ col_data_mapping = defaultdict(list)
287
+ for item in data:
288
+ for k, v in item.items():
289
+ col_data_mapping[k].append(v)
290
+ requiredness = {}
291
+ for col in col_data_mapping:
292
+ requiredness[col] = False if any(col not in item for item in data) else True
293
+
294
+ schema = Schema(
295
+ [
296
+ ColSpec(
297
+ _infer_colspec_type(values).dtype,
298
+ name=name,
299
+ required=requiredness[name],
300
+ )
301
+ for name, values in col_data_mapping.items()
302
+ ]
303
+ )
304
+
305
+ elif isinstance(data, dict):
306
+ # dictionary of (name -> numpy.ndarray)
307
+ if all(isinstance(values, np.ndarray) for values in data.values()):
308
+ schema = Schema(
309
+ [
310
+ TensorSpec(
311
+ type=clean_tensor_type(ndarray.dtype),
312
+ shape=_get_tensor_shape(ndarray),
313
+ name=name,
314
+ )
315
+ for name, ndarray in data.items()
316
+ ]
317
+ )
318
+ # Dict[str, Union[DataType, List, Dict]]
319
+ else:
320
+ if any(not isinstance(key, str) for key in data):
321
+ raise MlflowException("The dictionary keys are not all strings.")
322
+ schema = Schema(
323
+ [
324
+ ColSpec(
325
+ _infer_colspec_type(value),
326
+ name=name,
327
+ required=_infer_required(value),
328
+ )
329
+ for name, value in data.items()
330
+ ]
331
+ )
332
+ # pandas.Series
333
+ elif isinstance(data, pd.Series):
334
+ name = getattr(data, "name", None)
335
+ schema = Schema(
336
+ [
337
+ ColSpec(
338
+ type=_infer_pandas_column(data),
339
+ name=name,
340
+ required=_infer_required(data),
341
+ )
342
+ ]
343
+ )
344
+ # pandas.DataFrame
345
+ elif isinstance(data, pd.DataFrame):
346
+ schema = Schema(
347
+ [
348
+ ColSpec(
349
+ type=_infer_pandas_column(data[col]),
350
+ name=col,
351
+ required=_infer_required(data[col]),
352
+ )
353
+ for col in data.columns
354
+ ]
355
+ )
356
+ # numpy.ndarray
357
+ elif isinstance(data, np.ndarray):
358
+ schema = Schema(
359
+ [
360
+ TensorSpec(
361
+ type=clean_tensor_type(data.dtype), shape=_get_tensor_shape(data)
362
+ )
363
+ ]
364
+ )
365
+ # scipy.sparse.csr_matrix/csc_matrix
366
+ elif isinstance(data, (csc_matrix, csr_matrix)):
367
+ schema = Schema(
368
+ [
369
+ TensorSpec(
370
+ type=clean_tensor_type(data.data.dtype),
371
+ shape=_get_tensor_shape(data),
372
+ )
373
+ ]
374
+ )
375
+ elif isinstance(data, list):
376
+ # Assume list as a single column
377
+ # List[DataType]
378
+ # e.g. ['some sentence', 'some sentence'] -> Schema([ColSpec(type=DataType.string)])
379
+ # The corresponding pandas DataFrame representation should be pd.DataFrame(data)
380
+ # We set required=True as unnamed optional inputs is not allowed
381
+ schema = Schema([ColSpec(_infer_colspec_type(data).dtype)])
382
+ else:
383
+ # DataType
384
+ # e.g. "some sentence" -> Schema([ColSpec(type=DataType.string)])
385
+ try:
386
+ # We set required=True as unnamed optional inputs is not allowed
387
+ schema = Schema([ColSpec(_infer_colspec_type(data))])
388
+ except MlflowException as e:
389
+ raise MlflowException.invalid_parameter_value(
390
+ "Failed to infer schema. Expected one of the following types:\n"
391
+ "- pandas.DataFrame\n"
392
+ "- pandas.Series\n"
393
+ "- numpy.ndarray\n"
394
+ "- dictionary of (name -> numpy.ndarray)\n"
395
+ "- pyspark.sql.DataFrame\n"
396
+ "- scipy.sparse.csr_matrix\n"
397
+ "- scipy.sparse.csc_matrix\n"
398
+ "- DataType\n"
399
+ "- List[DataType]\n"
400
+ "- Dict[str, Union[DataType, List, Dict]]\n"
401
+ "- List[Dict[str, Union[DataType, List, Dict]]]\n"
402
+ f"but got '{data}'.\n"
403
+ f"Error: {e}",
404
+ ) from None
405
+ if not schema.is_tensor_spec() and any(
406
+ t in (DataType.integer, DataType.long) for t in schema.input_types()
407
+ ):
408
+ # TODO: Check with the team on this to keep it or remove it
409
+ warnings.warn(
410
+ "Hint: Inferred schema contains integer column(s). Integer columns in "
411
+ "Python cannot represent missing values. If your input data contains "
412
+ "missing values at inference time, it will be encoded as floats and will "
413
+ "cause a schema enforcement error. The best way to avoid this problem is "
414
+ "to infer the model schema based on a realistic data sample (training "
415
+ "dataset) that includes missing values. Alternatively, you can declare "
416
+ "integer columns as doubles (float64) whenever these columns may have "
417
+ "missing values. See `Handling Integers With Missing Values "
418
+ "<https://www.mlflow.org/docs/latest/models.html#"
419
+ "handling-integers-with-missing-values>`_ for more details.",
420
+ stacklevel=2,
421
+ )
422
+ return schema
423
+
424
+
425
+ def _infer_numpy_dtype(dtype) -> DataType: # noqa: C901
426
+ supported_types = np.dtype
427
+
428
+ # noinspection PyBroadException
429
+ try:
430
+ from pandas.core.dtypes.base import ExtensionDtype
431
+
432
+ supported_types = (np.dtype, ExtensionDtype)
433
+ except ImportError:
434
+ # This version of pandas does not support extension types
435
+ pass
436
+ if not isinstance(dtype, supported_types):
437
+ raise TypeError(
438
+ f"Expected numpy.dtype or pandas.ExtensionDtype, got '{type(dtype)}'."
439
+ )
440
+
441
+ if dtype.kind == "b":
442
+ return DataType.boolean
443
+ elif dtype.kind == "i" or dtype.kind == "u":
444
+ if dtype.itemsize < 4 or (dtype.kind == "i" and dtype.itemsize == 4):
445
+ return DataType.integer
446
+ elif dtype.itemsize < 8 or (dtype.kind == "i" and dtype.itemsize == 8):
447
+ return DataType.long
448
+ elif dtype.kind == "f":
449
+ if dtype.itemsize <= 4:
450
+ return DataType.float
451
+ elif dtype.itemsize <= 8:
452
+ return DataType.double
453
+
454
+ elif dtype.kind == "U":
455
+ return DataType.string
456
+ elif dtype.kind == "S":
457
+ return DataType.binary
458
+ elif dtype.kind == "O":
459
+ raise MlflowException(
460
+ "Can not infer object without looking at the values, call _map_numpy_array instead."
461
+ ) from None
462
+ elif dtype.kind == "M":
463
+ return DataType.datetime
464
+ raise MlflowException(
465
+ f"Unsupported numpy data type '{dtype}', kind '{dtype.kind}'"
466
+ ) from None
467
+
468
+
469
+ def _is_none_or_nan(x):
470
+ if isinstance(x, float):
471
+ return np.isnan(x)
472
+ return x is None
473
+
474
+
475
+ def _infer_required(col) -> bool:
476
+ if isinstance(col, (list, pd.Series)):
477
+ return not any(_is_none_or_nan(x) for x in col)
478
+ return not _is_none_or_nan(col)
479
+
480
+
481
+ def _infer_pandas_column(col: pd.Series) -> DataType:
482
+ if not isinstance(col, pd.Series):
483
+ raise TypeError(f"Expected pandas.Series, got '{type(col)}'.")
484
+ if len(col.values.shape) > 1:
485
+ raise MlflowException(f"Expected 1d array, got array with shape {col.shape}")
486
+
487
+ if col.dtype.kind == "O":
488
+ col = col.infer_objects()
489
+ if col.dtype.kind == "O":
490
+ try:
491
+ # We convert pandas Series into list and infer the schema.
492
+ # The real schema for internal field should be the Array's dtype
493
+ arr_type = _infer_colspec_type(col.to_list())
494
+ return arr_type.dtype
495
+ except Exception as e:
496
+ # For backwards compatibility, we fall back to string
497
+ # if the provided array is of string type
498
+ # This is for diviner test where df field is ('key2', 'key1', 'key0')
499
+ if pd.api.types.is_string_dtype(col):
500
+ return DataType.string
501
+ raise MlflowException(
502
+ f"Failed to infer schema for pandas.Series {col}. Error: {e}"
503
+ ) from None
504
+ else:
505
+ # NB: The following works for numpy types as well as pandas extension types.
506
+ return _infer_numpy_dtype(col.dtype)
507
+
508
+
509
+ def _is_all_string(x):
510
+ return all(isinstance(v, str) for v in x)
511
+
512
+
513
+ def _validate_is_all_string(x):
514
+ if not _is_all_string(x):
515
+ raise MlflowException.invalid_parameter_value(
516
+ f"Expected all values to be string, got {x}"
517
+ )
518
+
519
+
520
+ def _validate_all_keys_string(d):
521
+ keys = list(d.keys())
522
+ if not _is_all_string(keys):
523
+ raise MlflowException.invalid_parameter_value(
524
+ f"Expected example to be dict with string keys, got {keys}",
525
+ )
526
+
527
+
528
+ def _validate_all_values_string(d):
529
+ values = list(d.values())
530
+ if not _is_all_string(values):
531
+ raise MlflowException.invalid_parameter_value(
532
+ f"Expected example to be dict with string values, got {values}"
533
+ )
534
+
535
+
536
+ def _validate_keys_match(d, expected_keys):
537
+ if d.keys() != expected_keys:
538
+ raise MlflowException.invalid_parameter_value(
539
+ f"Expected example to be dict with keys {list(expected_keys)}, got {list(d.keys())}",
540
+ )
541
+
542
+
543
+ def _validate_num_items(d, num_items):
544
+ actual_num_items = len(d)
545
+ if actual_num_items != num_items:
546
+ raise MlflowException.invalid_parameter_value(
547
+ f"Expected example to be dict with {num_items} items, got {actual_num_items}",
548
+ )
549
+
550
+
551
+ def _validate_has_items(d):
552
+ num_items = len(d)
553
+ if num_items == 0:
554
+ raise MlflowException.invalid_parameter_value(
555
+ f"Expected example to be dict with at least one item, got {num_items}",
556
+ )
557
+
558
+
559
+ def _validate_is_dict(d):
560
+ if not isinstance(d, dict):
561
+ raise MlflowException.invalid_parameter_value(
562
+ f"Expected each item in example to be dict, got {type(d).__name__}",
563
+ )
564
+
565
+
566
+ def _validate_non_empty(examples):
567
+ num_items = len(examples)
568
+ if num_items == 0:
569
+ raise MlflowException.invalid_parameter_value(
570
+ f"Expected examples to be non-empty list, got {num_items}",
571
+ )
572
+
573
+
574
+ def _validate_is_list(examples):
575
+ if not isinstance(examples, list):
576
+ raise MlflowException.invalid_parameter_value(
577
+ f"Expected examples to be list, got {type(examples).__name__}",
578
+ )
579
+
580
+
581
+ def _validate_dict_examples(examples, num_items=None):
582
+ examples_iter = iter(examples)
583
+ first_example = next(examples_iter)
584
+ _validate_is_dict(first_example)
585
+ _validate_has_items(first_example)
586
+ if num_items is not None:
587
+ _validate_num_items(first_example, num_items)
588
+ _validate_all_keys_string(first_example)
589
+ _validate_all_values_string(first_example)
590
+ first_keys = first_example.keys()
591
+
592
+ for example in examples_iter:
593
+ _validate_is_dict(example)
594
+ _validate_has_items(example)
595
+ if num_items is not None:
596
+ _validate_num_items(example, num_items)
597
+ _validate_all_keys_string(example)
598
+ _validate_all_values_string(example)
599
+ _validate_keys_match(example, first_keys)
600
+
601
+
602
+ def _is_list_str(type_hint: Any) -> bool:
603
+ return type_hint in [
604
+ List[str], # noqa: UP006
605
+ list[str],
606
+ ]
607
+
608
+
609
+ def _is_list_dict_str(type_hint: Any) -> bool:
610
+ return type_hint in [
611
+ List[Dict[str, str]], # noqa: UP006
612
+ list[Dict[str, str]], # noqa: UP006
613
+ List[dict[str, str]], # noqa: UP006
614
+ list[dict[str, str]],
615
+ ]
616
+
617
+
618
+ def _infer_schema_from_type_hint(type_hint, examples=None):
619
+ has_examples = examples is not None
620
+ if has_examples:
621
+ _validate_non_empty(examples)
622
+
623
+ if _is_list_str(type_hint):
624
+ if has_examples:
625
+ _validate_is_list(examples)
626
+ _validate_is_all_string(examples)
627
+ return Schema([ColSpec(type="string", name=None)])
628
+ elif _is_list_dict_str(type_hint):
629
+ if has_examples:
630
+ _validate_is_list(examples)
631
+ _validate_dict_examples(examples)
632
+ return Schema([ColSpec(type="string", name=name) for name in examples[0]])
633
+ else:
634
+ _logger.warning(
635
+ f"Could not infer schema for {type_hint} because example is missing"
636
+ )
637
+ return Schema([ColSpec(type="string", name=None)])
638
+ else:
639
+ _logger.info("Unsupported type hint: %s, skipping schema inference", type_hint)
640
+ return None
641
+
642
+
643
+ def _get_array_depth(data: Any) -> int:
644
+ if isinstance(data, np.ndarray):
645
+ return data.ndim
646
+ if isinstance(data, list):
647
+ return max(_get_array_depth(item) for item in data) + 1 if data else 1
648
+ return 0
649
+
650
+
651
+ def _infer_type_and_shape(value):
652
+ if isinstance(value, (list, np.ndarray)):
653
+ ndim = _get_array_depth(value)
654
+ if ndim != 1:
655
+ raise MlflowException.invalid_parameter_value(
656
+ f"Expected parameters to be 1D array or scalar, got {ndim}D array",
657
+ )
658
+ if all(DataType.is_datetime(v) for v in value):
659
+ return DataType.datetime, (-1,)
660
+ value_type = _infer_numpy_dtype(np.array(value).dtype)
661
+ return value_type, (-1,)
662
+ elif DataType.is_datetime(value):
663
+ return DataType.datetime, None
664
+ elif np.isscalar(value):
665
+ try:
666
+ value_type = _infer_numpy_dtype(np.array(value).dtype)
667
+ return value_type, None
668
+ except (Exception, MlflowException) as e:
669
+ raise MlflowException.invalid_parameter_value(
670
+ f"Failed to infer schema for parameter {value}: {e!r}"
671
+ ) from None
672
+ raise MlflowException.invalid_parameter_value(
673
+ f"Expected parameters to be 1D array or scalar, got {type(value).__name__}",
674
+ ) from None
675
+
676
+
677
+ def infer_param_schema(parameters: Dict[str, Any]):
678
+ if not isinstance(parameters, dict):
679
+ raise MlflowException.invalid_parameter_value(
680
+ f"Expected parameters to be dict, got {type(parameters).__name__}",
681
+ )
682
+
683
+ param_specs = []
684
+ invalid_params = []
685
+ for name, value in parameters.items():
686
+ try:
687
+ value_type, shape = _infer_type_and_shape(value)
688
+ param_specs.append(
689
+ ParamSpec(name=name, dtype=value_type, default=value, shape=shape)
690
+ )
691
+ except Exception as e:
692
+ invalid_params.append((name, value, e))
693
+
694
+ if invalid_params:
695
+ raise MlflowException.invalid_parameter_value(
696
+ f"Failed to infer schema for parameters: {invalid_params}",
697
+ )
698
+
699
+ return ParamSchema(param_specs)