snowflake-ml-python 1.7.0__py3-none-any.whl → 1.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. snowflake/cortex/__init__.py +4 -0
  2. snowflake/cortex/_complete.py +107 -64
  3. snowflake/cortex/_finetune.py +273 -0
  4. snowflake/cortex/_sse_client.py +91 -28
  5. snowflake/cortex/_util.py +30 -1
  6. snowflake/ml/_internal/type_utils.py +3 -3
  7. snowflake/ml/_internal/utils/jwt_generator.py +141 -0
  8. snowflake/ml/data/__init__.py +5 -0
  9. snowflake/ml/model/_client/model/model_version_impl.py +26 -12
  10. snowflake/ml/model/_client/ops/model_ops.py +51 -30
  11. snowflake/ml/model/_client/ops/service_ops.py +25 -9
  12. snowflake/ml/model/_client/sql/model.py +0 -14
  13. snowflake/ml/model/_client/sql/service.py +25 -1
  14. snowflake/ml/model/_client/sql/stage.py +1 -1
  15. snowflake/ml/model/_model_composer/model_method/infer_function.py_template +2 -1
  16. snowflake/ml/model/_packager/model_env/model_env.py +12 -0
  17. snowflake/ml/model/_packager/model_handlers/_utils.py +1 -1
  18. snowflake/ml/model/_packager/model_handlers/catboost.py +1 -1
  19. snowflake/ml/model/_packager/model_handlers/custom.py +3 -1
  20. snowflake/ml/model/_packager/model_handlers/lightgbm.py +2 -1
  21. snowflake/ml/model/_packager/model_handlers/sklearn.py +50 -1
  22. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +1 -1
  23. snowflake/ml/model/_packager/model_handlers/tensorflow.py +23 -6
  24. snowflake/ml/model/_packager/model_handlers/torchscript.py +14 -14
  25. snowflake/ml/model/_packager/model_meta/_packaging_requirements.py +2 -3
  26. snowflake/ml/model/_packager/model_meta/model_meta_schema.py +5 -0
  27. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +2 -10
  28. snowflake/ml/model/_packager/model_runtime/model_runtime.py +4 -9
  29. snowflake/ml/model/_packager/model_task/model_task_utils.py +1 -1
  30. snowflake/ml/model/_signatures/core.py +63 -16
  31. snowflake/ml/model/_signatures/pandas_handler.py +71 -27
  32. snowflake/ml/model/_signatures/pytorch_handler.py +2 -2
  33. snowflake/ml/model/_signatures/snowpark_handler.py +2 -1
  34. snowflake/ml/model/_signatures/tensorflow_handler.py +2 -2
  35. snowflake/ml/model/_signatures/utils.py +4 -1
  36. snowflake/ml/model/model_signature.py +38 -9
  37. snowflake/ml/model/type_hints.py +1 -1
  38. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +2 -4
  39. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +2 -4
  40. snowflake/ml/monitoring/_client/model_monitor_sql_client.py +148 -1200
  41. snowflake/ml/monitoring/_manager/model_monitor_manager.py +114 -238
  42. snowflake/ml/monitoring/entities/model_monitor_config.py +38 -12
  43. snowflake/ml/monitoring/model_monitor.py +12 -86
  44. snowflake/ml/registry/registry.py +28 -40
  45. snowflake/ml/utils/authentication.py +75 -0
  46. snowflake/ml/version.py +1 -1
  47. {snowflake_ml_python-1.7.0.dist-info → snowflake_ml_python-1.7.2.dist-info}/METADATA +116 -52
  48. {snowflake_ml_python-1.7.0.dist-info → snowflake_ml_python-1.7.2.dist-info}/RECORD +51 -49
  49. {snowflake_ml_python-1.7.0.dist-info → snowflake_ml_python-1.7.2.dist-info}/WHEEL +1 -1
  50. snowflake/ml/monitoring/entities/model_monitor_interval.py +0 -46
  51. snowflake/ml/monitoring/entities/output_score_type.py +0 -90
  52. {snowflake_ml_python-1.7.0.dist-info → snowflake_ml_python-1.7.2.dist-info}/LICENSE.txt +0 -0
  53. {snowflake_ml_python-1.7.0.dist-info → snowflake_ml_python-1.7.2.dist-info}/top_level.txt +0 -0
@@ -14,10 +14,12 @@ from typing import (
14
14
  Type,
15
15
  Union,
16
16
  final,
17
+ get_args,
17
18
  )
18
19
 
19
20
  import numpy as np
20
21
  import numpy.typing as npt
22
+ import pandas as pd
21
23
 
22
24
  import snowflake.snowpark.types as spt
23
25
  from snowflake.ml._internal.exceptions import (
@@ -29,6 +31,21 @@ if TYPE_CHECKING:
29
31
  import mlflow
30
32
  import torch
31
33
 
34
+ PandasExtensionTypes = Union[
35
+ pd.Int8Dtype,
36
+ pd.Int16Dtype,
37
+ pd.Int32Dtype,
38
+ pd.Int64Dtype,
39
+ pd.UInt8Dtype,
40
+ pd.UInt16Dtype,
41
+ pd.UInt32Dtype,
42
+ pd.UInt64Dtype,
43
+ pd.Float32Dtype,
44
+ pd.Float64Dtype,
45
+ pd.BooleanDtype,
46
+ pd.StringDtype,
47
+ ]
48
+
32
49
 
33
50
  class DataType(Enum):
34
51
  def __init__(self, value: str, snowpark_type: Type[spt.DataType], numpy_type: npt.DTypeLike) -> None:
@@ -67,11 +84,11 @@ class DataType(Enum):
67
84
  return f"DataType.{self.name}"
68
85
 
69
86
  @classmethod
70
- def from_numpy_type(cls, np_type: npt.DTypeLike) -> "DataType":
87
+ def from_numpy_type(cls, input_type: Union[npt.DTypeLike, PandasExtensionTypes]) -> "DataType":
71
88
  """Translate numpy dtype to DataType for signature definition.
72
89
 
73
90
  Args:
74
- np_type: The numpy dtype.
91
+ input_type: The numpy dtype or Pandas Extension Dtype
75
92
 
76
93
  Raises:
77
94
  SnowflakeMLException: NotImplementedError: Raised when the given numpy type is not supported.
@@ -79,6 +96,10 @@ class DataType(Enum):
79
96
  Returns:
80
97
  Corresponding DataType.
81
98
  """
99
+ # To support pandas extension dtype
100
+ if isinstance(input_type, get_args(PandasExtensionTypes)):
101
+ input_type = input_type.type
102
+
82
103
  np_to_snowml_type_mapping = {i._numpy_type: i for i in DataType}
83
104
 
84
105
  # Add datetime types:
@@ -88,12 +109,12 @@ class DataType(Enum):
88
109
  np_to_snowml_type_mapping[f"datetime64[{res}]"] = DataType.TIMESTAMP_NTZ
89
110
 
90
111
  for potential_type in np_to_snowml_type_mapping.keys():
91
- if np.can_cast(np_type, potential_type, casting="no"):
112
+ if np.can_cast(input_type, potential_type, casting="no"):
92
113
  # This is used since the same dtype might represented in different ways.
93
114
  return np_to_snowml_type_mapping[potential_type]
94
115
  raise snowml_exceptions.SnowflakeMLException(
95
116
  error_code=error_codes.NOT_IMPLEMENTED,
96
- original_exception=NotImplementedError(f"Type {np_type} is not supported as a DataType."),
117
+ original_exception=NotImplementedError(f"Type {input_type} is not supported as a DataType."),
97
118
  )
98
119
 
99
120
  @classmethod
@@ -212,6 +233,7 @@ class FeatureSpec(BaseFeatureSpec):
212
233
  name: str,
213
234
  dtype: DataType,
214
235
  shape: Optional[Tuple[int, ...]] = None,
236
+ nullable: bool = True,
215
237
  ) -> None:
216
238
  """
217
239
  Initialize a feature.
@@ -219,6 +241,7 @@ class FeatureSpec(BaseFeatureSpec):
219
241
  Args:
220
242
  name: Name of the feature.
221
243
  dtype: Type of the elements in the feature.
244
+ nullable: Whether the feature is nullable. Defaults to True.
222
245
  shape: Used to represent scalar feature, 1-d feature list,
223
246
  or n-d tensor. Use -1 to represent variable length. Defaults to None.
224
247
 
@@ -227,6 +250,7 @@ class FeatureSpec(BaseFeatureSpec):
227
250
  - (2,): 1d list with a fixed length of 2.
228
251
  - (-1,): 1d list with variable length, used for ragged tensor representation.
229
252
  - (d1, d2, d3): 3d tensor.
253
+ nullable: Whether the feature is nullable. Defaults to True.
230
254
 
231
255
  Raises:
232
256
  SnowflakeMLException: TypeError: When the dtype input type is incorrect.
@@ -248,6 +272,8 @@ class FeatureSpec(BaseFeatureSpec):
248
272
  )
249
273
  self._shape = shape
250
274
 
275
+ self._nullable = nullable
276
+
251
277
  def as_snowpark_type(self) -> spt.DataType:
252
278
  result_type = self._dtype.as_snowpark_type()
253
279
  if not self._shape:
@@ -256,13 +282,34 @@ class FeatureSpec(BaseFeatureSpec):
256
282
  result_type = spt.ArrayType(result_type)
257
283
  return result_type
258
284
 
259
- def as_dtype(self) -> Union[npt.DTypeLike, str]:
285
+ def as_dtype(self) -> Union[npt.DTypeLike, str, PandasExtensionTypes]:
260
286
  """Convert to corresponding local Type."""
287
+
261
288
  if not self._shape:
262
289
  # scalar dtype: use keys from `np.sctypeDict` to prevent unit-less dtype 'datetime64'
263
290
  if "datetime64" in self._dtype._value:
264
291
  return self._dtype._value
265
- return self._dtype._numpy_type
292
+
293
+ np_type = self._dtype._numpy_type
294
+ if self._nullable:
295
+ np_to_pd_dtype_mapping = {
296
+ np.int8: pd.Int8Dtype(),
297
+ np.int16: pd.Int16Dtype(),
298
+ np.int32: pd.Int32Dtype(),
299
+ np.int64: pd.Int64Dtype(),
300
+ np.uint8: pd.UInt8Dtype(),
301
+ np.uint16: pd.UInt16Dtype(),
302
+ np.uint32: pd.UInt32Dtype(),
303
+ np.uint64: pd.UInt64Dtype(),
304
+ np.float32: pd.Float32Dtype(),
305
+ np.float64: pd.Float64Dtype(),
306
+ np.bool_: pd.BooleanDtype(),
307
+ np.str_: pd.StringDtype(),
308
+ }
309
+
310
+ return np_to_pd_dtype_mapping.get(np_type, np_type) # type: ignore[arg-type]
311
+
312
+ return np_type
266
313
  return np.object_
267
314
 
268
315
  def __eq__(self, other: object) -> bool:
@@ -273,7 +320,10 @@ class FeatureSpec(BaseFeatureSpec):
273
320
 
274
321
  def __repr__(self) -> str:
275
322
  shape_str = f", shape={repr(self._shape)}" if self._shape else ""
276
- return f"FeatureSpec(dtype={repr(self._dtype)}, name={repr(self._name)}{shape_str})"
323
+ return (
324
+ f"FeatureSpec(dtype={repr(self._dtype)}, "
325
+ f"name={repr(self._name)}{shape_str}, nullable={repr(self._nullable)})"
326
+ )
277
327
 
278
328
  def to_dict(self) -> Dict[str, Any]:
279
329
  """Serialize the feature group into a dict.
@@ -281,10 +331,7 @@ class FeatureSpec(BaseFeatureSpec):
281
331
  Returns:
282
332
  A dict that serializes the feature group.
283
333
  """
284
- base_dict: Dict[str, Any] = {
285
- "type": self._dtype.name,
286
- "name": self._name,
287
- }
334
+ base_dict: Dict[str, Any] = {"type": self._dtype.name, "name": self._name, "nullable": self._nullable}
288
335
  if self._shape is not None:
289
336
  base_dict["shape"] = self._shape
290
337
  return base_dict
@@ -304,7 +351,9 @@ class FeatureSpec(BaseFeatureSpec):
304
351
  if shape:
305
352
  shape = tuple(shape)
306
353
  type = DataType[input_dict["type"]]
307
- return FeatureSpec(name=name, dtype=type, shape=shape)
354
+ # If nullable is not provided, default to False for backward compatibility.
355
+ nullable = input_dict.get("nullable", False)
356
+ return FeatureSpec(name=name, dtype=type, shape=shape, nullable=nullable)
308
357
 
309
358
  @classmethod
310
359
  def from_mlflow_spec(
@@ -475,10 +524,8 @@ class ModelSignature:
475
524
  sig_outs = loaded["outputs"]
476
525
  sig_inputs = loaded["inputs"]
477
526
 
478
- deserialize_spec: Callable[[Dict[str, Any]], BaseFeatureSpec] = (
479
- lambda sig_spec: FeatureGroupSpec.from_dict(sig_spec)
480
- if "feature_group" in sig_spec
481
- else FeatureSpec.from_dict(sig_spec)
527
+ deserialize_spec: Callable[[Dict[str, Any]], BaseFeatureSpec] = lambda sig_spec: (
528
+ FeatureGroupSpec.from_dict(sig_spec) if "feature_group" in sig_spec else FeatureSpec.from_dict(sig_spec)
482
529
  )
483
530
 
484
531
  return ModelSignature(
@@ -1,4 +1,5 @@
1
- from typing import Literal, Sequence
1
+ import warnings
2
+ from typing import Literal, Sequence, Union
2
3
 
3
4
  import numpy as np
4
5
  import pandas as pd
@@ -14,8 +15,8 @@ from snowflake.ml.model._signatures import base_handler, core, utils
14
15
 
15
16
  class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
16
17
  @staticmethod
17
- def can_handle(data: model_types.SupportedDataType) -> TypeGuard[pd.DataFrame]:
18
- return isinstance(data, pd.DataFrame)
18
+ def can_handle(data: model_types.SupportedDataType) -> TypeGuard[Union[pd.DataFrame, pd.Series]]:
19
+ return isinstance(data, pd.DataFrame) or isinstance(data, pd.Series)
19
20
 
20
21
  @staticmethod
21
22
  def count(data: pd.DataFrame) -> int:
@@ -26,7 +27,17 @@ class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
26
27
  return data.head(min(PandasDataFrameHandler.count(data), PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT))
27
28
 
28
29
  @staticmethod
29
- def validate(data: pd.DataFrame) -> None:
30
+ def validate(data: Union[pd.DataFrame, pd.Series]) -> None:
31
+ if isinstance(data, pd.Series):
32
+ # check if the series is empty and throw error
33
+ if data.empty:
34
+ raise snowml_exceptions.SnowflakeMLException(
35
+ error_code=error_codes.INVALID_DATA,
36
+ original_exception=ValueError("Data Validation Error: Empty data is found."),
37
+ )
38
+ # convert the series to a dataframe
39
+ data = data.to_frame()
40
+
30
41
  df_cols = data.columns
31
42
 
32
43
  if df_cols.has_duplicates: # Rule out categorical index with duplicates
@@ -60,21 +71,44 @@ class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
60
71
 
61
72
  df_col_dtypes = [data[col].dtype for col in data.columns]
62
73
  for df_col, df_col_dtype in zip(df_cols, df_col_dtypes):
74
+ df_col_data = data[df_col]
75
+ if df_col_data.isnull().all():
76
+ raise snowml_exceptions.SnowflakeMLException(
77
+ error_code=error_codes.INVALID_DATA,
78
+ original_exception=ValueError(
79
+ f"Data Validation Error: There is no non-null data in column {df_col}."
80
+ ),
81
+ )
82
+ if df_col_data.isnull().any():
83
+ warnings.warn(
84
+ (
85
+ f"Null value detected in column {df_col}, model signature inference might not accurate, "
86
+ "or your prediction might fail if your model does not support null input. If this is not "
87
+ "expected, please check your input dataframe."
88
+ ),
89
+ category=UserWarning,
90
+ stacklevel=2,
91
+ )
92
+
93
+ df_col_data = utils.series_dropna(df_col_data)
94
+ df_col_dtype = df_col_data.dtype
95
+
63
96
  if df_col_dtype == np.dtype("O"):
64
97
  # Check if all objects have the same type
65
- if not all(isinstance(data_row, type(data[df_col].iloc[0])) for data_row in data[df_col]):
98
+ if not all(isinstance(data_row, type(df_col_data.iloc[0])) for data_row in df_col_data):
66
99
  raise snowml_exceptions.SnowflakeMLException(
67
100
  error_code=error_codes.INVALID_DATA,
68
101
  original_exception=ValueError(
69
- f"Data Validation Error: Inconsistent type of object found in column data {data[df_col]}."
102
+ "Data Validation Error: "
103
+ + f"Inconsistent type of element in object found in column data {df_col_data}."
70
104
  ),
71
105
  )
72
106
 
73
- if isinstance(data[df_col].iloc[0], list):
74
- arr = utils.convert_list_to_ndarray(data[df_col].iloc[0])
107
+ if isinstance(df_col_data.iloc[0], list):
108
+ arr = utils.convert_list_to_ndarray(df_col_data.iloc[0])
75
109
  arr_dtype = core.DataType.from_numpy_type(arr.dtype)
76
110
 
77
- converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in data[df_col]]
111
+ converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in df_col_data]
78
112
 
79
113
  if not all(
80
114
  core.DataType.from_numpy_type(converted_data.dtype) == arr_dtype
@@ -84,32 +118,37 @@ class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
84
118
  error_code=error_codes.INVALID_DATA,
85
119
  original_exception=ValueError(
86
120
  "Data Validation Error: "
87
- + f"Inconsistent type of element in object found in column data {data[df_col]}."
121
+ + f"Inconsistent type of element in object found in column data {df_col_data}."
88
122
  ),
89
123
  )
90
124
 
91
- elif isinstance(data[df_col].iloc[0], np.ndarray):
92
- arr_dtype = core.DataType.from_numpy_type(data[df_col].iloc[0].dtype)
125
+ elif isinstance(df_col_data.iloc[0], np.ndarray):
126
+ arr_dtype = core.DataType.from_numpy_type(df_col_data.iloc[0].dtype)
93
127
 
94
- if not all(core.DataType.from_numpy_type(data_row.dtype) == arr_dtype for data_row in data[df_col]):
128
+ if not all(core.DataType.from_numpy_type(data_row.dtype) == arr_dtype for data_row in df_col_data):
95
129
  raise snowml_exceptions.SnowflakeMLException(
96
130
  error_code=error_codes.INVALID_DATA,
97
131
  original_exception=ValueError(
98
132
  "Data Validation Error: "
99
- + f"Inconsistent type of element in object found in column data {data[df_col]}."
133
+ + f"Inconsistent type of element in object found in column data {df_col_data}."
100
134
  ),
101
135
  )
102
- elif not isinstance(data[df_col].iloc[0], (str, bytes)):
136
+ elif not isinstance(df_col_data.iloc[0], (str, bytes)):
103
137
  raise snowml_exceptions.SnowflakeMLException(
104
138
  error_code=error_codes.INVALID_DATA,
105
139
  original_exception=ValueError(
106
- f"Data Validation Error: Unsupported type confronted in {data[df_col]}"
140
+ f"Data Validation Error: Unsupported type confronted in {df_col_data}"
107
141
  ),
108
142
  )
109
143
 
110
144
  @staticmethod
111
- def infer_signature(data: pd.DataFrame, role: Literal["input", "output"]) -> Sequence[core.BaseFeatureSpec]:
145
+ def infer_signature(
146
+ data: Union[pd.DataFrame, pd.Series],
147
+ role: Literal["input", "output"],
148
+ ) -> Sequence[core.BaseFeatureSpec]:
112
149
  feature_prefix = f"{PandasDataFrameHandler.FEATURE_PREFIX}_"
150
+ if isinstance(data, pd.Series):
151
+ data = data.to_frame()
113
152
  df_cols = data.columns
114
153
  role_prefix = (
115
154
  PandasDataFrameHandler.INPUT_PREFIX if role == "input" else PandasDataFrameHandler.OUTPUT_PREFIX
@@ -123,29 +162,34 @@ class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
123
162
 
124
163
  specs = []
125
164
  for df_col, df_col_dtype, ft_name in zip(df_cols, df_col_dtypes, ft_names):
165
+ df_col_data = data[df_col]
166
+ if df_col_data.isnull().any():
167
+ df_col_data = utils.series_dropna(df_col_data)
168
+ df_col_dtype = df_col_data.dtype
169
+
126
170
  if df_col_dtype == np.dtype("O"):
127
- if isinstance(data[df_col].iloc[0], list):
128
- arr = utils.convert_list_to_ndarray(data[df_col].iloc[0])
171
+ if isinstance(df_col_data.iloc[0], list):
172
+ arr = utils.convert_list_to_ndarray(df_col_data.iloc[0])
129
173
  arr_dtype = core.DataType.from_numpy_type(arr.dtype)
130
- ft_shape = np.shape(data[df_col].iloc[0])
174
+ ft_shape = np.shape(df_col_data.iloc[0])
131
175
 
132
- converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in data[df_col]]
176
+ converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in df_col_data]
133
177
 
134
178
  if not all(np.shape(converted_data) == ft_shape for converted_data in converted_data_list):
135
179
  ft_shape = (-1,)
136
180
 
137
181
  specs.append(core.FeatureSpec(dtype=arr_dtype, name=ft_name, shape=ft_shape))
138
- elif isinstance(data[df_col].iloc[0], np.ndarray):
139
- arr_dtype = core.DataType.from_numpy_type(data[df_col].iloc[0].dtype)
140
- ft_shape = np.shape(data[df_col].iloc[0])
182
+ elif isinstance(df_col_data.iloc[0], np.ndarray):
183
+ arr_dtype = core.DataType.from_numpy_type(df_col_data.iloc[0].dtype)
184
+ ft_shape = np.shape(df_col_data.iloc[0])
141
185
 
142
- if not all(np.shape(data_row) == ft_shape for data_row in data[df_col]):
186
+ if not all(np.shape(data_row) == ft_shape for data_row in df_col_data):
143
187
  ft_shape = (-1,)
144
188
 
145
189
  specs.append(core.FeatureSpec(dtype=arr_dtype, name=ft_name, shape=ft_shape))
146
- elif isinstance(data[df_col].iloc[0], str):
190
+ elif isinstance(df_col_data.iloc[0], str):
147
191
  specs.append(core.FeatureSpec(dtype=core.DataType.STRING, name=ft_name))
148
- elif isinstance(data[df_col].iloc[0], bytes):
192
+ elif isinstance(df_col_data.iloc[0], bytes):
149
193
  specs.append(core.FeatureSpec(dtype=core.DataType.BYTES, name=ft_name))
150
194
  elif isinstance(df_col_dtype, pd.CategoricalDtype):
151
195
  category_dtype = df_col_dtype.categories.dtype
@@ -72,10 +72,10 @@ class SeqOfPyTorchTensorHandler(base_handler.BaseDataHandler[Sequence["torch.Ten
72
72
  dtype = core.DataType.from_torch_type(data_col.dtype)
73
73
  ft_name = f"{role_prefix}{feature_prefix}{i}"
74
74
  if len(data_col.shape) == 1:
75
- features.append(core.FeatureSpec(dtype=dtype, name=ft_name))
75
+ features.append(core.FeatureSpec(dtype=dtype, name=ft_name, nullable=False))
76
76
  else:
77
77
  ft_shape = tuple(data_col.shape[1:])
78
- features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape))
78
+ features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape, nullable=False))
79
79
  return features
80
80
 
81
81
  @staticmethod
@@ -82,7 +82,8 @@ class SnowparkDataFrameHandler(base_handler.BaseDataHandler[snowflake.snowpark.D
82
82
  identifier.get_unescaped_names(field.name)
83
83
  ].map(json.loads)
84
84
  # Only when the feature is not from inference, we are confident to do the type casting.
85
- # Otherwise, dtype_map will be empty
85
+ # Otherwise, dtype_map will be empty.
86
+ # Errors are ignored to make sure None won't be converted and won't raise Error
86
87
  df_local = df_local.astype(dtype=dtype_map)
87
88
  return df_local
88
89
 
@@ -109,10 +109,10 @@ class SeqOfTensorflowTensorHandler(
109
109
  dtype = core.DataType.from_numpy_type(data_col.dtype.as_numpy_dtype)
110
110
  ft_name = f"{role_prefix}{feature_prefix}{i}"
111
111
  if len(data_col.shape) == 1:
112
- features.append(core.FeatureSpec(dtype=dtype, name=ft_name))
112
+ features.append(core.FeatureSpec(dtype=dtype, name=ft_name, nullable=False))
113
113
  else:
114
114
  ft_shape = tuple(data_col.shape[1:])
115
- features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape))
115
+ features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape, nullable=False))
116
116
  return features
117
117
 
118
118
  @staticmethod
@@ -118,7 +118,6 @@ def huggingface_pipeline_signature_auto_infer(task: str, params: Dict[str, Any])
118
118
  category=DeprecationWarning,
119
119
  stacklevel=1,
120
120
  )
121
-
122
121
  return core.ModelSignature(
123
122
  inputs=[
124
123
  core.FeatureSpec(name="user_inputs", dtype=core.DataType.STRING, shape=(-1,)),
@@ -297,3 +296,7 @@ def huggingface_pipeline_signature_auto_infer(task: str, params: Dict[str, Any])
297
296
  )
298
297
 
299
298
  return None
299
+
300
+
301
+ def series_dropna(series: pd.Series) -> pd.Series:
302
+ return series.dropna(inplace=False).reset_index(drop=True).convert_dtypes()
@@ -139,9 +139,32 @@ def _rename_signature_with_snowflake_identifiers(
139
139
  return signature
140
140
 
141
141
 
142
- def _validate_numpy_array(
143
- arr: model_types._SupportedNumpyArray, feature_type: core.DataType, strict: bool = False
142
+ def _validate_array_or_series_type(
143
+ arr: Union[model_types._SupportedNumpyArray, pd.Series], feature_type: core.DataType, strict: bool = False
144
144
  ) -> bool:
145
+ original_dtype = arr.dtype
146
+ dtype = arr.dtype
147
+ if isinstance(
148
+ dtype,
149
+ (
150
+ pd.Int8Dtype,
151
+ pd.Int16Dtype,
152
+ pd.Int32Dtype,
153
+ pd.Int64Dtype,
154
+ pd.UInt8Dtype,
155
+ pd.UInt16Dtype,
156
+ pd.UInt32Dtype,
157
+ pd.UInt64Dtype,
158
+ pd.Float32Dtype,
159
+ pd.Float64Dtype,
160
+ pd.BooleanDtype,
161
+ ),
162
+ ):
163
+ dtype = dtype.type
164
+ elif isinstance(dtype, pd.CategoricalDtype):
165
+ dtype = dtype.categories.dtype
166
+ elif isinstance(dtype, pd.StringDtype):
167
+ dtype = np.str_
145
168
  if feature_type in [
146
169
  core.DataType.INT8,
147
170
  core.DataType.INT16,
@@ -152,14 +175,17 @@ def _validate_numpy_array(
152
175
  core.DataType.UINT32,
153
176
  core.DataType.UINT64,
154
177
  ]:
155
- if not (np.issubdtype(arr.dtype, np.integer)):
178
+ if not (np.issubdtype(dtype, np.integer)):
156
179
  return False
157
180
  if not strict:
158
181
  return True
159
- min_v, max_v = arr.min(), arr.max()
182
+ if isinstance(original_dtype, pd.CategoricalDtype):
183
+ min_v, max_v = arr.cat.as_ordered().min(), arr.cat.as_ordered().min() # type: ignore[union-attr]
184
+ else:
185
+ min_v, max_v = arr.min(), arr.max()
160
186
  return bool(max_v <= np.iinfo(feature_type._numpy_type).max and min_v >= np.iinfo(feature_type._numpy_type).min)
161
187
  elif feature_type in [core.DataType.FLOAT, core.DataType.DOUBLE]:
162
- if not (np.issubdtype(arr.dtype, np.integer) or np.issubdtype(arr.dtype, np.floating)):
188
+ if not (np.issubdtype(dtype, np.integer) or np.issubdtype(dtype, np.floating)):
163
189
  return False
164
190
  if not strict:
165
191
  return True
@@ -171,7 +197,7 @@ def _validate_numpy_array(
171
197
  elif feature_type in [core.DataType.TIMESTAMP_NTZ]:
172
198
  return np.issubdtype(arr.dtype, np.datetime64)
173
199
  else:
174
- return np.can_cast(arr.dtype, feature_type._numpy_type, casting="no")
200
+ return np.can_cast(dtype, feature_type._numpy_type, casting="no")
175
201
 
176
202
 
177
203
  def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureSpec], strict: bool = False) -> None:
@@ -204,7 +230,10 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
204
230
  original_exception=ValueError(f"Data Validation Error: feature {ft_name} does not exist in data."),
205
231
  )
206
232
 
233
+ if data_col.isnull().any():
234
+ data_col = utils.series_dropna(data_col)
207
235
  df_col_dtype = data_col.dtype
236
+
208
237
  if isinstance(feature, core.FeatureGroupSpec):
209
238
  raise snowml_exceptions.SnowflakeMLException(
210
239
  error_code=error_codes.NOT_IMPLEMENTED,
@@ -217,7 +246,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
217
246
  if isinstance(df_col_dtype, pd.CategoricalDtype):
218
247
  df_col_dtype = df_col_dtype.categories.dtype
219
248
  if df_col_dtype != np.dtype("O"):
220
- if not _validate_numpy_array(data_col.to_numpy(), ft_type, strict=strict):
249
+ if not _validate_array_or_series_type(data_col, ft_type, strict=strict):
221
250
  raise snowml_exceptions.SnowflakeMLException(
222
251
  error_code=error_codes.INVALID_DATA,
223
252
  original_exception=ValueError(
@@ -247,7 +276,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
247
276
  converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in data_col]
248
277
 
249
278
  if not all(
250
- _validate_numpy_array(converted_data, ft_type, strict=strict)
279
+ _validate_array_or_series_type(converted_data, ft_type, strict=strict)
251
280
  for converted_data in converted_data_list
252
281
  ):
253
282
  raise snowml_exceptions.SnowflakeMLException(
@@ -278,7 +307,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
278
307
  ),
279
308
  )
280
309
 
281
- if not all(_validate_numpy_array(data_row, ft_type, strict=strict) for data_row in data_col):
310
+ if not all(_validate_array_or_series_type(data_row, ft_type, strict=strict) for data_row in data_col):
282
311
  raise snowml_exceptions.SnowflakeMLException(
283
312
  error_code=error_codes.INVALID_DATA,
284
313
  original_exception=ValueError(
@@ -66,7 +66,7 @@ SupportedRequireSignatureModelType = Union[
66
66
  "xgboost.XGBModel",
67
67
  "xgboost.Booster",
68
68
  "torch.nn.Module",
69
- "torch.jit.ScriptModule", # type:ignore[name-defined]
69
+ "torch.jit.ScriptModule",
70
70
  "tensorflow.Module",
71
71
  ]
72
72
 
@@ -130,8 +130,7 @@ class LGBMClassifier(BaseTransformer):
130
130
  reg_alpha=0.0,
131
131
  reg_lambda=0.0,
132
132
  random_state=None,
133
- n_jobs=-1,
134
- silent="warn",
133
+ n_jobs=None,
135
134
  importance_type="split",
136
135
  input_cols: Optional[Union[str, Iterable[str]]] = None,
137
136
  output_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -172,8 +171,7 @@ class LGBMClassifier(BaseTransformer):
172
171
  'reg_alpha':(reg_alpha, 0.0, False),
173
172
  'reg_lambda':(reg_lambda, 0.0, False),
174
173
  'random_state':(random_state, None, False),
175
- 'n_jobs':(n_jobs, -1, False),
176
- 'silent':(silent, "warn", False),
174
+ 'n_jobs':(n_jobs, None, False),
177
175
  'importance_type':(importance_type, "split", False),}
178
176
  cleaned_up_init_args = validate_sklearn_args(
179
177
  args=init_args,
@@ -130,8 +130,7 @@ class LGBMRegressor(BaseTransformer):
130
130
  reg_alpha=0.0,
131
131
  reg_lambda=0.0,
132
132
  random_state=None,
133
- n_jobs=-1,
134
- silent="warn",
133
+ n_jobs=None,
135
134
  importance_type="split",
136
135
  input_cols: Optional[Union[str, Iterable[str]]] = None,
137
136
  output_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -172,8 +171,7 @@ class LGBMRegressor(BaseTransformer):
172
171
  'reg_alpha':(reg_alpha, 0.0, False),
173
172
  'reg_lambda':(reg_lambda, 0.0, False),
174
173
  'random_state':(random_state, None, False),
175
- 'n_jobs':(n_jobs, -1, False),
176
- 'silent':(silent, "warn", False),
174
+ 'n_jobs':(n_jobs, None, False),
177
175
  'importance_type':(importance_type, "split", False),}
178
176
  cleaned_up_init_args = validate_sklearn_args(
179
177
  args=init_args,