snowflake-ml-python 1.7.0__py3-none-any.whl → 1.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/__init__.py +4 -0
- snowflake/cortex/_complete.py +107 -64
- snowflake/cortex/_finetune.py +273 -0
- snowflake/cortex/_sse_client.py +91 -28
- snowflake/cortex/_util.py +30 -1
- snowflake/ml/_internal/type_utils.py +3 -3
- snowflake/ml/_internal/utils/jwt_generator.py +141 -0
- snowflake/ml/data/__init__.py +5 -0
- snowflake/ml/model/_client/model/model_version_impl.py +26 -12
- snowflake/ml/model/_client/ops/model_ops.py +51 -30
- snowflake/ml/model/_client/ops/service_ops.py +25 -9
- snowflake/ml/model/_client/sql/model.py +0 -14
- snowflake/ml/model/_client/sql/service.py +25 -1
- snowflake/ml/model/_client/sql/stage.py +1 -1
- snowflake/ml/model/_model_composer/model_method/infer_function.py_template +2 -1
- snowflake/ml/model/_packager/model_env/model_env.py +12 -0
- snowflake/ml/model/_packager/model_handlers/_utils.py +1 -1
- snowflake/ml/model/_packager/model_handlers/catboost.py +1 -1
- snowflake/ml/model/_packager/model_handlers/custom.py +3 -1
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +2 -1
- snowflake/ml/model/_packager/model_handlers/sklearn.py +50 -1
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +1 -1
- snowflake/ml/model/_packager/model_handlers/tensorflow.py +23 -6
- snowflake/ml/model/_packager/model_handlers/torchscript.py +14 -14
- snowflake/ml/model/_packager/model_meta/_packaging_requirements.py +2 -3
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +5 -0
- snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +2 -10
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +4 -9
- snowflake/ml/model/_packager/model_task/model_task_utils.py +1 -1
- snowflake/ml/model/_signatures/core.py +63 -16
- snowflake/ml/model/_signatures/pandas_handler.py +71 -27
- snowflake/ml/model/_signatures/pytorch_handler.py +2 -2
- snowflake/ml/model/_signatures/snowpark_handler.py +2 -1
- snowflake/ml/model/_signatures/tensorflow_handler.py +2 -2
- snowflake/ml/model/_signatures/utils.py +4 -1
- snowflake/ml/model/model_signature.py +38 -9
- snowflake/ml/model/type_hints.py +1 -1
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +2 -4
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +2 -4
- snowflake/ml/monitoring/_client/model_monitor_sql_client.py +148 -1200
- snowflake/ml/monitoring/_manager/model_monitor_manager.py +114 -238
- snowflake/ml/monitoring/entities/model_monitor_config.py +38 -12
- snowflake/ml/monitoring/model_monitor.py +12 -86
- snowflake/ml/registry/registry.py +28 -40
- snowflake/ml/utils/authentication.py +75 -0
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.7.0.dist-info → snowflake_ml_python-1.7.2.dist-info}/METADATA +116 -52
- {snowflake_ml_python-1.7.0.dist-info → snowflake_ml_python-1.7.2.dist-info}/RECORD +51 -49
- {snowflake_ml_python-1.7.0.dist-info → snowflake_ml_python-1.7.2.dist-info}/WHEEL +1 -1
- snowflake/ml/monitoring/entities/model_monitor_interval.py +0 -46
- snowflake/ml/monitoring/entities/output_score_type.py +0 -90
- {snowflake_ml_python-1.7.0.dist-info → snowflake_ml_python-1.7.2.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.7.0.dist-info → snowflake_ml_python-1.7.2.dist-info}/top_level.txt +0 -0
@@ -14,10 +14,12 @@ from typing import (
|
|
14
14
|
Type,
|
15
15
|
Union,
|
16
16
|
final,
|
17
|
+
get_args,
|
17
18
|
)
|
18
19
|
|
19
20
|
import numpy as np
|
20
21
|
import numpy.typing as npt
|
22
|
+
import pandas as pd
|
21
23
|
|
22
24
|
import snowflake.snowpark.types as spt
|
23
25
|
from snowflake.ml._internal.exceptions import (
|
@@ -29,6 +31,21 @@ if TYPE_CHECKING:
|
|
29
31
|
import mlflow
|
30
32
|
import torch
|
31
33
|
|
34
|
+
PandasExtensionTypes = Union[
|
35
|
+
pd.Int8Dtype,
|
36
|
+
pd.Int16Dtype,
|
37
|
+
pd.Int32Dtype,
|
38
|
+
pd.Int64Dtype,
|
39
|
+
pd.UInt8Dtype,
|
40
|
+
pd.UInt16Dtype,
|
41
|
+
pd.UInt32Dtype,
|
42
|
+
pd.UInt64Dtype,
|
43
|
+
pd.Float32Dtype,
|
44
|
+
pd.Float64Dtype,
|
45
|
+
pd.BooleanDtype,
|
46
|
+
pd.StringDtype,
|
47
|
+
]
|
48
|
+
|
32
49
|
|
33
50
|
class DataType(Enum):
|
34
51
|
def __init__(self, value: str, snowpark_type: Type[spt.DataType], numpy_type: npt.DTypeLike) -> None:
|
@@ -67,11 +84,11 @@ class DataType(Enum):
|
|
67
84
|
return f"DataType.{self.name}"
|
68
85
|
|
69
86
|
@classmethod
|
70
|
-
def from_numpy_type(cls,
|
87
|
+
def from_numpy_type(cls, input_type: Union[npt.DTypeLike, PandasExtensionTypes]) -> "DataType":
|
71
88
|
"""Translate numpy dtype to DataType for signature definition.
|
72
89
|
|
73
90
|
Args:
|
74
|
-
|
91
|
+
input_type: The numpy dtype or Pandas Extension Dtype
|
75
92
|
|
76
93
|
Raises:
|
77
94
|
SnowflakeMLException: NotImplementedError: Raised when the given numpy type is not supported.
|
@@ -79,6 +96,10 @@ class DataType(Enum):
|
|
79
96
|
Returns:
|
80
97
|
Corresponding DataType.
|
81
98
|
"""
|
99
|
+
# To support pandas extension dtype
|
100
|
+
if isinstance(input_type, get_args(PandasExtensionTypes)):
|
101
|
+
input_type = input_type.type
|
102
|
+
|
82
103
|
np_to_snowml_type_mapping = {i._numpy_type: i for i in DataType}
|
83
104
|
|
84
105
|
# Add datetime types:
|
@@ -88,12 +109,12 @@ class DataType(Enum):
|
|
88
109
|
np_to_snowml_type_mapping[f"datetime64[{res}]"] = DataType.TIMESTAMP_NTZ
|
89
110
|
|
90
111
|
for potential_type in np_to_snowml_type_mapping.keys():
|
91
|
-
if np.can_cast(
|
112
|
+
if np.can_cast(input_type, potential_type, casting="no"):
|
92
113
|
# This is used since the same dtype might represented in different ways.
|
93
114
|
return np_to_snowml_type_mapping[potential_type]
|
94
115
|
raise snowml_exceptions.SnowflakeMLException(
|
95
116
|
error_code=error_codes.NOT_IMPLEMENTED,
|
96
|
-
original_exception=NotImplementedError(f"Type {
|
117
|
+
original_exception=NotImplementedError(f"Type {input_type} is not supported as a DataType."),
|
97
118
|
)
|
98
119
|
|
99
120
|
@classmethod
|
@@ -212,6 +233,7 @@ class FeatureSpec(BaseFeatureSpec):
|
|
212
233
|
name: str,
|
213
234
|
dtype: DataType,
|
214
235
|
shape: Optional[Tuple[int, ...]] = None,
|
236
|
+
nullable: bool = True,
|
215
237
|
) -> None:
|
216
238
|
"""
|
217
239
|
Initialize a feature.
|
@@ -219,6 +241,7 @@ class FeatureSpec(BaseFeatureSpec):
|
|
219
241
|
Args:
|
220
242
|
name: Name of the feature.
|
221
243
|
dtype: Type of the elements in the feature.
|
244
|
+
nullable: Whether the feature is nullable. Defaults to True.
|
222
245
|
shape: Used to represent scalar feature, 1-d feature list,
|
223
246
|
or n-d tensor. Use -1 to represent variable length. Defaults to None.
|
224
247
|
|
@@ -227,6 +250,7 @@ class FeatureSpec(BaseFeatureSpec):
|
|
227
250
|
- (2,): 1d list with a fixed length of 2.
|
228
251
|
- (-1,): 1d list with variable length, used for ragged tensor representation.
|
229
252
|
- (d1, d2, d3): 3d tensor.
|
253
|
+
nullable: Whether the feature is nullable. Defaults to True.
|
230
254
|
|
231
255
|
Raises:
|
232
256
|
SnowflakeMLException: TypeError: When the dtype input type is incorrect.
|
@@ -248,6 +272,8 @@ class FeatureSpec(BaseFeatureSpec):
|
|
248
272
|
)
|
249
273
|
self._shape = shape
|
250
274
|
|
275
|
+
self._nullable = nullable
|
276
|
+
|
251
277
|
def as_snowpark_type(self) -> spt.DataType:
|
252
278
|
result_type = self._dtype.as_snowpark_type()
|
253
279
|
if not self._shape:
|
@@ -256,13 +282,34 @@ class FeatureSpec(BaseFeatureSpec):
|
|
256
282
|
result_type = spt.ArrayType(result_type)
|
257
283
|
return result_type
|
258
284
|
|
259
|
-
def as_dtype(self) -> Union[npt.DTypeLike, str]:
|
285
|
+
def as_dtype(self) -> Union[npt.DTypeLike, str, PandasExtensionTypes]:
|
260
286
|
"""Convert to corresponding local Type."""
|
287
|
+
|
261
288
|
if not self._shape:
|
262
289
|
# scalar dtype: use keys from `np.sctypeDict` to prevent unit-less dtype 'datetime64'
|
263
290
|
if "datetime64" in self._dtype._value:
|
264
291
|
return self._dtype._value
|
265
|
-
|
292
|
+
|
293
|
+
np_type = self._dtype._numpy_type
|
294
|
+
if self._nullable:
|
295
|
+
np_to_pd_dtype_mapping = {
|
296
|
+
np.int8: pd.Int8Dtype(),
|
297
|
+
np.int16: pd.Int16Dtype(),
|
298
|
+
np.int32: pd.Int32Dtype(),
|
299
|
+
np.int64: pd.Int64Dtype(),
|
300
|
+
np.uint8: pd.UInt8Dtype(),
|
301
|
+
np.uint16: pd.UInt16Dtype(),
|
302
|
+
np.uint32: pd.UInt32Dtype(),
|
303
|
+
np.uint64: pd.UInt64Dtype(),
|
304
|
+
np.float32: pd.Float32Dtype(),
|
305
|
+
np.float64: pd.Float64Dtype(),
|
306
|
+
np.bool_: pd.BooleanDtype(),
|
307
|
+
np.str_: pd.StringDtype(),
|
308
|
+
}
|
309
|
+
|
310
|
+
return np_to_pd_dtype_mapping.get(np_type, np_type) # type: ignore[arg-type]
|
311
|
+
|
312
|
+
return np_type
|
266
313
|
return np.object_
|
267
314
|
|
268
315
|
def __eq__(self, other: object) -> bool:
|
@@ -273,7 +320,10 @@ class FeatureSpec(BaseFeatureSpec):
|
|
273
320
|
|
274
321
|
def __repr__(self) -> str:
|
275
322
|
shape_str = f", shape={repr(self._shape)}" if self._shape else ""
|
276
|
-
return
|
323
|
+
return (
|
324
|
+
f"FeatureSpec(dtype={repr(self._dtype)}, "
|
325
|
+
f"name={repr(self._name)}{shape_str}, nullable={repr(self._nullable)})"
|
326
|
+
)
|
277
327
|
|
278
328
|
def to_dict(self) -> Dict[str, Any]:
|
279
329
|
"""Serialize the feature group into a dict.
|
@@ -281,10 +331,7 @@ class FeatureSpec(BaseFeatureSpec):
|
|
281
331
|
Returns:
|
282
332
|
A dict that serializes the feature group.
|
283
333
|
"""
|
284
|
-
base_dict: Dict[str, Any] = {
|
285
|
-
"type": self._dtype.name,
|
286
|
-
"name": self._name,
|
287
|
-
}
|
334
|
+
base_dict: Dict[str, Any] = {"type": self._dtype.name, "name": self._name, "nullable": self._nullable}
|
288
335
|
if self._shape is not None:
|
289
336
|
base_dict["shape"] = self._shape
|
290
337
|
return base_dict
|
@@ -304,7 +351,9 @@ class FeatureSpec(BaseFeatureSpec):
|
|
304
351
|
if shape:
|
305
352
|
shape = tuple(shape)
|
306
353
|
type = DataType[input_dict["type"]]
|
307
|
-
|
354
|
+
# If nullable is not provided, default to False for backward compatibility.
|
355
|
+
nullable = input_dict.get("nullable", False)
|
356
|
+
return FeatureSpec(name=name, dtype=type, shape=shape, nullable=nullable)
|
308
357
|
|
309
358
|
@classmethod
|
310
359
|
def from_mlflow_spec(
|
@@ -475,10 +524,8 @@ class ModelSignature:
|
|
475
524
|
sig_outs = loaded["outputs"]
|
476
525
|
sig_inputs = loaded["inputs"]
|
477
526
|
|
478
|
-
deserialize_spec: Callable[[Dict[str, Any]], BaseFeatureSpec] = (
|
479
|
-
|
480
|
-
if "feature_group" in sig_spec
|
481
|
-
else FeatureSpec.from_dict(sig_spec)
|
527
|
+
deserialize_spec: Callable[[Dict[str, Any]], BaseFeatureSpec] = lambda sig_spec: (
|
528
|
+
FeatureGroupSpec.from_dict(sig_spec) if "feature_group" in sig_spec else FeatureSpec.from_dict(sig_spec)
|
482
529
|
)
|
483
530
|
|
484
531
|
return ModelSignature(
|
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
import warnings
|
2
|
+
from typing import Literal, Sequence, Union
|
2
3
|
|
3
4
|
import numpy as np
|
4
5
|
import pandas as pd
|
@@ -14,8 +15,8 @@ from snowflake.ml.model._signatures import base_handler, core, utils
|
|
14
15
|
|
15
16
|
class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
|
16
17
|
@staticmethod
|
17
|
-
def can_handle(data: model_types.SupportedDataType) -> TypeGuard[pd.DataFrame]:
|
18
|
-
return isinstance(data, pd.DataFrame)
|
18
|
+
def can_handle(data: model_types.SupportedDataType) -> TypeGuard[Union[pd.DataFrame, pd.Series]]:
|
19
|
+
return isinstance(data, pd.DataFrame) or isinstance(data, pd.Series)
|
19
20
|
|
20
21
|
@staticmethod
|
21
22
|
def count(data: pd.DataFrame) -> int:
|
@@ -26,7 +27,17 @@ class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
|
|
26
27
|
return data.head(min(PandasDataFrameHandler.count(data), PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT))
|
27
28
|
|
28
29
|
@staticmethod
|
29
|
-
def validate(data: pd.DataFrame) -> None:
|
30
|
+
def validate(data: Union[pd.DataFrame, pd.Series]) -> None:
|
31
|
+
if isinstance(data, pd.Series):
|
32
|
+
# check if the series is empty and throw error
|
33
|
+
if data.empty:
|
34
|
+
raise snowml_exceptions.SnowflakeMLException(
|
35
|
+
error_code=error_codes.INVALID_DATA,
|
36
|
+
original_exception=ValueError("Data Validation Error: Empty data is found."),
|
37
|
+
)
|
38
|
+
# convert the series to a dataframe
|
39
|
+
data = data.to_frame()
|
40
|
+
|
30
41
|
df_cols = data.columns
|
31
42
|
|
32
43
|
if df_cols.has_duplicates: # Rule out categorical index with duplicates
|
@@ -60,21 +71,44 @@ class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
|
|
60
71
|
|
61
72
|
df_col_dtypes = [data[col].dtype for col in data.columns]
|
62
73
|
for df_col, df_col_dtype in zip(df_cols, df_col_dtypes):
|
74
|
+
df_col_data = data[df_col]
|
75
|
+
if df_col_data.isnull().all():
|
76
|
+
raise snowml_exceptions.SnowflakeMLException(
|
77
|
+
error_code=error_codes.INVALID_DATA,
|
78
|
+
original_exception=ValueError(
|
79
|
+
f"Data Validation Error: There is no non-null data in column {df_col}."
|
80
|
+
),
|
81
|
+
)
|
82
|
+
if df_col_data.isnull().any():
|
83
|
+
warnings.warn(
|
84
|
+
(
|
85
|
+
f"Null value detected in column {df_col}, model signature inference might not accurate, "
|
86
|
+
"or your prediction might fail if your model does not support null input. If this is not "
|
87
|
+
"expected, please check your input dataframe."
|
88
|
+
),
|
89
|
+
category=UserWarning,
|
90
|
+
stacklevel=2,
|
91
|
+
)
|
92
|
+
|
93
|
+
df_col_data = utils.series_dropna(df_col_data)
|
94
|
+
df_col_dtype = df_col_data.dtype
|
95
|
+
|
63
96
|
if df_col_dtype == np.dtype("O"):
|
64
97
|
# Check if all objects have the same type
|
65
|
-
if not all(isinstance(data_row, type(
|
98
|
+
if not all(isinstance(data_row, type(df_col_data.iloc[0])) for data_row in df_col_data):
|
66
99
|
raise snowml_exceptions.SnowflakeMLException(
|
67
100
|
error_code=error_codes.INVALID_DATA,
|
68
101
|
original_exception=ValueError(
|
69
|
-
|
102
|
+
"Data Validation Error: "
|
103
|
+
+ f"Inconsistent type of element in object found in column data {df_col_data}."
|
70
104
|
),
|
71
105
|
)
|
72
106
|
|
73
|
-
if isinstance(
|
74
|
-
arr = utils.convert_list_to_ndarray(
|
107
|
+
if isinstance(df_col_data.iloc[0], list):
|
108
|
+
arr = utils.convert_list_to_ndarray(df_col_data.iloc[0])
|
75
109
|
arr_dtype = core.DataType.from_numpy_type(arr.dtype)
|
76
110
|
|
77
|
-
converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in
|
111
|
+
converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in df_col_data]
|
78
112
|
|
79
113
|
if not all(
|
80
114
|
core.DataType.from_numpy_type(converted_data.dtype) == arr_dtype
|
@@ -84,32 +118,37 @@ class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
|
|
84
118
|
error_code=error_codes.INVALID_DATA,
|
85
119
|
original_exception=ValueError(
|
86
120
|
"Data Validation Error: "
|
87
|
-
+ f"Inconsistent type of element in object found in column data {
|
121
|
+
+ f"Inconsistent type of element in object found in column data {df_col_data}."
|
88
122
|
),
|
89
123
|
)
|
90
124
|
|
91
|
-
elif isinstance(
|
92
|
-
arr_dtype = core.DataType.from_numpy_type(
|
125
|
+
elif isinstance(df_col_data.iloc[0], np.ndarray):
|
126
|
+
arr_dtype = core.DataType.from_numpy_type(df_col_data.iloc[0].dtype)
|
93
127
|
|
94
|
-
if not all(core.DataType.from_numpy_type(data_row.dtype) == arr_dtype for data_row in
|
128
|
+
if not all(core.DataType.from_numpy_type(data_row.dtype) == arr_dtype for data_row in df_col_data):
|
95
129
|
raise snowml_exceptions.SnowflakeMLException(
|
96
130
|
error_code=error_codes.INVALID_DATA,
|
97
131
|
original_exception=ValueError(
|
98
132
|
"Data Validation Error: "
|
99
|
-
+ f"Inconsistent type of element in object found in column data {
|
133
|
+
+ f"Inconsistent type of element in object found in column data {df_col_data}."
|
100
134
|
),
|
101
135
|
)
|
102
|
-
elif not isinstance(
|
136
|
+
elif not isinstance(df_col_data.iloc[0], (str, bytes)):
|
103
137
|
raise snowml_exceptions.SnowflakeMLException(
|
104
138
|
error_code=error_codes.INVALID_DATA,
|
105
139
|
original_exception=ValueError(
|
106
|
-
f"Data Validation Error: Unsupported type confronted in {
|
140
|
+
f"Data Validation Error: Unsupported type confronted in {df_col_data}"
|
107
141
|
),
|
108
142
|
)
|
109
143
|
|
110
144
|
@staticmethod
|
111
|
-
def infer_signature(
|
145
|
+
def infer_signature(
|
146
|
+
data: Union[pd.DataFrame, pd.Series],
|
147
|
+
role: Literal["input", "output"],
|
148
|
+
) -> Sequence[core.BaseFeatureSpec]:
|
112
149
|
feature_prefix = f"{PandasDataFrameHandler.FEATURE_PREFIX}_"
|
150
|
+
if isinstance(data, pd.Series):
|
151
|
+
data = data.to_frame()
|
113
152
|
df_cols = data.columns
|
114
153
|
role_prefix = (
|
115
154
|
PandasDataFrameHandler.INPUT_PREFIX if role == "input" else PandasDataFrameHandler.OUTPUT_PREFIX
|
@@ -123,29 +162,34 @@ class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
|
|
123
162
|
|
124
163
|
specs = []
|
125
164
|
for df_col, df_col_dtype, ft_name in zip(df_cols, df_col_dtypes, ft_names):
|
165
|
+
df_col_data = data[df_col]
|
166
|
+
if df_col_data.isnull().any():
|
167
|
+
df_col_data = utils.series_dropna(df_col_data)
|
168
|
+
df_col_dtype = df_col_data.dtype
|
169
|
+
|
126
170
|
if df_col_dtype == np.dtype("O"):
|
127
|
-
if isinstance(
|
128
|
-
arr = utils.convert_list_to_ndarray(
|
171
|
+
if isinstance(df_col_data.iloc[0], list):
|
172
|
+
arr = utils.convert_list_to_ndarray(df_col_data.iloc[0])
|
129
173
|
arr_dtype = core.DataType.from_numpy_type(arr.dtype)
|
130
|
-
ft_shape = np.shape(
|
174
|
+
ft_shape = np.shape(df_col_data.iloc[0])
|
131
175
|
|
132
|
-
converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in
|
176
|
+
converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in df_col_data]
|
133
177
|
|
134
178
|
if not all(np.shape(converted_data) == ft_shape for converted_data in converted_data_list):
|
135
179
|
ft_shape = (-1,)
|
136
180
|
|
137
181
|
specs.append(core.FeatureSpec(dtype=arr_dtype, name=ft_name, shape=ft_shape))
|
138
|
-
elif isinstance(
|
139
|
-
arr_dtype = core.DataType.from_numpy_type(
|
140
|
-
ft_shape = np.shape(
|
182
|
+
elif isinstance(df_col_data.iloc[0], np.ndarray):
|
183
|
+
arr_dtype = core.DataType.from_numpy_type(df_col_data.iloc[0].dtype)
|
184
|
+
ft_shape = np.shape(df_col_data.iloc[0])
|
141
185
|
|
142
|
-
if not all(np.shape(data_row) == ft_shape for data_row in
|
186
|
+
if not all(np.shape(data_row) == ft_shape for data_row in df_col_data):
|
143
187
|
ft_shape = (-1,)
|
144
188
|
|
145
189
|
specs.append(core.FeatureSpec(dtype=arr_dtype, name=ft_name, shape=ft_shape))
|
146
|
-
elif isinstance(
|
190
|
+
elif isinstance(df_col_data.iloc[0], str):
|
147
191
|
specs.append(core.FeatureSpec(dtype=core.DataType.STRING, name=ft_name))
|
148
|
-
elif isinstance(
|
192
|
+
elif isinstance(df_col_data.iloc[0], bytes):
|
149
193
|
specs.append(core.FeatureSpec(dtype=core.DataType.BYTES, name=ft_name))
|
150
194
|
elif isinstance(df_col_dtype, pd.CategoricalDtype):
|
151
195
|
category_dtype = df_col_dtype.categories.dtype
|
@@ -72,10 +72,10 @@ class SeqOfPyTorchTensorHandler(base_handler.BaseDataHandler[Sequence["torch.Ten
|
|
72
72
|
dtype = core.DataType.from_torch_type(data_col.dtype)
|
73
73
|
ft_name = f"{role_prefix}{feature_prefix}{i}"
|
74
74
|
if len(data_col.shape) == 1:
|
75
|
-
features.append(core.FeatureSpec(dtype=dtype, name=ft_name))
|
75
|
+
features.append(core.FeatureSpec(dtype=dtype, name=ft_name, nullable=False))
|
76
76
|
else:
|
77
77
|
ft_shape = tuple(data_col.shape[1:])
|
78
|
-
features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape))
|
78
|
+
features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape, nullable=False))
|
79
79
|
return features
|
80
80
|
|
81
81
|
@staticmethod
|
@@ -82,7 +82,8 @@ class SnowparkDataFrameHandler(base_handler.BaseDataHandler[snowflake.snowpark.D
|
|
82
82
|
identifier.get_unescaped_names(field.name)
|
83
83
|
].map(json.loads)
|
84
84
|
# Only when the feature is not from inference, we are confident to do the type casting.
|
85
|
-
# Otherwise, dtype_map will be empty
|
85
|
+
# Otherwise, dtype_map will be empty.
|
86
|
+
# Errors are ignored to make sure None won't be converted and won't raise Error
|
86
87
|
df_local = df_local.astype(dtype=dtype_map)
|
87
88
|
return df_local
|
88
89
|
|
@@ -109,10 +109,10 @@ class SeqOfTensorflowTensorHandler(
|
|
109
109
|
dtype = core.DataType.from_numpy_type(data_col.dtype.as_numpy_dtype)
|
110
110
|
ft_name = f"{role_prefix}{feature_prefix}{i}"
|
111
111
|
if len(data_col.shape) == 1:
|
112
|
-
features.append(core.FeatureSpec(dtype=dtype, name=ft_name))
|
112
|
+
features.append(core.FeatureSpec(dtype=dtype, name=ft_name, nullable=False))
|
113
113
|
else:
|
114
114
|
ft_shape = tuple(data_col.shape[1:])
|
115
|
-
features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape))
|
115
|
+
features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape, nullable=False))
|
116
116
|
return features
|
117
117
|
|
118
118
|
@staticmethod
|
@@ -118,7 +118,6 @@ def huggingface_pipeline_signature_auto_infer(task: str, params: Dict[str, Any])
|
|
118
118
|
category=DeprecationWarning,
|
119
119
|
stacklevel=1,
|
120
120
|
)
|
121
|
-
|
122
121
|
return core.ModelSignature(
|
123
122
|
inputs=[
|
124
123
|
core.FeatureSpec(name="user_inputs", dtype=core.DataType.STRING, shape=(-1,)),
|
@@ -297,3 +296,7 @@ def huggingface_pipeline_signature_auto_infer(task: str, params: Dict[str, Any])
|
|
297
296
|
)
|
298
297
|
|
299
298
|
return None
|
299
|
+
|
300
|
+
|
301
|
+
def series_dropna(series: pd.Series) -> pd.Series:
|
302
|
+
return series.dropna(inplace=False).reset_index(drop=True).convert_dtypes()
|
@@ -139,9 +139,32 @@ def _rename_signature_with_snowflake_identifiers(
|
|
139
139
|
return signature
|
140
140
|
|
141
141
|
|
142
|
-
def
|
143
|
-
arr: model_types._SupportedNumpyArray, feature_type: core.DataType, strict: bool = False
|
142
|
+
def _validate_array_or_series_type(
|
143
|
+
arr: Union[model_types._SupportedNumpyArray, pd.Series], feature_type: core.DataType, strict: bool = False
|
144
144
|
) -> bool:
|
145
|
+
original_dtype = arr.dtype
|
146
|
+
dtype = arr.dtype
|
147
|
+
if isinstance(
|
148
|
+
dtype,
|
149
|
+
(
|
150
|
+
pd.Int8Dtype,
|
151
|
+
pd.Int16Dtype,
|
152
|
+
pd.Int32Dtype,
|
153
|
+
pd.Int64Dtype,
|
154
|
+
pd.UInt8Dtype,
|
155
|
+
pd.UInt16Dtype,
|
156
|
+
pd.UInt32Dtype,
|
157
|
+
pd.UInt64Dtype,
|
158
|
+
pd.Float32Dtype,
|
159
|
+
pd.Float64Dtype,
|
160
|
+
pd.BooleanDtype,
|
161
|
+
),
|
162
|
+
):
|
163
|
+
dtype = dtype.type
|
164
|
+
elif isinstance(dtype, pd.CategoricalDtype):
|
165
|
+
dtype = dtype.categories.dtype
|
166
|
+
elif isinstance(dtype, pd.StringDtype):
|
167
|
+
dtype = np.str_
|
145
168
|
if feature_type in [
|
146
169
|
core.DataType.INT8,
|
147
170
|
core.DataType.INT16,
|
@@ -152,14 +175,17 @@ def _validate_numpy_array(
|
|
152
175
|
core.DataType.UINT32,
|
153
176
|
core.DataType.UINT64,
|
154
177
|
]:
|
155
|
-
if not (np.issubdtype(
|
178
|
+
if not (np.issubdtype(dtype, np.integer)):
|
156
179
|
return False
|
157
180
|
if not strict:
|
158
181
|
return True
|
159
|
-
|
182
|
+
if isinstance(original_dtype, pd.CategoricalDtype):
|
183
|
+
min_v, max_v = arr.cat.as_ordered().min(), arr.cat.as_ordered().min() # type: ignore[union-attr]
|
184
|
+
else:
|
185
|
+
min_v, max_v = arr.min(), arr.max()
|
160
186
|
return bool(max_v <= np.iinfo(feature_type._numpy_type).max and min_v >= np.iinfo(feature_type._numpy_type).min)
|
161
187
|
elif feature_type in [core.DataType.FLOAT, core.DataType.DOUBLE]:
|
162
|
-
if not (np.issubdtype(
|
188
|
+
if not (np.issubdtype(dtype, np.integer) or np.issubdtype(dtype, np.floating)):
|
163
189
|
return False
|
164
190
|
if not strict:
|
165
191
|
return True
|
@@ -171,7 +197,7 @@ def _validate_numpy_array(
|
|
171
197
|
elif feature_type in [core.DataType.TIMESTAMP_NTZ]:
|
172
198
|
return np.issubdtype(arr.dtype, np.datetime64)
|
173
199
|
else:
|
174
|
-
return np.can_cast(
|
200
|
+
return np.can_cast(dtype, feature_type._numpy_type, casting="no")
|
175
201
|
|
176
202
|
|
177
203
|
def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureSpec], strict: bool = False) -> None:
|
@@ -204,7 +230,10 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
|
|
204
230
|
original_exception=ValueError(f"Data Validation Error: feature {ft_name} does not exist in data."),
|
205
231
|
)
|
206
232
|
|
233
|
+
if data_col.isnull().any():
|
234
|
+
data_col = utils.series_dropna(data_col)
|
207
235
|
df_col_dtype = data_col.dtype
|
236
|
+
|
208
237
|
if isinstance(feature, core.FeatureGroupSpec):
|
209
238
|
raise snowml_exceptions.SnowflakeMLException(
|
210
239
|
error_code=error_codes.NOT_IMPLEMENTED,
|
@@ -217,7 +246,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
|
|
217
246
|
if isinstance(df_col_dtype, pd.CategoricalDtype):
|
218
247
|
df_col_dtype = df_col_dtype.categories.dtype
|
219
248
|
if df_col_dtype != np.dtype("O"):
|
220
|
-
if not
|
249
|
+
if not _validate_array_or_series_type(data_col, ft_type, strict=strict):
|
221
250
|
raise snowml_exceptions.SnowflakeMLException(
|
222
251
|
error_code=error_codes.INVALID_DATA,
|
223
252
|
original_exception=ValueError(
|
@@ -247,7 +276,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
|
|
247
276
|
converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in data_col]
|
248
277
|
|
249
278
|
if not all(
|
250
|
-
|
279
|
+
_validate_array_or_series_type(converted_data, ft_type, strict=strict)
|
251
280
|
for converted_data in converted_data_list
|
252
281
|
):
|
253
282
|
raise snowml_exceptions.SnowflakeMLException(
|
@@ -278,7 +307,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
|
|
278
307
|
),
|
279
308
|
)
|
280
309
|
|
281
|
-
if not all(
|
310
|
+
if not all(_validate_array_or_series_type(data_row, ft_type, strict=strict) for data_row in data_col):
|
282
311
|
raise snowml_exceptions.SnowflakeMLException(
|
283
312
|
error_code=error_codes.INVALID_DATA,
|
284
313
|
original_exception=ValueError(
|
snowflake/ml/model/type_hints.py
CHANGED
@@ -130,8 +130,7 @@ class LGBMClassifier(BaseTransformer):
|
|
130
130
|
reg_alpha=0.0,
|
131
131
|
reg_lambda=0.0,
|
132
132
|
random_state=None,
|
133
|
-
n_jobs
|
134
|
-
silent="warn",
|
133
|
+
n_jobs=None,
|
135
134
|
importance_type="split",
|
136
135
|
input_cols: Optional[Union[str, Iterable[str]]] = None,
|
137
136
|
output_cols: Optional[Union[str, Iterable[str]]] = None,
|
@@ -172,8 +171,7 @@ class LGBMClassifier(BaseTransformer):
|
|
172
171
|
'reg_alpha':(reg_alpha, 0.0, False),
|
173
172
|
'reg_lambda':(reg_lambda, 0.0, False),
|
174
173
|
'random_state':(random_state, None, False),
|
175
|
-
'n_jobs':(n_jobs,
|
176
|
-
'silent':(silent, "warn", False),
|
174
|
+
'n_jobs':(n_jobs, None, False),
|
177
175
|
'importance_type':(importance_type, "split", False),}
|
178
176
|
cleaned_up_init_args = validate_sklearn_args(
|
179
177
|
args=init_args,
|
@@ -130,8 +130,7 @@ class LGBMRegressor(BaseTransformer):
|
|
130
130
|
reg_alpha=0.0,
|
131
131
|
reg_lambda=0.0,
|
132
132
|
random_state=None,
|
133
|
-
n_jobs
|
134
|
-
silent="warn",
|
133
|
+
n_jobs=None,
|
135
134
|
importance_type="split",
|
136
135
|
input_cols: Optional[Union[str, Iterable[str]]] = None,
|
137
136
|
output_cols: Optional[Union[str, Iterable[str]]] = None,
|
@@ -172,8 +171,7 @@ class LGBMRegressor(BaseTransformer):
|
|
172
171
|
'reg_alpha':(reg_alpha, 0.0, False),
|
173
172
|
'reg_lambda':(reg_lambda, 0.0, False),
|
174
173
|
'random_state':(random_state, None, False),
|
175
|
-
'n_jobs':(n_jobs,
|
176
|
-
'silent':(silent, "warn", False),
|
174
|
+
'n_jobs':(n_jobs, None, False),
|
177
175
|
'importance_type':(importance_type, "split", False),}
|
178
176
|
cleaned_up_init_args = validate_sklearn_args(
|
179
177
|
args=init_args,
|