snowflake-ml-python 1.6.3__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/telemetry.py +4 -2
- snowflake/ml/_internal/utils/import_utils.py +31 -0
- snowflake/ml/_internal/utils/snowpark_dataframe_utils.py +13 -0
- snowflake/ml/data/_internal/arrow_ingestor.py +8 -0
- snowflake/ml/data/data_connector.py +1 -1
- snowflake/ml/data/torch_utils.py +33 -14
- snowflake/ml/feature_store/examples/airline_features/features/plane_features.py +5 -3
- snowflake/ml/feature_store/examples/airline_features/features/weather_features.py +7 -5
- snowflake/ml/feature_store/examples/citibike_trip_features/features/station_feature.py +4 -2
- snowflake/ml/feature_store/examples/citibike_trip_features/features/trip_feature.py +3 -1
- snowflake/ml/feature_store/examples/example_helper.py +6 -3
- snowflake/ml/feature_store/examples/new_york_taxi_features/features/location_features.py +4 -2
- snowflake/ml/feature_store/examples/new_york_taxi_features/features/trip_features.py +4 -2
- snowflake/ml/feature_store/examples/wine_quality_features/features/managed_wine_features.py +3 -1
- snowflake/ml/feature_store/examples/wine_quality_features/features/static_wine_features.py +3 -1
- snowflake/ml/feature_store/feature_store.py +1 -2
- snowflake/ml/feature_store/feature_view.py +5 -1
- snowflake/ml/model/_client/model/model_version_impl.py +144 -10
- snowflake/ml/model/_client/ops/model_ops.py +25 -6
- snowflake/ml/model/_client/ops/service_ops.py +33 -28
- snowflake/ml/model/_client/service/model_deployment_spec.py +19 -8
- snowflake/ml/model/_client/service/model_deployment_spec_schema.py +3 -1
- snowflake/ml/model/_client/sql/model.py +14 -0
- snowflake/ml/model/_client/sql/service.py +6 -18
- snowflake/ml/model/_model_composer/model_composer.py +2 -0
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +4 -0
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +1 -0
- snowflake/ml/model/_model_composer/model_method/model_method.py +1 -1
- snowflake/ml/model/_packager/model_handlers/_utils.py +5 -1
- snowflake/ml/model/_packager/model_handlers/catboost.py +3 -6
- snowflake/ml/model/_packager/model_handlers/custom.py +2 -0
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +10 -1
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +3 -6
- snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +8 -1
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -6
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +7 -65
- snowflake/ml/model/_packager/model_handlers/xgboost.py +10 -40
- snowflake/ml/model/_packager/model_packager.py +0 -11
- snowflake/ml/model/_packager/{model_handlers/model_objective_utils.py → model_task/model_task_utils.py} +13 -25
- snowflake/ml/model/_signatures/pandas_handler.py +16 -0
- snowflake/ml/model/custom_model.py +47 -7
- snowflake/ml/model/model_signature.py +2 -0
- snowflake/ml/model/type_hints.py +8 -0
- snowflake/ml/modeling/_internal/estimator_utils.py +13 -0
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +7 -2
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +16 -5
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +8 -2
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -3
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +1 -8
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +17 -19
- snowflake/ml/modeling/cluster/dbscan.py +5 -2
- snowflake/ml/modeling/cluster/feature_agglomeration.py +7 -19
- snowflake/ml/modeling/cluster/k_means.py +14 -19
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +3 -3
- snowflake/ml/modeling/cluster/optics.py +6 -6
- snowflake/ml/modeling/cluster/spectral_clustering.py +4 -3
- snowflake/ml/modeling/compose/column_transformer.py +15 -5
- snowflake/ml/modeling/compose/transformed_target_regressor.py +7 -6
- snowflake/ml/modeling/covariance/elliptic_envelope.py +1 -1
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +1 -1
- snowflake/ml/modeling/covariance/min_cov_det.py +2 -2
- snowflake/ml/modeling/covariance/oas.py +1 -1
- snowflake/ml/modeling/decomposition/kernel_pca.py +2 -2
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +5 -12
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +5 -12
- snowflake/ml/modeling/decomposition/pca.py +28 -15
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +6 -0
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +1 -12
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +1 -11
- snowflake/ml/modeling/ensemble/bagging_classifier.py +1 -8
- snowflake/ml/modeling/ensemble/bagging_regressor.py +1 -8
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +21 -2
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +18 -2
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +2 -0
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +2 -0
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +21 -8
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +21 -11
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +21 -2
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +18 -2
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +2 -1
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +5 -3
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +2 -2
- snowflake/ml/modeling/linear_model/ard_regression.py +5 -10
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +5 -11
- snowflake/ml/modeling/linear_model/elastic_net.py +3 -0
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +1 -1
- snowflake/ml/modeling/linear_model/lars.py +0 -10
- snowflake/ml/modeling/linear_model/lars_cv.py +1 -11
- snowflake/ml/modeling/linear_model/lasso_cv.py +1 -1
- snowflake/ml/modeling/linear_model/lasso_lars.py +0 -10
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +1 -11
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +0 -10
- snowflake/ml/modeling/linear_model/logistic_regression.py +28 -22
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +30 -24
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +1 -1
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +1 -1
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +4 -13
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +4 -4
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +1 -1
- snowflake/ml/modeling/linear_model/perceptron.py +3 -3
- snowflake/ml/modeling/linear_model/ransac_regressor.py +3 -2
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +14 -6
- snowflake/ml/modeling/linear_model/ridge_cv.py +17 -11
- snowflake/ml/modeling/linear_model/sgd_classifier.py +2 -2
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +5 -1
- snowflake/ml/modeling/linear_model/sgd_regressor.py +12 -3
- snowflake/ml/modeling/manifold/isomap.py +1 -1
- snowflake/ml/modeling/manifold/mds.py +3 -3
- snowflake/ml/modeling/manifold/tsne.py +10 -4
- snowflake/ml/modeling/metrics/classification.py +12 -16
- snowflake/ml/modeling/metrics/ranking.py +3 -3
- snowflake/ml/modeling/metrics/regression.py +3 -3
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +3 -3
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +3 -3
- snowflake/ml/modeling/naive_bayes/complement_nb.py +3 -3
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +3 -3
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +10 -4
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +5 -2
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +2 -2
- snowflake/ml/modeling/neighbors/nearest_centroid.py +7 -14
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +1 -1
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +6 -1
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +1 -1
- snowflake/ml/modeling/neural_network/mlp_classifier.py +7 -1
- snowflake/ml/modeling/neural_network/mlp_regressor.py +3 -0
- snowflake/ml/modeling/pipeline/pipeline.py +16 -14
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +8 -4
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +9 -7
- snowflake/ml/modeling/svm/linear_svc.py +25 -16
- snowflake/ml/modeling/svm/linear_svr.py +23 -17
- snowflake/ml/modeling/svm/nu_svc.py +5 -3
- snowflake/ml/modeling/svm/nu_svr.py +3 -1
- snowflake/ml/modeling/svm/svc.py +9 -5
- snowflake/ml/modeling/svm/svr.py +3 -1
- snowflake/ml/modeling/tree/decision_tree_classifier.py +21 -2
- snowflake/ml/modeling/tree/decision_tree_regressor.py +18 -2
- snowflake/ml/modeling/tree/extra_tree_classifier.py +28 -9
- snowflake/ml/modeling/tree/extra_tree_regressor.py +18 -2
- snowflake/ml/monitoring/_client/{monitor_sql_client.py → model_monitor_sql_client.py} +1 -1
- snowflake/ml/monitoring/{_client → _manager}/model_monitor_manager.py +9 -8
- snowflake/ml/monitoring/{_client/model_monitor.py → model_monitor.py} +3 -3
- snowflake/ml/registry/_manager/model_manager.py +15 -1
- snowflake/ml/registry/registry.py +15 -8
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.6.3.dist-info → snowflake_ml_python-1.7.0.dist-info}/METADATA +81 -9
- {snowflake_ml_python-1.6.3.dist-info → snowflake_ml_python-1.7.0.dist-info}/RECORD +150 -150
- {snowflake_ml_python-1.6.3.dist-info → snowflake_ml_python-1.7.0.dist-info}/WHEEL +1 -1
- /snowflake/ml/monitoring/{_client/model_monitor_version.py → model_monitor_version.py} +0 -0
- {snowflake_ml_python-1.6.3.dist-info → snowflake_ml_python-1.7.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.6.3.dist-info → snowflake_ml_python-1.7.0.dist-info}/top_level.txt +0 -0
@@ -165,7 +165,7 @@ class Isomap(BaseTransformer):
|
|
165
165
|
If metric is "precomputed", X is assumed to be a distance matrix and
|
166
166
|
must be square. X may be a :term:`Glossary <sparse graph>`.
|
167
167
|
|
168
|
-
p:
|
168
|
+
p: float, default=2
|
169
169
|
Parameter for the Minkowski metric from
|
170
170
|
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
|
171
171
|
equivalent to using manhattan_distance (l1), and euclidean_distance
|
@@ -157,7 +157,7 @@ class MDS(BaseTransformer):
|
|
157
157
|
Pre-computed dissimilarities are passed directly to ``fit`` and
|
158
158
|
``fit_transform``.
|
159
159
|
|
160
|
-
normalized_stress: bool or "auto" default=
|
160
|
+
normalized_stress: bool or "auto" default="auto"
|
161
161
|
Whether use and return normed stress value (Stress-1) instead of raw
|
162
162
|
stress calculated by default. Only supported in non-metric MDS.
|
163
163
|
"""
|
@@ -174,7 +174,7 @@ class MDS(BaseTransformer):
|
|
174
174
|
n_jobs=None,
|
175
175
|
random_state=None,
|
176
176
|
dissimilarity="euclidean",
|
177
|
-
normalized_stress="
|
177
|
+
normalized_stress="auto",
|
178
178
|
input_cols: Optional[Union[str, Iterable[str]]] = None,
|
179
179
|
output_cols: Optional[Union[str, Iterable[str]]] = None,
|
180
180
|
label_cols: Optional[Union[str, Iterable[str]]] = None,
|
@@ -205,7 +205,7 @@ class MDS(BaseTransformer):
|
|
205
205
|
'n_jobs':(n_jobs, None, False),
|
206
206
|
'random_state':(random_state, None, False),
|
207
207
|
'dissimilarity':(dissimilarity, "euclidean", False),
|
208
|
-
'normalized_stress':(normalized_stress, "
|
208
|
+
'normalized_stress':(normalized_stress, "auto", False),}
|
209
209
|
cleaned_up_init_args = validate_sklearn_args(
|
210
210
|
args=init_args,
|
211
211
|
klass=sklearn.manifold.MDS
|
@@ -143,7 +143,7 @@ class TSNE(BaseTransformer):
|
|
143
143
|
to `max(N / early_exaggeration / 4, 50)` where N is the sample size,
|
144
144
|
following [4] and [5].
|
145
145
|
|
146
|
-
|
146
|
+
max_iter: int, default=1000
|
147
147
|
Maximum number of iterations for the optimization. Should be at
|
148
148
|
least 250.
|
149
149
|
|
@@ -211,6 +211,10 @@ class TSNE(BaseTransformer):
|
|
211
211
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
212
212
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
213
213
|
for more details.
|
214
|
+
|
215
|
+
n_iter: int
|
216
|
+
Maximum number of iterations for the optimization. Should be at
|
217
|
+
least 250.
|
214
218
|
"""
|
215
219
|
|
216
220
|
def __init__( # type: ignore[no-untyped-def]
|
@@ -220,7 +224,7 @@ class TSNE(BaseTransformer):
|
|
220
224
|
perplexity=30.0,
|
221
225
|
early_exaggeration=12.0,
|
222
226
|
learning_rate="auto",
|
223
|
-
|
227
|
+
max_iter=None,
|
224
228
|
n_iter_without_progress=300,
|
225
229
|
min_grad_norm=1e-07,
|
226
230
|
metric="euclidean",
|
@@ -231,6 +235,7 @@ class TSNE(BaseTransformer):
|
|
231
235
|
method="barnes_hut",
|
232
236
|
angle=0.5,
|
233
237
|
n_jobs=None,
|
238
|
+
n_iter="deprecated",
|
234
239
|
input_cols: Optional[Union[str, Iterable[str]]] = None,
|
235
240
|
output_cols: Optional[Union[str, Iterable[str]]] = None,
|
236
241
|
label_cols: Optional[Union[str, Iterable[str]]] = None,
|
@@ -256,7 +261,7 @@ class TSNE(BaseTransformer):
|
|
256
261
|
'perplexity':(perplexity, 30.0, False),
|
257
262
|
'early_exaggeration':(early_exaggeration, 12.0, False),
|
258
263
|
'learning_rate':(learning_rate, "auto", False),
|
259
|
-
'
|
264
|
+
'max_iter':(max_iter, None, False),
|
260
265
|
'n_iter_without_progress':(n_iter_without_progress, 300, False),
|
261
266
|
'min_grad_norm':(min_grad_norm, 1e-07, False),
|
262
267
|
'metric':(metric, "euclidean", False),
|
@@ -266,7 +271,8 @@ class TSNE(BaseTransformer):
|
|
266
271
|
'random_state':(random_state, None, False),
|
267
272
|
'method':(method, "barnes_hut", False),
|
268
273
|
'angle':(angle, 0.5, False),
|
269
|
-
'n_jobs':(n_jobs, None, False),
|
274
|
+
'n_jobs':(n_jobs, None, False),
|
275
|
+
'n_iter':(n_iter, "deprecated", False),}
|
270
276
|
cleaned_up_init_args = validate_sklearn_args(
|
271
277
|
args=init_args,
|
272
278
|
klass=sklearn.manifold.TSNE
|
@@ -300,7 +300,7 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
|
|
300
300
|
]
|
301
301
|
),
|
302
302
|
input_types=[T.ArrayType(), T.IntegerType()],
|
303
|
-
packages=["numpy", "cloudpickle"],
|
303
|
+
packages=[f"numpy=={np.__version__}", f"cloudpickle=={cloudpickle.__version__}"],
|
304
304
|
name=confusion_matrix_computer,
|
305
305
|
is_permanent=False,
|
306
306
|
replace=True,
|
@@ -535,9 +535,8 @@ def log_loss(
|
|
535
535
|
assumed to be that of the positive class. The labels in ``y_pred``
|
536
536
|
are assumed to be ordered alphabetically, as done by `LabelBinarizer`.
|
537
537
|
eps: float or "auto", default="auto"
|
538
|
-
|
539
|
-
|
540
|
-
data type of `y_pred` and is set to `np.finfo(y_pred.dtype).eps`.
|
538
|
+
Deprecated: if specified, it will be ignored and a warning emitted. Retained
|
539
|
+
for backward compatibility.
|
541
540
|
normalize: boolean, default=True
|
542
541
|
If true, return the mean loss per sample.
|
543
542
|
Otherwise, return the sum of the per-sample losses.
|
@@ -557,8 +556,11 @@ def log_loss(
|
|
557
556
|
y_true = y_true_col_names if isinstance(y_true_col_names, list) else [y_true_col_names]
|
558
557
|
y_pred = y_pred_col_names if isinstance(y_pred_col_names, list) else [y_pred_col_names]
|
559
558
|
|
559
|
+
if eps != "auto":
|
560
|
+
warnings.warn("log_loss eps argument is deprecated and will be ignored.", DeprecationWarning, stacklevel=2)
|
561
|
+
|
560
562
|
# If it is binary classification, use SQL because it is faster.
|
561
|
-
if len(y_pred) == 1
|
563
|
+
if len(y_pred) == 1:
|
562
564
|
metrics_utils.check_label_columns(y_true_col_names, y_pred_col_names)
|
563
565
|
eps = float(np.finfo(float).eps)
|
564
566
|
y_true_col = y_true[0]
|
@@ -592,7 +594,6 @@ def log_loss(
|
|
592
594
|
log_loss_computer = _register_log_loss_computer(
|
593
595
|
session=session,
|
594
596
|
statement_params=statement_params,
|
595
|
-
eps=eps,
|
596
597
|
labels=labels,
|
597
598
|
)
|
598
599
|
log_loss_computer_udtf = F.table_function(log_loss_computer)
|
@@ -625,7 +626,6 @@ def _register_log_loss_computer(
|
|
625
626
|
*,
|
626
627
|
session: snowpark.Session,
|
627
628
|
statement_params: Dict[str, Any],
|
628
|
-
eps: Union[float, str] = "auto",
|
629
629
|
labels: Optional[npt.ArrayLike] = None,
|
630
630
|
) -> str:
|
631
631
|
"""Registers log loss computation UDTF in Snowflake and returns the name of the UDTF.
|
@@ -633,10 +633,6 @@ def _register_log_loss_computer(
|
|
633
633
|
Args:
|
634
634
|
session: Snowpark session.
|
635
635
|
statement_params: Dictionary used for tagging queries for tracking purposes.
|
636
|
-
eps: float or "auto", default="auto"
|
637
|
-
Log loss is undefined for p=0 or p=1, so probabilities are
|
638
|
-
clipped to `max(eps, min(1 - eps, p))`. The default will depend on the
|
639
|
-
data type of `y_pred` and is set to `np.finfo(y_pred.dtype).eps`.
|
640
636
|
labels: If not provided, labels will be inferred from y_true. If ``labels``
|
641
637
|
is ``None`` and ``y_pred`` has shape (n_samples,) the labels are
|
642
638
|
assumed to be binary and are inferred from ``y_true``.
|
@@ -647,7 +643,6 @@ def _register_log_loss_computer(
|
|
647
643
|
|
648
644
|
class LogLossComputer:
|
649
645
|
def __init__(self) -> None:
|
650
|
-
self._eps = eps
|
651
646
|
self._labels = labels
|
652
647
|
self._y_true: List[List[int]] = []
|
653
648
|
self._y_pred: List[List[float]] = []
|
@@ -662,7 +657,6 @@ def _register_log_loss_computer(
|
|
662
657
|
res = metrics.log_loss(
|
663
658
|
self._y_true,
|
664
659
|
self._y_pred,
|
665
|
-
eps=self._eps,
|
666
660
|
normalize=False,
|
667
661
|
sample_weight=self._sample_weight,
|
668
662
|
labels=self._labels,
|
@@ -670,6 +664,7 @@ def _register_log_loss_computer(
|
|
670
664
|
yield (float(res),)
|
671
665
|
|
672
666
|
log_loss_computer = random_name_for_temp_object(TempObjectType.TABLE_FUNCTION)
|
667
|
+
sklearn_release = version.parse(sklearn.__version__).release
|
673
668
|
session.udtf.register(
|
674
669
|
LogLossComputer,
|
675
670
|
output_schema=T.StructType(
|
@@ -677,7 +672,7 @@ def _register_log_loss_computer(
|
|
677
672
|
T.StructField("log_loss", T.FloatType()),
|
678
673
|
]
|
679
674
|
),
|
680
|
-
packages=["scikit-learn
|
675
|
+
packages=[f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*"],
|
681
676
|
name=log_loss_computer,
|
682
677
|
is_permanent=False,
|
683
678
|
replace=True,
|
@@ -814,7 +809,7 @@ def precision_recall_fscore_support(
|
|
814
809
|
name=sproc_name,
|
815
810
|
replace=True,
|
816
811
|
packages=[
|
817
|
-
"cloudpickle",
|
812
|
+
f"cloudpickle=={cloudpickle.__version__}",
|
818
813
|
f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
|
819
814
|
"snowflake-snowpark-python",
|
820
815
|
],
|
@@ -1071,6 +1066,7 @@ def _register_multilabel_confusion_matrix_computer(
|
|
1071
1066
|
yield (tp_sum, pred_sum, true_sum)
|
1072
1067
|
|
1073
1068
|
multilabel_confusion_matrix_computer = random_name_for_temp_object(TempObjectType.TABLE_FUNCTION)
|
1069
|
+
sklearn_release = version.parse(sklearn.__version__).release
|
1074
1070
|
session.udtf.register(
|
1075
1071
|
MultilabelConfusionMatrixComputer,
|
1076
1072
|
output_schema=T.StructType(
|
@@ -1080,7 +1076,7 @@ def _register_multilabel_confusion_matrix_computer(
|
|
1080
1076
|
T.StructField("TRUE_SUM", T.ArrayType()),
|
1081
1077
|
]
|
1082
1078
|
),
|
1083
|
-
packages=["numpy", "scikit-learn
|
1079
|
+
packages=[f"numpy=={np.__version__}", f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*"],
|
1084
1080
|
name=multilabel_confusion_matrix_computer,
|
1085
1081
|
is_permanent=False,
|
1086
1082
|
replace=True,
|
@@ -96,7 +96,7 @@ def precision_recall_curve(
|
|
96
96
|
name=sproc_name,
|
97
97
|
replace=True,
|
98
98
|
packages=[
|
99
|
-
"cloudpickle",
|
99
|
+
f"cloudpickle=={cloudpickle.__version__}",
|
100
100
|
f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
|
101
101
|
"snowflake-snowpark-python",
|
102
102
|
],
|
@@ -243,7 +243,7 @@ def roc_auc_score(
|
|
243
243
|
name=sproc_name,
|
244
244
|
replace=True,
|
245
245
|
packages=[
|
246
|
-
"cloudpickle",
|
246
|
+
f"cloudpickle=={cloudpickle.__version__}",
|
247
247
|
f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
|
248
248
|
"snowflake-snowpark-python",
|
249
249
|
],
|
@@ -346,7 +346,7 @@ def roc_curve(
|
|
346
346
|
name=sproc_name,
|
347
347
|
replace=True,
|
348
348
|
packages=[
|
349
|
-
"cloudpickle",
|
349
|
+
f"cloudpickle=={cloudpickle.__version__}",
|
350
350
|
f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
|
351
351
|
"snowflake-snowpark-python",
|
352
352
|
],
|
@@ -81,7 +81,7 @@ def d2_absolute_error_score(
|
|
81
81
|
name=sproc_name,
|
82
82
|
replace=True,
|
83
83
|
packages=[
|
84
|
-
"cloudpickle",
|
84
|
+
f"cloudpickle=={cloudpickle.__version__}",
|
85
85
|
f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
|
86
86
|
"snowflake-snowpark-python",
|
87
87
|
],
|
@@ -178,7 +178,7 @@ def d2_pinball_score(
|
|
178
178
|
name=sproc_name,
|
179
179
|
replace=True,
|
180
180
|
packages=[
|
181
|
-
"cloudpickle",
|
181
|
+
f"cloudpickle=={cloudpickle.__version__}",
|
182
182
|
f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
|
183
183
|
"snowflake-snowpark-python",
|
184
184
|
],
|
@@ -293,7 +293,7 @@ def explained_variance_score(
|
|
293
293
|
name=sproc_name,
|
294
294
|
replace=True,
|
295
295
|
packages=[
|
296
|
-
"cloudpickle",
|
296
|
+
f"cloudpickle=={cloudpickle.__version__}",
|
297
297
|
f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
|
298
298
|
"snowflake-snowpark-python",
|
299
299
|
],
|
@@ -113,7 +113,7 @@ class BernoulliNB(BaseTransformer):
|
|
113
113
|
Additive (Laplace/Lidstone) smoothing parameter
|
114
114
|
(set alpha=0 and force_alpha=True, for no smoothing).
|
115
115
|
|
116
|
-
force_alpha: bool, default=
|
116
|
+
force_alpha: bool, default=True
|
117
117
|
If False and alpha is less than 1e-10, it will set alpha to
|
118
118
|
1e-10. If True, alpha will remain unchanged. This may cause
|
119
119
|
numerical errors if alpha is too close to 0.
|
@@ -135,7 +135,7 @@ class BernoulliNB(BaseTransformer):
|
|
135
135
|
self,
|
136
136
|
*,
|
137
137
|
alpha=1.0,
|
138
|
-
force_alpha=
|
138
|
+
force_alpha=True,
|
139
139
|
binarize=0.0,
|
140
140
|
fit_prior=True,
|
141
141
|
class_prior=None,
|
@@ -161,7 +161,7 @@ class BernoulliNB(BaseTransformer):
|
|
161
161
|
self._deps = list(deps)
|
162
162
|
|
163
163
|
init_args = {'alpha':(alpha, 1.0, False),
|
164
|
-
'force_alpha':(force_alpha,
|
164
|
+
'force_alpha':(force_alpha, True, False),
|
165
165
|
'binarize':(binarize, 0.0, False),
|
166
166
|
'fit_prior':(fit_prior, True, False),
|
167
167
|
'class_prior':(class_prior, None, False),}
|
@@ -113,7 +113,7 @@ class CategoricalNB(BaseTransformer):
|
|
113
113
|
Additive (Laplace/Lidstone) smoothing parameter
|
114
114
|
(set alpha=0 and force_alpha=True, for no smoothing).
|
115
115
|
|
116
|
-
force_alpha: bool, default=
|
116
|
+
force_alpha: bool, default=True
|
117
117
|
If False and alpha is less than 1e-10, it will set alpha to
|
118
118
|
1e-10. If True, alpha will remain unchanged. This may cause
|
119
119
|
numerical errors if alpha is too close to 0.
|
@@ -141,7 +141,7 @@ class CategoricalNB(BaseTransformer):
|
|
141
141
|
self,
|
142
142
|
*,
|
143
143
|
alpha=1.0,
|
144
|
-
force_alpha=
|
144
|
+
force_alpha=True,
|
145
145
|
fit_prior=True,
|
146
146
|
class_prior=None,
|
147
147
|
min_categories=None,
|
@@ -167,7 +167,7 @@ class CategoricalNB(BaseTransformer):
|
|
167
167
|
self._deps = list(deps)
|
168
168
|
|
169
169
|
init_args = {'alpha':(alpha, 1.0, False),
|
170
|
-
'force_alpha':(force_alpha,
|
170
|
+
'force_alpha':(force_alpha, True, False),
|
171
171
|
'fit_prior':(fit_prior, True, False),
|
172
172
|
'class_prior':(class_prior, None, False),
|
173
173
|
'min_categories':(min_categories, None, False),}
|
@@ -113,7 +113,7 @@ class ComplementNB(BaseTransformer):
|
|
113
113
|
Additive (Laplace/Lidstone) smoothing parameter
|
114
114
|
(set alpha=0 and force_alpha=True, for no smoothing).
|
115
115
|
|
116
|
-
force_alpha: bool, default=
|
116
|
+
force_alpha: bool, default=True
|
117
117
|
If False and alpha is less than 1e-10, it will set alpha to
|
118
118
|
1e-10. If True, alpha will remain unchanged. This may cause
|
119
119
|
numerical errors if alpha is too close to 0.
|
@@ -135,7 +135,7 @@ class ComplementNB(BaseTransformer):
|
|
135
135
|
self,
|
136
136
|
*,
|
137
137
|
alpha=1.0,
|
138
|
-
force_alpha=
|
138
|
+
force_alpha=True,
|
139
139
|
fit_prior=True,
|
140
140
|
class_prior=None,
|
141
141
|
norm=False,
|
@@ -161,7 +161,7 @@ class ComplementNB(BaseTransformer):
|
|
161
161
|
self._deps = list(deps)
|
162
162
|
|
163
163
|
init_args = {'alpha':(alpha, 1.0, False),
|
164
|
-
'force_alpha':(force_alpha,
|
164
|
+
'force_alpha':(force_alpha, True, False),
|
165
165
|
'fit_prior':(fit_prior, True, False),
|
166
166
|
'class_prior':(class_prior, None, False),
|
167
167
|
'norm':(norm, False, False),}
|
@@ -113,7 +113,7 @@ class MultinomialNB(BaseTransformer):
|
|
113
113
|
Additive (Laplace/Lidstone) smoothing parameter
|
114
114
|
(set alpha=0 and force_alpha=True, for no smoothing).
|
115
115
|
|
116
|
-
force_alpha: bool, default=
|
116
|
+
force_alpha: bool, default=True
|
117
117
|
If False and alpha is less than 1e-10, it will set alpha to
|
118
118
|
1e-10. If True, alpha will remain unchanged. This may cause
|
119
119
|
numerical errors if alpha is too close to 0.
|
@@ -131,7 +131,7 @@ class MultinomialNB(BaseTransformer):
|
|
131
131
|
self,
|
132
132
|
*,
|
133
133
|
alpha=1.0,
|
134
|
-
force_alpha=
|
134
|
+
force_alpha=True,
|
135
135
|
fit_prior=True,
|
136
136
|
class_prior=None,
|
137
137
|
input_cols: Optional[Union[str, Iterable[str]]] = None,
|
@@ -156,7 +156,7 @@ class MultinomialNB(BaseTransformer):
|
|
156
156
|
self._deps = list(deps)
|
157
157
|
|
158
158
|
init_args = {'alpha':(alpha, 1.0, False),
|
159
|
-
'force_alpha':(force_alpha,
|
159
|
+
'force_alpha':(force_alpha, True, False),
|
160
160
|
'fit_prior':(fit_prior, True, False),
|
161
161
|
'class_prior':(class_prior, None, False),}
|
162
162
|
cleaned_up_init_args = validate_sklearn_args(
|
@@ -124,6 +124,11 @@ class KNeighborsClassifier(BaseTransformer):
|
|
124
124
|
array of distances, and returns an array of the same shape
|
125
125
|
containing the weights.
|
126
126
|
|
127
|
+
Refer to the example entitled
|
128
|
+
:ref:`sphx_glr_auto_examples_neighbors_plot_classification.py`
|
129
|
+
showing the impact of the `weights` parameter on the decision
|
130
|
+
boundary.
|
131
|
+
|
127
132
|
algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
128
133
|
Algorithm used to compute the nearest neighbors:
|
129
134
|
|
@@ -142,10 +147,11 @@ class KNeighborsClassifier(BaseTransformer):
|
|
142
147
|
required to store the tree. The optimal value depends on the
|
143
148
|
nature of the problem.
|
144
149
|
|
145
|
-
p:
|
146
|
-
Power parameter for the Minkowski metric. When p = 1, this is
|
147
|
-
|
148
|
-
|
150
|
+
p: float, default=2
|
151
|
+
Power parameter for the Minkowski metric. When p = 1, this is equivalent
|
152
|
+
to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
|
153
|
+
For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected
|
154
|
+
to be positive.
|
149
155
|
|
150
156
|
metric: str or callable, default='minkowski'
|
151
157
|
Metric to use for distance computation. Default is "minkowski", which
|
@@ -144,12 +144,12 @@ class KNeighborsRegressor(BaseTransformer):
|
|
144
144
|
required to store the tree. The optimal value depends on the
|
145
145
|
nature of the problem.
|
146
146
|
|
147
|
-
p:
|
147
|
+
p: float, default=2
|
148
148
|
Power parameter for the Minkowski metric. When p = 1, this is
|
149
149
|
equivalent to using manhattan_distance (l1), and euclidean_distance
|
150
150
|
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
151
151
|
|
152
|
-
metric: str or callable, default='minkowski'
|
152
|
+
metric: str, DistanceMetric object or callable, default='minkowski'
|
153
153
|
Metric to use for distance computation. Default is "minkowski", which
|
154
154
|
results in the standard Euclidean distance when p = 2. See the
|
155
155
|
documentation of `scipy.spatial.distance
|
@@ -167,6 +167,9 @@ class KNeighborsRegressor(BaseTransformer):
|
|
167
167
|
between those vectors. This works for Scipy's metrics, but is less
|
168
168
|
efficient than passing the metric name as a string.
|
169
169
|
|
170
|
+
If metric is a DistanceMetric object, it will be passed directly to
|
171
|
+
the underlying computation routines.
|
172
|
+
|
170
173
|
metric_params: dict, default=None
|
171
174
|
Additional keyword arguments for the metric function.
|
172
175
|
|
@@ -150,9 +150,9 @@ class LocalOutlierFactor(BaseTransformer):
|
|
150
150
|
between those vectors. This works for Scipy's metrics, but is less
|
151
151
|
efficient than passing the metric name as a string.
|
152
152
|
|
153
|
-
p:
|
153
|
+
p: float, default=2
|
154
154
|
Parameter for the Minkowski metric from
|
155
|
-
:func:`sklearn.metrics.
|
155
|
+
:func:`sklearn.metrics.pairwise_distances`. When p = 1, this
|
156
156
|
is equivalent to using manhattan_distance (l1), and euclidean_distance
|
157
157
|
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
158
158
|
|
@@ -109,20 +109,13 @@ class NearestCentroid(BaseTransformer):
|
|
109
109
|
drop_input_cols: Optional[bool], default=False
|
110
110
|
If set, the response of predict(), transform() methods will not contain input columns.
|
111
111
|
|
112
|
-
metric:
|
113
|
-
Metric to use for distance computation.
|
114
|
-
|
115
|
-
|
116
|
-
the
|
117
|
-
|
118
|
-
|
119
|
-
supported.
|
120
|
-
|
121
|
-
The centroids for the samples corresponding to each class is
|
122
|
-
the point from which the sum of the distances (according to the metric)
|
123
|
-
of all samples that belong to that particular class are minimized.
|
124
|
-
If the `"manhattan"` metric is provided, this centroid is the median
|
125
|
-
and for all other metrics, the centroid is now set to be the mean.
|
112
|
+
metric: {"euclidean", "manhattan"}, default="euclidean"
|
113
|
+
Metric to use for distance computation.
|
114
|
+
|
115
|
+
If `metric="euclidean"`, the centroid for the samples corresponding to each
|
116
|
+
class is the arithmetic mean, which minimizes the sum of squared L1 distances.
|
117
|
+
If `metric="manhattan"`, the centroid is the feature-wise median, which
|
118
|
+
minimizes the sum of L1 distances.
|
126
119
|
|
127
120
|
shrink_threshold: float, default=None
|
128
121
|
Threshold for shrinking centroids to remove features.
|
@@ -152,7 +152,7 @@ class NearestNeighbors(BaseTransformer):
|
|
152
152
|
between those vectors. This works for Scipy's metrics, but is less
|
153
153
|
efficient than passing the metric name as a string.
|
154
154
|
|
155
|
-
p: float, default=2
|
155
|
+
p: float (positive), default=2
|
156
156
|
Parameter for the Minkowski metric from
|
157
157
|
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
|
158
158
|
equivalent to using manhattan_distance (l1), and euclidean_distance
|
@@ -145,10 +145,11 @@ class RadiusNeighborsClassifier(BaseTransformer):
|
|
145
145
|
required to store the tree. The optimal value depends on the
|
146
146
|
nature of the problem.
|
147
147
|
|
148
|
-
p:
|
148
|
+
p: float, default=2
|
149
149
|
Power parameter for the Minkowski metric. When p = 1, this is
|
150
150
|
equivalent to using manhattan_distance (l1), and euclidean_distance
|
151
151
|
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
152
|
+
This parameter is expected to be positive.
|
152
153
|
|
153
154
|
metric: str or callable, default='minkowski'
|
154
155
|
Metric to use for distance computation. Default is "minkowski", which
|
@@ -176,6 +177,10 @@ class RadiusNeighborsClassifier(BaseTransformer):
|
|
176
177
|
- 'most_frequent': assign the most frequent label of y to outliers.
|
177
178
|
- None: when any outlier is detected, ValueError will be raised.
|
178
179
|
|
180
|
+
The outlier label should be selected from among the unique 'Y' labels.
|
181
|
+
If it is specified with a different value a warning will be raised and
|
182
|
+
all class probabilities of outliers will be assigned to be 0.
|
183
|
+
|
179
184
|
metric_params: dict, default=None
|
180
185
|
Additional keyword arguments for the metric function.
|
181
186
|
|
@@ -145,7 +145,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
|
|
145
145
|
required to store the tree. The optimal value depends on the
|
146
146
|
nature of the problem.
|
147
147
|
|
148
|
-
p:
|
148
|
+
p: float, default=2
|
149
149
|
Power parameter for the Minkowski metric. When p = 1, this is
|
150
150
|
equivalent to using manhattan_distance (l1), and euclidean_distance
|
151
151
|
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
@@ -138,6 +138,9 @@ class MLPClassifier(BaseTransformer):
|
|
138
138
|
- 'adam' refers to a stochastic gradient-based optimizer proposed
|
139
139
|
by Kingma, Diederik, and Jimmy Ba
|
140
140
|
|
141
|
+
For a comparison between Adam optimizer and SGD, see
|
142
|
+
:ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py`.
|
143
|
+
|
141
144
|
Note: The default solver 'adam' works pretty well on relatively
|
142
145
|
large datasets (with thousands of training samples or more) in terms of
|
143
146
|
both training time and validation score.
|
@@ -148,6 +151,9 @@ class MLPClassifier(BaseTransformer):
|
|
148
151
|
Strength of the L2 regularization term. The L2 regularization term
|
149
152
|
is divided by the sample size when added to the loss.
|
150
153
|
|
154
|
+
For an example usage and visualization of varying regularization, see
|
155
|
+
:ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_alpha.py`.
|
156
|
+
|
151
157
|
batch_size: int, default='auto'
|
152
158
|
Size of minibatches for stochastic optimizers.
|
153
159
|
If the solver is 'lbfgs', the classifier will not use minibatch.
|
@@ -224,7 +230,7 @@ class MLPClassifier(BaseTransformer):
|
|
224
230
|
Whether to use early stopping to terminate training when validation
|
225
231
|
score is not improving. If set to true, it will automatically set
|
226
232
|
aside 10% of training data as validation and terminate training when
|
227
|
-
validation score is not improving by at least tol for
|
233
|
+
validation score is not improving by at least ``tol`` for
|
228
234
|
``n_iter_no_change`` consecutive epochs. The split is stratified,
|
229
235
|
except in a multilabel setting.
|
230
236
|
If early stopping is False, then the training stops when the training
|
@@ -138,6 +138,9 @@ class MLPRegressor(BaseTransformer):
|
|
138
138
|
- 'adam' refers to a stochastic gradient-based optimizer proposed by
|
139
139
|
Kingma, Diederik, and Jimmy Ba
|
140
140
|
|
141
|
+
For a comparison between Adam optimizer and SGD, see
|
142
|
+
:ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py`.
|
143
|
+
|
141
144
|
Note: The default solver 'adam' works pretty well on relatively
|
142
145
|
large datasets (with thousands of training samples or more) in terms of
|
143
146
|
both training time and validation score.
|
@@ -863,21 +863,23 @@ class Pipeline(base.BaseTransformer):
|
|
863
863
|
ct.sparse_output_ = False
|
864
864
|
|
865
865
|
# ColumnTransformer internally replaces the "passthrough" string in the "remainder" step with a
|
866
|
-
# fitted FunctionTransformer
|
867
|
-
#
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
866
|
+
# fitted FunctionTransformer during the fit() call. So we need to manually replace the "passthrough"
|
867
|
+
# string with a fitted FunctionTransformer
|
868
|
+
for i, (step, transform, indices) in enumerate(ct.transformers_):
|
869
|
+
if transform == "passthrough":
|
870
|
+
ft = FunctionTransformer(
|
871
|
+
accept_sparse=True,
|
872
|
+
check_inverse=False,
|
873
|
+
feature_names_out="one-to-one",
|
874
|
+
)
|
875
|
+
if step == "remainder":
|
876
|
+
ft.feature_names_in_ = remaining
|
877
|
+
ft.n_features_in_ = len(remaining)
|
878
|
+
else:
|
879
|
+
ft.feature_names_in_ = self._feature_names_in[step_index_in_pipeline]
|
880
|
+
ft.n_features_in_ = self._n_features_in[step_index_in_pipeline]
|
881
|
+
ct.transformers_[i] = (step, ft, indices)
|
874
882
|
|
875
|
-
if remainder_action == "passthrough":
|
876
|
-
ft.n_features_in_ = len(remaining)
|
877
|
-
ct._name_to_fitted_passthrough = {"remainder": ft}
|
878
|
-
elif step_transformer_obj == "passthrough":
|
879
|
-
ft.n_features_in_ = self._n_features_in[step_index_in_pipeline]
|
880
|
-
ct._name_to_fitted_passthrough = {step_name_in_ct: ft}
|
881
883
|
return ct
|
882
884
|
|
883
885
|
def _fit_ml_runtime(self, dataset: snowpark.DataFrame) -> None:
|
@@ -9,12 +9,12 @@ import pandas as pd
|
|
9
9
|
import sklearn
|
10
10
|
from packaging import version
|
11
11
|
from scipy import sparse
|
12
|
-
from sklearn import preprocessing
|
12
|
+
from sklearn import preprocessing
|
13
13
|
|
14
14
|
from snowflake import snowpark
|
15
15
|
from snowflake.ml._internal import telemetry, type_utils
|
16
16
|
from snowflake.ml._internal.exceptions import error_codes, exceptions
|
17
|
-
from snowflake.ml._internal.utils import identifier
|
17
|
+
from snowflake.ml._internal.utils import identifier, import_utils
|
18
18
|
from snowflake.ml.model import model_signature
|
19
19
|
from snowflake.ml.modeling.framework import _utils, base
|
20
20
|
from snowflake.snowpark import functions as F, types as T
|
@@ -24,6 +24,10 @@ from snowflake.snowpark._internal.utils import (
|
|
24
24
|
random_name_for_temp_object,
|
25
25
|
)
|
26
26
|
|
27
|
+
is_scalar_nan = import_utils.import_with_fallbacks(
|
28
|
+
"sklearn.utils.is_scalar_nan", "sklearn.utils._missing.is_scalar_nan"
|
29
|
+
)
|
30
|
+
|
27
31
|
_INFREQUENT_CATEGORY = "_INFREQUENT"
|
28
32
|
_COLUMN_NAME = "_COLUMN_NAME"
|
29
33
|
_CATEGORY = "_CATEGORY"
|
@@ -1293,7 +1297,7 @@ class OneHotEncoder(base.BaseTransformer):
|
|
1293
1297
|
missing_drops = []
|
1294
1298
|
drop_indices = []
|
1295
1299
|
for feature_idx, (drop_val, cat_list) in enumerate(zip(drop_array, self._categories_list)):
|
1296
|
-
if not
|
1300
|
+
if not is_scalar_nan(drop_val):
|
1297
1301
|
drop_idx = np.where(cat_list == drop_val)[0]
|
1298
1302
|
if drop_idx.size: # found drop idx
|
1299
1303
|
drop_indices.append(self._map_drop_idx_to_infrequent(feature_idx, drop_idx[0]))
|
@@ -1303,7 +1307,7 @@ class OneHotEncoder(base.BaseTransformer):
|
|
1303
1307
|
|
1304
1308
|
# drop_val is nan, find nan in categories manually
|
1305
1309
|
for cat_idx, cat in enumerate(cat_list):
|
1306
|
-
if
|
1310
|
+
if is_scalar_nan(cat):
|
1307
1311
|
drop_indices.append(self._map_drop_idx_to_infrequent(feature_idx, cat_idx))
|
1308
1312
|
break
|
1309
1313
|
else: # loop did not break thus drop is missing
|