snowflake-ml-python 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/_complete.py +1 -1
- snowflake/cortex/_extract_answer.py +1 -1
- snowflake/cortex/_sentiment.py +1 -1
- snowflake/cortex/_summarize.py +1 -1
- snowflake/cortex/_translate.py +1 -1
- snowflake/ml/_internal/env_utils.py +68 -6
- snowflake/ml/_internal/file_utils.py +34 -4
- snowflake/ml/_internal/telemetry.py +79 -91
- snowflake/ml/_internal/utils/identifier.py +78 -72
- snowflake/ml/_internal/utils/retryable_http.py +16 -4
- snowflake/ml/_internal/utils/spcs_attribution_utils.py +122 -0
- snowflake/ml/dataset/dataset.py +1 -1
- snowflake/ml/model/_api.py +21 -14
- snowflake/ml/model/_client/model/model_impl.py +176 -0
- snowflake/ml/model/_client/model/model_method_info.py +19 -0
- snowflake/ml/model/_client/model/model_version_impl.py +291 -0
- snowflake/ml/model/_client/ops/metadata_ops.py +107 -0
- snowflake/ml/model/_client/ops/model_ops.py +308 -0
- snowflake/ml/model/_client/sql/model.py +75 -0
- snowflake/ml/model/_client/sql/model_version.py +213 -0
- snowflake/ml/model/_client/sql/stage.py +40 -0
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -4
- snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template +24 -8
- snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template +23 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +14 -2
- snowflake/ml/model/_deploy_client/utils/constants.py +1 -0
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +2 -2
- snowflake/ml/model/_model_composer/model_composer.py +31 -9
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +25 -10
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +2 -2
- snowflake/ml/model/_model_composer/model_method/infer_function.py_template +2 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +34 -3
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +1 -1
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +3 -1
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +10 -28
- snowflake/ml/model/_packager/model_meta/model_meta.py +18 -16
- snowflake/ml/model/_signatures/snowpark_handler.py +1 -1
- snowflake/ml/model/model_signature.py +108 -53
- snowflake/ml/model/type_hints.py +1 -0
- snowflake/ml/modeling/_internal/distributed_hpo_trainer.py +554 -0
- snowflake/ml/modeling/_internal/estimator_protocols.py +1 -60
- snowflake/ml/modeling/_internal/model_specifications.py +146 -0
- snowflake/ml/modeling/_internal/model_trainer.py +13 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +78 -0
- snowflake/ml/modeling/_internal/pandas_trainer.py +54 -0
- snowflake/ml/modeling/_internal/snowpark_handlers.py +6 -760
- snowflake/ml/modeling/_internal/snowpark_trainer.py +331 -0
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +108 -135
- snowflake/ml/modeling/cluster/affinity_propagation.py +106 -135
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +106 -135
- snowflake/ml/modeling/cluster/birch.py +106 -135
- snowflake/ml/modeling/cluster/bisecting_k_means.py +106 -135
- snowflake/ml/modeling/cluster/dbscan.py +106 -135
- snowflake/ml/modeling/cluster/feature_agglomeration.py +106 -135
- snowflake/ml/modeling/cluster/k_means.py +105 -135
- snowflake/ml/modeling/cluster/mean_shift.py +106 -135
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +105 -135
- snowflake/ml/modeling/cluster/optics.py +106 -135
- snowflake/ml/modeling/cluster/spectral_biclustering.py +106 -135
- snowflake/ml/modeling/cluster/spectral_clustering.py +106 -135
- snowflake/ml/modeling/cluster/spectral_coclustering.py +106 -135
- snowflake/ml/modeling/compose/column_transformer.py +106 -135
- snowflake/ml/modeling/compose/transformed_target_regressor.py +108 -135
- snowflake/ml/modeling/covariance/elliptic_envelope.py +106 -135
- snowflake/ml/modeling/covariance/empirical_covariance.py +99 -128
- snowflake/ml/modeling/covariance/graphical_lasso.py +106 -135
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +106 -135
- snowflake/ml/modeling/covariance/ledoit_wolf.py +104 -133
- snowflake/ml/modeling/covariance/min_cov_det.py +106 -135
- snowflake/ml/modeling/covariance/oas.py +99 -128
- snowflake/ml/modeling/covariance/shrunk_covariance.py +103 -132
- snowflake/ml/modeling/decomposition/dictionary_learning.py +106 -135
- snowflake/ml/modeling/decomposition/factor_analysis.py +106 -135
- snowflake/ml/modeling/decomposition/fast_ica.py +106 -135
- snowflake/ml/modeling/decomposition/incremental_pca.py +106 -135
- snowflake/ml/modeling/decomposition/kernel_pca.py +106 -135
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +106 -135
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +106 -135
- snowflake/ml/modeling/decomposition/pca.py +106 -135
- snowflake/ml/modeling/decomposition/sparse_pca.py +106 -135
- snowflake/ml/modeling/decomposition/truncated_svd.py +106 -135
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +108 -135
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +108 -135
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/bagging_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/bagging_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/isolation_forest.py +106 -135
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/stacking_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/voting_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/voting_regressor.py +108 -135
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +101 -128
- snowflake/ml/modeling/feature_selection/select_fdr.py +99 -126
- snowflake/ml/modeling/feature_selection/select_fpr.py +99 -126
- snowflake/ml/modeling/feature_selection/select_fwe.py +99 -126
- snowflake/ml/modeling/feature_selection/select_k_best.py +100 -127
- snowflake/ml/modeling/feature_selection/select_percentile.py +99 -126
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +106 -135
- snowflake/ml/modeling/feature_selection/variance_threshold.py +95 -124
- snowflake/ml/modeling/framework/base.py +83 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +108 -135
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +108 -135
- snowflake/ml/modeling/impute/iterative_imputer.py +106 -135
- snowflake/ml/modeling/impute/knn_imputer.py +106 -135
- snowflake/ml/modeling/impute/missing_indicator.py +106 -135
- snowflake/ml/modeling/impute/simple_imputer.py +9 -1
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +96 -125
- snowflake/ml/modeling/kernel_approximation/nystroem.py +106 -135
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +106 -135
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +105 -134
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +103 -132
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +108 -135
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +90 -118
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +90 -118
- snowflake/ml/modeling/linear_model/ard_regression.py +108 -135
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +108 -135
- snowflake/ml/modeling/linear_model/elastic_net.py +108 -135
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +108 -135
- snowflake/ml/modeling/linear_model/gamma_regressor.py +108 -135
- snowflake/ml/modeling/linear_model/huber_regressor.py +108 -135
- snowflake/ml/modeling/linear_model/lars.py +108 -135
- snowflake/ml/modeling/linear_model/lars_cv.py +108 -135
- snowflake/ml/modeling/linear_model/lasso.py +108 -135
- snowflake/ml/modeling/linear_model/lasso_cv.py +108 -135
- snowflake/ml/modeling/linear_model/lasso_lars.py +108 -135
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +108 -135
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +108 -135
- snowflake/ml/modeling/linear_model/linear_regression.py +108 -135
- snowflake/ml/modeling/linear_model/logistic_regression.py +108 -135
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +108 -135
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +108 -135
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +108 -135
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +108 -135
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +108 -135
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +108 -135
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +108 -135
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +107 -135
- snowflake/ml/modeling/linear_model/perceptron.py +107 -135
- snowflake/ml/modeling/linear_model/poisson_regressor.py +108 -135
- snowflake/ml/modeling/linear_model/ransac_regressor.py +108 -135
- snowflake/ml/modeling/linear_model/ridge.py +108 -135
- snowflake/ml/modeling/linear_model/ridge_classifier.py +108 -135
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +108 -135
- snowflake/ml/modeling/linear_model/ridge_cv.py +108 -135
- snowflake/ml/modeling/linear_model/sgd_classifier.py +108 -135
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +106 -135
- snowflake/ml/modeling/linear_model/sgd_regressor.py +108 -135
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +108 -135
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +108 -135
- snowflake/ml/modeling/manifold/isomap.py +106 -135
- snowflake/ml/modeling/manifold/mds.py +106 -135
- snowflake/ml/modeling/manifold/spectral_embedding.py +106 -135
- snowflake/ml/modeling/manifold/tsne.py +106 -135
- snowflake/ml/modeling/metrics/classification.py +196 -55
- snowflake/ml/modeling/metrics/correlation.py +4 -2
- snowflake/ml/modeling/metrics/covariance.py +7 -4
- snowflake/ml/modeling/metrics/ranking.py +32 -16
- snowflake/ml/modeling/metrics/regression.py +60 -32
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +106 -135
- snowflake/ml/modeling/mixture/gaussian_mixture.py +106 -135
- snowflake/ml/modeling/model_selection/grid_search_cv.py +91 -148
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +93 -154
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +105 -132
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +108 -135
- snowflake/ml/modeling/multiclass/output_code_classifier.py +108 -135
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +108 -135
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +108 -135
- snowflake/ml/modeling/naive_bayes/complement_nb.py +108 -135
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +98 -125
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +107 -134
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +108 -135
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +108 -135
- snowflake/ml/modeling/neighbors/kernel_density.py +106 -135
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +106 -135
- snowflake/ml/modeling/neighbors/nearest_centroid.py +108 -135
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +106 -135
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +108 -135
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +108 -135
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +108 -135
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +106 -135
- snowflake/ml/modeling/neural_network/mlp_classifier.py +108 -135
- snowflake/ml/modeling/neural_network/mlp_regressor.py +108 -135
- snowflake/ml/modeling/parameters/disable_distributed_hpo.py +2 -6
- snowflake/ml/modeling/preprocessing/binarizer.py +25 -8
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +9 -4
- snowflake/ml/modeling/preprocessing/label_encoder.py +31 -11
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +27 -9
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +42 -14
- snowflake/ml/modeling/preprocessing/normalizer.py +9 -4
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +26 -10
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +37 -13
- snowflake/ml/modeling/preprocessing/polynomial_features.py +106 -135
- snowflake/ml/modeling/preprocessing/robust_scaler.py +39 -13
- snowflake/ml/modeling/preprocessing/standard_scaler.py +36 -12
- snowflake/ml/modeling/semi_supervised/label_propagation.py +108 -135
- snowflake/ml/modeling/semi_supervised/label_spreading.py +108 -135
- snowflake/ml/modeling/svm/linear_svc.py +108 -135
- snowflake/ml/modeling/svm/linear_svr.py +108 -135
- snowflake/ml/modeling/svm/nu_svc.py +108 -135
- snowflake/ml/modeling/svm/nu_svr.py +108 -135
- snowflake/ml/modeling/svm/svc.py +108 -135
- snowflake/ml/modeling/svm/svr.py +108 -135
- snowflake/ml/modeling/tree/decision_tree_classifier.py +108 -135
- snowflake/ml/modeling/tree/decision_tree_regressor.py +108 -135
- snowflake/ml/modeling/tree/extra_tree_classifier.py +108 -135
- snowflake/ml/modeling/tree/extra_tree_regressor.py +108 -135
- snowflake/ml/modeling/xgboost/xgb_classifier.py +108 -136
- snowflake/ml/modeling/xgboost/xgb_regressor.py +108 -136
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +108 -136
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +108 -136
- snowflake/ml/registry/model_registry.py +2 -0
- snowflake/ml/registry/registry.py +215 -0
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.1.0.dist-info → snowflake_ml_python-1.1.2.dist-info}/METADATA +34 -1
- snowflake_ml_python-1.1.2.dist-info/RECORD +347 -0
- snowflake_ml_python-1.1.0.dist-info/RECORD +0 -331
- {snowflake_ml_python-1.1.0.dist-info → snowflake_ml_python-1.1.2.dist-info}/WHEEL +0 -0
@@ -27,14 +27,29 @@ class MaxAbsScaler(base.BaseTransformer):
|
|
27
27
|
(https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html).
|
28
28
|
|
29
29
|
Args:
|
30
|
-
input_cols:
|
31
|
-
|
30
|
+
input_cols: Optional[Union[str, List[str]]], default=None
|
31
|
+
The name(s) of one or more columns in a DataFrame containing a feature to be scaled.
|
32
|
+
|
33
|
+
output_cols: Optional[Union[str, List[str]]], default=None
|
34
|
+
The name(s) of one or more columns in a DataFrame in which results will be stored. The number of
|
32
35
|
columns specified must match the number of input columns.
|
33
|
-
|
36
|
+
|
37
|
+
passthrough_cols: Optional[Union[str, List[str]]], default=None
|
38
|
+
A string or a list of strings indicating column names to be excluded from any
|
39
|
+
operations (such as train, transform, or inference). These specified column(s)
|
40
|
+
will remain untouched throughout the process. This option is helpful in scenarios
|
41
|
+
requiring automatic input_cols inference, but need to avoid using specific
|
42
|
+
columns, like index columns, during training or inference.
|
43
|
+
|
44
|
+
drop_input_cols: Optional[bool], default=False
|
45
|
+
Remove input columns from output if set True. False by default.
|
34
46
|
|
35
47
|
Attributes:
|
36
|
-
scale_:
|
37
|
-
|
48
|
+
scale_: Dict[str, float]
|
49
|
+
dict {column_name: value} or None. Per-feature relative scaling factor.
|
50
|
+
|
51
|
+
max_abs_: Dict[str, float]
|
52
|
+
dict {column_name: value} or None. Per-feature maximum absolute value.
|
38
53
|
"""
|
39
54
|
|
40
55
|
def __init__(
|
@@ -42,6 +57,7 @@ class MaxAbsScaler(base.BaseTransformer):
|
|
42
57
|
*,
|
43
58
|
input_cols: Optional[Union[str, Iterable[str]]] = None,
|
44
59
|
output_cols: Optional[Union[str, Iterable[str]]] = None,
|
60
|
+
passthrough_cols: Optional[Union[str, Iterable[str]]] = None,
|
45
61
|
drop_input_cols: Optional[bool] = False,
|
46
62
|
) -> None:
|
47
63
|
"""
|
@@ -55,6 +71,11 @@ class MaxAbsScaler(base.BaseTransformer):
|
|
55
71
|
Args:
|
56
72
|
input_cols: Single or multiple input columns.
|
57
73
|
output_cols: Single or multiple output columns.
|
74
|
+
passthrough_cols: A string or a list of strings indicating column names to be excluded from any
|
75
|
+
operations (such as train, transform, or inference). These specified column(s)
|
76
|
+
will remain untouched throughout the process. This option is helful in scenarios
|
77
|
+
requiring automatic input_cols inference, but need to avoid using specific
|
78
|
+
columns, like index columns, during in training or inference.
|
58
79
|
drop_input_cols: Remove input columns from output if set True. False by default.
|
59
80
|
|
60
81
|
Attributes:
|
@@ -74,6 +95,7 @@ class MaxAbsScaler(base.BaseTransformer):
|
|
74
95
|
|
75
96
|
self.set_input_cols(input_cols)
|
76
97
|
self.set_output_cols(output_cols)
|
98
|
+
self.set_passthrough_cols(passthrough_cols)
|
77
99
|
|
78
100
|
def _reset(self) -> None:
|
79
101
|
"""
|
@@ -138,10 +160,6 @@ class MaxAbsScaler(base.BaseTransformer):
|
|
138
160
|
project=base.PROJECT,
|
139
161
|
subproject=base.SUBPROJECT,
|
140
162
|
)
|
141
|
-
@telemetry.add_stmt_params_to_df(
|
142
|
-
project=base.PROJECT,
|
143
|
-
subproject=base.SUBPROJECT,
|
144
|
-
)
|
145
163
|
def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[snowpark.DataFrame, pd.DataFrame]:
|
146
164
|
"""
|
147
165
|
Scale the data.
|
@@ -21,20 +21,45 @@ class MinMaxScaler(base.BaseTransformer):
|
|
21
21
|
(https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html).
|
22
22
|
|
23
23
|
Args:
|
24
|
-
feature_range:
|
25
|
-
|
26
|
-
|
24
|
+
feature_range: Tuple[float, float], default=(0, 1)
|
25
|
+
Desired range of transformed data (default is 0 to 1).
|
26
|
+
|
27
|
+
clip: bool, default=False
|
28
|
+
Whether to clip transformed values of held-out data to the specified feature range (default is True).
|
29
|
+
|
30
|
+
input_cols: Optional[Union[str, List[str]]], default=None
|
31
|
+
The name(s) of one or more columns in a DataFrame containing a feature to be scaled. Each specified
|
27
32
|
input column is scaled independently and stored in the corresponding output column.
|
28
|
-
|
33
|
+
|
34
|
+
output_cols: Optional[Union[str, List[str]]], default=None
|
35
|
+
The name(s) of one or more columns in a DataFrame in which results will be stored. The number of
|
29
36
|
columns specified must match the number of input columns.
|
30
|
-
|
37
|
+
|
38
|
+
passthrough_cols: Optional[Union[str, List[str]]], default=None
|
39
|
+
A string or a list of strings indicating column names to be excluded from any
|
40
|
+
operations (such as train, transform, or inference). These specified column(s)
|
41
|
+
will remain untouched throughout the process. This option is helpful in scenarios
|
42
|
+
requiring automatic input_cols inference, but need to avoid using specific
|
43
|
+
columns, like index columns, during training or inference.
|
44
|
+
|
45
|
+
drop_input_cols: Optional[bool], default=False
|
46
|
+
Remove input columns from output if set True. False by default.
|
31
47
|
|
32
48
|
Attributes:
|
33
|
-
min_:
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
49
|
+
min_: Dict[str, float]
|
50
|
+
dict {column_name: value} or None. Per-feature adjustment for minimum.
|
51
|
+
|
52
|
+
scale_: Dict[str, float]
|
53
|
+
dict {column_name: value} or None. Per-feature relative scaling factor.
|
54
|
+
|
55
|
+
data_min_: Dict[str, float]
|
56
|
+
dict {column_name: value} or None. Per-feature minimum seen in the data.
|
57
|
+
|
58
|
+
data_max_: Dict[str, float]
|
59
|
+
dict {column_name: value} or None. Per-feature maximum seen in the data.
|
60
|
+
|
61
|
+
data_range_: Dict[str, float]
|
62
|
+
dict {column_name: value} or None. Per-feature range seen in the data as a (min, max) tuple.
|
38
63
|
"""
|
39
64
|
|
40
65
|
def __init__(
|
@@ -44,6 +69,7 @@ class MinMaxScaler(base.BaseTransformer):
|
|
44
69
|
clip: bool = False,
|
45
70
|
input_cols: Optional[Union[str, Iterable[str]]] = None,
|
46
71
|
output_cols: Optional[Union[str, Iterable[str]]] = None,
|
72
|
+
passthrough_cols: Optional[Union[str, Iterable[str]]] = None,
|
47
73
|
drop_input_cols: Optional[bool] = False,
|
48
74
|
) -> None:
|
49
75
|
"""
|
@@ -54,6 +80,11 @@ class MinMaxScaler(base.BaseTransformer):
|
|
54
80
|
clip: Set to True to clip transformed values of held-out data to provided `feature range`.
|
55
81
|
input_cols: Single or multiple input columns.
|
56
82
|
output_cols: Single or multiple output columns.
|
83
|
+
passthrough_cols: A string or a list of strings indicating column names to be excluded from any
|
84
|
+
operations (such as train, transform, or inference). These specified column(s)
|
85
|
+
will remain untouched throughout the process. This option is helful in scenarios
|
86
|
+
requiring automatic input_cols inference, but need to avoid using specific
|
87
|
+
columns, like index columns, during in training or inference.
|
57
88
|
drop_input_cols: Remove input columns from output if set True. False by default.
|
58
89
|
|
59
90
|
Attributes:
|
@@ -78,6 +109,7 @@ class MinMaxScaler(base.BaseTransformer):
|
|
78
109
|
|
79
110
|
self.set_input_cols(input_cols)
|
80
111
|
self.set_output_cols(output_cols)
|
112
|
+
self.set_passthrough_cols(passthrough_cols)
|
81
113
|
|
82
114
|
def _reset(self) -> None:
|
83
115
|
"""
|
@@ -158,10 +190,6 @@ class MinMaxScaler(base.BaseTransformer):
|
|
158
190
|
project=base.PROJECT,
|
159
191
|
subproject=base.SUBPROJECT,
|
160
192
|
)
|
161
|
-
@telemetry.add_stmt_params_to_df(
|
162
|
-
project=base.PROJECT,
|
163
|
-
subproject=base.SUBPROJECT,
|
164
|
-
)
|
165
193
|
def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[snowpark.DataFrame, pd.DataFrame]:
|
166
194
|
"""
|
167
195
|
Scale features according to feature_range.
|
@@ -34,6 +34,13 @@ class Normalizer(base.BaseTransformer):
|
|
34
34
|
A string or list of strings representing column names that will store the output of transform operation.
|
35
35
|
The length of `output_cols` must equal the length of `input_cols`.
|
36
36
|
|
37
|
+
passthrough_cols: Optional[Union[str, List[str]]]
|
38
|
+
A string or a list of strings indicating column names to be excluded from any
|
39
|
+
operations (such as train, transform, or inference). These specified column(s)
|
40
|
+
will remain untouched throughout the process. This option is helpful in scenarios
|
41
|
+
requiring automatic input_cols inference, but need to avoid using specific
|
42
|
+
columns, like index columns, during training or inference.
|
43
|
+
|
37
44
|
drop_input_cols: bool, default=False
|
38
45
|
Remove input columns from output if set `True`.
|
39
46
|
"""
|
@@ -44,6 +51,7 @@ class Normalizer(base.BaseTransformer):
|
|
44
51
|
norm: str = "l2",
|
45
52
|
input_cols: Optional[Union[str, Iterable[str]]] = None,
|
46
53
|
output_cols: Optional[Union[str, Iterable[str]]] = None,
|
54
|
+
passthrough_cols: Optional[Union[str, Iterable[str]]] = None,
|
47
55
|
drop_input_cols: Optional[bool] = False,
|
48
56
|
) -> None:
|
49
57
|
super().__init__(drop_input_cols=drop_input_cols)
|
@@ -51,6 +59,7 @@ class Normalizer(base.BaseTransformer):
|
|
51
59
|
self._is_fitted = False
|
52
60
|
self.set_input_cols(input_cols)
|
53
61
|
self.set_output_cols(output_cols)
|
62
|
+
self.set_passthrough_cols(passthrough_cols)
|
54
63
|
|
55
64
|
def _reset(self) -> None:
|
56
65
|
"""
|
@@ -82,10 +91,6 @@ class Normalizer(base.BaseTransformer):
|
|
82
91
|
project=base.PROJECT,
|
83
92
|
subproject=base.SUBPROJECT,
|
84
93
|
)
|
85
|
-
@telemetry.add_stmt_params_to_df(
|
86
|
-
project=base.PROJECT,
|
87
|
-
subproject=base.SUBPROJECT,
|
88
|
-
)
|
89
94
|
def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[snowpark.DataFrame, pd.DataFrame]:
|
90
95
|
"""
|
91
96
|
Scale each non-zero row of the input dataset to the unit norm.
|
@@ -38,7 +38,7 @@ _N_FEATURES_OUT = "_N_FEATURES_OUT"
|
|
38
38
|
# transformer with the sklearn version
|
39
39
|
_SKLEARN_INITIAL_KEYWORDS = ("sparse", "handle_unknown") # initial keywords in sklearn
|
40
40
|
_SKLEARN_UNUSED_KEYWORDS = "dtype" # sklearn keywords that are unused in snowml
|
41
|
-
_SNOWML_ONLY_KEYWORDS = ["input_cols", "output_cols"] # snowml only keywords not present in sklearn
|
41
|
+
_SNOWML_ONLY_KEYWORDS = ["input_cols", "output_cols", "passthrough_cols"] # snowml only keywords not present in sklearn
|
42
42
|
|
43
43
|
# Added keywords mapped to the sklearn versions in which they were added. Update mappings in new
|
44
44
|
# sklearn versions to support parameter validation.
|
@@ -101,7 +101,7 @@ class OneHotEncoder(base.BaseTransformer):
|
|
101
101
|
(https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html).
|
102
102
|
|
103
103
|
Args:
|
104
|
-
categories: 'auto' or dict {column_name: ndarray([category])}, default='auto'
|
104
|
+
categories: 'auto' or dict {column_name: np.ndarray([category])}, default='auto'
|
105
105
|
Categories (unique values) per feature:
|
106
106
|
- 'auto': Determine categories automatically from the training data.
|
107
107
|
- dict: ``categories[column_name]`` holds the categories expected in
|
@@ -109,6 +109,7 @@ class OneHotEncoder(base.BaseTransformer):
|
|
109
109
|
and numeric values within a single feature, and should be sorted in
|
110
110
|
case of numeric values.
|
111
111
|
The used categories can be found in the ``categories_`` attribute.
|
112
|
+
|
112
113
|
drop: {‘first’, ‘if_binary’} or an array-like of shape (n_features,), default=None
|
113
114
|
Specifies a methodology to use to drop one of the categories per
|
114
115
|
feature. This is useful in situations where perfectly collinear
|
@@ -128,15 +129,18 @@ class OneHotEncoder(base.BaseTransformer):
|
|
128
129
|
When `max_categories` or `min_frequency` is configured to group
|
129
130
|
infrequent categories, the dropping behavior is handled after the
|
130
131
|
grouping.
|
132
|
+
|
131
133
|
sparse: bool, default=False
|
132
134
|
Will return a column with sparse representation if set True else will return
|
133
135
|
a separate column for each category.
|
136
|
+
|
134
137
|
handle_unknown: {'error', 'ignore'}, default='error'
|
135
138
|
Specifies the way unknown categories are handled during :meth:`transform`.
|
136
139
|
- 'error': Raise an error if an unknown category is present during transform.
|
137
140
|
- 'ignore': When an unknown category is encountered during
|
138
141
|
transform, the resulting one-hot encoded columns for this feature
|
139
142
|
will be all zeros.
|
143
|
+
|
140
144
|
min_frequency: int or float, default=None
|
141
145
|
Specifies the minimum frequency below which a category will be
|
142
146
|
considered infrequent.
|
@@ -144,17 +148,29 @@ class OneHotEncoder(base.BaseTransformer):
|
|
144
148
|
infrequent.
|
145
149
|
- If `float`, categories with a smaller cardinality than
|
146
150
|
`min_frequency * n_samples` will be considered infrequent.
|
151
|
+
|
147
152
|
max_categories: int, default=None
|
148
153
|
Specifies an upper limit to the number of output features for each input
|
149
154
|
feature when considering infrequent categories. If there are infrequent
|
150
155
|
categories, `max_categories` includes the category representing the
|
151
156
|
infrequent categories along with the frequent categories. If `None`,
|
152
157
|
there is no limit to the number of output features.
|
153
|
-
|
158
|
+
|
159
|
+
input_cols: Optional[Union[str, List[str]]], default=None
|
154
160
|
Single or multiple input columns.
|
155
|
-
|
161
|
+
|
162
|
+
output_cols: Optional[Union[str, List[str]]], default=None
|
156
163
|
Single or multiple output columns.
|
157
|
-
|
164
|
+
|
165
|
+
passthrough_cols: Optional[Union[str, List[str]]]
|
166
|
+
A string or a list of strings indicating column names to be excluded from any
|
167
|
+
operations (such as train, transform, or inference). These specified column(s)
|
168
|
+
will remain untouched throughout the process. This option is helpful in scenarios
|
169
|
+
requiring automatic input_cols inference, but need to avoid using specific
|
170
|
+
columns, like index columns, during training or inference.
|
171
|
+
|
172
|
+
drop_input_cols: Optional[Union[str, List[str]]]
|
173
|
+
Remove input columns from output if set True. False by default.
|
158
174
|
|
159
175
|
Attributes:
|
160
176
|
categories_: dict {column_name: ndarray([category])}
|
@@ -190,6 +206,7 @@ class OneHotEncoder(base.BaseTransformer):
|
|
190
206
|
max_categories: Optional[int] = None,
|
191
207
|
input_cols: Optional[Union[str, Iterable[str]]] = None,
|
192
208
|
output_cols: Optional[Union[str, Iterable[str]]] = None,
|
209
|
+
passthrough_cols: Optional[Union[str, Iterable[str]]] = None,
|
193
210
|
drop_input_cols: Optional[bool] = False,
|
194
211
|
) -> None:
|
195
212
|
"""See class-level docstring."""
|
@@ -218,6 +235,7 @@ class OneHotEncoder(base.BaseTransformer):
|
|
218
235
|
|
219
236
|
self.set_input_cols(input_cols)
|
220
237
|
self.set_output_cols(output_cols)
|
238
|
+
self.set_passthrough_cols(passthrough_cols)
|
221
239
|
|
222
240
|
@property
|
223
241
|
def infrequent_categories_(self) -> List[Optional[type_utils.LiteralNDArrayType]]:
|
@@ -658,10 +676,6 @@ class OneHotEncoder(base.BaseTransformer):
|
|
658
676
|
project=base.PROJECT,
|
659
677
|
subproject=base.SUBPROJECT,
|
660
678
|
)
|
661
|
-
@telemetry.add_stmt_params_to_df(
|
662
|
-
project=base.PROJECT,
|
663
|
-
subproject=base.SUBPROJECT,
|
664
|
-
)
|
665
679
|
def transform(
|
666
680
|
self, dataset: Union[snowpark.DataFrame, pd.DataFrame]
|
667
681
|
) -> Union[snowpark.DataFrame, pd.DataFrame, sparse.csr_matrix]:
|
@@ -1319,7 +1333,9 @@ class OneHotEncoder(base.BaseTransformer):
|
|
1319
1333
|
"""
|
1320
1334
|
category_counts_list = [] # list of ndarray
|
1321
1335
|
for idx, input_col in enumerate(self.input_cols):
|
1322
|
-
counts = np.vectorize(lambda x: category_counts[input_col][x])(
|
1336
|
+
counts = np.vectorize(lambda x, input_col=input_col: category_counts[input_col][x])(
|
1337
|
+
self._categories_list[idx]
|
1338
|
+
)
|
1323
1339
|
category_counts_list.append(np.array(counts))
|
1324
1340
|
self._infrequent_indices = [
|
1325
1341
|
self._identify_infrequent(category_count, n_samples) for category_count in category_counts_list
|
@@ -24,7 +24,7 @@ _COLUMN_BATCH_SIZE = 20
|
|
24
24
|
# transformer with the sklearn version
|
25
25
|
_SKLEARN_INITIAL_KEYWORDS = "categories" # initial keywords in sklearn
|
26
26
|
_SKLEARN_UNUSED_KEYWORDS = "dtype" # sklearn keywords that are unused in snowml
|
27
|
-
_SNOWML_ONLY_KEYWORDS = ["input_cols", "output_cols"] # snowml only keywords not present in sklearn
|
27
|
+
_SNOWML_ONLY_KEYWORDS = ["input_cols", "output_cols", "passthrough_cols"] # snowml only keywords not present in sklearn
|
28
28
|
|
29
29
|
# Added keywords mapped to the sklearn versions in which they were added. Update mappings in new
|
30
30
|
# sklearn versions to support parameter validation.
|
@@ -45,26 +45,47 @@ class OrdinalEncoder(base.BaseTransformer):
|
|
45
45
|
(https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html).
|
46
46
|
|
47
47
|
Args:
|
48
|
-
categories:
|
48
|
+
categories: Union[str, Dict[str, type_utils.LiteralNDArrayType]], default="auto"
|
49
|
+
The string 'auto' (the default) causes the categories to be extracted from the input columns.
|
49
50
|
To specify the categories yourself, pass a dictionary mapping the column name to an ndarray containing the
|
50
51
|
categories.
|
51
|
-
|
52
|
+
|
53
|
+
handle_unknown: str, default="error"
|
54
|
+
Specifies how unknown categories are handled during transformation. Applicable only if
|
52
55
|
categories is not 'auto'.
|
53
56
|
Valid values are:
|
54
57
|
- 'error': Raise an error if an unknown category is present during transform (default).
|
55
58
|
- 'use_encoded_value': When an unknown category is encountered during transform, the specified
|
56
59
|
encoded_missing_value (below) is used.
|
57
|
-
|
60
|
+
|
61
|
+
unknown_value: Optional[Union[int, float]], default=None
|
62
|
+
When the parameter handle_unknown is set to 'use_encoded_value', this parameter is required and
|
58
63
|
will set the encoded value of unknown categories. It has to be distinct from the values used to encode any
|
59
64
|
of the categories in `fit`.
|
60
|
-
|
61
|
-
|
62
|
-
|
65
|
+
|
66
|
+
encoded_missing_value: Union[int, float], default=np.nan
|
67
|
+
The value to be used to encode unknown categories.
|
68
|
+
|
69
|
+
input_cols: Optional[Union[str, List[str]]], default=None
|
70
|
+
The name(s) of one or more columns in a DataFrame containing a feature to be encoded.
|
71
|
+
|
72
|
+
output_cols: Optional[Union[str, List[str]]], default=None
|
73
|
+
The name(s) of one or more columns in a DataFrame in which results will be stored. The number of
|
63
74
|
columns specified must match the number of input columns.
|
64
|
-
|
75
|
+
|
76
|
+
passthrough_cols: Optional[Union[str, List[str]]], default=None
|
77
|
+
A string or a list of strings indicating column names to be excluded from any
|
78
|
+
operations (such as train, transform, or inference). These specified column(s)
|
79
|
+
will remain untouched throughout the process. This option is helpful in scenarios
|
80
|
+
requiring automatic input_cols inference, but need to avoid using specific
|
81
|
+
columns, like index columns, during training or inference.
|
82
|
+
|
83
|
+
drop_input_cols: Optional[bool], default=False
|
84
|
+
Remove input columns from output if set True. False by default.
|
65
85
|
|
66
86
|
Attributes:
|
67
|
-
categories_ (dict of ndarray):
|
87
|
+
categories_ (dict of ndarray): List[type_utils.LiteralNDArrayType]
|
88
|
+
The categories of each feature determined during fitting. Maps input column
|
68
89
|
names to an array of the detected categories.
|
69
90
|
Attributes are valid only after fit() has been called.
|
70
91
|
"""
|
@@ -78,6 +99,7 @@ class OrdinalEncoder(base.BaseTransformer):
|
|
78
99
|
encoded_missing_value: Union[int, float] = np.nan,
|
79
100
|
input_cols: Optional[Union[str, Iterable[str]]] = None,
|
80
101
|
output_cols: Optional[Union[str, Iterable[str]]] = None,
|
102
|
+
passthrough_cols: Optional[Union[str, Iterable[str]]] = None,
|
81
103
|
drop_input_cols: Optional[bool] = False,
|
82
104
|
) -> None:
|
83
105
|
"""
|
@@ -110,6 +132,11 @@ class OrdinalEncoder(base.BaseTransformer):
|
|
110
132
|
encoded_missing_value: Encoded value of missing categories.
|
111
133
|
input_cols: Single or multiple input columns.
|
112
134
|
output_cols: Single or multiple output columns.
|
135
|
+
passthrough_cols: A string or a list of strings indicating column names to be excluded from any
|
136
|
+
operations (such as train, transform, or inference). These specified column(s)
|
137
|
+
will remain untouched throughout the process. This option is helful in scenarios
|
138
|
+
requiring automatic input_cols inference, but need to avoid using specific
|
139
|
+
columns, like index columns, during in training or inference.
|
113
140
|
drop_input_cols: Remove input columns from output if set True. False by default.
|
114
141
|
|
115
142
|
Attributes:
|
@@ -129,6 +156,7 @@ class OrdinalEncoder(base.BaseTransformer):
|
|
129
156
|
|
130
157
|
self.set_input_cols(input_cols)
|
131
158
|
self.set_output_cols(output_cols)
|
159
|
+
self.set_passthrough_cols(passthrough_cols)
|
132
160
|
|
133
161
|
def _reset(self) -> None:
|
134
162
|
"""
|
@@ -417,10 +445,6 @@ class OrdinalEncoder(base.BaseTransformer):
|
|
417
445
|
project=base.PROJECT,
|
418
446
|
subproject=base.SUBPROJECT,
|
419
447
|
)
|
420
|
-
@telemetry.add_stmt_params_to_df(
|
421
|
-
project=base.PROJECT,
|
422
|
-
subproject=base.SUBPROJECT,
|
423
|
-
)
|
424
448
|
def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[snowpark.DataFrame, pd.DataFrame]:
|
425
449
|
"""
|
426
450
|
Transform dataset to ordinal codes.
|