snowflake-ml-python 1.6.4__py3-none-any.whl → 1.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. snowflake/cortex/__init__.py +4 -0
  2. snowflake/cortex/_complete.py +107 -64
  3. snowflake/cortex/_finetune.py +273 -0
  4. snowflake/cortex/_sse_client.py +91 -28
  5. snowflake/cortex/_util.py +30 -1
  6. snowflake/ml/_internal/telemetry.py +4 -2
  7. snowflake/ml/_internal/type_utils.py +3 -3
  8. snowflake/ml/_internal/utils/import_utils.py +31 -0
  9. snowflake/ml/_internal/utils/snowpark_dataframe_utils.py +13 -0
  10. snowflake/ml/data/__init__.py +5 -0
  11. snowflake/ml/data/_internal/arrow_ingestor.py +8 -0
  12. snowflake/ml/data/data_connector.py +1 -1
  13. snowflake/ml/data/torch_utils.py +33 -14
  14. snowflake/ml/feature_store/examples/airline_features/features/plane_features.py +5 -3
  15. snowflake/ml/feature_store/examples/airline_features/features/weather_features.py +7 -5
  16. snowflake/ml/feature_store/examples/citibike_trip_features/features/station_feature.py +4 -2
  17. snowflake/ml/feature_store/examples/citibike_trip_features/features/trip_feature.py +3 -1
  18. snowflake/ml/feature_store/examples/example_helper.py +6 -3
  19. snowflake/ml/feature_store/examples/new_york_taxi_features/features/location_features.py +4 -2
  20. snowflake/ml/feature_store/examples/new_york_taxi_features/features/trip_features.py +4 -2
  21. snowflake/ml/feature_store/examples/wine_quality_features/features/managed_wine_features.py +3 -1
  22. snowflake/ml/feature_store/examples/wine_quality_features/features/static_wine_features.py +3 -1
  23. snowflake/ml/feature_store/feature_store.py +1 -2
  24. snowflake/ml/feature_store/feature_view.py +5 -1
  25. snowflake/ml/model/_client/model/model_version_impl.py +145 -11
  26. snowflake/ml/model/_client/ops/model_ops.py +56 -16
  27. snowflake/ml/model/_client/ops/service_ops.py +46 -30
  28. snowflake/ml/model/_client/service/model_deployment_spec.py +19 -8
  29. snowflake/ml/model/_client/service/model_deployment_spec_schema.py +3 -1
  30. snowflake/ml/model/_client/sql/service.py +25 -1
  31. snowflake/ml/model/_model_composer/model_composer.py +2 -0
  32. snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +4 -0
  33. snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +1 -0
  34. snowflake/ml/model/_model_composer/model_method/infer_function.py_template +2 -1
  35. snowflake/ml/model/_model_composer/model_method/model_method.py +1 -1
  36. snowflake/ml/model/_packager/model_env/model_env.py +12 -0
  37. snowflake/ml/model/_packager/model_handlers/_utils.py +6 -2
  38. snowflake/ml/model/_packager/model_handlers/catboost.py +4 -7
  39. snowflake/ml/model/_packager/model_handlers/custom.py +5 -1
  40. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +10 -1
  41. snowflake/ml/model/_packager/model_handlers/lightgbm.py +5 -7
  42. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +8 -1
  43. snowflake/ml/model/_packager/model_handlers/sklearn.py +51 -7
  44. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +8 -66
  45. snowflake/ml/model/_packager/model_handlers/tensorflow.py +23 -6
  46. snowflake/ml/model/_packager/model_handlers/torchscript.py +14 -14
  47. snowflake/ml/model/_packager/model_handlers/xgboost.py +10 -40
  48. snowflake/ml/model/_packager/model_meta/_packaging_requirements.py +2 -3
  49. snowflake/ml/model/_packager/model_meta/model_meta_schema.py +5 -0
  50. snowflake/ml/model/_packager/model_packager.py +0 -11
  51. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +2 -10
  52. snowflake/ml/model/_packager/model_runtime/model_runtime.py +4 -9
  53. snowflake/ml/model/_packager/{model_handlers/model_objective_utils.py → model_task/model_task_utils.py} +14 -26
  54. snowflake/ml/model/_signatures/core.py +63 -16
  55. snowflake/ml/model/_signatures/pandas_handler.py +87 -27
  56. snowflake/ml/model/_signatures/pytorch_handler.py +2 -2
  57. snowflake/ml/model/_signatures/snowpark_handler.py +2 -1
  58. snowflake/ml/model/_signatures/tensorflow_handler.py +2 -2
  59. snowflake/ml/model/_signatures/utils.py +4 -0
  60. snowflake/ml/model/custom_model.py +47 -7
  61. snowflake/ml/model/model_signature.py +40 -9
  62. snowflake/ml/model/type_hints.py +9 -1
  63. snowflake/ml/modeling/_internal/estimator_utils.py +13 -0
  64. snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +7 -2
  65. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +16 -5
  66. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +8 -2
  67. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -3
  68. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +1 -8
  69. snowflake/ml/modeling/cluster/agglomerative_clustering.py +17 -19
  70. snowflake/ml/modeling/cluster/dbscan.py +5 -2
  71. snowflake/ml/modeling/cluster/feature_agglomeration.py +7 -19
  72. snowflake/ml/modeling/cluster/k_means.py +14 -19
  73. snowflake/ml/modeling/cluster/mini_batch_k_means.py +3 -3
  74. snowflake/ml/modeling/cluster/optics.py +6 -6
  75. snowflake/ml/modeling/cluster/spectral_clustering.py +4 -3
  76. snowflake/ml/modeling/compose/column_transformer.py +15 -5
  77. snowflake/ml/modeling/compose/transformed_target_regressor.py +7 -6
  78. snowflake/ml/modeling/covariance/elliptic_envelope.py +1 -1
  79. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +1 -1
  80. snowflake/ml/modeling/covariance/min_cov_det.py +2 -2
  81. snowflake/ml/modeling/covariance/oas.py +1 -1
  82. snowflake/ml/modeling/decomposition/kernel_pca.py +2 -2
  83. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +5 -12
  84. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +5 -12
  85. snowflake/ml/modeling/decomposition/pca.py +28 -15
  86. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +6 -0
  87. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +1 -12
  88. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +1 -11
  89. snowflake/ml/modeling/ensemble/bagging_classifier.py +1 -8
  90. snowflake/ml/modeling/ensemble/bagging_regressor.py +1 -8
  91. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +21 -2
  92. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +18 -2
  93. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +2 -0
  94. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +2 -0
  95. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +21 -8
  96. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +21 -11
  97. snowflake/ml/modeling/ensemble/random_forest_classifier.py +21 -2
  98. snowflake/ml/modeling/ensemble/random_forest_regressor.py +18 -2
  99. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +2 -1
  100. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +5 -3
  101. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +2 -2
  102. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +2 -4
  103. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +2 -4
  104. snowflake/ml/modeling/linear_model/ard_regression.py +5 -10
  105. snowflake/ml/modeling/linear_model/bayesian_ridge.py +5 -11
  106. snowflake/ml/modeling/linear_model/elastic_net.py +3 -0
  107. snowflake/ml/modeling/linear_model/elastic_net_cv.py +1 -1
  108. snowflake/ml/modeling/linear_model/lars.py +0 -10
  109. snowflake/ml/modeling/linear_model/lars_cv.py +1 -11
  110. snowflake/ml/modeling/linear_model/lasso_cv.py +1 -1
  111. snowflake/ml/modeling/linear_model/lasso_lars.py +0 -10
  112. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +1 -11
  113. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +0 -10
  114. snowflake/ml/modeling/linear_model/logistic_regression.py +28 -22
  115. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +30 -24
  116. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +1 -1
  117. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +1 -1
  118. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +4 -13
  119. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +4 -4
  120. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +1 -1
  121. snowflake/ml/modeling/linear_model/perceptron.py +3 -3
  122. snowflake/ml/modeling/linear_model/ransac_regressor.py +3 -2
  123. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +14 -6
  124. snowflake/ml/modeling/linear_model/ridge_cv.py +17 -11
  125. snowflake/ml/modeling/linear_model/sgd_classifier.py +2 -2
  126. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +5 -1
  127. snowflake/ml/modeling/linear_model/sgd_regressor.py +12 -3
  128. snowflake/ml/modeling/manifold/isomap.py +1 -1
  129. snowflake/ml/modeling/manifold/mds.py +3 -3
  130. snowflake/ml/modeling/manifold/tsne.py +10 -4
  131. snowflake/ml/modeling/metrics/classification.py +12 -16
  132. snowflake/ml/modeling/metrics/ranking.py +3 -3
  133. snowflake/ml/modeling/metrics/regression.py +3 -3
  134. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +3 -3
  135. snowflake/ml/modeling/naive_bayes/categorical_nb.py +3 -3
  136. snowflake/ml/modeling/naive_bayes/complement_nb.py +3 -3
  137. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +3 -3
  138. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +10 -4
  139. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +5 -2
  140. snowflake/ml/modeling/neighbors/local_outlier_factor.py +2 -2
  141. snowflake/ml/modeling/neighbors/nearest_centroid.py +7 -14
  142. snowflake/ml/modeling/neighbors/nearest_neighbors.py +1 -1
  143. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +6 -1
  144. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +1 -1
  145. snowflake/ml/modeling/neural_network/mlp_classifier.py +7 -1
  146. snowflake/ml/modeling/neural_network/mlp_regressor.py +3 -0
  147. snowflake/ml/modeling/pipeline/pipeline.py +16 -14
  148. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +8 -4
  149. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +9 -7
  150. snowflake/ml/modeling/svm/linear_svc.py +25 -16
  151. snowflake/ml/modeling/svm/linear_svr.py +23 -17
  152. snowflake/ml/modeling/svm/nu_svc.py +5 -3
  153. snowflake/ml/modeling/svm/nu_svr.py +3 -1
  154. snowflake/ml/modeling/svm/svc.py +9 -5
  155. snowflake/ml/modeling/svm/svr.py +3 -1
  156. snowflake/ml/modeling/tree/decision_tree_classifier.py +21 -2
  157. snowflake/ml/modeling/tree/decision_tree_regressor.py +18 -2
  158. snowflake/ml/modeling/tree/extra_tree_classifier.py +28 -9
  159. snowflake/ml/modeling/tree/extra_tree_regressor.py +18 -2
  160. snowflake/ml/monitoring/_client/model_monitor_sql_client.py +448 -0
  161. snowflake/ml/monitoring/_manager/model_monitor_manager.py +238 -0
  162. snowflake/ml/monitoring/entities/model_monitor_config.py +10 -10
  163. snowflake/ml/monitoring/model_monitor.py +37 -0
  164. snowflake/ml/registry/_manager/model_manager.py +15 -1
  165. snowflake/ml/registry/registry.py +32 -37
  166. snowflake/ml/version.py +1 -1
  167. {snowflake_ml_python-1.6.4.dist-info → snowflake_ml_python-1.7.1.dist-info}/METADATA +104 -12
  168. {snowflake_ml_python-1.6.4.dist-info → snowflake_ml_python-1.7.1.dist-info}/RECORD +172 -171
  169. {snowflake_ml_python-1.6.4.dist-info → snowflake_ml_python-1.7.1.dist-info}/WHEEL +1 -1
  170. snowflake/ml/monitoring/_client/model_monitor.py +0 -126
  171. snowflake/ml/monitoring/_client/model_monitor_manager.py +0 -361
  172. snowflake/ml/monitoring/_client/monitor_sql_client.py +0 -1335
  173. snowflake/ml/monitoring/entities/model_monitor_interval.py +0 -46
  174. /snowflake/ml/monitoring/{_client/model_monitor_version.py → model_monitor_version.py} +0 -0
  175. {snowflake_ml_python-1.6.4.dist-info → snowflake_ml_python-1.7.1.dist-info}/LICENSE.txt +0 -0
  176. {snowflake_ml_python-1.6.4.dist-info → snowflake_ml_python-1.7.1.dist-info}/top_level.txt +0 -0
@@ -113,26 +113,24 @@ class KMeans(BaseTransformer):
113
113
  The number of clusters to form as well as the number of
114
114
  centroids to generate.
115
115
 
116
+ For an example of how to choose an optimal value for `n_clusters` refer to
117
+ :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.
118
+
116
119
  init: {'k-means++', 'random'}, callable or array-like of shape (n_clusters, n_features), default='k-means++'
117
120
  Method for initialization:
118
121
 
119
- 'k-means++': selects initial cluster centroids using sampling based on
120
- an empirical probability distribution of the points' contribution to the
121
- overall inertia. This technique speeds up convergence. The algorithm
122
- implemented is "greedy k-means++". It differs from the vanilla k-means++
123
- by making several trials at each sampling step and choosing the best centroid
124
- among them.
122
+ * 'k-means++': selects initial cluster centroids using sampling based on an empirical probability distribution of the points' contribution to the overall inertia. This technique speeds up convergence. The algorithm implemented is "greedy k-means++". It differs from the vanilla k-means++ by making several trials at each sampling step and choosing the best centroid among them.
123
+
124
+ * 'random': choose `n_clusters` observations (rows) at random from data for the initial centroids.
125
125
 
126
- 'random': choose `n_clusters` observations (rows) at random from data
127
- for the initial centroids.
126
+ * If an array is passed, it should be of shape (n_clusters, n_features) and gives the initial centers.
128
127
 
129
- If an array is passed, it should be of shape (n_clusters, n_features)
130
- and gives the initial centers.
128
+ * If a callable is passed, it should take arguments X, n_clusters and a random state and return an initialization.
131
129
 
132
- If a callable is passed, it should take arguments X, n_clusters and a
133
- random state and return an initialization.
130
+ For an example of how to use the different `init` strategy, see the example
131
+ entitled :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`.
134
132
 
135
- n_init: 'auto' or int, default=10
133
+ n_init: 'auto' or int, default='auto'
136
134
  Number of times the k-means algorithm is run with different centroid
137
135
  seeds. The final results is the best output of `n_init` consecutive runs
138
136
  in terms of inertia. Several runs are recommended for sparse
@@ -169,15 +167,12 @@ class KMeans(BaseTransformer):
169
167
  copy_x is False. If the original data is sparse, but not in CSR format,
170
168
  a copy will be made even if copy_x is False.
171
169
 
172
- algorithm: {"lloyd", "elkan", "auto", "full"}, default="lloyd"
170
+ algorithm: {"lloyd", "elkan"}, default="lloyd"
173
171
  K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`.
174
172
  The `"elkan"` variation can be more efficient on some datasets with
175
173
  well-defined clusters, by using the triangle inequality. However it's
176
174
  more memory intensive due to the allocation of an extra array of shape
177
175
  `(n_samples, n_clusters)`.
178
-
179
- `"auto"` and `"full"` are deprecated and they will be removed in
180
- Scikit-Learn 1.3. They are both aliases for `"lloyd"`.
181
176
  """
182
177
 
183
178
  def __init__( # type: ignore[no-untyped-def]
@@ -185,7 +180,7 @@ class KMeans(BaseTransformer):
185
180
  *,
186
181
  n_clusters=8,
187
182
  init="k-means++",
188
- n_init="warn",
183
+ n_init="auto",
189
184
  max_iter=300,
190
185
  tol=0.0001,
191
186
  verbose=0,
@@ -215,7 +210,7 @@ class KMeans(BaseTransformer):
215
210
 
216
211
  init_args = {'n_clusters':(n_clusters, 8, False),
217
212
  'init':(init, "k-means++", False),
218
- 'n_init':(n_init, "warn", False),
213
+ 'n_init':(n_init, "auto", False),
219
214
  'max_iter':(max_iter, 300, False),
220
215
  'tol':(tol, 0.0001, False),
221
216
  'verbose':(verbose, 0, False),
@@ -180,7 +180,7 @@ class MiniBatchKMeans(BaseTransformer):
180
180
  If `None`, the heuristic is `init_size = 3 * batch_size` if
181
181
  `3 * batch_size < n_clusters`, else `init_size = 3 * n_clusters`.
182
182
 
183
- n_init: 'auto' or int, default=3
183
+ n_init: 'auto' or int, default="auto"
184
184
  Number of random initializations that are tried.
185
185
  In contrast to KMeans, the algorithm is only run once, using the best of
186
186
  the `n_init` initializations as measured by inertia. Several runs are
@@ -213,7 +213,7 @@ class MiniBatchKMeans(BaseTransformer):
213
213
  tol=0.0,
214
214
  max_no_improvement=10,
215
215
  init_size=None,
216
- n_init="warn",
216
+ n_init="auto",
217
217
  reassignment_ratio=0.01,
218
218
  input_cols: Optional[Union[str, Iterable[str]]] = None,
219
219
  output_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -246,7 +246,7 @@ class MiniBatchKMeans(BaseTransformer):
246
246
  'tol':(tol, 0.0, False),
247
247
  'max_no_improvement':(max_no_improvement, 10, False),
248
248
  'init_size':(init_size, None, False),
249
- 'n_init':(n_init, "warn", False),
249
+ 'n_init':(n_init, "auto", False),
250
250
  'reassignment_ratio':(reassignment_ratio, 0.01, False),}
251
251
  cleaned_up_init_args = validate_sklearn_args(
252
252
  args=init_args,
@@ -189,8 +189,8 @@ class OPTICS(BaseTransformer):
189
189
  algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
190
190
  Algorithm used to compute the nearest neighbors:
191
191
 
192
- - 'ball_tree' will use :class:`BallTree`.
193
- - 'kd_tree' will use :class:`KDTree`.
192
+ - 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`.
193
+ - 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`.
194
194
  - 'brute' will use a brute-force search.
195
195
  - 'auto' (default) will attempt to decide the most appropriate
196
196
  algorithm based on the values passed to :meth:`fit` method.
@@ -199,10 +199,10 @@ class OPTICS(BaseTransformer):
199
199
  this parameter, using brute force.
200
200
 
201
201
  leaf_size: int, default=30
202
- Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can
203
- affect the speed of the construction and query, as well as the memory
204
- required to store the tree. The optimal value depends on the
205
- nature of the problem.
202
+ Leaf size passed to :class:`~sklearn.neighbors.BallTree` or
203
+ :class:`~sklearn.neighbors.KDTree`. This can affect the speed of the
204
+ construction and query, as well as the memory required to store the
205
+ tree. The optimal value depends on the nature of the problem.
206
206
 
207
207
  memory: str or object with the joblib.Memory interface, default=None
208
208
  Used to cache the output of the computation of the tree.
@@ -137,7 +137,8 @@ class SpectralClustering(BaseTransformer):
137
137
 
138
138
  gamma: float, default=1.0
139
139
  Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
140
- Ignored for ``affinity='nearest_neighbors'``.
140
+ Ignored for ``affinity='nearest_neighbors'``, ``affinity='precomputed'``
141
+ or ``affinity='precomputed_nearest_neighbors'``.
141
142
 
142
143
  affinity: str or callable, default='rbf'
143
144
  How to construct the affinity matrix.
@@ -151,7 +152,7 @@ class SpectralClustering(BaseTransformer):
151
152
  of precomputed distances, and construct a binary affinity matrix
152
153
  from the ``n_neighbors`` nearest neighbors of each instance.
153
154
  - one of the kernels supported by
154
- :func:`~sklearn.metrics.pairwise_kernels`.
155
+ :func:`~sklearn.metrics.pairwise.pairwise_kernels`.
155
156
 
156
157
  Only kernels that produce similarity scores (non-negative values that
157
158
  increase with similarity) should be used. This property is not checked
@@ -162,7 +163,7 @@ class SpectralClustering(BaseTransformer):
162
163
  the nearest neighbors method. Ignored for ``affinity='rbf'``.
163
164
 
164
165
  eigen_tol: float, default="auto"
165
- Stopping criterion for eigendecomposition of the Laplacian matrix.
166
+ Stopping criterion for eigen decomposition of the Laplacian matrix.
166
167
  If `eigen_tol="auto"` then the passed tolerance will depend on the
167
168
  `eigen_solver`:
168
169
 
@@ -171,10 +171,18 @@ class ColumnTransformer(BaseTransformer):
171
171
  printed as it is completed.
172
172
 
173
173
  verbose_feature_names_out: bool, default=True
174
- If True, :meth:`get_feature_names_out` will prefix all feature names
175
- with the name of the transformer that generated that feature.
176
- If False, :meth:`get_feature_names_out` will not prefix any feature
177
- names and will error if feature names are not unique.
174
+ If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix
175
+ all feature names with the name of the transformer that generated that
176
+ feature.
177
+ If False, :meth:`ColumnTransformer.get_feature_names_out` will not
178
+ prefix any feature names and will error if feature names are not
179
+ unique.
180
+
181
+ force_int_remainder_cols: bool, default=True
182
+ Force the columns of the last entry of `transformers_`, which
183
+ corresponds to the "remainder" transformer, to always be stored as
184
+ indices (int) rather than column names (str). See description of the
185
+ `transformers_` attribute for details.
178
186
  """
179
187
 
180
188
  def __init__( # type: ignore[no-untyped-def]
@@ -187,6 +195,7 @@ class ColumnTransformer(BaseTransformer):
187
195
  transformer_weights=None,
188
196
  verbose=False,
189
197
  verbose_feature_names_out=True,
198
+ force_int_remainder_cols=True,
190
199
  input_cols: Optional[Union[str, Iterable[str]]] = None,
191
200
  output_cols: Optional[Union[str, Iterable[str]]] = None,
192
201
  label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -214,7 +223,8 @@ class ColumnTransformer(BaseTransformer):
214
223
  'n_jobs':(n_jobs, None, False),
215
224
  'transformer_weights':(transformer_weights, None, False),
216
225
  'verbose':(verbose, False, False),
217
- 'verbose_feature_names_out':(verbose_feature_names_out, True, False),}
226
+ 'verbose_feature_names_out':(verbose_feature_names_out, True, False),
227
+ 'force_int_remainder_cols':(force_int_remainder_cols, True, False),}
218
228
  cleaned_up_init_args = validate_sklearn_args(
219
229
  args=init_args,
220
230
  klass=sklearn.compose.ColumnTransformer
@@ -125,15 +125,16 @@ class TransformedTargetRegressor(BaseTransformer):
125
125
 
126
126
  func: function, default=None
127
127
  Function to apply to `y` before passing to :meth:`fit`. Cannot be set
128
- at the same time as `transformer`. The function needs to return a
129
- 2-dimensional array. If `func is None`, the function used will be the
130
- identity function.
128
+ at the same time as `transformer`. If `func is None`, the function used will be
129
+ the identity function. If `func` is set, `inverse_func` also needs to be
130
+ provided. The function needs to return a 2-dimensional array.
131
131
 
132
132
  inverse_func: function, default=None
133
133
  Function to apply to the prediction of the regressor. Cannot be set at
134
- the same time as `transformer`. The function needs to return a
135
- 2-dimensional array. The inverse function is used to return
136
- predictions to the same space of the original training labels.
134
+ the same time as `transformer`. The inverse function is used to return
135
+ predictions to the same space of the original training labels. If
136
+ `inverse_func` is set, `func` also needs to be provided. The inverse
137
+ function needs to return a 2-dimensional array.
137
138
 
138
139
  check_inverse: bool, default=True
139
140
  Whether to check that `transform` followed by `inverse_transform`
@@ -124,7 +124,7 @@ class EllipticEnvelope(BaseTransformer):
124
124
  support_fraction: float, default=None
125
125
  The proportion of points to be included in the support of the raw
126
126
  MCD estimate. If None, the minimum value of support_fraction will
127
- be used within the algorithm: `[n_sample + n_features + 1] / 2`.
127
+ be used within the algorithm: `(n_samples + n_features + 1) / 2 * n_samples`.
128
128
  Range is (0, 1).
129
129
 
130
130
  contamination: float, default=0.1
@@ -129,7 +129,7 @@ class GraphicalLassoCV(BaseTransformer):
129
129
  - :term:`CV splitter`,
130
130
  - An iterable yielding (train, test) splits as arrays of indices.
131
131
 
132
- For integer/None inputs :class:`KFold` is used.
132
+ For integer/None inputs :class:`~sklearn.model_selection.KFold` is used.
133
133
 
134
134
  Refer :ref:`User Guide <cross_validation>` for the various
135
135
  cross-validation strategies that can be used here.
@@ -125,8 +125,8 @@ class MinCovDet(BaseTransformer):
125
125
  The proportion of points to be included in the support of the raw
126
126
  MCD estimate. Default is None, which implies that the minimum
127
127
  value of support_fraction will be used within the algorithm:
128
- `(n_sample + n_features + 1) / 2`. The parameter must be in the range
129
- (0, 1].
128
+ `(n_samples + n_features + 1) / 2 * n_samples`. The parameter must be
129
+ in the range (0, 1].
130
130
 
131
131
  random_state: int, RandomState instance or None, default=None
132
132
  Determines the pseudo random number generator for shuffling the data.
@@ -58,7 +58,7 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.covariance".replace("skl
58
58
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
59
 
60
60
  class OAS(BaseTransformer):
61
- r"""Oracle Approximating Shrinkage Estimator as proposed in [1]_
61
+ r"""Oracle Approximating Shrinkage Estimator
62
62
  For more details on this class, see [sklearn.covariance.OAS]
63
63
  (https://scikit-learn.org/stable/modules/generated/sklearn.covariance.OAS.html)
64
64
 
@@ -58,7 +58,7 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.decomposition".replace("
58
58
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
59
 
60
60
  class KernelPCA(BaseTransformer):
61
- r"""Kernel Principal component analysis (KPCA) [1]_
61
+ r"""Kernel Principal component analysis (KPCA)
62
62
  For more details on this class, see [sklearn.decomposition.KernelPCA]
63
63
  (https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html)
64
64
 
@@ -119,7 +119,7 @@ class KernelPCA(BaseTransformer):
119
119
  Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other
120
120
  kernels. If ``gamma`` is ``None``, then it is set to ``1/n_features``.
121
121
 
122
- degree: int, default=3
122
+ degree: float, default=3
123
123
  Degree for poly kernels. Ignored by other kernels.
124
124
 
125
125
  coef0: float, default=1
@@ -115,13 +115,9 @@ class MiniBatchDictionaryLearning(BaseTransformer):
115
115
  alpha: float, default=1
116
116
  Sparsity controlling parameter.
117
117
 
118
- n_iter: int, default=1000
119
- Total number of iterations over data batches to perform.
120
-
121
- max_iter: int, default=None
118
+ max_iter: int, default=1_000
122
119
  Maximum number of iterations over the complete dataset before
123
120
  stopping independently of any early stopping criterion heuristics.
124
- If ``max_iter`` is not None, ``n_iter`` is ignored.
125
121
 
126
122
  fit_algorithm: {'lars', 'cd'}, default='lars'
127
123
  The algorithm used:
@@ -204,15 +200,14 @@ class MiniBatchDictionaryLearning(BaseTransformer):
204
200
 
205
201
  tol: float, default=1e-3
206
202
  Control early stopping based on the norm of the differences in the
207
- dictionary between 2 steps. Used only if `max_iter` is not None.
203
+ dictionary between 2 steps.
208
204
 
209
205
  To disable early stopping based on changes in the dictionary, set
210
206
  `tol` to 0.0.
211
207
 
212
208
  max_no_improvement: int, default=10
213
209
  Control early stopping based on the consecutive number of mini batches
214
- that does not yield an improvement on the smoothed cost function. Used only if
215
- `max_iter` is not None.
210
+ that does not yield an improvement on the smoothed cost function.
216
211
 
217
212
  To disable convergence detection based on cost function, set
218
213
  `max_no_improvement` to None.
@@ -223,8 +218,7 @@ class MiniBatchDictionaryLearning(BaseTransformer):
223
218
  *,
224
219
  n_components=None,
225
220
  alpha=1,
226
- n_iter="deprecated",
227
- max_iter=None,
221
+ max_iter=1000,
228
222
  fit_algorithm="lars",
229
223
  n_jobs=None,
230
224
  batch_size=256,
@@ -265,8 +259,7 @@ class MiniBatchDictionaryLearning(BaseTransformer):
265
259
 
266
260
  init_args = {'n_components':(n_components, None, False),
267
261
  'alpha':(alpha, 1, False),
268
- 'n_iter':(n_iter, "deprecated", False),
269
- 'max_iter':(max_iter, None, False),
262
+ 'max_iter':(max_iter, 1000, False),
270
263
  'fit_algorithm':(fit_algorithm, "lars", False),
271
264
  'n_jobs':(n_jobs, None, False),
272
265
  'batch_size':(batch_size, 256, False),
@@ -121,13 +121,9 @@ class MiniBatchSparsePCA(BaseTransformer):
121
121
  Amount of ridge shrinkage to apply in order to improve
122
122
  conditioning when calling the transform method.
123
123
 
124
- n_iter: int, default=100
125
- Number of iterations to perform for each mini batch.
126
-
127
- max_iter: int, default=None
124
+ max_iter: int, default=1_000
128
125
  Maximum number of iterations over the complete dataset before
129
126
  stopping independently of any early stopping criterion heuristics.
130
- If `max_iter` is not `None`, `n_iter` is ignored.
131
127
 
132
128
  callback: callable, default=None
133
129
  Callable that gets invoked every five iterations.
@@ -163,15 +159,14 @@ class MiniBatchSparsePCA(BaseTransformer):
163
159
 
164
160
  tol: float, default=1e-3
165
161
  Control early stopping based on the norm of the differences in the
166
- dictionary between 2 steps. Used only if `max_iter` is not None.
162
+ dictionary between 2 steps.
167
163
 
168
164
  To disable early stopping based on changes in the dictionary, set
169
165
  `tol` to 0.0.
170
166
 
171
167
  max_no_improvement: int or None, default=10
172
168
  Control early stopping based on the consecutive number of mini batches
173
- that does not yield an improvement on the smoothed cost function. Used only if
174
- `max_iter` is not None.
169
+ that does not yield an improvement on the smoothed cost function.
175
170
 
176
171
  To disable convergence detection based on cost function, set
177
172
  `max_no_improvement` to `None`.
@@ -183,8 +178,7 @@ class MiniBatchSparsePCA(BaseTransformer):
183
178
  n_components=None,
184
179
  alpha=1,
185
180
  ridge_alpha=0.01,
186
- n_iter="deprecated",
187
- max_iter=None,
181
+ max_iter=1000,
188
182
  callback=None,
189
183
  batch_size=3,
190
184
  verbose=False,
@@ -218,8 +212,7 @@ class MiniBatchSparsePCA(BaseTransformer):
218
212
  init_args = {'n_components':(n_components, None, False),
219
213
  'alpha':(alpha, 1, False),
220
214
  'ridge_alpha':(ridge_alpha, 0.01, False),
221
- 'n_iter':(n_iter, "deprecated", False),
222
- 'max_iter':(max_iter, None, False),
215
+ 'max_iter':(max_iter, 1000, False),
223
216
  'callback':(callback, None, False),
224
217
  'batch_size':(batch_size, 3, False),
225
218
  'verbose':(verbose, False, False),
@@ -145,23 +145,36 @@ class PCA(BaseTransformer):
145
145
  improve the predictive accuracy of the downstream estimators by
146
146
  making their data respect some hard-wired assumptions.
147
147
 
148
- svd_solver: {'auto', 'full', 'arpack', 'randomized'}, default='auto'
149
- If auto :
150
- The solver is selected by a default policy based on `X.shape` and
151
- `n_components`: if the input data is larger than 500x500 and the
152
- number of components to extract is lower than 80% of the smallest
153
- dimension of the data, then the more efficient 'randomized'
154
- method is enabled. Otherwise the exact full SVD is computed and
155
- optionally truncated afterwards.
156
- If full :
157
- run exact full SVD calling the standard LAPACK solver via
148
+ svd_solver: {'auto', 'full', 'covariance_eigh', 'arpack', 'randomized'}, default='auto'
149
+ "auto" :
150
+ The solver is selected by a default 'auto' policy is based on `X.shape` and
151
+ `n_components`: if the input data has fewer than 1000 features and
152
+ more than 10 times as many samples, then the "covariance_eigh"
153
+ solver is used. Otherwise, if the input data is larger than 500x500
154
+ and the number of components to extract is lower than 80% of the
155
+ smallest dimension of the data, then the more efficient
156
+ "randomized" method is selected. Otherwise the exact "full" SVD is
157
+ computed and optionally truncated afterwards.
158
+ "full" :
159
+ Run exact full SVD calling the standard LAPACK solver via
158
160
  `scipy.linalg.svd` and select the components by postprocessing
159
- If arpack :
160
- run SVD truncated to n_components calling ARPACK solver via
161
+ "covariance_eigh" :
162
+ Precompute the covariance matrix (on centered data), run a
163
+ classical eigenvalue decomposition on the covariance matrix
164
+ typically using LAPACK and select the components by postprocessing.
165
+ This solver is very efficient for n_samples >> n_features and small
166
+ n_features. It is, however, not tractable otherwise for large
167
+ n_features (large memory footprint required to materialize the
168
+ covariance matrix). Also note that compared to the "full" solver,
169
+ this solver effectively doubles the condition number and is
170
+ therefore less numerical stable (e.g. on input data with a large
171
+ range of singular values).
172
+ "arpack" :
173
+ Run SVD truncated to `n_components` calling ARPACK solver via
161
174
  `scipy.sparse.linalg.svds`. It requires strictly
162
- 0 < n_components < min(X.shape)
163
- If randomized :
164
- run randomized SVD by the method of Halko et al.
175
+ `0 < n_components < min(X.shape)`
176
+ "randomized" :
177
+ Run randomized SVD by the method of Halko et al.
165
178
 
166
179
  tol: float, default=0.0
167
180
  Tolerance for singular values computed by svd_solver == 'arpack'.
@@ -128,6 +128,9 @@ class LinearDiscriminantAnalysis(BaseTransformer):
128
128
  This should be left to None if `covariance_estimator` is used.
129
129
  Note that shrinkage works only with 'lsqr' and 'eigen' solvers.
130
130
 
131
+ For a usage example, see
132
+ :ref:`sphx_glr_auto_examples_classification_plot_lda.py`.
133
+
131
134
  priors: array-like of shape (n_classes,), default=None
132
135
  The class prior probabilities. By default, the class proportions are
133
136
  inferred from the training data.
@@ -138,6 +141,9 @@ class LinearDiscriminantAnalysis(BaseTransformer):
138
141
  min(n_classes - 1, n_features). This parameter only affects the
139
142
  `transform` method.
140
143
 
144
+ For a usage example, see
145
+ :ref:`sphx_glr_auto_examples_decomposition_plot_pca_vs_lda.py`.
146
+
141
147
  store_covariance: bool, default=False
142
148
  If True, explicitly compute the weighted within-class covariance
143
149
  matrix when solver is 'svd'. The matrix is always computed
@@ -140,13 +140,6 @@ class AdaBoostClassifier(BaseTransformer):
140
140
  Thus, it is only used when `estimator` exposes a `random_state`.
141
141
  Pass an int for reproducible output across multiple function calls.
142
142
  See :term:`Glossary <random_state>`.
143
-
144
- base_estimator: object, default=None
145
- The base estimator from which the boosted ensemble is built.
146
- Support for sample weighting is required, as well as proper
147
- ``classes_`` and ``n_classes_`` attributes. If ``None``, then
148
- the base estimator is :class:`~sklearn.tree.DecisionTreeClassifier`
149
- initialized with `max_depth=1`.
150
143
  """
151
144
 
152
145
  def __init__( # type: ignore[no-untyped-def]
@@ -157,7 +150,6 @@ class AdaBoostClassifier(BaseTransformer):
157
150
  learning_rate=1.0,
158
151
  algorithm="SAMME.R",
159
152
  random_state=None,
160
- base_estimator="deprecated",
161
153
  input_cols: Optional[Union[str, Iterable[str]]] = None,
162
154
  output_cols: Optional[Union[str, Iterable[str]]] = None,
163
155
  label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -177,16 +169,13 @@ class AdaBoostClassifier(BaseTransformer):
177
169
  self._batch_size = -1
178
170
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
179
171
  deps = deps | gather_dependencies(estimator)
180
- deps = deps | gather_dependencies(base_estimator)
181
172
  self._deps = list(deps)
182
173
  estimator = transform_snowml_obj_to_sklearn_obj(estimator)
183
- base_estimator = transform_snowml_obj_to_sklearn_obj(base_estimator)
184
174
  init_args = {'estimator':(estimator, None, False),
185
175
  'n_estimators':(n_estimators, 50, False),
186
176
  'learning_rate':(learning_rate, 1.0, False),
187
177
  'algorithm':(algorithm, "SAMME.R", False),
188
- 'random_state':(random_state, None, False),
189
- 'base_estimator':(base_estimator, "deprecated", False),}
178
+ 'random_state':(random_state, None, False),}
190
179
  cleaned_up_init_args = validate_sklearn_args(
191
180
  args=init_args,
192
181
  klass=sklearn.ensemble.AdaBoostClassifier
@@ -138,12 +138,6 @@ class AdaBoostRegressor(BaseTransformer):
138
138
  `estimator` at each boosting iteration.
139
139
  Pass an int for reproducible output across multiple function calls.
140
140
  See :term:`Glossary <random_state>`.
141
-
142
- base_estimator: object, default=None
143
- The base estimator from which the boosted ensemble is built.
144
- If ``None``, then the base estimator is
145
- :class:`~sklearn.tree.DecisionTreeRegressor` initialized with
146
- `max_depth=3`.
147
141
  """
148
142
 
149
143
  def __init__( # type: ignore[no-untyped-def]
@@ -154,7 +148,6 @@ class AdaBoostRegressor(BaseTransformer):
154
148
  learning_rate=1.0,
155
149
  loss="linear",
156
150
  random_state=None,
157
- base_estimator="deprecated",
158
151
  input_cols: Optional[Union[str, Iterable[str]]] = None,
159
152
  output_cols: Optional[Union[str, Iterable[str]]] = None,
160
153
  label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -174,16 +167,13 @@ class AdaBoostRegressor(BaseTransformer):
174
167
  self._batch_size = -1
175
168
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
176
169
  deps = deps | gather_dependencies(estimator)
177
- deps = deps | gather_dependencies(base_estimator)
178
170
  self._deps = list(deps)
179
171
  estimator = transform_snowml_obj_to_sklearn_obj(estimator)
180
- base_estimator = transform_snowml_obj_to_sklearn_obj(base_estimator)
181
172
  init_args = {'estimator':(estimator, None, False),
182
173
  'n_estimators':(n_estimators, 50, False),
183
174
  'learning_rate':(learning_rate, 1.0, False),
184
175
  'loss':(loss, "linear", False),
185
- 'random_state':(random_state, None, False),
186
- 'base_estimator':(base_estimator, "deprecated", False),}
176
+ 'random_state':(random_state, None, False),}
187
177
  cleaned_up_init_args = validate_sklearn_args(
188
178
  args=init_args,
189
179
  klass=sklearn.ensemble.AdaBoostRegressor
@@ -164,9 +164,6 @@ class BaggingClassifier(BaseTransformer):
164
164
 
165
165
  verbose: int, default=0
166
166
  Controls the verbosity when fitting and predicting.
167
-
168
- base_estimator: object, default="deprecated"
169
- Use `estimator` instead.
170
167
  """
171
168
 
172
169
  def __init__( # type: ignore[no-untyped-def]
@@ -183,7 +180,6 @@ class BaggingClassifier(BaseTransformer):
183
180
  n_jobs=None,
184
181
  random_state=None,
185
182
  verbose=0,
186
- base_estimator="deprecated",
187
183
  input_cols: Optional[Union[str, Iterable[str]]] = None,
188
184
  output_cols: Optional[Union[str, Iterable[str]]] = None,
189
185
  label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -203,10 +199,8 @@ class BaggingClassifier(BaseTransformer):
203
199
  self._batch_size = -1
204
200
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
205
201
  deps = deps | gather_dependencies(estimator)
206
- deps = deps | gather_dependencies(base_estimator)
207
202
  self._deps = list(deps)
208
203
  estimator = transform_snowml_obj_to_sklearn_obj(estimator)
209
- base_estimator = transform_snowml_obj_to_sklearn_obj(base_estimator)
210
204
  init_args = {'estimator':(estimator, None, False),
211
205
  'n_estimators':(n_estimators, 10, False),
212
206
  'max_samples':(max_samples, 1.0, False),
@@ -217,8 +211,7 @@ class BaggingClassifier(BaseTransformer):
217
211
  'warm_start':(warm_start, False, False),
218
212
  'n_jobs':(n_jobs, None, False),
219
213
  'random_state':(random_state, None, False),
220
- 'verbose':(verbose, 0, False),
221
- 'base_estimator':(base_estimator, "deprecated", False),}
214
+ 'verbose':(verbose, 0, False),}
222
215
  cleaned_up_init_args = validate_sklearn_args(
223
216
  args=init_args,
224
217
  klass=sklearn.ensemble.BaggingClassifier
@@ -164,9 +164,6 @@ class BaggingRegressor(BaseTransformer):
164
164
 
165
165
  verbose: int, default=0
166
166
  Controls the verbosity when fitting and predicting.
167
-
168
- base_estimator: object, default="deprecated"
169
- Use `estimator` instead.
170
167
  """
171
168
 
172
169
  def __init__( # type: ignore[no-untyped-def]
@@ -183,7 +180,6 @@ class BaggingRegressor(BaseTransformer):
183
180
  n_jobs=None,
184
181
  random_state=None,
185
182
  verbose=0,
186
- base_estimator="deprecated",
187
183
  input_cols: Optional[Union[str, Iterable[str]]] = None,
188
184
  output_cols: Optional[Union[str, Iterable[str]]] = None,
189
185
  label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -203,10 +199,8 @@ class BaggingRegressor(BaseTransformer):
203
199
  self._batch_size = -1
204
200
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
205
201
  deps = deps | gather_dependencies(estimator)
206
- deps = deps | gather_dependencies(base_estimator)
207
202
  self._deps = list(deps)
208
203
  estimator = transform_snowml_obj_to_sklearn_obj(estimator)
209
- base_estimator = transform_snowml_obj_to_sklearn_obj(base_estimator)
210
204
  init_args = {'estimator':(estimator, None, False),
211
205
  'n_estimators':(n_estimators, 10, False),
212
206
  'max_samples':(max_samples, 1.0, False),
@@ -217,8 +211,7 @@ class BaggingRegressor(BaseTransformer):
217
211
  'warm_start':(warm_start, False, False),
218
212
  'n_jobs':(n_jobs, None, False),
219
213
  'random_state':(random_state, None, False),
220
- 'verbose':(verbose, 0, False),
221
- 'base_estimator':(base_estimator, "deprecated", False),}
214
+ 'verbose':(verbose, 0, False),}
222
215
  cleaned_up_init_args = validate_sklearn_args(
223
216
  args=init_args,
224
217
  klass=sklearn.ensemble.BaggingRegressor