snowflake-ml-python 1.7.3__py3-none-any.whl → 1.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. snowflake/cortex/_complete.py +19 -0
  2. snowflake/ml/_internal/env_utils.py +64 -21
  3. snowflake/ml/_internal/platform_capabilities.py +87 -0
  4. snowflake/ml/_internal/relax_version_strategy.py +16 -0
  5. snowflake/ml/_internal/telemetry.py +21 -0
  6. snowflake/ml/data/_internal/arrow_ingestor.py +1 -1
  7. snowflake/ml/dataset/dataset.py +0 -1
  8. snowflake/ml/feature_store/feature_store.py +18 -0
  9. snowflake/ml/feature_store/feature_view.py +46 -1
  10. snowflake/ml/fileset/fileset.py +6 -0
  11. snowflake/ml/jobs/__init__.py +21 -0
  12. snowflake/ml/jobs/_utils/constants.py +57 -0
  13. snowflake/ml/jobs/_utils/payload_utils.py +438 -0
  14. snowflake/ml/jobs/_utils/spec_utils.py +296 -0
  15. snowflake/ml/jobs/_utils/types.py +39 -0
  16. snowflake/ml/jobs/decorators.py +71 -0
  17. snowflake/ml/jobs/job.py +113 -0
  18. snowflake/ml/jobs/manager.py +298 -0
  19. snowflake/ml/model/_client/ops/model_ops.py +11 -2
  20. snowflake/ml/model/_client/ops/service_ops.py +1 -11
  21. snowflake/ml/model/_client/sql/service.py +13 -6
  22. snowflake/ml/model/_packager/model_env/model_env.py +45 -28
  23. snowflake/ml/model/_packager/model_handlers/_utils.py +19 -6
  24. snowflake/ml/model/_packager/model_handlers/custom.py +1 -2
  25. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +17 -0
  26. snowflake/ml/model/_packager/model_handlers/keras.py +230 -0
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +1 -0
  28. snowflake/ml/model/_packager/model_handlers/sklearn.py +28 -3
  29. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +74 -21
  30. snowflake/ml/model/_packager/model_handlers/tensorflow.py +27 -49
  31. snowflake/ml/model/_packager/model_handlers_migrator/tensorflow_migrator_2023_12_01.py +48 -0
  32. snowflake/ml/model/_packager/model_meta/model_meta.py +1 -1
  33. snowflake/ml/model/_packager/model_meta/model_meta_schema.py +3 -0
  34. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +2 -2
  35. snowflake/ml/model/_packager/model_runtime/model_runtime.py +4 -1
  36. snowflake/ml/model/_packager/model_task/model_task_utils.py +5 -1
  37. snowflake/ml/model/_signatures/base_handler.py +1 -2
  38. snowflake/ml/model/_signatures/builtins_handler.py +2 -2
  39. snowflake/ml/model/_signatures/core.py +2 -2
  40. snowflake/ml/model/_signatures/numpy_handler.py +11 -12
  41. snowflake/ml/model/_signatures/pandas_handler.py +11 -9
  42. snowflake/ml/model/_signatures/pytorch_handler.py +3 -6
  43. snowflake/ml/model/_signatures/snowpark_handler.py +3 -3
  44. snowflake/ml/model/_signatures/tensorflow_handler.py +2 -7
  45. snowflake/ml/model/model_signature.py +25 -4
  46. snowflake/ml/model/type_hints.py +15 -0
  47. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +14 -1
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +6 -3
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +6 -3
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +6 -3
  51. snowflake/ml/modeling/cluster/birch.py +6 -3
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +6 -3
  53. snowflake/ml/modeling/cluster/dbscan.py +6 -3
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +6 -3
  55. snowflake/ml/modeling/cluster/k_means.py +6 -3
  56. snowflake/ml/modeling/cluster/mean_shift.py +6 -3
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +6 -3
  58. snowflake/ml/modeling/cluster/optics.py +6 -3
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +6 -3
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +6 -3
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +6 -3
  62. snowflake/ml/modeling/compose/column_transformer.py +6 -3
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +6 -3
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +6 -3
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +6 -3
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +6 -3
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +6 -3
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +6 -3
  69. snowflake/ml/modeling/covariance/min_cov_det.py +6 -3
  70. snowflake/ml/modeling/covariance/oas.py +6 -3
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +6 -3
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +6 -3
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +6 -3
  74. snowflake/ml/modeling/decomposition/fast_ica.py +6 -3
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +6 -3
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +6 -3
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +6 -3
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +6 -3
  79. snowflake/ml/modeling/decomposition/pca.py +6 -3
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +6 -3
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +6 -3
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +6 -3
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +6 -3
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +6 -3
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +6 -3
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +6 -3
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +6 -3
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +6 -3
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +6 -3
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +6 -3
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +6 -3
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +6 -3
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +6 -3
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +6 -3
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +6 -3
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +6 -3
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +6 -3
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +6 -3
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +6 -3
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +6 -3
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +6 -3
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +6 -3
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +6 -3
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +6 -3
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +6 -3
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +6 -3
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +6 -3
  108. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +6 -3
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +6 -3
  110. snowflake/ml/modeling/impute/iterative_imputer.py +6 -3
  111. snowflake/ml/modeling/impute/knn_imputer.py +6 -3
  112. snowflake/ml/modeling/impute/missing_indicator.py +6 -3
  113. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +6 -3
  114. snowflake/ml/modeling/kernel_approximation/nystroem.py +6 -3
  115. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +6 -3
  116. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +6 -3
  117. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +6 -3
  118. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +6 -3
  119. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +6 -3
  120. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +6 -3
  121. snowflake/ml/modeling/linear_model/ard_regression.py +6 -3
  122. snowflake/ml/modeling/linear_model/bayesian_ridge.py +6 -3
  123. snowflake/ml/modeling/linear_model/elastic_net.py +6 -3
  124. snowflake/ml/modeling/linear_model/elastic_net_cv.py +6 -3
  125. snowflake/ml/modeling/linear_model/gamma_regressor.py +6 -3
  126. snowflake/ml/modeling/linear_model/huber_regressor.py +6 -3
  127. snowflake/ml/modeling/linear_model/lars.py +6 -3
  128. snowflake/ml/modeling/linear_model/lars_cv.py +6 -3
  129. snowflake/ml/modeling/linear_model/lasso.py +6 -3
  130. snowflake/ml/modeling/linear_model/lasso_cv.py +6 -3
  131. snowflake/ml/modeling/linear_model/lasso_lars.py +6 -3
  132. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +6 -3
  133. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +6 -3
  134. snowflake/ml/modeling/linear_model/linear_regression.py +6 -3
  135. snowflake/ml/modeling/linear_model/logistic_regression.py +6 -3
  136. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +6 -3
  137. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +6 -3
  138. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +6 -3
  139. snowflake/ml/modeling/linear_model/multi_task_lasso.py +6 -3
  140. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +6 -3
  141. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +6 -3
  142. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +6 -3
  143. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +6 -3
  144. snowflake/ml/modeling/linear_model/perceptron.py +6 -3
  145. snowflake/ml/modeling/linear_model/poisson_regressor.py +6 -3
  146. snowflake/ml/modeling/linear_model/ransac_regressor.py +6 -3
  147. snowflake/ml/modeling/linear_model/ridge.py +6 -3
  148. snowflake/ml/modeling/linear_model/ridge_classifier.py +6 -3
  149. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +6 -3
  150. snowflake/ml/modeling/linear_model/ridge_cv.py +6 -3
  151. snowflake/ml/modeling/linear_model/sgd_classifier.py +6 -3
  152. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +6 -3
  153. snowflake/ml/modeling/linear_model/sgd_regressor.py +6 -3
  154. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +6 -3
  155. snowflake/ml/modeling/linear_model/tweedie_regressor.py +6 -3
  156. snowflake/ml/modeling/manifold/isomap.py +6 -3
  157. snowflake/ml/modeling/manifold/mds.py +6 -3
  158. snowflake/ml/modeling/manifold/spectral_embedding.py +6 -3
  159. snowflake/ml/modeling/manifold/tsne.py +6 -3
  160. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +6 -3
  161. snowflake/ml/modeling/mixture/gaussian_mixture.py +6 -3
  162. snowflake/ml/modeling/model_selection/grid_search_cv.py +17 -2
  163. snowflake/ml/modeling/model_selection/randomized_search_cv.py +17 -2
  164. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +6 -3
  165. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +6 -3
  166. snowflake/ml/modeling/multiclass/output_code_classifier.py +6 -3
  167. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +6 -3
  168. snowflake/ml/modeling/naive_bayes/categorical_nb.py +6 -3
  169. snowflake/ml/modeling/naive_bayes/complement_nb.py +6 -3
  170. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +6 -3
  171. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +6 -3
  172. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +6 -3
  173. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +6 -3
  174. snowflake/ml/modeling/neighbors/kernel_density.py +6 -3
  175. snowflake/ml/modeling/neighbors/local_outlier_factor.py +6 -3
  176. snowflake/ml/modeling/neighbors/nearest_centroid.py +6 -3
  177. snowflake/ml/modeling/neighbors/nearest_neighbors.py +6 -3
  178. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +6 -3
  179. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +6 -3
  180. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +6 -3
  181. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +6 -3
  182. snowflake/ml/modeling/neural_network/mlp_classifier.py +6 -3
  183. snowflake/ml/modeling/neural_network/mlp_regressor.py +6 -3
  184. snowflake/ml/modeling/pipeline/pipeline.py +28 -3
  185. snowflake/ml/modeling/preprocessing/polynomial_features.py +8 -5
  186. snowflake/ml/modeling/semi_supervised/label_propagation.py +6 -3
  187. snowflake/ml/modeling/semi_supervised/label_spreading.py +6 -3
  188. snowflake/ml/modeling/svm/linear_svc.py +6 -3
  189. snowflake/ml/modeling/svm/linear_svr.py +6 -3
  190. snowflake/ml/modeling/svm/nu_svc.py +6 -3
  191. snowflake/ml/modeling/svm/nu_svr.py +6 -3
  192. snowflake/ml/modeling/svm/svc.py +6 -3
  193. snowflake/ml/modeling/svm/svr.py +6 -3
  194. snowflake/ml/modeling/tree/decision_tree_classifier.py +6 -3
  195. snowflake/ml/modeling/tree/decision_tree_regressor.py +6 -3
  196. snowflake/ml/modeling/tree/extra_tree_classifier.py +6 -3
  197. snowflake/ml/modeling/tree/extra_tree_regressor.py +6 -3
  198. snowflake/ml/modeling/xgboost/xgb_classifier.py +6 -3
  199. snowflake/ml/modeling/xgboost/xgb_regressor.py +6 -3
  200. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +6 -3
  201. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +6 -3
  202. snowflake/ml/registry/registry.py +34 -4
  203. snowflake/ml/version.py +1 -1
  204. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/METADATA +81 -33
  205. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/RECORD +208 -196
  206. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/WHEEL +1 -1
  207. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/LICENSE.txt +0 -0
  208. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/top_level.txt +0 -0
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.linear_model".replace("s
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class SGDClassifier(BaseTransformer):
61
64
  r"""Linear classifiers (SVM, logistic regression, etc
62
65
  For more details on this class, see [sklearn.linear_model.SGDClassifier]
@@ -593,7 +596,7 @@ class SGDClassifier(BaseTransformer):
593
596
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
594
597
  expected_dtype = "array"
595
598
  else:
596
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
599
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
597
600
  # We can only infer the output types from the input types if the following two statemetns are true:
598
601
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
599
602
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1256,7 +1259,7 @@ class SGDClassifier(BaseTransformer):
1256
1259
 
1257
1260
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1258
1261
 
1259
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1262
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1260
1263
  outputs: List[BaseFeatureSpec] = []
1261
1264
  if hasattr(self, "predict"):
1262
1265
  # keep mypy happy
@@ -1264,7 +1267,7 @@ class SGDClassifier(BaseTransformer):
1264
1267
  # For classifier, the type of predict is the same as the type of label
1265
1268
  if self._sklearn_object._estimator_type == "classifier":
1266
1269
  # label columns is the desired type for output
1267
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1270
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1268
1271
  # rename the output columns
1269
1272
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1270
1273
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.linear_model".replace("s
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class SGDOneClassSVM(BaseTransformer):
61
64
  r"""Solves linear One-Class SVM using Stochastic Gradient Descent
62
65
  For more details on this class, see [sklearn.linear_model.SGDOneClassSVM]
@@ -497,7 +500,7 @@ class SGDOneClassSVM(BaseTransformer):
497
500
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
498
501
  expected_dtype = "array"
499
502
  else:
500
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
503
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
501
504
  # We can only infer the output types from the input types if the following two statemetns are true:
502
505
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
503
506
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1158,7 +1161,7 @@ class SGDOneClassSVM(BaseTransformer):
1158
1161
 
1159
1162
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1160
1163
 
1161
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1164
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1162
1165
  outputs: List[BaseFeatureSpec] = []
1163
1166
  if hasattr(self, "predict"):
1164
1167
  # keep mypy happy
@@ -1166,7 +1169,7 @@ class SGDOneClassSVM(BaseTransformer):
1166
1169
  # For classifier, the type of predict is the same as the type of label
1167
1170
  if self._sklearn_object._estimator_type == "classifier":
1168
1171
  # label columns is the desired type for output
1169
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1172
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1170
1173
  # rename the output columns
1171
1174
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1172
1175
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.linear_model".replace("s
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class SGDRegressor(BaseTransformer):
61
64
  r"""Linear model fitted by minimizing a regularized empirical loss with SGD
62
65
  For more details on this class, see [sklearn.linear_model.SGDRegressor]
@@ -568,7 +571,7 @@ class SGDRegressor(BaseTransformer):
568
571
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
569
572
  expected_dtype = "array"
570
573
  else:
571
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
574
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
572
575
  # We can only infer the output types from the input types if the following two statemetns are true:
573
576
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
574
577
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1225,7 +1228,7 @@ class SGDRegressor(BaseTransformer):
1225
1228
 
1226
1229
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1227
1230
 
1228
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1231
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1229
1232
  outputs: List[BaseFeatureSpec] = []
1230
1233
  if hasattr(self, "predict"):
1231
1234
  # keep mypy happy
@@ -1233,7 +1236,7 @@ class SGDRegressor(BaseTransformer):
1233
1236
  # For classifier, the type of predict is the same as the type of label
1234
1237
  if self._sklearn_object._estimator_type == "classifier":
1235
1238
  # label columns is the desired type for output
1236
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1239
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1237
1240
  # rename the output columns
1238
1241
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1239
1242
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.linear_model".replace("s
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class TheilSenRegressor(BaseTransformer):
61
64
  r"""Theil-Sen Estimator: robust multivariate regression model
62
65
  For more details on this class, see [sklearn.linear_model.TheilSenRegressor]
@@ -461,7 +464,7 @@ class TheilSenRegressor(BaseTransformer):
461
464
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
462
465
  expected_dtype = "array"
463
466
  else:
464
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
467
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
465
468
  # We can only infer the output types from the input types if the following two statemetns are true:
466
469
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
467
470
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1118,7 +1121,7 @@ class TheilSenRegressor(BaseTransformer):
1118
1121
 
1119
1122
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1120
1123
 
1121
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1124
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1122
1125
  outputs: List[BaseFeatureSpec] = []
1123
1126
  if hasattr(self, "predict"):
1124
1127
  # keep mypy happy
@@ -1126,7 +1129,7 @@ class TheilSenRegressor(BaseTransformer):
1126
1129
  # For classifier, the type of predict is the same as the type of label
1127
1130
  if self._sklearn_object._estimator_type == "classifier":
1128
1131
  # label columns is the desired type for output
1129
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1132
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1130
1133
  # rename the output columns
1131
1134
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1132
1135
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.linear_model".replace("s
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class TweedieRegressor(BaseTransformer):
61
64
  r"""Generalized Linear Model with a Tweedie distribution
62
65
  For more details on this class, see [sklearn.linear_model.TweedieRegressor]
@@ -487,7 +490,7 @@ class TweedieRegressor(BaseTransformer):
487
490
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
488
491
  expected_dtype = "array"
489
492
  else:
490
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
493
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
491
494
  # We can only infer the output types from the input types if the following two statemetns are true:
492
495
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
493
496
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1144,7 +1147,7 @@ class TweedieRegressor(BaseTransformer):
1144
1147
 
1145
1148
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1146
1149
 
1147
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1150
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1148
1151
  outputs: List[BaseFeatureSpec] = []
1149
1152
  if hasattr(self, "predict"):
1150
1153
  # keep mypy happy
@@ -1152,7 +1155,7 @@ class TweedieRegressor(BaseTransformer):
1152
1155
  # For classifier, the type of predict is the same as the type of label
1153
1156
  if self._sklearn_object._estimator_type == "classifier":
1154
1157
  # label columns is the desired type for output
1155
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1158
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1156
1159
  # rename the output columns
1157
1160
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1158
1161
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.manifold".replace("sklea
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class Isomap(BaseTransformer):
61
64
  r"""Isomap Embedding
62
65
  For more details on this class, see [sklearn.manifold.Isomap]
@@ -485,7 +488,7 @@ class Isomap(BaseTransformer):
485
488
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
486
489
  expected_dtype = "array"
487
490
  else:
488
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
491
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
489
492
  # We can only infer the output types from the input types if the following two statemetns are true:
490
493
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
491
494
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1142,7 +1145,7 @@ class Isomap(BaseTransformer):
1142
1145
 
1143
1146
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1144
1147
 
1145
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1148
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1146
1149
  outputs: List[BaseFeatureSpec] = []
1147
1150
  if hasattr(self, "predict"):
1148
1151
  # keep mypy happy
@@ -1150,7 +1153,7 @@ class Isomap(BaseTransformer):
1150
1153
  # For classifier, the type of predict is the same as the type of label
1151
1154
  if self._sklearn_object._estimator_type == "classifier":
1152
1155
  # label columns is the desired type for output
1153
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1156
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1154
1157
  # rename the output columns
1155
1158
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1156
1159
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.manifold".replace("sklea
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class MDS(BaseTransformer):
61
64
  r"""Multidimensional scaling
62
65
  For more details on this class, see [sklearn.manifold.MDS]
@@ -466,7 +469,7 @@ class MDS(BaseTransformer):
466
469
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
467
470
  expected_dtype = "array"
468
471
  else:
469
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
472
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
470
473
  # We can only infer the output types from the input types if the following two statemetns are true:
471
474
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
472
475
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1123,7 +1126,7 @@ class MDS(BaseTransformer):
1123
1126
 
1124
1127
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1125
1128
 
1126
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1129
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1127
1130
  outputs: List[BaseFeatureSpec] = []
1128
1131
  if hasattr(self, "predict"):
1129
1132
  # keep mypy happy
@@ -1131,7 +1134,7 @@ class MDS(BaseTransformer):
1131
1134
  # For classifier, the type of predict is the same as the type of label
1132
1135
  if self._sklearn_object._estimator_type == "classifier":
1133
1136
  # label columns is the desired type for output
1134
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1137
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1135
1138
  # rename the output columns
1136
1139
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1137
1140
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.manifold".replace("sklea
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class SpectralEmbedding(BaseTransformer):
61
64
  r"""Spectral embedding for non-linear dimensionality reduction
62
65
  For more details on this class, see [sklearn.manifold.SpectralEmbedding]
@@ -468,7 +471,7 @@ class SpectralEmbedding(BaseTransformer):
468
471
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
469
472
  expected_dtype = "array"
470
473
  else:
471
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
474
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
472
475
  # We can only infer the output types from the input types if the following two statemetns are true:
473
476
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
474
477
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1125,7 +1128,7 @@ class SpectralEmbedding(BaseTransformer):
1125
1128
 
1126
1129
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1127
1130
 
1128
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1131
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1129
1132
  outputs: List[BaseFeatureSpec] = []
1130
1133
  if hasattr(self, "predict"):
1131
1134
  # keep mypy happy
@@ -1133,7 +1136,7 @@ class SpectralEmbedding(BaseTransformer):
1133
1136
  # For classifier, the type of predict is the same as the type of label
1134
1137
  if self._sklearn_object._estimator_type == "classifier":
1135
1138
  # label columns is the desired type for output
1136
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1139
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1137
1140
  # rename the output columns
1138
1141
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1139
1142
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.manifold".replace("sklea
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class TSNE(BaseTransformer):
61
64
  r"""T-distributed Stochastic Neighbor Embedding
62
65
  For more details on this class, see [sklearn.manifold.TSNE]
@@ -533,7 +536,7 @@ class TSNE(BaseTransformer):
533
536
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
534
537
  expected_dtype = "array"
535
538
  else:
536
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
539
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
537
540
  # We can only infer the output types from the input types if the following two statemetns are true:
538
541
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
539
542
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1190,7 +1193,7 @@ class TSNE(BaseTransformer):
1190
1193
 
1191
1194
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1192
1195
 
1193
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1196
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1194
1197
  outputs: List[BaseFeatureSpec] = []
1195
1198
  if hasattr(self, "predict"):
1196
1199
  # keep mypy happy
@@ -1198,7 +1201,7 @@ class TSNE(BaseTransformer):
1198
1201
  # For classifier, the type of predict is the same as the type of label
1199
1202
  if self._sklearn_object._estimator_type == "classifier":
1200
1203
  # label columns is the desired type for output
1201
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1204
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1202
1205
  # rename the output columns
1203
1206
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1204
1207
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.mixture".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class BayesianGaussianMixture(BaseTransformer):
61
64
  r"""Variational Bayesian estimation of a Gaussian mixture
62
65
  For more details on this class, see [sklearn.mixture.BayesianGaussianMixture]
@@ -532,7 +535,7 @@ class BayesianGaussianMixture(BaseTransformer):
532
535
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
533
536
  expected_dtype = "array"
534
537
  else:
535
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
538
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
536
539
  # We can only infer the output types from the input types if the following two statemetns are true:
537
540
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
538
541
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1197,7 +1200,7 @@ class BayesianGaussianMixture(BaseTransformer):
1197
1200
 
1198
1201
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1199
1202
 
1200
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1203
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1201
1204
  outputs: List[BaseFeatureSpec] = []
1202
1205
  if hasattr(self, "predict"):
1203
1206
  # keep mypy happy
@@ -1205,7 +1208,7 @@ class BayesianGaussianMixture(BaseTransformer):
1205
1208
  # For classifier, the type of predict is the same as the type of label
1206
1209
  if self._sklearn_object._estimator_type == "classifier":
1207
1210
  # label columns is the desired type for output
1208
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1211
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1209
1212
  # rename the output columns
1210
1213
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1211
1214
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.mixture".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class GaussianMixture(BaseTransformer):
61
64
  r"""Gaussian Mixture
62
65
  For more details on this class, see [sklearn.mixture.GaussianMixture]
@@ -505,7 +508,7 @@ class GaussianMixture(BaseTransformer):
505
508
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
506
509
  expected_dtype = "array"
507
510
  else:
508
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
511
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
509
512
  # We can only infer the output types from the input types if the following two statemetns are true:
510
513
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
511
514
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1170,7 +1173,7 @@ class GaussianMixture(BaseTransformer):
1170
1173
 
1171
1174
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1172
1175
 
1173
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1176
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1174
1177
  outputs: List[BaseFeatureSpec] = []
1175
1178
  if hasattr(self, "predict"):
1176
1179
  # keep mypy happy
@@ -1178,7 +1181,7 @@ class GaussianMixture(BaseTransformer):
1178
1181
  # For classifier, the type of predict is the same as the type of label
1179
1182
  if self._sklearn_object._estimator_type == "classifier":
1180
1183
  # label columns is the desired type for output
1181
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1184
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1182
1185
  # rename the output columns
1183
1186
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1184
1187
  self._model_signature_dict["predict"] = ModelSignature(
@@ -21,6 +21,7 @@ from snowflake.ml.model.model_signature import (
21
21
  ModelSignature,
22
22
  _infer_signature,
23
23
  _rename_signature_with_snowflake_identifiers,
24
+ _truncate_data,
24
25
  )
25
26
  from snowflake.ml.modeling._internal.estimator_utils import (
26
27
  gather_dependencies,
@@ -47,6 +48,8 @@ _PROJECT = "ModelDevelopment"
47
48
  _SUBPROJECT = "ModelSelection"
48
49
  DEFAULT_UDTF_NJOBS = 3
49
50
 
51
+ INFER_SIGNATURE_MAX_ROWS = 100
52
+
50
53
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
51
54
 
52
55
 
@@ -810,7 +813,13 @@ class GridSearchCV(BaseTransformer):
810
813
 
811
814
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
812
815
 
813
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
816
+ inputs = list(
817
+ _infer_signature(
818
+ _truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS),
819
+ "input",
820
+ use_snowflake_identifiers=True,
821
+ )
822
+ )
814
823
  outputs: List[BaseFeatureSpec] = []
815
824
  if hasattr(self, "predict"):
816
825
  # keep mypy happy
@@ -818,7 +827,13 @@ class GridSearchCV(BaseTransformer):
818
827
  # For classifier, the type of predict is the same as the type of label
819
828
  if self._sklearn_object._estimator_type == "classifier":
820
829
  # label columns is the desired type for output
821
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
830
+ outputs = list(
831
+ _infer_signature(
832
+ _truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS),
833
+ "output",
834
+ use_snowflake_identifiers=True,
835
+ )
836
+ )
822
837
  # rename the output columns
823
838
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
824
839
  self._model_signature_dict["predict"] = ModelSignature(
@@ -18,6 +18,7 @@ from snowflake.ml.model.model_signature import (
18
18
  ModelSignature,
19
19
  _infer_signature,
20
20
  _rename_signature_with_snowflake_identifiers,
21
+ _truncate_data,
21
22
  )
22
23
  from snowflake.ml.modeling._internal.estimator_utils import (
23
24
  gather_dependencies,
@@ -44,6 +45,8 @@ _PROJECT = "ModelDevelopment"
44
45
  _SUBPROJECT = "ModelSelection"
45
46
  DEFAULT_UDTF_NJOBS = 3
46
47
 
48
+ INFER_SIGNATURE_MAX_ROWS = 100
49
+
47
50
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
48
51
 
49
52
 
@@ -825,7 +828,13 @@ class RandomizedSearchCV(BaseTransformer):
825
828
 
826
829
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
827
830
 
828
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
831
+ inputs = list(
832
+ _infer_signature(
833
+ _truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS),
834
+ "input",
835
+ use_snowflake_identifiers=True,
836
+ )
837
+ )
829
838
  outputs: List[BaseFeatureSpec] = []
830
839
  if hasattr(self, "predict"):
831
840
  # keep mypy happy
@@ -833,7 +842,13 @@ class RandomizedSearchCV(BaseTransformer):
833
842
  # For classifier, the type of predict is the same as the type of label
834
843
  if self._sklearn_object._estimator_type == "classifier":
835
844
  # label columns is the desired type for output
836
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
845
+ outputs = list(
846
+ _infer_signature(
847
+ _truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS),
848
+ "output",
849
+ use_snowflake_identifiers=True,
850
+ )
851
+ )
837
852
  # rename the output columns
838
853
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
839
854
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.multiclass".replace("skl
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class OneVsOneClassifier(BaseTransformer):
61
64
  r"""One-vs-one multiclass strategy
62
65
  For more details on this class, see [sklearn.multiclass.OneVsOneClassifier]
@@ -415,7 +418,7 @@ class OneVsOneClassifier(BaseTransformer):
415
418
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
416
419
  expected_dtype = "array"
417
420
  else:
418
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
421
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
419
422
  # We can only infer the output types from the input types if the following two statemetns are true:
420
423
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
421
424
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1074,7 +1077,7 @@ class OneVsOneClassifier(BaseTransformer):
1074
1077
 
1075
1078
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1076
1079
 
1077
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1080
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1078
1081
  outputs: List[BaseFeatureSpec] = []
1079
1082
  if hasattr(self, "predict"):
1080
1083
  # keep mypy happy
@@ -1082,7 +1085,7 @@ class OneVsOneClassifier(BaseTransformer):
1082
1085
  # For classifier, the type of predict is the same as the type of label
1083
1086
  if self._sklearn_object._estimator_type == "classifier":
1084
1087
  # label columns is the desired type for output
1085
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1088
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1086
1089
  # rename the output columns
1087
1090
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1088
1091
  self._model_signature_dict["predict"] = ModelSignature(