snowflake-ml-python 1.7.3__py3-none-any.whl → 1.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. snowflake/cortex/_complete.py +19 -0
  2. snowflake/ml/_internal/env_utils.py +64 -21
  3. snowflake/ml/_internal/platform_capabilities.py +87 -0
  4. snowflake/ml/_internal/relax_version_strategy.py +16 -0
  5. snowflake/ml/_internal/telemetry.py +21 -0
  6. snowflake/ml/data/_internal/arrow_ingestor.py +1 -1
  7. snowflake/ml/dataset/dataset.py +0 -1
  8. snowflake/ml/feature_store/feature_store.py +18 -0
  9. snowflake/ml/feature_store/feature_view.py +46 -1
  10. snowflake/ml/fileset/fileset.py +6 -0
  11. snowflake/ml/jobs/__init__.py +21 -0
  12. snowflake/ml/jobs/_utils/constants.py +57 -0
  13. snowflake/ml/jobs/_utils/payload_utils.py +438 -0
  14. snowflake/ml/jobs/_utils/spec_utils.py +296 -0
  15. snowflake/ml/jobs/_utils/types.py +39 -0
  16. snowflake/ml/jobs/decorators.py +71 -0
  17. snowflake/ml/jobs/job.py +113 -0
  18. snowflake/ml/jobs/manager.py +298 -0
  19. snowflake/ml/model/_client/ops/model_ops.py +11 -2
  20. snowflake/ml/model/_client/ops/service_ops.py +1 -11
  21. snowflake/ml/model/_client/sql/service.py +13 -6
  22. snowflake/ml/model/_packager/model_env/model_env.py +45 -28
  23. snowflake/ml/model/_packager/model_handlers/_utils.py +19 -6
  24. snowflake/ml/model/_packager/model_handlers/custom.py +1 -2
  25. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +17 -0
  26. snowflake/ml/model/_packager/model_handlers/keras.py +230 -0
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +1 -0
  28. snowflake/ml/model/_packager/model_handlers/sklearn.py +28 -3
  29. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +74 -21
  30. snowflake/ml/model/_packager/model_handlers/tensorflow.py +27 -49
  31. snowflake/ml/model/_packager/model_handlers_migrator/tensorflow_migrator_2023_12_01.py +48 -0
  32. snowflake/ml/model/_packager/model_meta/model_meta.py +1 -1
  33. snowflake/ml/model/_packager/model_meta/model_meta_schema.py +3 -0
  34. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +2 -2
  35. snowflake/ml/model/_packager/model_runtime/model_runtime.py +4 -1
  36. snowflake/ml/model/_packager/model_task/model_task_utils.py +5 -1
  37. snowflake/ml/model/_signatures/base_handler.py +1 -2
  38. snowflake/ml/model/_signatures/builtins_handler.py +2 -2
  39. snowflake/ml/model/_signatures/core.py +2 -2
  40. snowflake/ml/model/_signatures/numpy_handler.py +11 -12
  41. snowflake/ml/model/_signatures/pandas_handler.py +11 -9
  42. snowflake/ml/model/_signatures/pytorch_handler.py +3 -6
  43. snowflake/ml/model/_signatures/snowpark_handler.py +3 -3
  44. snowflake/ml/model/_signatures/tensorflow_handler.py +2 -7
  45. snowflake/ml/model/model_signature.py +25 -4
  46. snowflake/ml/model/type_hints.py +15 -0
  47. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +14 -1
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +6 -3
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +6 -3
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +6 -3
  51. snowflake/ml/modeling/cluster/birch.py +6 -3
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +6 -3
  53. snowflake/ml/modeling/cluster/dbscan.py +6 -3
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +6 -3
  55. snowflake/ml/modeling/cluster/k_means.py +6 -3
  56. snowflake/ml/modeling/cluster/mean_shift.py +6 -3
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +6 -3
  58. snowflake/ml/modeling/cluster/optics.py +6 -3
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +6 -3
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +6 -3
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +6 -3
  62. snowflake/ml/modeling/compose/column_transformer.py +6 -3
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +6 -3
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +6 -3
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +6 -3
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +6 -3
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +6 -3
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +6 -3
  69. snowflake/ml/modeling/covariance/min_cov_det.py +6 -3
  70. snowflake/ml/modeling/covariance/oas.py +6 -3
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +6 -3
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +6 -3
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +6 -3
  74. snowflake/ml/modeling/decomposition/fast_ica.py +6 -3
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +6 -3
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +6 -3
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +6 -3
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +6 -3
  79. snowflake/ml/modeling/decomposition/pca.py +6 -3
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +6 -3
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +6 -3
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +6 -3
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +6 -3
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +6 -3
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +6 -3
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +6 -3
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +6 -3
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +6 -3
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +6 -3
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +6 -3
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +6 -3
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +6 -3
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +6 -3
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +6 -3
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +6 -3
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +6 -3
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +6 -3
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +6 -3
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +6 -3
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +6 -3
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +6 -3
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +6 -3
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +6 -3
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +6 -3
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +6 -3
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +6 -3
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +6 -3
  108. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +6 -3
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +6 -3
  110. snowflake/ml/modeling/impute/iterative_imputer.py +6 -3
  111. snowflake/ml/modeling/impute/knn_imputer.py +6 -3
  112. snowflake/ml/modeling/impute/missing_indicator.py +6 -3
  113. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +6 -3
  114. snowflake/ml/modeling/kernel_approximation/nystroem.py +6 -3
  115. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +6 -3
  116. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +6 -3
  117. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +6 -3
  118. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +6 -3
  119. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +6 -3
  120. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +6 -3
  121. snowflake/ml/modeling/linear_model/ard_regression.py +6 -3
  122. snowflake/ml/modeling/linear_model/bayesian_ridge.py +6 -3
  123. snowflake/ml/modeling/linear_model/elastic_net.py +6 -3
  124. snowflake/ml/modeling/linear_model/elastic_net_cv.py +6 -3
  125. snowflake/ml/modeling/linear_model/gamma_regressor.py +6 -3
  126. snowflake/ml/modeling/linear_model/huber_regressor.py +6 -3
  127. snowflake/ml/modeling/linear_model/lars.py +6 -3
  128. snowflake/ml/modeling/linear_model/lars_cv.py +6 -3
  129. snowflake/ml/modeling/linear_model/lasso.py +6 -3
  130. snowflake/ml/modeling/linear_model/lasso_cv.py +6 -3
  131. snowflake/ml/modeling/linear_model/lasso_lars.py +6 -3
  132. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +6 -3
  133. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +6 -3
  134. snowflake/ml/modeling/linear_model/linear_regression.py +6 -3
  135. snowflake/ml/modeling/linear_model/logistic_regression.py +6 -3
  136. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +6 -3
  137. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +6 -3
  138. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +6 -3
  139. snowflake/ml/modeling/linear_model/multi_task_lasso.py +6 -3
  140. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +6 -3
  141. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +6 -3
  142. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +6 -3
  143. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +6 -3
  144. snowflake/ml/modeling/linear_model/perceptron.py +6 -3
  145. snowflake/ml/modeling/linear_model/poisson_regressor.py +6 -3
  146. snowflake/ml/modeling/linear_model/ransac_regressor.py +6 -3
  147. snowflake/ml/modeling/linear_model/ridge.py +6 -3
  148. snowflake/ml/modeling/linear_model/ridge_classifier.py +6 -3
  149. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +6 -3
  150. snowflake/ml/modeling/linear_model/ridge_cv.py +6 -3
  151. snowflake/ml/modeling/linear_model/sgd_classifier.py +6 -3
  152. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +6 -3
  153. snowflake/ml/modeling/linear_model/sgd_regressor.py +6 -3
  154. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +6 -3
  155. snowflake/ml/modeling/linear_model/tweedie_regressor.py +6 -3
  156. snowflake/ml/modeling/manifold/isomap.py +6 -3
  157. snowflake/ml/modeling/manifold/mds.py +6 -3
  158. snowflake/ml/modeling/manifold/spectral_embedding.py +6 -3
  159. snowflake/ml/modeling/manifold/tsne.py +6 -3
  160. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +6 -3
  161. snowflake/ml/modeling/mixture/gaussian_mixture.py +6 -3
  162. snowflake/ml/modeling/model_selection/grid_search_cv.py +17 -2
  163. snowflake/ml/modeling/model_selection/randomized_search_cv.py +17 -2
  164. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +6 -3
  165. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +6 -3
  166. snowflake/ml/modeling/multiclass/output_code_classifier.py +6 -3
  167. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +6 -3
  168. snowflake/ml/modeling/naive_bayes/categorical_nb.py +6 -3
  169. snowflake/ml/modeling/naive_bayes/complement_nb.py +6 -3
  170. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +6 -3
  171. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +6 -3
  172. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +6 -3
  173. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +6 -3
  174. snowflake/ml/modeling/neighbors/kernel_density.py +6 -3
  175. snowflake/ml/modeling/neighbors/local_outlier_factor.py +6 -3
  176. snowflake/ml/modeling/neighbors/nearest_centroid.py +6 -3
  177. snowflake/ml/modeling/neighbors/nearest_neighbors.py +6 -3
  178. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +6 -3
  179. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +6 -3
  180. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +6 -3
  181. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +6 -3
  182. snowflake/ml/modeling/neural_network/mlp_classifier.py +6 -3
  183. snowflake/ml/modeling/neural_network/mlp_regressor.py +6 -3
  184. snowflake/ml/modeling/pipeline/pipeline.py +28 -3
  185. snowflake/ml/modeling/preprocessing/polynomial_features.py +8 -5
  186. snowflake/ml/modeling/semi_supervised/label_propagation.py +6 -3
  187. snowflake/ml/modeling/semi_supervised/label_spreading.py +6 -3
  188. snowflake/ml/modeling/svm/linear_svc.py +6 -3
  189. snowflake/ml/modeling/svm/linear_svr.py +6 -3
  190. snowflake/ml/modeling/svm/nu_svc.py +6 -3
  191. snowflake/ml/modeling/svm/nu_svr.py +6 -3
  192. snowflake/ml/modeling/svm/svc.py +6 -3
  193. snowflake/ml/modeling/svm/svr.py +6 -3
  194. snowflake/ml/modeling/tree/decision_tree_classifier.py +6 -3
  195. snowflake/ml/modeling/tree/decision_tree_regressor.py +6 -3
  196. snowflake/ml/modeling/tree/extra_tree_classifier.py +6 -3
  197. snowflake/ml/modeling/tree/extra_tree_regressor.py +6 -3
  198. snowflake/ml/modeling/xgboost/xgb_classifier.py +6 -3
  199. snowflake/ml/modeling/xgboost/xgb_regressor.py +6 -3
  200. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +6 -3
  201. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +6 -3
  202. snowflake/ml/registry/registry.py +34 -4
  203. snowflake/ml/version.py +1 -1
  204. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/METADATA +81 -33
  205. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/RECORD +208 -196
  206. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/WHEEL +1 -1
  207. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/LICENSE.txt +0 -0
  208. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/top_level.txt +0 -0
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.gaussian_process".replac
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class GaussianProcessRegressor(BaseTransformer):
61
64
  r"""Gaussian process regression (GPR)
62
65
  For more details on this class, see [sklearn.gaussian_process.GaussianProcessRegressor]
@@ -487,7 +490,7 @@ class GaussianProcessRegressor(BaseTransformer):
487
490
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
488
491
  expected_dtype = "array"
489
492
  else:
490
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
493
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
491
494
  # We can only infer the output types from the input types if the following two statemetns are true:
492
495
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
493
496
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1144,7 +1147,7 @@ class GaussianProcessRegressor(BaseTransformer):
1144
1147
 
1145
1148
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1146
1149
 
1147
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1150
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1148
1151
  outputs: List[BaseFeatureSpec] = []
1149
1152
  if hasattr(self, "predict"):
1150
1153
  # keep mypy happy
@@ -1152,7 +1155,7 @@ class GaussianProcessRegressor(BaseTransformer):
1152
1155
  # For classifier, the type of predict is the same as the type of label
1153
1156
  if self._sklearn_object._estimator_type == "classifier":
1154
1157
  # label columns is the desired type for output
1155
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1158
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1156
1159
  # rename the output columns
1157
1160
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1158
1161
  self._model_signature_dict["predict"] = ModelSignature(
@@ -38,6 +38,7 @@ from snowflake.ml.model.model_signature import (
38
38
  FeatureSpec,
39
39
  ModelSignature,
40
40
  _infer_signature,
41
+ _truncate_data,
41
42
  _rename_signature_with_snowflake_identifiers,
42
43
  )
43
44
 
@@ -58,6 +59,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.impute".replace("sklearn
58
59
 
59
60
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
60
61
 
62
+ INFER_SIGNATURE_MAX_ROWS = 100
63
+
61
64
  class IterativeImputer(BaseTransformer):
62
65
  r"""Multivariate imputer that estimates each feature from all the others
63
66
  For more details on this class, see [sklearn.impute.IterativeImputer]
@@ -531,7 +534,7 @@ class IterativeImputer(BaseTransformer):
531
534
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
532
535
  expected_dtype = "array"
533
536
  else:
534
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
537
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
535
538
  # We can only infer the output types from the input types if the following two statemetns are true:
536
539
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
537
540
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1188,7 +1191,7 @@ class IterativeImputer(BaseTransformer):
1188
1191
 
1189
1192
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1190
1193
 
1191
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1194
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1192
1195
  outputs: List[BaseFeatureSpec] = []
1193
1196
  if hasattr(self, "predict"):
1194
1197
  # keep mypy happy
@@ -1196,7 +1199,7 @@ class IterativeImputer(BaseTransformer):
1196
1199
  # For classifier, the type of predict is the same as the type of label
1197
1200
  if self._sklearn_object._estimator_type == "classifier":
1198
1201
  # label columns is the desired type for output
1199
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1202
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1200
1203
  # rename the output columns
1201
1204
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1202
1205
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.impute".replace("sklearn
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class KNNImputer(BaseTransformer):
61
64
  r"""Imputation for completing missing values using k-Nearest Neighbors
62
65
  For more details on this class, see [sklearn.impute.KNNImputer]
@@ -457,7 +460,7 @@ class KNNImputer(BaseTransformer):
457
460
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
458
461
  expected_dtype = "array"
459
462
  else:
460
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
463
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
461
464
  # We can only infer the output types from the input types if the following two statemetns are true:
462
465
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
463
466
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1114,7 +1117,7 @@ class KNNImputer(BaseTransformer):
1114
1117
 
1115
1118
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1116
1119
 
1117
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1120
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1118
1121
  outputs: List[BaseFeatureSpec] = []
1119
1122
  if hasattr(self, "predict"):
1120
1123
  # keep mypy happy
@@ -1122,7 +1125,7 @@ class KNNImputer(BaseTransformer):
1122
1125
  # For classifier, the type of predict is the same as the type of label
1123
1126
  if self._sklearn_object._estimator_type == "classifier":
1124
1127
  # label columns is the desired type for output
1125
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1128
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1126
1129
  # rename the output columns
1127
1130
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1128
1131
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.impute".replace("sklearn
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class MissingIndicator(BaseTransformer):
61
64
  r"""Binary indicators for missing values
62
65
  For more details on this class, see [sklearn.impute.MissingIndicator]
@@ -431,7 +434,7 @@ class MissingIndicator(BaseTransformer):
431
434
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
432
435
  expected_dtype = "array"
433
436
  else:
434
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
437
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
435
438
  # We can only infer the output types from the input types if the following two statemetns are true:
436
439
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
437
440
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1088,7 +1091,7 @@ class MissingIndicator(BaseTransformer):
1088
1091
 
1089
1092
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1090
1093
 
1091
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1094
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1092
1095
  outputs: List[BaseFeatureSpec] = []
1093
1096
  if hasattr(self, "predict"):
1094
1097
  # keep mypy happy
@@ -1096,7 +1099,7 @@ class MissingIndicator(BaseTransformer):
1096
1099
  # For classifier, the type of predict is the same as the type of label
1097
1100
  if self._sklearn_object._estimator_type == "classifier":
1098
1101
  # label columns is the desired type for output
1099
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1102
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1100
1103
  # rename the output columns
1101
1104
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1102
1105
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class AdditiveChi2Sampler(BaseTransformer):
61
64
  r"""Approximate feature map for additive chi2 kernel
62
65
  For more details on this class, see [sklearn.kernel_approximation.AdditiveChi2Sampler]
@@ -406,7 +409,7 @@ class AdditiveChi2Sampler(BaseTransformer):
406
409
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
407
410
  expected_dtype = "array"
408
411
  else:
409
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
412
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
410
413
  # We can only infer the output types from the input types if the following two statemetns are true:
411
414
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
412
415
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1063,7 +1066,7 @@ class AdditiveChi2Sampler(BaseTransformer):
1063
1066
 
1064
1067
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1065
1068
 
1066
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1069
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1067
1070
  outputs: List[BaseFeatureSpec] = []
1068
1071
  if hasattr(self, "predict"):
1069
1072
  # keep mypy happy
@@ -1071,7 +1074,7 @@ class AdditiveChi2Sampler(BaseTransformer):
1071
1074
  # For classifier, the type of predict is the same as the type of label
1072
1075
  if self._sklearn_object._estimator_type == "classifier":
1073
1076
  # label columns is the desired type for output
1074
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1077
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1075
1078
  # rename the output columns
1076
1079
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1077
1080
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class Nystroem(BaseTransformer):
61
64
  r"""Approximate a kernel map using a subset of the training data
62
65
  For more details on this class, see [sklearn.kernel_approximation.Nystroem]
@@ -454,7 +457,7 @@ class Nystroem(BaseTransformer):
454
457
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
455
458
  expected_dtype = "array"
456
459
  else:
457
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
460
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
458
461
  # We can only infer the output types from the input types if the following two statemetns are true:
459
462
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
460
463
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1111,7 +1114,7 @@ class Nystroem(BaseTransformer):
1111
1114
 
1112
1115
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1113
1116
 
1114
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1117
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1115
1118
  outputs: List[BaseFeatureSpec] = []
1116
1119
  if hasattr(self, "predict"):
1117
1120
  # keep mypy happy
@@ -1119,7 +1122,7 @@ class Nystroem(BaseTransformer):
1119
1122
  # For classifier, the type of predict is the same as the type of label
1120
1123
  if self._sklearn_object._estimator_type == "classifier":
1121
1124
  # label columns is the desired type for output
1122
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1125
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1123
1126
  # rename the output columns
1124
1127
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1125
1128
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class PolynomialCountSketch(BaseTransformer):
61
64
  r"""Polynomial kernel approximation via Tensor Sketch
62
65
  For more details on this class, see [sklearn.kernel_approximation.PolynomialCountSketch]
@@ -430,7 +433,7 @@ class PolynomialCountSketch(BaseTransformer):
430
433
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
431
434
  expected_dtype = "array"
432
435
  else:
433
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
436
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
434
437
  # We can only infer the output types from the input types if the following two statemetns are true:
435
438
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
436
439
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1087,7 +1090,7 @@ class PolynomialCountSketch(BaseTransformer):
1087
1090
 
1088
1091
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1089
1092
 
1090
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1093
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1091
1094
  outputs: List[BaseFeatureSpec] = []
1092
1095
  if hasattr(self, "predict"):
1093
1096
  # keep mypy happy
@@ -1095,7 +1098,7 @@ class PolynomialCountSketch(BaseTransformer):
1095
1098
  # For classifier, the type of predict is the same as the type of label
1096
1099
  if self._sklearn_object._estimator_type == "classifier":
1097
1100
  # label columns is the desired type for output
1098
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1101
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1099
1102
  # rename the output columns
1100
1103
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1101
1104
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class RBFSampler(BaseTransformer):
61
64
  r"""Approximate a RBF kernel feature map using random Fourier features
62
65
  For more details on this class, see [sklearn.kernel_approximation.RBFSampler]
@@ -417,7 +420,7 @@ class RBFSampler(BaseTransformer):
417
420
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
418
421
  expected_dtype = "array"
419
422
  else:
420
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
423
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
421
424
  # We can only infer the output types from the input types if the following two statemetns are true:
422
425
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
423
426
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1074,7 +1077,7 @@ class RBFSampler(BaseTransformer):
1074
1077
 
1075
1078
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1076
1079
 
1077
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1080
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1078
1081
  outputs: List[BaseFeatureSpec] = []
1079
1082
  if hasattr(self, "predict"):
1080
1083
  # keep mypy happy
@@ -1082,7 +1085,7 @@ class RBFSampler(BaseTransformer):
1082
1085
  # For classifier, the type of predict is the same as the type of label
1083
1086
  if self._sklearn_object._estimator_type == "classifier":
1084
1087
  # label columns is the desired type for output
1085
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1088
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1086
1089
  # rename the output columns
1087
1090
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1088
1091
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class SkewedChi2Sampler(BaseTransformer):
61
64
  r"""Approximate feature map for "skewed chi-squared" kernel
62
65
  For more details on this class, see [sklearn.kernel_approximation.SkewedChi2Sampler]
@@ -415,7 +418,7 @@ class SkewedChi2Sampler(BaseTransformer):
415
418
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
416
419
  expected_dtype = "array"
417
420
  else:
418
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
421
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
419
422
  # We can only infer the output types from the input types if the following two statemetns are true:
420
423
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
421
424
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1072,7 +1075,7 @@ class SkewedChi2Sampler(BaseTransformer):
1072
1075
 
1073
1076
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1074
1077
 
1075
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1078
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1076
1079
  outputs: List[BaseFeatureSpec] = []
1077
1080
  if hasattr(self, "predict"):
1078
1081
  # keep mypy happy
@@ -1080,7 +1083,7 @@ class SkewedChi2Sampler(BaseTransformer):
1080
1083
  # For classifier, the type of predict is the same as the type of label
1081
1084
  if self._sklearn_object._estimator_type == "classifier":
1082
1085
  # label columns is the desired type for output
1083
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1086
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1084
1087
  # rename the output columns
1085
1088
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1086
1089
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_ridge".replace("s
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class KernelRidge(BaseTransformer):
61
64
  r"""Kernel ridge regression
62
65
  For more details on this class, see [sklearn.kernel_ridge.KernelRidge]
@@ -449,7 +452,7 @@ class KernelRidge(BaseTransformer):
449
452
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
450
453
  expected_dtype = "array"
451
454
  else:
452
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
455
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
453
456
  # We can only infer the output types from the input types if the following two statemetns are true:
454
457
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
455
458
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1106,7 +1109,7 @@ class KernelRidge(BaseTransformer):
1106
1109
 
1107
1110
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1108
1111
 
1109
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1112
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1110
1113
  outputs: List[BaseFeatureSpec] = []
1111
1114
  if hasattr(self, "predict"):
1112
1115
  # keep mypy happy
@@ -1114,7 +1117,7 @@ class KernelRidge(BaseTransformer):
1114
1117
  # For classifier, the type of predict is the same as the type of label
1115
1118
  if self._sklearn_object._estimator_type == "classifier":
1116
1119
  # label columns is the desired type for output
1117
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1120
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1118
1121
  # rename the output columns
1119
1122
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1120
1123
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "lightgbm".replace("sklearn.", ""
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class LGBMClassifier(BaseTransformer):
61
64
  r"""LightGBM classifier
62
65
  For more details on this class, see [lightgbm.LGBMClassifier]
@@ -436,7 +439,7 @@ class LGBMClassifier(BaseTransformer):
436
439
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
437
440
  expected_dtype = "array"
438
441
  else:
439
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
442
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
440
443
  # We can only infer the output types from the input types if the following two statemetns are true:
441
444
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
442
445
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1097,7 +1100,7 @@ class LGBMClassifier(BaseTransformer):
1097
1100
 
1098
1101
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1099
1102
 
1100
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1103
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1101
1104
  outputs: List[BaseFeatureSpec] = []
1102
1105
  if hasattr(self, "predict"):
1103
1106
  # keep mypy happy
@@ -1105,7 +1108,7 @@ class LGBMClassifier(BaseTransformer):
1105
1108
  # For classifier, the type of predict is the same as the type of label
1106
1109
  if self._sklearn_object._estimator_type == "classifier":
1107
1110
  # label columns is the desired type for output
1108
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1111
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1109
1112
  # rename the output columns
1110
1113
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1111
1114
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "lightgbm".replace("sklearn.", ""
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class LGBMRegressor(BaseTransformer):
61
64
  r"""LightGBM regressor
62
65
  For more details on this class, see [lightgbm.LGBMRegressor]
@@ -436,7 +439,7 @@ class LGBMRegressor(BaseTransformer):
436
439
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
437
440
  expected_dtype = "array"
438
441
  else:
439
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
442
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
440
443
  # We can only infer the output types from the input types if the following two statemetns are true:
441
444
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
442
445
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1093,7 +1096,7 @@ class LGBMRegressor(BaseTransformer):
1093
1096
 
1094
1097
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1095
1098
 
1096
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1099
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1097
1100
  outputs: List[BaseFeatureSpec] = []
1098
1101
  if hasattr(self, "predict"):
1099
1102
  # keep mypy happy
@@ -1101,7 +1104,7 @@ class LGBMRegressor(BaseTransformer):
1101
1104
  # For classifier, the type of predict is the same as the type of label
1102
1105
  if self._sklearn_object._estimator_type == "classifier":
1103
1106
  # label columns is the desired type for output
1104
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1107
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1105
1108
  # rename the output columns
1106
1109
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1107
1110
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.linear_model".replace("s
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class ARDRegression(BaseTransformer):
61
64
  r"""Bayesian ARD regression
62
65
  For more details on this class, see [sklearn.linear_model.ARDRegression]
@@ -458,7 +461,7 @@ class ARDRegression(BaseTransformer):
458
461
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
459
462
  expected_dtype = "array"
460
463
  else:
461
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
464
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
462
465
  # We can only infer the output types from the input types if the following two statemetns are true:
463
466
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
464
467
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1115,7 +1118,7 @@ class ARDRegression(BaseTransformer):
1115
1118
 
1116
1119
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1117
1120
 
1118
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1121
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1119
1122
  outputs: List[BaseFeatureSpec] = []
1120
1123
  if hasattr(self, "predict"):
1121
1124
  # keep mypy happy
@@ -1123,7 +1126,7 @@ class ARDRegression(BaseTransformer):
1123
1126
  # For classifier, the type of predict is the same as the type of label
1124
1127
  if self._sklearn_object._estimator_type == "classifier":
1125
1128
  # label columns is the desired type for output
1126
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1129
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1127
1130
  # rename the output columns
1128
1131
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1129
1132
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.linear_model".replace("s
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class BayesianRidge(BaseTransformer):
61
64
  r"""Bayesian ridge regression
62
65
  For more details on this class, see [sklearn.linear_model.BayesianRidge]
@@ -468,7 +471,7 @@ class BayesianRidge(BaseTransformer):
468
471
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
469
472
  expected_dtype = "array"
470
473
  else:
471
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
474
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
472
475
  # We can only infer the output types from the input types if the following two statemetns are true:
473
476
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
474
477
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1125,7 +1128,7 @@ class BayesianRidge(BaseTransformer):
1125
1128
 
1126
1129
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1127
1130
 
1128
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1131
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1129
1132
  outputs: List[BaseFeatureSpec] = []
1130
1133
  if hasattr(self, "predict"):
1131
1134
  # keep mypy happy
@@ -1133,7 +1136,7 @@ class BayesianRidge(BaseTransformer):
1133
1136
  # For classifier, the type of predict is the same as the type of label
1134
1137
  if self._sklearn_object._estimator_type == "classifier":
1135
1138
  # label columns is the desired type for output
1136
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1139
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1137
1140
  # rename the output columns
1138
1141
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1139
1142
  self._model_signature_dict["predict"] = ModelSignature(