snowflake-ml-python 1.7.3__py3-none-any.whl → 1.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. snowflake/cortex/_complete.py +19 -0
  2. snowflake/ml/_internal/platform_capabilities.py +87 -0
  3. snowflake/ml/dataset/dataset.py +0 -1
  4. snowflake/ml/fileset/fileset.py +6 -0
  5. snowflake/ml/jobs/__init__.py +21 -0
  6. snowflake/ml/jobs/_utils/constants.py +51 -0
  7. snowflake/ml/jobs/_utils/payload_utils.py +352 -0
  8. snowflake/ml/jobs/_utils/spec_utils.py +298 -0
  9. snowflake/ml/jobs/_utils/types.py +39 -0
  10. snowflake/ml/jobs/decorators.py +91 -0
  11. snowflake/ml/jobs/job.py +113 -0
  12. snowflake/ml/jobs/manager.py +298 -0
  13. snowflake/ml/model/_client/ops/model_ops.py +11 -2
  14. snowflake/ml/model/_client/ops/service_ops.py +1 -11
  15. snowflake/ml/model/_client/sql/service.py +13 -6
  16. snowflake/ml/model/_packager/model_handlers/_utils.py +12 -3
  17. snowflake/ml/model/_packager/model_handlers/custom.py +1 -2
  18. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +1 -0
  19. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +2 -2
  20. snowflake/ml/model/_signatures/base_handler.py +1 -2
  21. snowflake/ml/model/_signatures/builtins_handler.py +2 -2
  22. snowflake/ml/model/_signatures/numpy_handler.py +6 -7
  23. snowflake/ml/model/_signatures/pandas_handler.py +2 -2
  24. snowflake/ml/model/_signatures/pytorch_handler.py +2 -5
  25. snowflake/ml/model/_signatures/snowpark_handler.py +3 -3
  26. snowflake/ml/model/_signatures/tensorflow_handler.py +2 -7
  27. snowflake/ml/model/model_signature.py +17 -4
  28. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +6 -3
  29. snowflake/ml/modeling/cluster/affinity_propagation.py +6 -3
  30. snowflake/ml/modeling/cluster/agglomerative_clustering.py +6 -3
  31. snowflake/ml/modeling/cluster/birch.py +6 -3
  32. snowflake/ml/modeling/cluster/bisecting_k_means.py +6 -3
  33. snowflake/ml/modeling/cluster/dbscan.py +6 -3
  34. snowflake/ml/modeling/cluster/feature_agglomeration.py +6 -3
  35. snowflake/ml/modeling/cluster/k_means.py +6 -3
  36. snowflake/ml/modeling/cluster/mean_shift.py +6 -3
  37. snowflake/ml/modeling/cluster/mini_batch_k_means.py +6 -3
  38. snowflake/ml/modeling/cluster/optics.py +6 -3
  39. snowflake/ml/modeling/cluster/spectral_biclustering.py +6 -3
  40. snowflake/ml/modeling/cluster/spectral_clustering.py +6 -3
  41. snowflake/ml/modeling/cluster/spectral_coclustering.py +6 -3
  42. snowflake/ml/modeling/compose/column_transformer.py +6 -3
  43. snowflake/ml/modeling/compose/transformed_target_regressor.py +6 -3
  44. snowflake/ml/modeling/covariance/elliptic_envelope.py +6 -3
  45. snowflake/ml/modeling/covariance/empirical_covariance.py +6 -3
  46. snowflake/ml/modeling/covariance/graphical_lasso.py +6 -3
  47. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +6 -3
  48. snowflake/ml/modeling/covariance/ledoit_wolf.py +6 -3
  49. snowflake/ml/modeling/covariance/min_cov_det.py +6 -3
  50. snowflake/ml/modeling/covariance/oas.py +6 -3
  51. snowflake/ml/modeling/covariance/shrunk_covariance.py +6 -3
  52. snowflake/ml/modeling/decomposition/dictionary_learning.py +6 -3
  53. snowflake/ml/modeling/decomposition/factor_analysis.py +6 -3
  54. snowflake/ml/modeling/decomposition/fast_ica.py +6 -3
  55. snowflake/ml/modeling/decomposition/incremental_pca.py +6 -3
  56. snowflake/ml/modeling/decomposition/kernel_pca.py +6 -3
  57. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +6 -3
  58. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +6 -3
  59. snowflake/ml/modeling/decomposition/pca.py +6 -3
  60. snowflake/ml/modeling/decomposition/sparse_pca.py +6 -3
  61. snowflake/ml/modeling/decomposition/truncated_svd.py +6 -3
  62. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +6 -3
  63. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +6 -3
  64. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +6 -3
  65. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +6 -3
  66. snowflake/ml/modeling/ensemble/bagging_classifier.py +6 -3
  67. snowflake/ml/modeling/ensemble/bagging_regressor.py +6 -3
  68. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +6 -3
  69. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +6 -3
  70. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +6 -3
  71. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +6 -3
  72. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +6 -3
  73. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +6 -3
  74. snowflake/ml/modeling/ensemble/isolation_forest.py +6 -3
  75. snowflake/ml/modeling/ensemble/random_forest_classifier.py +6 -3
  76. snowflake/ml/modeling/ensemble/random_forest_regressor.py +6 -3
  77. snowflake/ml/modeling/ensemble/stacking_regressor.py +6 -3
  78. snowflake/ml/modeling/ensemble/voting_classifier.py +6 -3
  79. snowflake/ml/modeling/ensemble/voting_regressor.py +6 -3
  80. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +6 -3
  81. snowflake/ml/modeling/feature_selection/select_fdr.py +6 -3
  82. snowflake/ml/modeling/feature_selection/select_fpr.py +6 -3
  83. snowflake/ml/modeling/feature_selection/select_fwe.py +6 -3
  84. snowflake/ml/modeling/feature_selection/select_k_best.py +6 -3
  85. snowflake/ml/modeling/feature_selection/select_percentile.py +6 -3
  86. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +6 -3
  87. snowflake/ml/modeling/feature_selection/variance_threshold.py +6 -3
  88. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +6 -3
  89. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +6 -3
  90. snowflake/ml/modeling/impute/iterative_imputer.py +6 -3
  91. snowflake/ml/modeling/impute/knn_imputer.py +6 -3
  92. snowflake/ml/modeling/impute/missing_indicator.py +6 -3
  93. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +6 -3
  94. snowflake/ml/modeling/kernel_approximation/nystroem.py +6 -3
  95. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +6 -3
  96. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +6 -3
  97. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +6 -3
  98. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +6 -3
  99. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +6 -3
  100. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +6 -3
  101. snowflake/ml/modeling/linear_model/ard_regression.py +6 -3
  102. snowflake/ml/modeling/linear_model/bayesian_ridge.py +6 -3
  103. snowflake/ml/modeling/linear_model/elastic_net.py +6 -3
  104. snowflake/ml/modeling/linear_model/elastic_net_cv.py +6 -3
  105. snowflake/ml/modeling/linear_model/gamma_regressor.py +6 -3
  106. snowflake/ml/modeling/linear_model/huber_regressor.py +6 -3
  107. snowflake/ml/modeling/linear_model/lars.py +6 -3
  108. snowflake/ml/modeling/linear_model/lars_cv.py +6 -3
  109. snowflake/ml/modeling/linear_model/lasso.py +6 -3
  110. snowflake/ml/modeling/linear_model/lasso_cv.py +6 -3
  111. snowflake/ml/modeling/linear_model/lasso_lars.py +6 -3
  112. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +6 -3
  113. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +6 -3
  114. snowflake/ml/modeling/linear_model/linear_regression.py +6 -3
  115. snowflake/ml/modeling/linear_model/logistic_regression.py +6 -3
  116. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +6 -3
  117. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +6 -3
  118. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +6 -3
  119. snowflake/ml/modeling/linear_model/multi_task_lasso.py +6 -3
  120. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +6 -3
  121. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +6 -3
  122. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +6 -3
  123. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +6 -3
  124. snowflake/ml/modeling/linear_model/perceptron.py +6 -3
  125. snowflake/ml/modeling/linear_model/poisson_regressor.py +6 -3
  126. snowflake/ml/modeling/linear_model/ransac_regressor.py +6 -3
  127. snowflake/ml/modeling/linear_model/ridge.py +6 -3
  128. snowflake/ml/modeling/linear_model/ridge_classifier.py +6 -3
  129. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +6 -3
  130. snowflake/ml/modeling/linear_model/ridge_cv.py +6 -3
  131. snowflake/ml/modeling/linear_model/sgd_classifier.py +6 -3
  132. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +6 -3
  133. snowflake/ml/modeling/linear_model/sgd_regressor.py +6 -3
  134. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +6 -3
  135. snowflake/ml/modeling/linear_model/tweedie_regressor.py +6 -3
  136. snowflake/ml/modeling/manifold/isomap.py +6 -3
  137. snowflake/ml/modeling/manifold/mds.py +6 -3
  138. snowflake/ml/modeling/manifold/spectral_embedding.py +6 -3
  139. snowflake/ml/modeling/manifold/tsne.py +6 -3
  140. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +6 -3
  141. snowflake/ml/modeling/mixture/gaussian_mixture.py +6 -3
  142. snowflake/ml/modeling/model_selection/grid_search_cv.py +17 -2
  143. snowflake/ml/modeling/model_selection/randomized_search_cv.py +17 -2
  144. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +6 -3
  145. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +6 -3
  146. snowflake/ml/modeling/multiclass/output_code_classifier.py +6 -3
  147. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +6 -3
  148. snowflake/ml/modeling/naive_bayes/categorical_nb.py +6 -3
  149. snowflake/ml/modeling/naive_bayes/complement_nb.py +6 -3
  150. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +6 -3
  151. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +6 -3
  152. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +6 -3
  153. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +6 -3
  154. snowflake/ml/modeling/neighbors/kernel_density.py +6 -3
  155. snowflake/ml/modeling/neighbors/local_outlier_factor.py +6 -3
  156. snowflake/ml/modeling/neighbors/nearest_centroid.py +6 -3
  157. snowflake/ml/modeling/neighbors/nearest_neighbors.py +6 -3
  158. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +6 -3
  159. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +6 -3
  160. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +6 -3
  161. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +6 -3
  162. snowflake/ml/modeling/neural_network/mlp_classifier.py +6 -3
  163. snowflake/ml/modeling/neural_network/mlp_regressor.py +6 -3
  164. snowflake/ml/modeling/pipeline/pipeline.py +10 -2
  165. snowflake/ml/modeling/preprocessing/polynomial_features.py +6 -3
  166. snowflake/ml/modeling/semi_supervised/label_propagation.py +6 -3
  167. snowflake/ml/modeling/semi_supervised/label_spreading.py +6 -3
  168. snowflake/ml/modeling/svm/linear_svc.py +6 -3
  169. snowflake/ml/modeling/svm/linear_svr.py +6 -3
  170. snowflake/ml/modeling/svm/nu_svc.py +6 -3
  171. snowflake/ml/modeling/svm/nu_svr.py +6 -3
  172. snowflake/ml/modeling/svm/svc.py +6 -3
  173. snowflake/ml/modeling/svm/svr.py +6 -3
  174. snowflake/ml/modeling/tree/decision_tree_classifier.py +6 -3
  175. snowflake/ml/modeling/tree/decision_tree_regressor.py +6 -3
  176. snowflake/ml/modeling/tree/extra_tree_classifier.py +6 -3
  177. snowflake/ml/modeling/tree/extra_tree_regressor.py +6 -3
  178. snowflake/ml/modeling/xgboost/xgb_classifier.py +6 -3
  179. snowflake/ml/modeling/xgboost/xgb_regressor.py +6 -3
  180. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +6 -3
  181. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +6 -3
  182. snowflake/ml/version.py +1 -1
  183. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.4.dist-info}/METADATA +29 -14
  184. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.4.dist-info}/RECORD +187 -178
  185. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.4.dist-info}/LICENSE.txt +0 -0
  186. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.4.dist-info}/WHEEL +0 -0
  187. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.4.dist-info}/top_level.txt +0 -0
@@ -38,6 +38,7 @@ from snowflake.ml.model.model_signature import (
38
38
  FeatureSpec,
39
39
  ModelSignature,
40
40
  _infer_signature,
41
+ _truncate_data,
41
42
  _rename_signature_with_snowflake_identifiers,
42
43
  )
43
44
 
@@ -58,6 +59,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.feature_selection".repla
58
59
 
59
60
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
60
61
 
62
+ INFER_SIGNATURE_MAX_ROWS = 100
63
+
61
64
  class SelectKBest(BaseTransformer):
62
65
  r"""Select features according to the k highest scores
63
66
  For more details on this class, see [sklearn.feature_selection.SelectKBest]
@@ -411,7 +414,7 @@ class SelectKBest(BaseTransformer):
411
414
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
412
415
  expected_dtype = "array"
413
416
  else:
414
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
417
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
415
418
  # We can only infer the output types from the input types if the following two statemetns are true:
416
419
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
417
420
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1068,7 +1071,7 @@ class SelectKBest(BaseTransformer):
1068
1071
 
1069
1072
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1070
1073
 
1071
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1074
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1072
1075
  outputs: List[BaseFeatureSpec] = []
1073
1076
  if hasattr(self, "predict"):
1074
1077
  # keep mypy happy
@@ -1076,7 +1079,7 @@ class SelectKBest(BaseTransformer):
1076
1079
  # For classifier, the type of predict is the same as the type of label
1077
1080
  if self._sklearn_object._estimator_type == "classifier":
1078
1081
  # label columns is the desired type for output
1079
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1082
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1080
1083
  # rename the output columns
1081
1084
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1082
1085
  self._model_signature_dict["predict"] = ModelSignature(
@@ -38,6 +38,7 @@ from snowflake.ml.model.model_signature import (
38
38
  FeatureSpec,
39
39
  ModelSignature,
40
40
  _infer_signature,
41
+ _truncate_data,
41
42
  _rename_signature_with_snowflake_identifiers,
42
43
  )
43
44
 
@@ -58,6 +59,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.feature_selection".repla
58
59
 
59
60
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
60
61
 
62
+ INFER_SIGNATURE_MAX_ROWS = 100
63
+
61
64
  class SelectPercentile(BaseTransformer):
62
65
  r"""Select features according to a percentile of the highest scores
63
66
  For more details on this class, see [sklearn.feature_selection.SelectPercentile]
@@ -410,7 +413,7 @@ class SelectPercentile(BaseTransformer):
410
413
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
411
414
  expected_dtype = "array"
412
415
  else:
413
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
416
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
414
417
  # We can only infer the output types from the input types if the following two statemetns are true:
415
418
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
416
419
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1067,7 +1070,7 @@ class SelectPercentile(BaseTransformer):
1067
1070
 
1068
1071
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1069
1072
 
1070
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1073
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1071
1074
  outputs: List[BaseFeatureSpec] = []
1072
1075
  if hasattr(self, "predict"):
1073
1076
  # keep mypy happy
@@ -1075,7 +1078,7 @@ class SelectPercentile(BaseTransformer):
1075
1078
  # For classifier, the type of predict is the same as the type of label
1076
1079
  if self._sklearn_object._estimator_type == "classifier":
1077
1080
  # label columns is the desired type for output
1078
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1081
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1079
1082
  # rename the output columns
1080
1083
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1081
1084
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.feature_selection".repla
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class SequentialFeatureSelector(BaseTransformer):
61
64
  r"""Transformer that performs Sequential Feature Selection
62
65
  For more details on this class, see [sklearn.feature_selection.SequentialFeatureSelector]
@@ -472,7 +475,7 @@ class SequentialFeatureSelector(BaseTransformer):
472
475
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
473
476
  expected_dtype = "array"
474
477
  else:
475
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
478
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
476
479
  # We can only infer the output types from the input types if the following two statemetns are true:
477
480
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
478
481
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1129,7 +1132,7 @@ class SequentialFeatureSelector(BaseTransformer):
1129
1132
 
1130
1133
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1131
1134
 
1132
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1135
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1133
1136
  outputs: List[BaseFeatureSpec] = []
1134
1137
  if hasattr(self, "predict"):
1135
1138
  # keep mypy happy
@@ -1137,7 +1140,7 @@ class SequentialFeatureSelector(BaseTransformer):
1137
1140
  # For classifier, the type of predict is the same as the type of label
1138
1141
  if self._sklearn_object._estimator_type == "classifier":
1139
1142
  # label columns is the desired type for output
1140
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1143
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1141
1144
  # rename the output columns
1142
1145
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1143
1146
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.feature_selection".repla
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class VarianceThreshold(BaseTransformer):
61
64
  r"""Feature selector that removes all low-variance features
62
65
  For more details on this class, see [sklearn.feature_selection.VarianceThreshold]
@@ -403,7 +406,7 @@ class VarianceThreshold(BaseTransformer):
403
406
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
404
407
  expected_dtype = "array"
405
408
  else:
406
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
409
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
407
410
  # We can only infer the output types from the input types if the following two statemetns are true:
408
411
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
409
412
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1060,7 +1063,7 @@ class VarianceThreshold(BaseTransformer):
1060
1063
 
1061
1064
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1062
1065
 
1063
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1066
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1064
1067
  outputs: List[BaseFeatureSpec] = []
1065
1068
  if hasattr(self, "predict"):
1066
1069
  # keep mypy happy
@@ -1068,7 +1071,7 @@ class VarianceThreshold(BaseTransformer):
1068
1071
  # For classifier, the type of predict is the same as the type of label
1069
1072
  if self._sklearn_object._estimator_type == "classifier":
1070
1073
  # label columns is the desired type for output
1071
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1074
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1072
1075
  # rename the output columns
1073
1076
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1074
1077
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.gaussian_process".replac
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class GaussianProcessClassifier(BaseTransformer):
61
64
  r"""Gaussian process classification (GPC) based on Laplace approximation
62
65
  For more details on this class, see [sklearn.gaussian_process.GaussianProcessClassifier]
@@ -496,7 +499,7 @@ class GaussianProcessClassifier(BaseTransformer):
496
499
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
497
500
  expected_dtype = "array"
498
501
  else:
499
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
502
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
500
503
  # We can only infer the output types from the input types if the following two statemetns are true:
501
504
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
502
505
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1157,7 +1160,7 @@ class GaussianProcessClassifier(BaseTransformer):
1157
1160
 
1158
1161
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1159
1162
 
1160
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1163
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1161
1164
  outputs: List[BaseFeatureSpec] = []
1162
1165
  if hasattr(self, "predict"):
1163
1166
  # keep mypy happy
@@ -1165,7 +1168,7 @@ class GaussianProcessClassifier(BaseTransformer):
1165
1168
  # For classifier, the type of predict is the same as the type of label
1166
1169
  if self._sklearn_object._estimator_type == "classifier":
1167
1170
  # label columns is the desired type for output
1168
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1171
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1169
1172
  # rename the output columns
1170
1173
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1171
1174
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.gaussian_process".replac
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class GaussianProcessRegressor(BaseTransformer):
61
64
  r"""Gaussian process regression (GPR)
62
65
  For more details on this class, see [sklearn.gaussian_process.GaussianProcessRegressor]
@@ -487,7 +490,7 @@ class GaussianProcessRegressor(BaseTransformer):
487
490
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
488
491
  expected_dtype = "array"
489
492
  else:
490
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
493
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
491
494
  # We can only infer the output types from the input types if the following two statemetns are true:
492
495
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
493
496
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1144,7 +1147,7 @@ class GaussianProcessRegressor(BaseTransformer):
1144
1147
 
1145
1148
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1146
1149
 
1147
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1150
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1148
1151
  outputs: List[BaseFeatureSpec] = []
1149
1152
  if hasattr(self, "predict"):
1150
1153
  # keep mypy happy
@@ -1152,7 +1155,7 @@ class GaussianProcessRegressor(BaseTransformer):
1152
1155
  # For classifier, the type of predict is the same as the type of label
1153
1156
  if self._sklearn_object._estimator_type == "classifier":
1154
1157
  # label columns is the desired type for output
1155
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1158
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1156
1159
  # rename the output columns
1157
1160
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1158
1161
  self._model_signature_dict["predict"] = ModelSignature(
@@ -38,6 +38,7 @@ from snowflake.ml.model.model_signature import (
38
38
  FeatureSpec,
39
39
  ModelSignature,
40
40
  _infer_signature,
41
+ _truncate_data,
41
42
  _rename_signature_with_snowflake_identifiers,
42
43
  )
43
44
 
@@ -58,6 +59,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.impute".replace("sklearn
58
59
 
59
60
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
60
61
 
62
+ INFER_SIGNATURE_MAX_ROWS = 100
63
+
61
64
  class IterativeImputer(BaseTransformer):
62
65
  r"""Multivariate imputer that estimates each feature from all the others
63
66
  For more details on this class, see [sklearn.impute.IterativeImputer]
@@ -531,7 +534,7 @@ class IterativeImputer(BaseTransformer):
531
534
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
532
535
  expected_dtype = "array"
533
536
  else:
534
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
537
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
535
538
  # We can only infer the output types from the input types if the following two statemetns are true:
536
539
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
537
540
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1188,7 +1191,7 @@ class IterativeImputer(BaseTransformer):
1188
1191
 
1189
1192
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1190
1193
 
1191
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1194
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1192
1195
  outputs: List[BaseFeatureSpec] = []
1193
1196
  if hasattr(self, "predict"):
1194
1197
  # keep mypy happy
@@ -1196,7 +1199,7 @@ class IterativeImputer(BaseTransformer):
1196
1199
  # For classifier, the type of predict is the same as the type of label
1197
1200
  if self._sklearn_object._estimator_type == "classifier":
1198
1201
  # label columns is the desired type for output
1199
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1202
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1200
1203
  # rename the output columns
1201
1204
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1202
1205
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.impute".replace("sklearn
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class KNNImputer(BaseTransformer):
61
64
  r"""Imputation for completing missing values using k-Nearest Neighbors
62
65
  For more details on this class, see [sklearn.impute.KNNImputer]
@@ -457,7 +460,7 @@ class KNNImputer(BaseTransformer):
457
460
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
458
461
  expected_dtype = "array"
459
462
  else:
460
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
463
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
461
464
  # We can only infer the output types from the input types if the following two statemetns are true:
462
465
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
463
466
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1114,7 +1117,7 @@ class KNNImputer(BaseTransformer):
1114
1117
 
1115
1118
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1116
1119
 
1117
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1120
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1118
1121
  outputs: List[BaseFeatureSpec] = []
1119
1122
  if hasattr(self, "predict"):
1120
1123
  # keep mypy happy
@@ -1122,7 +1125,7 @@ class KNNImputer(BaseTransformer):
1122
1125
  # For classifier, the type of predict is the same as the type of label
1123
1126
  if self._sklearn_object._estimator_type == "classifier":
1124
1127
  # label columns is the desired type for output
1125
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1128
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1126
1129
  # rename the output columns
1127
1130
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1128
1131
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.impute".replace("sklearn
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class MissingIndicator(BaseTransformer):
61
64
  r"""Binary indicators for missing values
62
65
  For more details on this class, see [sklearn.impute.MissingIndicator]
@@ -431,7 +434,7 @@ class MissingIndicator(BaseTransformer):
431
434
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
432
435
  expected_dtype = "array"
433
436
  else:
434
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
437
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
435
438
  # We can only infer the output types from the input types if the following two statemetns are true:
436
439
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
437
440
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1088,7 +1091,7 @@ class MissingIndicator(BaseTransformer):
1088
1091
 
1089
1092
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1090
1093
 
1091
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1094
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1092
1095
  outputs: List[BaseFeatureSpec] = []
1093
1096
  if hasattr(self, "predict"):
1094
1097
  # keep mypy happy
@@ -1096,7 +1099,7 @@ class MissingIndicator(BaseTransformer):
1096
1099
  # For classifier, the type of predict is the same as the type of label
1097
1100
  if self._sklearn_object._estimator_type == "classifier":
1098
1101
  # label columns is the desired type for output
1099
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1102
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1100
1103
  # rename the output columns
1101
1104
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1102
1105
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class AdditiveChi2Sampler(BaseTransformer):
61
64
  r"""Approximate feature map for additive chi2 kernel
62
65
  For more details on this class, see [sklearn.kernel_approximation.AdditiveChi2Sampler]
@@ -406,7 +409,7 @@ class AdditiveChi2Sampler(BaseTransformer):
406
409
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
407
410
  expected_dtype = "array"
408
411
  else:
409
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
412
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
410
413
  # We can only infer the output types from the input types if the following two statemetns are true:
411
414
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
412
415
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1063,7 +1066,7 @@ class AdditiveChi2Sampler(BaseTransformer):
1063
1066
 
1064
1067
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1065
1068
 
1066
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1069
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1067
1070
  outputs: List[BaseFeatureSpec] = []
1068
1071
  if hasattr(self, "predict"):
1069
1072
  # keep mypy happy
@@ -1071,7 +1074,7 @@ class AdditiveChi2Sampler(BaseTransformer):
1071
1074
  # For classifier, the type of predict is the same as the type of label
1072
1075
  if self._sklearn_object._estimator_type == "classifier":
1073
1076
  # label columns is the desired type for output
1074
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1077
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1075
1078
  # rename the output columns
1076
1079
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1077
1080
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class Nystroem(BaseTransformer):
61
64
  r"""Approximate a kernel map using a subset of the training data
62
65
  For more details on this class, see [sklearn.kernel_approximation.Nystroem]
@@ -454,7 +457,7 @@ class Nystroem(BaseTransformer):
454
457
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
455
458
  expected_dtype = "array"
456
459
  else:
457
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
460
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
458
461
  # We can only infer the output types from the input types if the following two statemetns are true:
459
462
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
460
463
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1111,7 +1114,7 @@ class Nystroem(BaseTransformer):
1111
1114
 
1112
1115
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1113
1116
 
1114
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1117
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1115
1118
  outputs: List[BaseFeatureSpec] = []
1116
1119
  if hasattr(self, "predict"):
1117
1120
  # keep mypy happy
@@ -1119,7 +1122,7 @@ class Nystroem(BaseTransformer):
1119
1122
  # For classifier, the type of predict is the same as the type of label
1120
1123
  if self._sklearn_object._estimator_type == "classifier":
1121
1124
  # label columns is the desired type for output
1122
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1125
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1123
1126
  # rename the output columns
1124
1127
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1125
1128
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class PolynomialCountSketch(BaseTransformer):
61
64
  r"""Polynomial kernel approximation via Tensor Sketch
62
65
  For more details on this class, see [sklearn.kernel_approximation.PolynomialCountSketch]
@@ -430,7 +433,7 @@ class PolynomialCountSketch(BaseTransformer):
430
433
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
431
434
  expected_dtype = "array"
432
435
  else:
433
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
436
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
434
437
  # We can only infer the output types from the input types if the following two statemetns are true:
435
438
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
436
439
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1087,7 +1090,7 @@ class PolynomialCountSketch(BaseTransformer):
1087
1090
 
1088
1091
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1089
1092
 
1090
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1093
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1091
1094
  outputs: List[BaseFeatureSpec] = []
1092
1095
  if hasattr(self, "predict"):
1093
1096
  # keep mypy happy
@@ -1095,7 +1098,7 @@ class PolynomialCountSketch(BaseTransformer):
1095
1098
  # For classifier, the type of predict is the same as the type of label
1096
1099
  if self._sklearn_object._estimator_type == "classifier":
1097
1100
  # label columns is the desired type for output
1098
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1101
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1099
1102
  # rename the output columns
1100
1103
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1101
1104
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class RBFSampler(BaseTransformer):
61
64
  r"""Approximate a RBF kernel feature map using random Fourier features
62
65
  For more details on this class, see [sklearn.kernel_approximation.RBFSampler]
@@ -417,7 +420,7 @@ class RBFSampler(BaseTransformer):
417
420
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
418
421
  expected_dtype = "array"
419
422
  else:
420
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
423
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
421
424
  # We can only infer the output types from the input types if the following two statemetns are true:
422
425
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
423
426
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1074,7 +1077,7 @@ class RBFSampler(BaseTransformer):
1074
1077
 
1075
1078
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1076
1079
 
1077
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1080
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1078
1081
  outputs: List[BaseFeatureSpec] = []
1079
1082
  if hasattr(self, "predict"):
1080
1083
  # keep mypy happy
@@ -1082,7 +1085,7 @@ class RBFSampler(BaseTransformer):
1082
1085
  # For classifier, the type of predict is the same as the type of label
1083
1086
  if self._sklearn_object._estimator_type == "classifier":
1084
1087
  # label columns is the desired type for output
1085
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1088
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1086
1089
  # rename the output columns
1087
1090
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1088
1091
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class SkewedChi2Sampler(BaseTransformer):
61
64
  r"""Approximate feature map for "skewed chi-squared" kernel
62
65
  For more details on this class, see [sklearn.kernel_approximation.SkewedChi2Sampler]
@@ -415,7 +418,7 @@ class SkewedChi2Sampler(BaseTransformer):
415
418
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
416
419
  expected_dtype = "array"
417
420
  else:
418
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
421
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
419
422
  # We can only infer the output types from the input types if the following two statemetns are true:
420
423
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
421
424
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1072,7 +1075,7 @@ class SkewedChi2Sampler(BaseTransformer):
1072
1075
 
1073
1076
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1074
1077
 
1075
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1078
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1076
1079
  outputs: List[BaseFeatureSpec] = []
1077
1080
  if hasattr(self, "predict"):
1078
1081
  # keep mypy happy
@@ -1080,7 +1083,7 @@ class SkewedChi2Sampler(BaseTransformer):
1080
1083
  # For classifier, the type of predict is the same as the type of label
1081
1084
  if self._sklearn_object._estimator_type == "classifier":
1082
1085
  # label columns is the desired type for output
1083
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1086
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1084
1087
  # rename the output columns
1085
1088
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1086
1089
  self._model_signature_dict["predict"] = ModelSignature(