snowflake-ml-python 1.7.3__py3-none-any.whl → 1.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. snowflake/cortex/_complete.py +19 -0
  2. snowflake/ml/_internal/env_utils.py +64 -21
  3. snowflake/ml/_internal/platform_capabilities.py +87 -0
  4. snowflake/ml/_internal/relax_version_strategy.py +16 -0
  5. snowflake/ml/_internal/telemetry.py +21 -0
  6. snowflake/ml/data/_internal/arrow_ingestor.py +1 -1
  7. snowflake/ml/dataset/dataset.py +0 -1
  8. snowflake/ml/feature_store/feature_store.py +18 -0
  9. snowflake/ml/feature_store/feature_view.py +46 -1
  10. snowflake/ml/fileset/fileset.py +6 -0
  11. snowflake/ml/jobs/__init__.py +21 -0
  12. snowflake/ml/jobs/_utils/constants.py +57 -0
  13. snowflake/ml/jobs/_utils/payload_utils.py +438 -0
  14. snowflake/ml/jobs/_utils/spec_utils.py +296 -0
  15. snowflake/ml/jobs/_utils/types.py +39 -0
  16. snowflake/ml/jobs/decorators.py +71 -0
  17. snowflake/ml/jobs/job.py +113 -0
  18. snowflake/ml/jobs/manager.py +298 -0
  19. snowflake/ml/model/_client/ops/model_ops.py +11 -2
  20. snowflake/ml/model/_client/ops/service_ops.py +1 -11
  21. snowflake/ml/model/_client/sql/service.py +13 -6
  22. snowflake/ml/model/_packager/model_env/model_env.py +45 -28
  23. snowflake/ml/model/_packager/model_handlers/_utils.py +19 -6
  24. snowflake/ml/model/_packager/model_handlers/custom.py +1 -2
  25. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +17 -0
  26. snowflake/ml/model/_packager/model_handlers/keras.py +230 -0
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +1 -0
  28. snowflake/ml/model/_packager/model_handlers/sklearn.py +28 -3
  29. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +74 -21
  30. snowflake/ml/model/_packager/model_handlers/tensorflow.py +27 -49
  31. snowflake/ml/model/_packager/model_handlers_migrator/tensorflow_migrator_2023_12_01.py +48 -0
  32. snowflake/ml/model/_packager/model_meta/model_meta.py +1 -1
  33. snowflake/ml/model/_packager/model_meta/model_meta_schema.py +3 -0
  34. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +2 -2
  35. snowflake/ml/model/_packager/model_runtime/model_runtime.py +4 -1
  36. snowflake/ml/model/_packager/model_task/model_task_utils.py +5 -1
  37. snowflake/ml/model/_signatures/base_handler.py +1 -2
  38. snowflake/ml/model/_signatures/builtins_handler.py +2 -2
  39. snowflake/ml/model/_signatures/core.py +2 -2
  40. snowflake/ml/model/_signatures/numpy_handler.py +11 -12
  41. snowflake/ml/model/_signatures/pandas_handler.py +11 -9
  42. snowflake/ml/model/_signatures/pytorch_handler.py +3 -6
  43. snowflake/ml/model/_signatures/snowpark_handler.py +3 -3
  44. snowflake/ml/model/_signatures/tensorflow_handler.py +2 -7
  45. snowflake/ml/model/model_signature.py +25 -4
  46. snowflake/ml/model/type_hints.py +15 -0
  47. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +14 -1
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +6 -3
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +6 -3
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +6 -3
  51. snowflake/ml/modeling/cluster/birch.py +6 -3
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +6 -3
  53. snowflake/ml/modeling/cluster/dbscan.py +6 -3
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +6 -3
  55. snowflake/ml/modeling/cluster/k_means.py +6 -3
  56. snowflake/ml/modeling/cluster/mean_shift.py +6 -3
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +6 -3
  58. snowflake/ml/modeling/cluster/optics.py +6 -3
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +6 -3
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +6 -3
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +6 -3
  62. snowflake/ml/modeling/compose/column_transformer.py +6 -3
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +6 -3
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +6 -3
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +6 -3
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +6 -3
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +6 -3
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +6 -3
  69. snowflake/ml/modeling/covariance/min_cov_det.py +6 -3
  70. snowflake/ml/modeling/covariance/oas.py +6 -3
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +6 -3
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +6 -3
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +6 -3
  74. snowflake/ml/modeling/decomposition/fast_ica.py +6 -3
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +6 -3
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +6 -3
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +6 -3
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +6 -3
  79. snowflake/ml/modeling/decomposition/pca.py +6 -3
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +6 -3
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +6 -3
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +6 -3
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +6 -3
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +6 -3
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +6 -3
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +6 -3
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +6 -3
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +6 -3
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +6 -3
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +6 -3
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +6 -3
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +6 -3
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +6 -3
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +6 -3
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +6 -3
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +6 -3
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +6 -3
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +6 -3
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +6 -3
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +6 -3
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +6 -3
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +6 -3
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +6 -3
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +6 -3
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +6 -3
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +6 -3
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +6 -3
  108. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +6 -3
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +6 -3
  110. snowflake/ml/modeling/impute/iterative_imputer.py +6 -3
  111. snowflake/ml/modeling/impute/knn_imputer.py +6 -3
  112. snowflake/ml/modeling/impute/missing_indicator.py +6 -3
  113. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +6 -3
  114. snowflake/ml/modeling/kernel_approximation/nystroem.py +6 -3
  115. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +6 -3
  116. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +6 -3
  117. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +6 -3
  118. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +6 -3
  119. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +6 -3
  120. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +6 -3
  121. snowflake/ml/modeling/linear_model/ard_regression.py +6 -3
  122. snowflake/ml/modeling/linear_model/bayesian_ridge.py +6 -3
  123. snowflake/ml/modeling/linear_model/elastic_net.py +6 -3
  124. snowflake/ml/modeling/linear_model/elastic_net_cv.py +6 -3
  125. snowflake/ml/modeling/linear_model/gamma_regressor.py +6 -3
  126. snowflake/ml/modeling/linear_model/huber_regressor.py +6 -3
  127. snowflake/ml/modeling/linear_model/lars.py +6 -3
  128. snowflake/ml/modeling/linear_model/lars_cv.py +6 -3
  129. snowflake/ml/modeling/linear_model/lasso.py +6 -3
  130. snowflake/ml/modeling/linear_model/lasso_cv.py +6 -3
  131. snowflake/ml/modeling/linear_model/lasso_lars.py +6 -3
  132. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +6 -3
  133. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +6 -3
  134. snowflake/ml/modeling/linear_model/linear_regression.py +6 -3
  135. snowflake/ml/modeling/linear_model/logistic_regression.py +6 -3
  136. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +6 -3
  137. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +6 -3
  138. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +6 -3
  139. snowflake/ml/modeling/linear_model/multi_task_lasso.py +6 -3
  140. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +6 -3
  141. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +6 -3
  142. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +6 -3
  143. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +6 -3
  144. snowflake/ml/modeling/linear_model/perceptron.py +6 -3
  145. snowflake/ml/modeling/linear_model/poisson_regressor.py +6 -3
  146. snowflake/ml/modeling/linear_model/ransac_regressor.py +6 -3
  147. snowflake/ml/modeling/linear_model/ridge.py +6 -3
  148. snowflake/ml/modeling/linear_model/ridge_classifier.py +6 -3
  149. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +6 -3
  150. snowflake/ml/modeling/linear_model/ridge_cv.py +6 -3
  151. snowflake/ml/modeling/linear_model/sgd_classifier.py +6 -3
  152. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +6 -3
  153. snowflake/ml/modeling/linear_model/sgd_regressor.py +6 -3
  154. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +6 -3
  155. snowflake/ml/modeling/linear_model/tweedie_regressor.py +6 -3
  156. snowflake/ml/modeling/manifold/isomap.py +6 -3
  157. snowflake/ml/modeling/manifold/mds.py +6 -3
  158. snowflake/ml/modeling/manifold/spectral_embedding.py +6 -3
  159. snowflake/ml/modeling/manifold/tsne.py +6 -3
  160. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +6 -3
  161. snowflake/ml/modeling/mixture/gaussian_mixture.py +6 -3
  162. snowflake/ml/modeling/model_selection/grid_search_cv.py +17 -2
  163. snowflake/ml/modeling/model_selection/randomized_search_cv.py +17 -2
  164. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +6 -3
  165. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +6 -3
  166. snowflake/ml/modeling/multiclass/output_code_classifier.py +6 -3
  167. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +6 -3
  168. snowflake/ml/modeling/naive_bayes/categorical_nb.py +6 -3
  169. snowflake/ml/modeling/naive_bayes/complement_nb.py +6 -3
  170. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +6 -3
  171. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +6 -3
  172. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +6 -3
  173. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +6 -3
  174. snowflake/ml/modeling/neighbors/kernel_density.py +6 -3
  175. snowflake/ml/modeling/neighbors/local_outlier_factor.py +6 -3
  176. snowflake/ml/modeling/neighbors/nearest_centroid.py +6 -3
  177. snowflake/ml/modeling/neighbors/nearest_neighbors.py +6 -3
  178. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +6 -3
  179. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +6 -3
  180. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +6 -3
  181. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +6 -3
  182. snowflake/ml/modeling/neural_network/mlp_classifier.py +6 -3
  183. snowflake/ml/modeling/neural_network/mlp_regressor.py +6 -3
  184. snowflake/ml/modeling/pipeline/pipeline.py +28 -3
  185. snowflake/ml/modeling/preprocessing/polynomial_features.py +8 -5
  186. snowflake/ml/modeling/semi_supervised/label_propagation.py +6 -3
  187. snowflake/ml/modeling/semi_supervised/label_spreading.py +6 -3
  188. snowflake/ml/modeling/svm/linear_svc.py +6 -3
  189. snowflake/ml/modeling/svm/linear_svr.py +6 -3
  190. snowflake/ml/modeling/svm/nu_svc.py +6 -3
  191. snowflake/ml/modeling/svm/nu_svr.py +6 -3
  192. snowflake/ml/modeling/svm/svc.py +6 -3
  193. snowflake/ml/modeling/svm/svr.py +6 -3
  194. snowflake/ml/modeling/tree/decision_tree_classifier.py +6 -3
  195. snowflake/ml/modeling/tree/decision_tree_regressor.py +6 -3
  196. snowflake/ml/modeling/tree/extra_tree_classifier.py +6 -3
  197. snowflake/ml/modeling/tree/extra_tree_regressor.py +6 -3
  198. snowflake/ml/modeling/xgboost/xgb_classifier.py +6 -3
  199. snowflake/ml/modeling/xgboost/xgb_regressor.py +6 -3
  200. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +6 -3
  201. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +6 -3
  202. snowflake/ml/registry/registry.py +34 -4
  203. snowflake/ml/version.py +1 -1
  204. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/METADATA +81 -33
  205. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/RECORD +208 -196
  206. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/WHEEL +1 -1
  207. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/LICENSE.txt +0 -0
  208. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/top_level.txt +0 -0
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.neighbors".replace("skle
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class RadiusNeighborsClassifier(BaseTransformer):
61
64
  r"""Classifier implementing a vote among neighbors within a given radius
62
65
  For more details on this class, see [sklearn.neighbors.RadiusNeighborsClassifier]
@@ -495,7 +498,7 @@ class RadiusNeighborsClassifier(BaseTransformer):
495
498
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
496
499
  expected_dtype = "array"
497
500
  else:
498
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
501
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
499
502
  # We can only infer the output types from the input types if the following two statemetns are true:
500
503
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
501
504
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1156,7 +1159,7 @@ class RadiusNeighborsClassifier(BaseTransformer):
1156
1159
 
1157
1160
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1158
1161
 
1159
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1162
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1160
1163
  outputs: List[BaseFeatureSpec] = []
1161
1164
  if hasattr(self, "predict"):
1162
1165
  # keep mypy happy
@@ -1164,7 +1167,7 @@ class RadiusNeighborsClassifier(BaseTransformer):
1164
1167
  # For classifier, the type of predict is the same as the type of label
1165
1168
  if self._sklearn_object._estimator_type == "classifier":
1166
1169
  # label columns is the desired type for output
1167
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1170
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1168
1171
  # rename the output columns
1169
1172
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1170
1173
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.neighbors".replace("skle
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class RadiusNeighborsRegressor(BaseTransformer):
61
64
  r"""Regression based on neighbors within a fixed radius
62
65
  For more details on this class, see [sklearn.neighbors.RadiusNeighborsRegressor]
@@ -480,7 +483,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
480
483
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
481
484
  expected_dtype = "array"
482
485
  else:
483
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
486
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
484
487
  # We can only infer the output types from the input types if the following two statemetns are true:
485
488
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
486
489
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1137,7 +1140,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
1137
1140
 
1138
1141
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1139
1142
 
1140
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1143
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1141
1144
  outputs: List[BaseFeatureSpec] = []
1142
1145
  if hasattr(self, "predict"):
1143
1146
  # keep mypy happy
@@ -1145,7 +1148,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
1145
1148
  # For classifier, the type of predict is the same as the type of label
1146
1149
  if self._sklearn_object._estimator_type == "classifier":
1147
1150
  # label columns is the desired type for output
1148
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1151
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1149
1152
  # rename the output columns
1150
1153
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1151
1154
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.neural_network".replace(
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class BernoulliRBM(BaseTransformer):
61
64
  r"""Bernoulli Restricted Boltzmann Machine (RBM)
62
65
  For more details on this class, see [sklearn.neural_network.BernoulliRBM]
@@ -439,7 +442,7 @@ class BernoulliRBM(BaseTransformer):
439
442
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
440
443
  expected_dtype = "array"
441
444
  else:
442
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
445
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
443
446
  # We can only infer the output types from the input types if the following two statemetns are true:
444
447
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
445
448
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1098,7 +1101,7 @@ class BernoulliRBM(BaseTransformer):
1098
1101
 
1099
1102
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1100
1103
 
1101
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1104
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1102
1105
  outputs: List[BaseFeatureSpec] = []
1103
1106
  if hasattr(self, "predict"):
1104
1107
  # keep mypy happy
@@ -1106,7 +1109,7 @@ class BernoulliRBM(BaseTransformer):
1106
1109
  # For classifier, the type of predict is the same as the type of label
1107
1110
  if self._sklearn_object._estimator_type == "classifier":
1108
1111
  # label columns is the desired type for output
1109
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1112
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1110
1113
  # rename the output columns
1111
1114
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1112
1115
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.neural_network".replace(
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class MLPClassifier(BaseTransformer):
61
64
  r"""Multi-layer Perceptron classifier
62
65
  For more details on this class, see [sklearn.neural_network.MLPClassifier]
@@ -598,7 +601,7 @@ class MLPClassifier(BaseTransformer):
598
601
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
599
602
  expected_dtype = "array"
600
603
  else:
601
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
604
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
602
605
  # We can only infer the output types from the input types if the following two statemetns are true:
603
606
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
604
607
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1259,7 +1262,7 @@ class MLPClassifier(BaseTransformer):
1259
1262
 
1260
1263
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1261
1264
 
1262
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1265
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1263
1266
  outputs: List[BaseFeatureSpec] = []
1264
1267
  if hasattr(self, "predict"):
1265
1268
  # keep mypy happy
@@ -1267,7 +1270,7 @@ class MLPClassifier(BaseTransformer):
1267
1270
  # For classifier, the type of predict is the same as the type of label
1268
1271
  if self._sklearn_object._estimator_type == "classifier":
1269
1272
  # label columns is the desired type for output
1270
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1273
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1271
1274
  # rename the output columns
1272
1275
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1273
1276
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.neural_network".replace(
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class MLPRegressor(BaseTransformer):
61
64
  r"""Multi-layer Perceptron regressor
62
65
  For more details on this class, see [sklearn.neural_network.MLPRegressor]
@@ -591,7 +594,7 @@ class MLPRegressor(BaseTransformer):
591
594
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
592
595
  expected_dtype = "array"
593
596
  else:
594
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
597
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
595
598
  # We can only infer the output types from the input types if the following two statemetns are true:
596
599
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
597
600
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1248,7 +1251,7 @@ class MLPRegressor(BaseTransformer):
1248
1251
 
1249
1252
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1250
1253
 
1251
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1254
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1252
1255
  outputs: List[BaseFeatureSpec] = []
1253
1256
  if hasattr(self, "predict"):
1254
1257
  # keep mypy happy
@@ -1256,7 +1259,7 @@ class MLPRegressor(BaseTransformer):
1256
1259
  # For classifier, the type of predict is the same as the type of label
1257
1260
  if self._sklearn_object._estimator_type == "classifier":
1258
1261
  # label columns is the desired type for output
1259
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1262
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1260
1263
  # rename the output columns
1261
1264
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1262
1265
  self._model_signature_dict["predict"] = ModelSignature(
@@ -20,7 +20,11 @@ from snowflake.ml._internal.exceptions import error_codes, exceptions
20
20
  from snowflake.ml._internal.lineage import lineage_utils
21
21
  from snowflake.ml._internal.utils import snowpark_dataframe_utils, temp_file_utils
22
22
  from snowflake.ml.data import data_source
23
- from snowflake.ml.model.model_signature import ModelSignature, _infer_signature
23
+ from snowflake.ml.model.model_signature import (
24
+ ModelSignature,
25
+ _infer_signature,
26
+ _truncate_data,
27
+ )
24
28
  from snowflake.ml.modeling._internal.model_transformer_builder import (
25
29
  ModelTransformerBuilder,
26
30
  )
@@ -31,6 +35,8 @@ from snowflake.snowpark._internal import utils as snowpark_utils
31
35
  _PROJECT = "ModelDevelopment"
32
36
  _SUBPROJECT = "Framework"
33
37
 
38
+ INFER_SIGNATURE_MAX_ROWS = 100
39
+
34
40
 
35
41
  def _final_step_has(attr: str) -> Callable[..., bool]:
36
42
  """Check that final_estimator has `attr`. Used together with `available_if` in `Pipeline`."""
@@ -848,6 +854,7 @@ class Pipeline(base.BaseTransformer):
848
854
  # Create a fitted sklearn pipeline object by translating each non-estimator step in pipeline with with
849
855
  # a fitted column transformer.
850
856
  sksteps = []
857
+ i = 0
851
858
  for i, (name, trans) in enumerate(self._get_transformers()):
852
859
  if isinstance(trans, base.BaseTransformer):
853
860
  trans = self._construct_fitted_column_transformer_object(
@@ -885,13 +892,31 @@ class Pipeline(base.BaseTransformer):
885
892
  self._model_signature_dict = dict()
886
893
 
887
894
  input_columns = self._get_sanitized_list_of_columns(dataset.columns)
888
- inputs_signature = _infer_signature(dataset[input_columns], "input", use_snowflake_identifiers=True)
895
+ inputs_signature = _infer_signature(
896
+ _truncate_data(dataset[input_columns], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True
897
+ )
889
898
 
890
899
  estimator_step = self._get_estimator()
891
900
  if estimator_step:
892
901
  estimator_signatures = estimator_step[1].model_signatures
893
902
  for method, signature in estimator_signatures.items():
894
- self._model_signature_dict[method] = ModelSignature(inputs=inputs_signature, outputs=signature.outputs)
903
+ # Add the inferred input signature to the model signature dictionary for each method
904
+ self._model_signature_dict[method] = ModelSignature(
905
+ inputs=inputs_signature,
906
+ outputs=(
907
+ # If _drop_input_cols is True, do not include any input columns in the output signature
908
+ []
909
+ if self._drop_input_cols
910
+ else [
911
+ # Include input columns in the output signature if they are not already present
912
+ # Those already present means they are overwritten by the output of the estimator
913
+ spec
914
+ for spec in inputs_signature
915
+ if spec.name not in [_spec.name for _spec in signature.outputs]
916
+ ]
917
+ )
918
+ + signature.outputs, # Append the existing output signature
919
+ )
895
920
 
896
921
  @property
897
922
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.preprocessing".replace("
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class PolynomialFeatures(BaseTransformer):
61
64
  r"""Generate polynomial and interaction features
62
65
  For more details on this class, see [sklearn.preprocessing.PolynomialFeatures]
@@ -334,7 +337,7 @@ class PolynomialFeatures(BaseTransformer):
334
337
  transform_kwargs: BatchInferenceKwargsTypedDict = dict()
335
338
 
336
339
  if isinstance(dataset, DataFrame):
337
- expected_type_inferred = ""
340
+ expected_type_inferred = "float"
338
341
  # when it is classifier, infer the datatype from label columns
339
342
  if expected_type_inferred == "" and 'predict' in self.model_signatures:
340
343
  # Batch inference takes a single expected output column type. Use the first columns type for now.
@@ -412,7 +415,7 @@ class PolynomialFeatures(BaseTransformer):
412
415
  # are specific to the type of dataset used.
413
416
  transform_kwargs: BatchInferenceKwargsTypedDict = dict()
414
417
  if isinstance(dataset, DataFrame):
415
- expected_dtype = ""
418
+ expected_dtype = "float"
416
419
  if False: # is child of _BaseHeterogeneousEnsemble
417
420
  # transform() method of HeterogeneousEnsemble estimators return responses of varying shapes
418
421
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
@@ -429,7 +432,7 @@ class PolynomialFeatures(BaseTransformer):
429
432
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
430
433
  expected_dtype = "array"
431
434
  else:
432
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
435
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
433
436
  # We can only infer the output types from the input types if the following two statemetns are true:
434
437
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
435
438
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1086,7 +1089,7 @@ class PolynomialFeatures(BaseTransformer):
1086
1089
 
1087
1090
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1088
1091
 
1089
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1092
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1090
1093
  outputs: List[BaseFeatureSpec] = []
1091
1094
  if hasattr(self, "predict"):
1092
1095
  # keep mypy happy
@@ -1094,7 +1097,7 @@ class PolynomialFeatures(BaseTransformer):
1094
1097
  # For classifier, the type of predict is the same as the type of label
1095
1098
  if self._sklearn_object._estimator_type == "classifier":
1096
1099
  # label columns is the desired type for output
1097
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1100
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1098
1101
  # rename the output columns
1099
1102
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1100
1103
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.semi_supervised".replace
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class LabelPropagation(BaseTransformer):
61
64
  r"""Label Propagation classifier
62
65
  For more details on this class, see [sklearn.semi_supervised.LabelPropagation]
@@ -433,7 +436,7 @@ class LabelPropagation(BaseTransformer):
433
436
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
434
437
  expected_dtype = "array"
435
438
  else:
436
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
439
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
437
440
  # We can only infer the output types from the input types if the following two statemetns are true:
438
441
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
439
442
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1094,7 +1097,7 @@ class LabelPropagation(BaseTransformer):
1094
1097
 
1095
1098
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1096
1099
 
1097
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1100
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1098
1101
  outputs: List[BaseFeatureSpec] = []
1099
1102
  if hasattr(self, "predict"):
1100
1103
  # keep mypy happy
@@ -1102,7 +1105,7 @@ class LabelPropagation(BaseTransformer):
1102
1105
  # For classifier, the type of predict is the same as the type of label
1103
1106
  if self._sklearn_object._estimator_type == "classifier":
1104
1107
  # label columns is the desired type for output
1105
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1108
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1106
1109
  # rename the output columns
1107
1110
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1108
1111
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.semi_supervised".replace
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class LabelSpreading(BaseTransformer):
61
64
  r"""LabelSpreading model for semi-supervised learning
62
65
  For more details on this class, see [sklearn.semi_supervised.LabelSpreading]
@@ -442,7 +445,7 @@ class LabelSpreading(BaseTransformer):
442
445
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
443
446
  expected_dtype = "array"
444
447
  else:
445
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
448
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
446
449
  # We can only infer the output types from the input types if the following two statemetns are true:
447
450
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
448
451
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1103,7 +1106,7 @@ class LabelSpreading(BaseTransformer):
1103
1106
 
1104
1107
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1105
1108
 
1106
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1109
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1107
1110
  outputs: List[BaseFeatureSpec] = []
1108
1111
  if hasattr(self, "predict"):
1109
1112
  # keep mypy happy
@@ -1111,7 +1114,7 @@ class LabelSpreading(BaseTransformer):
1111
1114
  # For classifier, the type of predict is the same as the type of label
1112
1115
  if self._sklearn_object._estimator_type == "classifier":
1113
1116
  # label columns is the desired type for output
1114
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1117
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1115
1118
  # rename the output columns
1116
1119
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1117
1120
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.svm".replace("sklearn.",
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class LinearSVC(BaseTransformer):
61
64
  r"""Linear Support Vector Classification
62
65
  For more details on this class, see [sklearn.svm.LinearSVC]
@@ -507,7 +510,7 @@ class LinearSVC(BaseTransformer):
507
510
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
508
511
  expected_dtype = "array"
509
512
  else:
510
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
513
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
511
514
  # We can only infer the output types from the input types if the following two statemetns are true:
512
515
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
513
516
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1166,7 +1169,7 @@ class LinearSVC(BaseTransformer):
1166
1169
 
1167
1170
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1168
1171
 
1169
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1172
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1170
1173
  outputs: List[BaseFeatureSpec] = []
1171
1174
  if hasattr(self, "predict"):
1172
1175
  # keep mypy happy
@@ -1174,7 +1177,7 @@ class LinearSVC(BaseTransformer):
1174
1177
  # For classifier, the type of predict is the same as the type of label
1175
1178
  if self._sklearn_object._estimator_type == "classifier":
1176
1179
  # label columns is the desired type for output
1177
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1180
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1178
1181
  # rename the output columns
1179
1182
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1180
1183
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.svm".replace("sklearn.",
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class LinearSVR(BaseTransformer):
61
64
  r"""Linear Support Vector Regression
62
65
  For more details on this class, see [sklearn.svm.LinearSVR]
@@ -476,7 +479,7 @@ class LinearSVR(BaseTransformer):
476
479
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
477
480
  expected_dtype = "array"
478
481
  else:
479
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
482
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
480
483
  # We can only infer the output types from the input types if the following two statemetns are true:
481
484
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
482
485
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1133,7 +1136,7 @@ class LinearSVR(BaseTransformer):
1133
1136
 
1134
1137
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1135
1138
 
1136
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1139
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1137
1140
  outputs: List[BaseFeatureSpec] = []
1138
1141
  if hasattr(self, "predict"):
1139
1142
  # keep mypy happy
@@ -1141,7 +1144,7 @@ class LinearSVR(BaseTransformer):
1141
1144
  # For classifier, the type of predict is the same as the type of label
1142
1145
  if self._sklearn_object._estimator_type == "classifier":
1143
1146
  # label columns is the desired type for output
1144
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1147
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1145
1148
  # rename the output columns
1146
1149
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1147
1150
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.svm".replace("sklearn.",
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class NuSVC(BaseTransformer):
61
64
  r"""Nu-Support Vector Classification
62
65
  For more details on this class, see [sklearn.svm.NuSVC]
@@ -506,7 +509,7 @@ class NuSVC(BaseTransformer):
506
509
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
507
510
  expected_dtype = "array"
508
511
  else:
509
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
512
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
510
513
  # We can only infer the output types from the input types if the following two statemetns are true:
511
514
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
512
515
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1169,7 +1172,7 @@ class NuSVC(BaseTransformer):
1169
1172
 
1170
1173
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1171
1174
 
1172
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1175
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1173
1176
  outputs: List[BaseFeatureSpec] = []
1174
1177
  if hasattr(self, "predict"):
1175
1178
  # keep mypy happy
@@ -1177,7 +1180,7 @@ class NuSVC(BaseTransformer):
1177
1180
  # For classifier, the type of predict is the same as the type of label
1178
1181
  if self._sklearn_object._estimator_type == "classifier":
1179
1182
  # label columns is the desired type for output
1180
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1183
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1181
1184
  # rename the output columns
1182
1185
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1183
1186
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.svm".replace("sklearn.",
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class NuSVR(BaseTransformer):
61
64
  r"""Nu Support Vector Regression
62
65
  For more details on this class, see [sklearn.svm.NuSVR]
@@ -467,7 +470,7 @@ class NuSVR(BaseTransformer):
467
470
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
468
471
  expected_dtype = "array"
469
472
  else:
470
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
473
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
471
474
  # We can only infer the output types from the input types if the following two statemetns are true:
472
475
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
473
476
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1124,7 +1127,7 @@ class NuSVR(BaseTransformer):
1124
1127
 
1125
1128
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1126
1129
 
1127
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1130
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1128
1131
  outputs: List[BaseFeatureSpec] = []
1129
1132
  if hasattr(self, "predict"):
1130
1133
  # keep mypy happy
@@ -1132,7 +1135,7 @@ class NuSVR(BaseTransformer):
1132
1135
  # For classifier, the type of predict is the same as the type of label
1133
1136
  if self._sklearn_object._estimator_type == "classifier":
1134
1137
  # label columns is the desired type for output
1135
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1138
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1136
1139
  # rename the output columns
1137
1140
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1138
1141
  self._model_signature_dict["predict"] = ModelSignature(