snowflake-ml-python 1.7.3__py3-none-any.whl → 1.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. snowflake/cortex/_complete.py +19 -0
  2. snowflake/ml/_internal/platform_capabilities.py +87 -0
  3. snowflake/ml/dataset/dataset.py +0 -1
  4. snowflake/ml/fileset/fileset.py +6 -0
  5. snowflake/ml/jobs/__init__.py +21 -0
  6. snowflake/ml/jobs/_utils/constants.py +51 -0
  7. snowflake/ml/jobs/_utils/payload_utils.py +352 -0
  8. snowflake/ml/jobs/_utils/spec_utils.py +298 -0
  9. snowflake/ml/jobs/_utils/types.py +39 -0
  10. snowflake/ml/jobs/decorators.py +91 -0
  11. snowflake/ml/jobs/job.py +113 -0
  12. snowflake/ml/jobs/manager.py +298 -0
  13. snowflake/ml/model/_client/ops/model_ops.py +11 -2
  14. snowflake/ml/model/_client/ops/service_ops.py +1 -11
  15. snowflake/ml/model/_client/sql/service.py +13 -6
  16. snowflake/ml/model/_packager/model_handlers/_utils.py +12 -3
  17. snowflake/ml/model/_packager/model_handlers/custom.py +1 -2
  18. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +1 -0
  19. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +2 -2
  20. snowflake/ml/model/_signatures/base_handler.py +1 -2
  21. snowflake/ml/model/_signatures/builtins_handler.py +2 -2
  22. snowflake/ml/model/_signatures/numpy_handler.py +6 -7
  23. snowflake/ml/model/_signatures/pandas_handler.py +2 -2
  24. snowflake/ml/model/_signatures/pytorch_handler.py +2 -5
  25. snowflake/ml/model/_signatures/snowpark_handler.py +3 -3
  26. snowflake/ml/model/_signatures/tensorflow_handler.py +2 -7
  27. snowflake/ml/model/model_signature.py +17 -4
  28. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +6 -3
  29. snowflake/ml/modeling/cluster/affinity_propagation.py +6 -3
  30. snowflake/ml/modeling/cluster/agglomerative_clustering.py +6 -3
  31. snowflake/ml/modeling/cluster/birch.py +6 -3
  32. snowflake/ml/modeling/cluster/bisecting_k_means.py +6 -3
  33. snowflake/ml/modeling/cluster/dbscan.py +6 -3
  34. snowflake/ml/modeling/cluster/feature_agglomeration.py +6 -3
  35. snowflake/ml/modeling/cluster/k_means.py +6 -3
  36. snowflake/ml/modeling/cluster/mean_shift.py +6 -3
  37. snowflake/ml/modeling/cluster/mini_batch_k_means.py +6 -3
  38. snowflake/ml/modeling/cluster/optics.py +6 -3
  39. snowflake/ml/modeling/cluster/spectral_biclustering.py +6 -3
  40. snowflake/ml/modeling/cluster/spectral_clustering.py +6 -3
  41. snowflake/ml/modeling/cluster/spectral_coclustering.py +6 -3
  42. snowflake/ml/modeling/compose/column_transformer.py +6 -3
  43. snowflake/ml/modeling/compose/transformed_target_regressor.py +6 -3
  44. snowflake/ml/modeling/covariance/elliptic_envelope.py +6 -3
  45. snowflake/ml/modeling/covariance/empirical_covariance.py +6 -3
  46. snowflake/ml/modeling/covariance/graphical_lasso.py +6 -3
  47. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +6 -3
  48. snowflake/ml/modeling/covariance/ledoit_wolf.py +6 -3
  49. snowflake/ml/modeling/covariance/min_cov_det.py +6 -3
  50. snowflake/ml/modeling/covariance/oas.py +6 -3
  51. snowflake/ml/modeling/covariance/shrunk_covariance.py +6 -3
  52. snowflake/ml/modeling/decomposition/dictionary_learning.py +6 -3
  53. snowflake/ml/modeling/decomposition/factor_analysis.py +6 -3
  54. snowflake/ml/modeling/decomposition/fast_ica.py +6 -3
  55. snowflake/ml/modeling/decomposition/incremental_pca.py +6 -3
  56. snowflake/ml/modeling/decomposition/kernel_pca.py +6 -3
  57. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +6 -3
  58. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +6 -3
  59. snowflake/ml/modeling/decomposition/pca.py +6 -3
  60. snowflake/ml/modeling/decomposition/sparse_pca.py +6 -3
  61. snowflake/ml/modeling/decomposition/truncated_svd.py +6 -3
  62. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +6 -3
  63. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +6 -3
  64. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +6 -3
  65. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +6 -3
  66. snowflake/ml/modeling/ensemble/bagging_classifier.py +6 -3
  67. snowflake/ml/modeling/ensemble/bagging_regressor.py +6 -3
  68. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +6 -3
  69. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +6 -3
  70. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +6 -3
  71. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +6 -3
  72. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +6 -3
  73. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +6 -3
  74. snowflake/ml/modeling/ensemble/isolation_forest.py +6 -3
  75. snowflake/ml/modeling/ensemble/random_forest_classifier.py +6 -3
  76. snowflake/ml/modeling/ensemble/random_forest_regressor.py +6 -3
  77. snowflake/ml/modeling/ensemble/stacking_regressor.py +6 -3
  78. snowflake/ml/modeling/ensemble/voting_classifier.py +6 -3
  79. snowflake/ml/modeling/ensemble/voting_regressor.py +6 -3
  80. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +6 -3
  81. snowflake/ml/modeling/feature_selection/select_fdr.py +6 -3
  82. snowflake/ml/modeling/feature_selection/select_fpr.py +6 -3
  83. snowflake/ml/modeling/feature_selection/select_fwe.py +6 -3
  84. snowflake/ml/modeling/feature_selection/select_k_best.py +6 -3
  85. snowflake/ml/modeling/feature_selection/select_percentile.py +6 -3
  86. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +6 -3
  87. snowflake/ml/modeling/feature_selection/variance_threshold.py +6 -3
  88. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +6 -3
  89. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +6 -3
  90. snowflake/ml/modeling/impute/iterative_imputer.py +6 -3
  91. snowflake/ml/modeling/impute/knn_imputer.py +6 -3
  92. snowflake/ml/modeling/impute/missing_indicator.py +6 -3
  93. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +6 -3
  94. snowflake/ml/modeling/kernel_approximation/nystroem.py +6 -3
  95. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +6 -3
  96. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +6 -3
  97. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +6 -3
  98. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +6 -3
  99. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +6 -3
  100. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +6 -3
  101. snowflake/ml/modeling/linear_model/ard_regression.py +6 -3
  102. snowflake/ml/modeling/linear_model/bayesian_ridge.py +6 -3
  103. snowflake/ml/modeling/linear_model/elastic_net.py +6 -3
  104. snowflake/ml/modeling/linear_model/elastic_net_cv.py +6 -3
  105. snowflake/ml/modeling/linear_model/gamma_regressor.py +6 -3
  106. snowflake/ml/modeling/linear_model/huber_regressor.py +6 -3
  107. snowflake/ml/modeling/linear_model/lars.py +6 -3
  108. snowflake/ml/modeling/linear_model/lars_cv.py +6 -3
  109. snowflake/ml/modeling/linear_model/lasso.py +6 -3
  110. snowflake/ml/modeling/linear_model/lasso_cv.py +6 -3
  111. snowflake/ml/modeling/linear_model/lasso_lars.py +6 -3
  112. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +6 -3
  113. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +6 -3
  114. snowflake/ml/modeling/linear_model/linear_regression.py +6 -3
  115. snowflake/ml/modeling/linear_model/logistic_regression.py +6 -3
  116. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +6 -3
  117. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +6 -3
  118. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +6 -3
  119. snowflake/ml/modeling/linear_model/multi_task_lasso.py +6 -3
  120. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +6 -3
  121. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +6 -3
  122. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +6 -3
  123. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +6 -3
  124. snowflake/ml/modeling/linear_model/perceptron.py +6 -3
  125. snowflake/ml/modeling/linear_model/poisson_regressor.py +6 -3
  126. snowflake/ml/modeling/linear_model/ransac_regressor.py +6 -3
  127. snowflake/ml/modeling/linear_model/ridge.py +6 -3
  128. snowflake/ml/modeling/linear_model/ridge_classifier.py +6 -3
  129. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +6 -3
  130. snowflake/ml/modeling/linear_model/ridge_cv.py +6 -3
  131. snowflake/ml/modeling/linear_model/sgd_classifier.py +6 -3
  132. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +6 -3
  133. snowflake/ml/modeling/linear_model/sgd_regressor.py +6 -3
  134. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +6 -3
  135. snowflake/ml/modeling/linear_model/tweedie_regressor.py +6 -3
  136. snowflake/ml/modeling/manifold/isomap.py +6 -3
  137. snowflake/ml/modeling/manifold/mds.py +6 -3
  138. snowflake/ml/modeling/manifold/spectral_embedding.py +6 -3
  139. snowflake/ml/modeling/manifold/tsne.py +6 -3
  140. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +6 -3
  141. snowflake/ml/modeling/mixture/gaussian_mixture.py +6 -3
  142. snowflake/ml/modeling/model_selection/grid_search_cv.py +17 -2
  143. snowflake/ml/modeling/model_selection/randomized_search_cv.py +17 -2
  144. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +6 -3
  145. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +6 -3
  146. snowflake/ml/modeling/multiclass/output_code_classifier.py +6 -3
  147. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +6 -3
  148. snowflake/ml/modeling/naive_bayes/categorical_nb.py +6 -3
  149. snowflake/ml/modeling/naive_bayes/complement_nb.py +6 -3
  150. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +6 -3
  151. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +6 -3
  152. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +6 -3
  153. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +6 -3
  154. snowflake/ml/modeling/neighbors/kernel_density.py +6 -3
  155. snowflake/ml/modeling/neighbors/local_outlier_factor.py +6 -3
  156. snowflake/ml/modeling/neighbors/nearest_centroid.py +6 -3
  157. snowflake/ml/modeling/neighbors/nearest_neighbors.py +6 -3
  158. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +6 -3
  159. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +6 -3
  160. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +6 -3
  161. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +6 -3
  162. snowflake/ml/modeling/neural_network/mlp_classifier.py +6 -3
  163. snowflake/ml/modeling/neural_network/mlp_regressor.py +6 -3
  164. snowflake/ml/modeling/pipeline/pipeline.py +10 -2
  165. snowflake/ml/modeling/preprocessing/polynomial_features.py +6 -3
  166. snowflake/ml/modeling/semi_supervised/label_propagation.py +6 -3
  167. snowflake/ml/modeling/semi_supervised/label_spreading.py +6 -3
  168. snowflake/ml/modeling/svm/linear_svc.py +6 -3
  169. snowflake/ml/modeling/svm/linear_svr.py +6 -3
  170. snowflake/ml/modeling/svm/nu_svc.py +6 -3
  171. snowflake/ml/modeling/svm/nu_svr.py +6 -3
  172. snowflake/ml/modeling/svm/svc.py +6 -3
  173. snowflake/ml/modeling/svm/svr.py +6 -3
  174. snowflake/ml/modeling/tree/decision_tree_classifier.py +6 -3
  175. snowflake/ml/modeling/tree/decision_tree_regressor.py +6 -3
  176. snowflake/ml/modeling/tree/extra_tree_classifier.py +6 -3
  177. snowflake/ml/modeling/tree/extra_tree_regressor.py +6 -3
  178. snowflake/ml/modeling/xgboost/xgb_classifier.py +6 -3
  179. snowflake/ml/modeling/xgboost/xgb_regressor.py +6 -3
  180. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +6 -3
  181. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +6 -3
  182. snowflake/ml/version.py +1 -1
  183. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.4.dist-info}/METADATA +29 -14
  184. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.4.dist-info}/RECORD +187 -178
  185. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.4.dist-info}/LICENSE.txt +0 -0
  186. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.4.dist-info}/WHEEL +0 -0
  187. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.4.dist-info}/top_level.txt +0 -0
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.calibration".replace("sk
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class CalibratedClassifierCV(BaseTransformer):
61
64
  r"""Probability calibration with isotonic regression or logistic regression
62
65
  For more details on this class, see [sklearn.calibration.CalibratedClassifierCV]
@@ -465,7 +468,7 @@ class CalibratedClassifierCV(BaseTransformer):
465
468
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
466
469
  expected_dtype = "array"
467
470
  else:
468
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
471
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
469
472
  # We can only infer the output types from the input types if the following two statemetns are true:
470
473
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
471
474
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1126,7 +1129,7 @@ class CalibratedClassifierCV(BaseTransformer):
1126
1129
 
1127
1130
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1128
1131
 
1129
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1132
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1130
1133
  outputs: List[BaseFeatureSpec] = []
1131
1134
  if hasattr(self, "predict"):
1132
1135
  # keep mypy happy
@@ -1134,7 +1137,7 @@ class CalibratedClassifierCV(BaseTransformer):
1134
1137
  # For classifier, the type of predict is the same as the type of label
1135
1138
  if self._sklearn_object._estimator_type == "classifier":
1136
1139
  # label columns is the desired type for output
1137
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1140
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1138
1141
  # rename the output columns
1139
1142
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1140
1143
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class AffinityPropagation(BaseTransformer):
61
64
  r"""Perform Affinity Propagation Clustering of data
62
65
  For more details on this class, see [sklearn.cluster.AffinityPropagation]
@@ -449,7 +452,7 @@ class AffinityPropagation(BaseTransformer):
449
452
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
450
453
  expected_dtype = "array"
451
454
  else:
452
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
455
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
453
456
  # We can only infer the output types from the input types if the following two statemetns are true:
454
457
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
455
458
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1106,7 +1109,7 @@ class AffinityPropagation(BaseTransformer):
1106
1109
 
1107
1110
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1108
1111
 
1109
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1112
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1110
1113
  outputs: List[BaseFeatureSpec] = []
1111
1114
  if hasattr(self, "predict"):
1112
1115
  # keep mypy happy
@@ -1114,7 +1117,7 @@ class AffinityPropagation(BaseTransformer):
1114
1117
  # For classifier, the type of predict is the same as the type of label
1115
1118
  if self._sklearn_object._estimator_type == "classifier":
1116
1119
  # label columns is the desired type for output
1117
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1120
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1118
1121
  # rename the output columns
1119
1122
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1120
1123
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class AgglomerativeClustering(BaseTransformer):
61
64
  r"""Agglomerative Clustering
62
65
  For more details on this class, see [sklearn.cluster.AgglomerativeClustering]
@@ -478,7 +481,7 @@ class AgglomerativeClustering(BaseTransformer):
478
481
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
479
482
  expected_dtype = "array"
480
483
  else:
481
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
484
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
482
485
  # We can only infer the output types from the input types if the following two statemetns are true:
483
486
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
484
487
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1135,7 +1138,7 @@ class AgglomerativeClustering(BaseTransformer):
1135
1138
 
1136
1139
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1137
1140
 
1138
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1141
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1139
1142
  outputs: List[BaseFeatureSpec] = []
1140
1143
  if hasattr(self, "predict"):
1141
1144
  # keep mypy happy
@@ -1143,7 +1146,7 @@ class AgglomerativeClustering(BaseTransformer):
1143
1146
  # For classifier, the type of predict is the same as the type of label
1144
1147
  if self._sklearn_object._estimator_type == "classifier":
1145
1148
  # label columns is the desired type for output
1146
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1149
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1147
1150
  # rename the output columns
1148
1151
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1149
1152
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class Birch(BaseTransformer):
61
64
  r"""Implements the BIRCH clustering algorithm
62
65
  For more details on this class, see [sklearn.cluster.Birch]
@@ -442,7 +445,7 @@ class Birch(BaseTransformer):
442
445
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
443
446
  expected_dtype = "array"
444
447
  else:
445
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
448
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
446
449
  # We can only infer the output types from the input types if the following two statemetns are true:
447
450
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
448
451
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1101,7 +1104,7 @@ class Birch(BaseTransformer):
1101
1104
 
1102
1105
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1103
1106
 
1104
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1107
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1105
1108
  outputs: List[BaseFeatureSpec] = []
1106
1109
  if hasattr(self, "predict"):
1107
1110
  # keep mypy happy
@@ -1109,7 +1112,7 @@ class Birch(BaseTransformer):
1109
1112
  # For classifier, the type of predict is the same as the type of label
1110
1113
  if self._sklearn_object._estimator_type == "classifier":
1111
1114
  # label columns is the desired type for output
1112
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1115
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1113
1116
  # rename the output columns
1114
1117
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1115
1118
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class BisectingKMeans(BaseTransformer):
61
64
  r"""Bisecting K-Means clustering
62
65
  For more details on this class, see [sklearn.cluster.BisectingKMeans]
@@ -491,7 +494,7 @@ class BisectingKMeans(BaseTransformer):
491
494
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
492
495
  expected_dtype = "array"
493
496
  else:
494
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
497
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
495
498
  # We can only infer the output types from the input types if the following two statemetns are true:
496
499
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
497
500
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1152,7 +1155,7 @@ class BisectingKMeans(BaseTransformer):
1152
1155
 
1153
1156
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1154
1157
 
1155
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1158
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1156
1159
  outputs: List[BaseFeatureSpec] = []
1157
1160
  if hasattr(self, "predict"):
1158
1161
  # keep mypy happy
@@ -1160,7 +1163,7 @@ class BisectingKMeans(BaseTransformer):
1160
1163
  # For classifier, the type of predict is the same as the type of label
1161
1164
  if self._sklearn_object._estimator_type == "classifier":
1162
1165
  # label columns is the desired type for output
1163
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1166
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1164
1167
  # rename the output columns
1165
1168
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1166
1169
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class DBSCAN(BaseTransformer):
61
64
  r"""Perform DBSCAN clustering from vector array or distance matrix
62
65
  For more details on this class, see [sklearn.cluster.DBSCAN]
@@ -458,7 +461,7 @@ class DBSCAN(BaseTransformer):
458
461
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
459
462
  expected_dtype = "array"
460
463
  else:
461
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
464
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
462
465
  # We can only infer the output types from the input types if the following two statemetns are true:
463
466
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
464
467
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1115,7 +1118,7 @@ class DBSCAN(BaseTransformer):
1115
1118
 
1116
1119
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1117
1120
 
1118
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1121
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1119
1122
  outputs: List[BaseFeatureSpec] = []
1120
1123
  if hasattr(self, "predict"):
1121
1124
  # keep mypy happy
@@ -1123,7 +1126,7 @@ class DBSCAN(BaseTransformer):
1123
1126
  # For classifier, the type of predict is the same as the type of label
1124
1127
  if self._sklearn_object._estimator_type == "classifier":
1125
1128
  # label columns is the desired type for output
1126
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1129
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1127
1130
  # rename the output columns
1128
1131
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1129
1132
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class FeatureAgglomeration(BaseTransformer):
61
64
  r"""Agglomerate features
62
65
  For more details on this class, see [sklearn.cluster.FeatureAgglomeration]
@@ -477,7 +480,7 @@ class FeatureAgglomeration(BaseTransformer):
477
480
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
478
481
  expected_dtype = "array"
479
482
  else:
480
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
483
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
481
484
  # We can only infer the output types from the input types if the following two statemetns are true:
482
485
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
483
486
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1136,7 +1139,7 @@ class FeatureAgglomeration(BaseTransformer):
1136
1139
 
1137
1140
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1138
1141
 
1139
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1142
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1140
1143
  outputs: List[BaseFeatureSpec] = []
1141
1144
  if hasattr(self, "predict"):
1142
1145
  # keep mypy happy
@@ -1144,7 +1147,7 @@ class FeatureAgglomeration(BaseTransformer):
1144
1147
  # For classifier, the type of predict is the same as the type of label
1145
1148
  if self._sklearn_object._estimator_type == "classifier":
1146
1149
  # label columns is the desired type for output
1147
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1150
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1148
1151
  # rename the output columns
1149
1152
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1150
1153
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class KMeans(BaseTransformer):
61
64
  r"""K-Means clustering
62
65
  For more details on this class, see [sklearn.cluster.KMeans]
@@ -481,7 +484,7 @@ class KMeans(BaseTransformer):
481
484
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
482
485
  expected_dtype = "array"
483
486
  else:
484
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
487
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
485
488
  # We can only infer the output types from the input types if the following two statemetns are true:
486
489
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
487
490
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1142,7 +1145,7 @@ class KMeans(BaseTransformer):
1142
1145
 
1143
1146
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1144
1147
 
1145
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1148
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1146
1149
  outputs: List[BaseFeatureSpec] = []
1147
1150
  if hasattr(self, "predict"):
1148
1151
  # keep mypy happy
@@ -1150,7 +1153,7 @@ class KMeans(BaseTransformer):
1150
1153
  # For classifier, the type of predict is the same as the type of label
1151
1154
  if self._sklearn_object._estimator_type == "classifier":
1152
1155
  # label columns is the desired type for output
1153
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1156
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1154
1157
  # rename the output columns
1155
1158
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1156
1159
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class MeanShift(BaseTransformer):
61
64
  r"""Mean shift clustering using a flat kernel
62
65
  For more details on this class, see [sklearn.cluster.MeanShift]
@@ -460,7 +463,7 @@ class MeanShift(BaseTransformer):
460
463
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
461
464
  expected_dtype = "array"
462
465
  else:
463
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
466
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
464
467
  # We can only infer the output types from the input types if the following two statemetns are true:
465
468
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
466
469
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1117,7 +1120,7 @@ class MeanShift(BaseTransformer):
1117
1120
 
1118
1121
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1119
1122
 
1120
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1123
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1121
1124
  outputs: List[BaseFeatureSpec] = []
1122
1125
  if hasattr(self, "predict"):
1123
1126
  # keep mypy happy
@@ -1125,7 +1128,7 @@ class MeanShift(BaseTransformer):
1125
1128
  # For classifier, the type of predict is the same as the type of label
1126
1129
  if self._sklearn_object._estimator_type == "classifier":
1127
1130
  # label columns is the desired type for output
1128
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1131
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1129
1132
  # rename the output columns
1130
1133
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1131
1134
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class MiniBatchKMeans(BaseTransformer):
61
64
  r"""Mini-Batch K-Means clustering
62
65
  For more details on this class, see [sklearn.cluster.MiniBatchKMeans]
@@ -512,7 +515,7 @@ class MiniBatchKMeans(BaseTransformer):
512
515
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
513
516
  expected_dtype = "array"
514
517
  else:
515
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
518
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
516
519
  # We can only infer the output types from the input types if the following two statemetns are true:
517
520
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
518
521
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1173,7 +1176,7 @@ class MiniBatchKMeans(BaseTransformer):
1173
1176
 
1174
1177
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1175
1178
 
1176
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1179
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1177
1180
  outputs: List[BaseFeatureSpec] = []
1178
1181
  if hasattr(self, "predict"):
1179
1182
  # keep mypy happy
@@ -1181,7 +1184,7 @@ class MiniBatchKMeans(BaseTransformer):
1181
1184
  # For classifier, the type of predict is the same as the type of label
1182
1185
  if self._sklearn_object._estimator_type == "classifier":
1183
1186
  # label columns is the desired type for output
1184
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1187
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1185
1188
  # rename the output columns
1186
1189
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1187
1190
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class OPTICS(BaseTransformer):
61
64
  r"""Estimate clustering structure from vector array
62
65
  For more details on this class, see [sklearn.cluster.OPTICS]
@@ -528,7 +531,7 @@ class OPTICS(BaseTransformer):
528
531
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
529
532
  expected_dtype = "array"
530
533
  else:
531
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
534
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
532
535
  # We can only infer the output types from the input types if the following two statemetns are true:
533
536
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
534
537
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1185,7 +1188,7 @@ class OPTICS(BaseTransformer):
1185
1188
 
1186
1189
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1187
1190
 
1188
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1191
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1189
1192
  outputs: List[BaseFeatureSpec] = []
1190
1193
  if hasattr(self, "predict"):
1191
1194
  # keep mypy happy
@@ -1193,7 +1196,7 @@ class OPTICS(BaseTransformer):
1193
1196
  # For classifier, the type of predict is the same as the type of label
1194
1197
  if self._sklearn_object._estimator_type == "classifier":
1195
1198
  # label columns is the desired type for output
1196
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1199
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1197
1200
  # rename the output columns
1198
1201
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1199
1202
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class SpectralBiclustering(BaseTransformer):
61
64
  r"""Spectral biclustering (Kluger, 2003)
62
65
  For more details on this class, see [sklearn.cluster.SpectralBiclustering]
@@ -466,7 +469,7 @@ class SpectralBiclustering(BaseTransformer):
466
469
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
467
470
  expected_dtype = "array"
468
471
  else:
469
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
472
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
470
473
  # We can only infer the output types from the input types if the following two statemetns are true:
471
474
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
472
475
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1121,7 +1124,7 @@ class SpectralBiclustering(BaseTransformer):
1121
1124
 
1122
1125
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1123
1126
 
1124
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1127
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1125
1128
  outputs: List[BaseFeatureSpec] = []
1126
1129
  if hasattr(self, "predict"):
1127
1130
  # keep mypy happy
@@ -1129,7 +1132,7 @@ class SpectralBiclustering(BaseTransformer):
1129
1132
  # For classifier, the type of predict is the same as the type of label
1130
1133
  if self._sklearn_object._estimator_type == "classifier":
1131
1134
  # label columns is the desired type for output
1132
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1135
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1133
1136
  # rename the output columns
1134
1137
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1135
1138
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class SpectralClustering(BaseTransformer):
61
64
  r"""Apply clustering to a projection of the normalized Laplacian
62
65
  For more details on this class, see [sklearn.cluster.SpectralClustering]
@@ -525,7 +528,7 @@ class SpectralClustering(BaseTransformer):
525
528
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
526
529
  expected_dtype = "array"
527
530
  else:
528
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
531
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
529
532
  # We can only infer the output types from the input types if the following two statemetns are true:
530
533
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
531
534
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1182,7 +1185,7 @@ class SpectralClustering(BaseTransformer):
1182
1185
 
1183
1186
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1184
1187
 
1185
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1188
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1186
1189
  outputs: List[BaseFeatureSpec] = []
1187
1190
  if hasattr(self, "predict"):
1188
1191
  # keep mypy happy
@@ -1190,7 +1193,7 @@ class SpectralClustering(BaseTransformer):
1190
1193
  # For classifier, the type of predict is the same as the type of label
1191
1194
  if self._sklearn_object._estimator_type == "classifier":
1192
1195
  # label columns is the desired type for output
1193
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1196
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1194
1197
  # rename the output columns
1195
1198
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1196
1199
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class SpectralCoclustering(BaseTransformer):
61
64
  r"""Spectral Co-Clustering algorithm (Dhillon, 2001)
62
65
  For more details on this class, see [sklearn.cluster.SpectralCoclustering]
@@ -445,7 +448,7 @@ class SpectralCoclustering(BaseTransformer):
445
448
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
446
449
  expected_dtype = "array"
447
450
  else:
448
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
451
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
449
452
  # We can only infer the output types from the input types if the following two statemetns are true:
450
453
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
451
454
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1100,7 +1103,7 @@ class SpectralCoclustering(BaseTransformer):
1100
1103
 
1101
1104
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1102
1105
 
1103
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1106
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1104
1107
  outputs: List[BaseFeatureSpec] = []
1105
1108
  if hasattr(self, "predict"):
1106
1109
  # keep mypy happy
@@ -1108,7 +1111,7 @@ class SpectralCoclustering(BaseTransformer):
1108
1111
  # For classifier, the type of predict is the same as the type of label
1109
1112
  if self._sklearn_object._estimator_type == "classifier":
1110
1113
  # label columns is the desired type for output
1111
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1114
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1112
1115
  # rename the output columns
1113
1116
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1114
1117
  self._model_signature_dict["predict"] = ModelSignature(