snowflake-ml-python 1.7.3__py3-none-any.whl → 1.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. snowflake/cortex/_complete.py +19 -0
  2. snowflake/ml/_internal/env_utils.py +64 -21
  3. snowflake/ml/_internal/platform_capabilities.py +87 -0
  4. snowflake/ml/_internal/relax_version_strategy.py +16 -0
  5. snowflake/ml/_internal/telemetry.py +21 -0
  6. snowflake/ml/data/_internal/arrow_ingestor.py +1 -1
  7. snowflake/ml/dataset/dataset.py +0 -1
  8. snowflake/ml/feature_store/feature_store.py +18 -0
  9. snowflake/ml/feature_store/feature_view.py +46 -1
  10. snowflake/ml/fileset/fileset.py +6 -0
  11. snowflake/ml/jobs/__init__.py +21 -0
  12. snowflake/ml/jobs/_utils/constants.py +57 -0
  13. snowflake/ml/jobs/_utils/payload_utils.py +438 -0
  14. snowflake/ml/jobs/_utils/spec_utils.py +296 -0
  15. snowflake/ml/jobs/_utils/types.py +39 -0
  16. snowflake/ml/jobs/decorators.py +71 -0
  17. snowflake/ml/jobs/job.py +113 -0
  18. snowflake/ml/jobs/manager.py +298 -0
  19. snowflake/ml/model/_client/ops/model_ops.py +11 -2
  20. snowflake/ml/model/_client/ops/service_ops.py +1 -11
  21. snowflake/ml/model/_client/sql/service.py +13 -6
  22. snowflake/ml/model/_packager/model_env/model_env.py +45 -28
  23. snowflake/ml/model/_packager/model_handlers/_utils.py +19 -6
  24. snowflake/ml/model/_packager/model_handlers/custom.py +1 -2
  25. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +17 -0
  26. snowflake/ml/model/_packager/model_handlers/keras.py +230 -0
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +1 -0
  28. snowflake/ml/model/_packager/model_handlers/sklearn.py +28 -3
  29. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +74 -21
  30. snowflake/ml/model/_packager/model_handlers/tensorflow.py +27 -49
  31. snowflake/ml/model/_packager/model_handlers_migrator/tensorflow_migrator_2023_12_01.py +48 -0
  32. snowflake/ml/model/_packager/model_meta/model_meta.py +1 -1
  33. snowflake/ml/model/_packager/model_meta/model_meta_schema.py +3 -0
  34. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +2 -2
  35. snowflake/ml/model/_packager/model_runtime/model_runtime.py +4 -1
  36. snowflake/ml/model/_packager/model_task/model_task_utils.py +5 -1
  37. snowflake/ml/model/_signatures/base_handler.py +1 -2
  38. snowflake/ml/model/_signatures/builtins_handler.py +2 -2
  39. snowflake/ml/model/_signatures/core.py +2 -2
  40. snowflake/ml/model/_signatures/numpy_handler.py +11 -12
  41. snowflake/ml/model/_signatures/pandas_handler.py +11 -9
  42. snowflake/ml/model/_signatures/pytorch_handler.py +3 -6
  43. snowflake/ml/model/_signatures/snowpark_handler.py +3 -3
  44. snowflake/ml/model/_signatures/tensorflow_handler.py +2 -7
  45. snowflake/ml/model/model_signature.py +25 -4
  46. snowflake/ml/model/type_hints.py +15 -0
  47. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +14 -1
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +6 -3
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +6 -3
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +6 -3
  51. snowflake/ml/modeling/cluster/birch.py +6 -3
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +6 -3
  53. snowflake/ml/modeling/cluster/dbscan.py +6 -3
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +6 -3
  55. snowflake/ml/modeling/cluster/k_means.py +6 -3
  56. snowflake/ml/modeling/cluster/mean_shift.py +6 -3
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +6 -3
  58. snowflake/ml/modeling/cluster/optics.py +6 -3
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +6 -3
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +6 -3
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +6 -3
  62. snowflake/ml/modeling/compose/column_transformer.py +6 -3
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +6 -3
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +6 -3
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +6 -3
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +6 -3
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +6 -3
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +6 -3
  69. snowflake/ml/modeling/covariance/min_cov_det.py +6 -3
  70. snowflake/ml/modeling/covariance/oas.py +6 -3
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +6 -3
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +6 -3
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +6 -3
  74. snowflake/ml/modeling/decomposition/fast_ica.py +6 -3
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +6 -3
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +6 -3
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +6 -3
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +6 -3
  79. snowflake/ml/modeling/decomposition/pca.py +6 -3
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +6 -3
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +6 -3
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +6 -3
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +6 -3
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +6 -3
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +6 -3
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +6 -3
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +6 -3
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +6 -3
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +6 -3
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +6 -3
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +6 -3
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +6 -3
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +6 -3
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +6 -3
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +6 -3
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +6 -3
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +6 -3
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +6 -3
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +6 -3
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +6 -3
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +6 -3
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +6 -3
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +6 -3
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +6 -3
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +6 -3
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +6 -3
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +6 -3
  108. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +6 -3
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +6 -3
  110. snowflake/ml/modeling/impute/iterative_imputer.py +6 -3
  111. snowflake/ml/modeling/impute/knn_imputer.py +6 -3
  112. snowflake/ml/modeling/impute/missing_indicator.py +6 -3
  113. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +6 -3
  114. snowflake/ml/modeling/kernel_approximation/nystroem.py +6 -3
  115. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +6 -3
  116. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +6 -3
  117. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +6 -3
  118. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +6 -3
  119. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +6 -3
  120. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +6 -3
  121. snowflake/ml/modeling/linear_model/ard_regression.py +6 -3
  122. snowflake/ml/modeling/linear_model/bayesian_ridge.py +6 -3
  123. snowflake/ml/modeling/linear_model/elastic_net.py +6 -3
  124. snowflake/ml/modeling/linear_model/elastic_net_cv.py +6 -3
  125. snowflake/ml/modeling/linear_model/gamma_regressor.py +6 -3
  126. snowflake/ml/modeling/linear_model/huber_regressor.py +6 -3
  127. snowflake/ml/modeling/linear_model/lars.py +6 -3
  128. snowflake/ml/modeling/linear_model/lars_cv.py +6 -3
  129. snowflake/ml/modeling/linear_model/lasso.py +6 -3
  130. snowflake/ml/modeling/linear_model/lasso_cv.py +6 -3
  131. snowflake/ml/modeling/linear_model/lasso_lars.py +6 -3
  132. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +6 -3
  133. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +6 -3
  134. snowflake/ml/modeling/linear_model/linear_regression.py +6 -3
  135. snowflake/ml/modeling/linear_model/logistic_regression.py +6 -3
  136. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +6 -3
  137. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +6 -3
  138. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +6 -3
  139. snowflake/ml/modeling/linear_model/multi_task_lasso.py +6 -3
  140. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +6 -3
  141. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +6 -3
  142. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +6 -3
  143. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +6 -3
  144. snowflake/ml/modeling/linear_model/perceptron.py +6 -3
  145. snowflake/ml/modeling/linear_model/poisson_regressor.py +6 -3
  146. snowflake/ml/modeling/linear_model/ransac_regressor.py +6 -3
  147. snowflake/ml/modeling/linear_model/ridge.py +6 -3
  148. snowflake/ml/modeling/linear_model/ridge_classifier.py +6 -3
  149. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +6 -3
  150. snowflake/ml/modeling/linear_model/ridge_cv.py +6 -3
  151. snowflake/ml/modeling/linear_model/sgd_classifier.py +6 -3
  152. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +6 -3
  153. snowflake/ml/modeling/linear_model/sgd_regressor.py +6 -3
  154. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +6 -3
  155. snowflake/ml/modeling/linear_model/tweedie_regressor.py +6 -3
  156. snowflake/ml/modeling/manifold/isomap.py +6 -3
  157. snowflake/ml/modeling/manifold/mds.py +6 -3
  158. snowflake/ml/modeling/manifold/spectral_embedding.py +6 -3
  159. snowflake/ml/modeling/manifold/tsne.py +6 -3
  160. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +6 -3
  161. snowflake/ml/modeling/mixture/gaussian_mixture.py +6 -3
  162. snowflake/ml/modeling/model_selection/grid_search_cv.py +17 -2
  163. snowflake/ml/modeling/model_selection/randomized_search_cv.py +17 -2
  164. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +6 -3
  165. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +6 -3
  166. snowflake/ml/modeling/multiclass/output_code_classifier.py +6 -3
  167. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +6 -3
  168. snowflake/ml/modeling/naive_bayes/categorical_nb.py +6 -3
  169. snowflake/ml/modeling/naive_bayes/complement_nb.py +6 -3
  170. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +6 -3
  171. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +6 -3
  172. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +6 -3
  173. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +6 -3
  174. snowflake/ml/modeling/neighbors/kernel_density.py +6 -3
  175. snowflake/ml/modeling/neighbors/local_outlier_factor.py +6 -3
  176. snowflake/ml/modeling/neighbors/nearest_centroid.py +6 -3
  177. snowflake/ml/modeling/neighbors/nearest_neighbors.py +6 -3
  178. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +6 -3
  179. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +6 -3
  180. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +6 -3
  181. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +6 -3
  182. snowflake/ml/modeling/neural_network/mlp_classifier.py +6 -3
  183. snowflake/ml/modeling/neural_network/mlp_regressor.py +6 -3
  184. snowflake/ml/modeling/pipeline/pipeline.py +28 -3
  185. snowflake/ml/modeling/preprocessing/polynomial_features.py +8 -5
  186. snowflake/ml/modeling/semi_supervised/label_propagation.py +6 -3
  187. snowflake/ml/modeling/semi_supervised/label_spreading.py +6 -3
  188. snowflake/ml/modeling/svm/linear_svc.py +6 -3
  189. snowflake/ml/modeling/svm/linear_svr.py +6 -3
  190. snowflake/ml/modeling/svm/nu_svc.py +6 -3
  191. snowflake/ml/modeling/svm/nu_svr.py +6 -3
  192. snowflake/ml/modeling/svm/svc.py +6 -3
  193. snowflake/ml/modeling/svm/svr.py +6 -3
  194. snowflake/ml/modeling/tree/decision_tree_classifier.py +6 -3
  195. snowflake/ml/modeling/tree/decision_tree_regressor.py +6 -3
  196. snowflake/ml/modeling/tree/extra_tree_classifier.py +6 -3
  197. snowflake/ml/modeling/tree/extra_tree_regressor.py +6 -3
  198. snowflake/ml/modeling/xgboost/xgb_classifier.py +6 -3
  199. snowflake/ml/modeling/xgboost/xgb_regressor.py +6 -3
  200. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +6 -3
  201. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +6 -3
  202. snowflake/ml/registry/registry.py +34 -4
  203. snowflake/ml/version.py +1 -1
  204. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/METADATA +81 -33
  205. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/RECORD +208 -196
  206. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/WHEEL +1 -1
  207. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/LICENSE.txt +0 -0
  208. {snowflake_ml_python-1.7.3.dist-info → snowflake_ml_python-1.7.5.dist-info}/top_level.txt +0 -0
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.multiclass".replace("skl
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class OneVsRestClassifier(BaseTransformer):
61
64
  r"""One-vs-the-rest (OvR) multiclass strategy
62
65
  For more details on this class, see [sklearn.multiclass.OneVsRestClassifier]
@@ -424,7 +427,7 @@ class OneVsRestClassifier(BaseTransformer):
424
427
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
425
428
  expected_dtype = "array"
426
429
  else:
427
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
430
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
428
431
  # We can only infer the output types from the input types if the following two statemetns are true:
429
432
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
430
433
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1087,7 +1090,7 @@ class OneVsRestClassifier(BaseTransformer):
1087
1090
 
1088
1091
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1089
1092
 
1090
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1093
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1091
1094
  outputs: List[BaseFeatureSpec] = []
1092
1095
  if hasattr(self, "predict"):
1093
1096
  # keep mypy happy
@@ -1095,7 +1098,7 @@ class OneVsRestClassifier(BaseTransformer):
1095
1098
  # For classifier, the type of predict is the same as the type of label
1096
1099
  if self._sklearn_object._estimator_type == "classifier":
1097
1100
  # label columns is the desired type for output
1098
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1101
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1099
1102
  # rename the output columns
1100
1103
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1101
1104
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.multiclass".replace("skl
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class OutputCodeClassifier(BaseTransformer):
61
64
  r"""(Error-Correcting) Output-Code multiclass strategy
62
65
  For more details on this class, see [sklearn.multiclass.OutputCodeClassifier]
@@ -427,7 +430,7 @@ class OutputCodeClassifier(BaseTransformer):
427
430
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
428
431
  expected_dtype = "array"
429
432
  else:
430
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
433
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
431
434
  # We can only infer the output types from the input types if the following two statemetns are true:
432
435
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
433
436
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1084,7 +1087,7 @@ class OutputCodeClassifier(BaseTransformer):
1084
1087
 
1085
1088
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1086
1089
 
1087
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1090
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1088
1091
  outputs: List[BaseFeatureSpec] = []
1089
1092
  if hasattr(self, "predict"):
1090
1093
  # keep mypy happy
@@ -1092,7 +1095,7 @@ class OutputCodeClassifier(BaseTransformer):
1092
1095
  # For classifier, the type of predict is the same as the type of label
1093
1096
  if self._sklearn_object._estimator_type == "classifier":
1094
1097
  # label columns is the desired type for output
1095
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1098
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1096
1099
  # rename the output columns
1097
1100
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1098
1101
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.naive_bayes".replace("sk
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class BernoulliNB(BaseTransformer):
61
64
  r"""Naive Bayes classifier for multivariate Bernoulli models
62
65
  For more details on this class, see [sklearn.naive_bayes.BernoulliNB]
@@ -427,7 +430,7 @@ class BernoulliNB(BaseTransformer):
427
430
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
428
431
  expected_dtype = "array"
429
432
  else:
430
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
433
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
431
434
  # We can only infer the output types from the input types if the following two statemetns are true:
432
435
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
433
436
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1088,7 +1091,7 @@ class BernoulliNB(BaseTransformer):
1088
1091
 
1089
1092
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1090
1093
 
1091
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1094
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1092
1095
  outputs: List[BaseFeatureSpec] = []
1093
1096
  if hasattr(self, "predict"):
1094
1097
  # keep mypy happy
@@ -1096,7 +1099,7 @@ class BernoulliNB(BaseTransformer):
1096
1099
  # For classifier, the type of predict is the same as the type of label
1097
1100
  if self._sklearn_object._estimator_type == "classifier":
1098
1101
  # label columns is the desired type for output
1099
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1102
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1100
1103
  # rename the output columns
1101
1104
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1102
1105
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.naive_bayes".replace("sk
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class CategoricalNB(BaseTransformer):
61
64
  r"""Naive Bayes classifier for categorical features
62
65
  For more details on this class, see [sklearn.naive_bayes.CategoricalNB]
@@ -433,7 +436,7 @@ class CategoricalNB(BaseTransformer):
433
436
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
434
437
  expected_dtype = "array"
435
438
  else:
436
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
439
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
437
440
  # We can only infer the output types from the input types if the following two statemetns are true:
438
441
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
439
442
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1094,7 +1097,7 @@ class CategoricalNB(BaseTransformer):
1094
1097
 
1095
1098
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1096
1099
 
1097
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1100
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1098
1101
  outputs: List[BaseFeatureSpec] = []
1099
1102
  if hasattr(self, "predict"):
1100
1103
  # keep mypy happy
@@ -1102,7 +1105,7 @@ class CategoricalNB(BaseTransformer):
1102
1105
  # For classifier, the type of predict is the same as the type of label
1103
1106
  if self._sklearn_object._estimator_type == "classifier":
1104
1107
  # label columns is the desired type for output
1105
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1108
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1106
1109
  # rename the output columns
1107
1110
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1108
1111
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.naive_bayes".replace("sk
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class ComplementNB(BaseTransformer):
61
64
  r"""The Complement Naive Bayes classifier described in Rennie et al
62
65
  For more details on this class, see [sklearn.naive_bayes.ComplementNB]
@@ -427,7 +430,7 @@ class ComplementNB(BaseTransformer):
427
430
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
428
431
  expected_dtype = "array"
429
432
  else:
430
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
433
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
431
434
  # We can only infer the output types from the input types if the following two statemetns are true:
432
435
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
433
436
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1088,7 +1091,7 @@ class ComplementNB(BaseTransformer):
1088
1091
 
1089
1092
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1090
1093
 
1091
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1094
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1092
1095
  outputs: List[BaseFeatureSpec] = []
1093
1096
  if hasattr(self, "predict"):
1094
1097
  # keep mypy happy
@@ -1096,7 +1099,7 @@ class ComplementNB(BaseTransformer):
1096
1099
  # For classifier, the type of predict is the same as the type of label
1097
1100
  if self._sklearn_object._estimator_type == "classifier":
1098
1101
  # label columns is the desired type for output
1099
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1102
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1100
1103
  # rename the output columns
1101
1104
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1102
1105
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.naive_bayes".replace("sk
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class GaussianNB(BaseTransformer):
61
64
  r"""Gaussian Naive Bayes (GaussianNB)
62
65
  For more details on this class, see [sklearn.naive_bayes.GaussianNB]
@@ -408,7 +411,7 @@ class GaussianNB(BaseTransformer):
408
411
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
409
412
  expected_dtype = "array"
410
413
  else:
411
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
414
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
412
415
  # We can only infer the output types from the input types if the following two statemetns are true:
413
416
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
414
417
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1069,7 +1072,7 @@ class GaussianNB(BaseTransformer):
1069
1072
 
1070
1073
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1071
1074
 
1072
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1075
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1073
1076
  outputs: List[BaseFeatureSpec] = []
1074
1077
  if hasattr(self, "predict"):
1075
1078
  # keep mypy happy
@@ -1077,7 +1080,7 @@ class GaussianNB(BaseTransformer):
1077
1080
  # For classifier, the type of predict is the same as the type of label
1078
1081
  if self._sklearn_object._estimator_type == "classifier":
1079
1082
  # label columns is the desired type for output
1080
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1083
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1081
1084
  # rename the output columns
1082
1085
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1083
1086
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.naive_bayes".replace("sk
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class MultinomialNB(BaseTransformer):
61
64
  r"""Naive Bayes classifier for multinomial models
62
65
  For more details on this class, see [sklearn.naive_bayes.MultinomialNB]
@@ -421,7 +424,7 @@ class MultinomialNB(BaseTransformer):
421
424
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
422
425
  expected_dtype = "array"
423
426
  else:
424
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
427
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
425
428
  # We can only infer the output types from the input types if the following two statemetns are true:
426
429
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
427
430
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1082,7 +1085,7 @@ class MultinomialNB(BaseTransformer):
1082
1085
 
1083
1086
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1084
1087
 
1085
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1088
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1086
1089
  outputs: List[BaseFeatureSpec] = []
1087
1090
  if hasattr(self, "predict"):
1088
1091
  # keep mypy happy
@@ -1090,7 +1093,7 @@ class MultinomialNB(BaseTransformer):
1090
1093
  # For classifier, the type of predict is the same as the type of label
1091
1094
  if self._sklearn_object._estimator_type == "classifier":
1092
1095
  # label columns is the desired type for output
1093
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1096
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1094
1097
  # rename the output columns
1095
1098
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1096
1099
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.neighbors".replace("skle
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class KNeighborsClassifier(BaseTransformer):
61
64
  r"""Classifier implementing the k-nearest neighbors vote
62
65
  For more details on this class, see [sklearn.neighbors.KNeighborsClassifier]
@@ -484,7 +487,7 @@ class KNeighborsClassifier(BaseTransformer):
484
487
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
485
488
  expected_dtype = "array"
486
489
  else:
487
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
490
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
488
491
  # We can only infer the output types from the input types if the following two statemetns are true:
489
492
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
490
493
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1147,7 +1150,7 @@ class KNeighborsClassifier(BaseTransformer):
1147
1150
 
1148
1151
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1149
1152
 
1150
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1153
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1151
1154
  outputs: List[BaseFeatureSpec] = []
1152
1155
  if hasattr(self, "predict"):
1153
1156
  # keep mypy happy
@@ -1155,7 +1158,7 @@ class KNeighborsClassifier(BaseTransformer):
1155
1158
  # For classifier, the type of predict is the same as the type of label
1156
1159
  if self._sklearn_object._estimator_type == "classifier":
1157
1160
  # label columns is the desired type for output
1158
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1161
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1159
1162
  # rename the output columns
1160
1163
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1161
1164
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.neighbors".replace("skle
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class KNeighborsRegressor(BaseTransformer):
61
64
  r"""Regression based on k-nearest neighbors
62
65
  For more details on this class, see [sklearn.neighbors.KNeighborsRegressor]
@@ -483,7 +486,7 @@ class KNeighborsRegressor(BaseTransformer):
483
486
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
484
487
  expected_dtype = "array"
485
488
  else:
486
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
489
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
487
490
  # We can only infer the output types from the input types if the following two statemetns are true:
488
491
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
489
492
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1142,7 +1145,7 @@ class KNeighborsRegressor(BaseTransformer):
1142
1145
 
1143
1146
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1144
1147
 
1145
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1148
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1146
1149
  outputs: List[BaseFeatureSpec] = []
1147
1150
  if hasattr(self, "predict"):
1148
1151
  # keep mypy happy
@@ -1150,7 +1153,7 @@ class KNeighborsRegressor(BaseTransformer):
1150
1153
  # For classifier, the type of predict is the same as the type of label
1151
1154
  if self._sklearn_object._estimator_type == "classifier":
1152
1155
  # label columns is the desired type for output
1153
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1156
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1154
1157
  # rename the output columns
1155
1158
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1156
1159
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.neighbors".replace("skle
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class KernelDensity(BaseTransformer):
61
64
  r"""Kernel Density Estimation
62
65
  For more details on this class, see [sklearn.neighbors.KernelDensity]
@@ -457,7 +460,7 @@ class KernelDensity(BaseTransformer):
457
460
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
458
461
  expected_dtype = "array"
459
462
  else:
460
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
463
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
461
464
  # We can only infer the output types from the input types if the following two statemetns are true:
462
465
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
463
466
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1116,7 +1119,7 @@ class KernelDensity(BaseTransformer):
1116
1119
 
1117
1120
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1118
1121
 
1119
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1122
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1120
1123
  outputs: List[BaseFeatureSpec] = []
1121
1124
  if hasattr(self, "predict"):
1122
1125
  # keep mypy happy
@@ -1124,7 +1127,7 @@ class KernelDensity(BaseTransformer):
1124
1127
  # For classifier, the type of predict is the same as the type of label
1125
1128
  if self._sklearn_object._estimator_type == "classifier":
1126
1129
  # label columns is the desired type for output
1127
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1130
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1128
1131
  # rename the output columns
1129
1132
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1130
1133
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.neighbors".replace("skle
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class LocalOutlierFactor(BaseTransformer):
61
64
  r"""Unsupervised Outlier Detection using the Local Outlier Factor (LOF)
62
65
  For more details on this class, see [sklearn.neighbors.LocalOutlierFactor]
@@ -487,7 +490,7 @@ class LocalOutlierFactor(BaseTransformer):
487
490
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
488
491
  expected_dtype = "array"
489
492
  else:
490
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
493
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
491
494
  # We can only infer the output types from the input types if the following two statemetns are true:
492
495
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
493
496
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1150,7 +1153,7 @@ class LocalOutlierFactor(BaseTransformer):
1150
1153
 
1151
1154
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1152
1155
 
1153
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1156
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1154
1157
  outputs: List[BaseFeatureSpec] = []
1155
1158
  if hasattr(self, "predict"):
1156
1159
  # keep mypy happy
@@ -1158,7 +1161,7 @@ class LocalOutlierFactor(BaseTransformer):
1158
1161
  # For classifier, the type of predict is the same as the type of label
1159
1162
  if self._sklearn_object._estimator_type == "classifier":
1160
1163
  # label columns is the desired type for output
1161
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1164
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1162
1165
  # rename the output columns
1163
1166
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1164
1167
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.neighbors".replace("skle
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class NearestCentroid(BaseTransformer):
61
64
  r"""Nearest centroid classifier
62
65
  For more details on this class, see [sklearn.neighbors.NearestCentroid]
@@ -411,7 +414,7 @@ class NearestCentroid(BaseTransformer):
411
414
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
412
415
  expected_dtype = "array"
413
416
  else:
414
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
417
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
415
418
  # We can only infer the output types from the input types if the following two statemetns are true:
416
419
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
417
420
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1068,7 +1071,7 @@ class NearestCentroid(BaseTransformer):
1068
1071
 
1069
1072
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1070
1073
 
1071
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1074
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1072
1075
  outputs: List[BaseFeatureSpec] = []
1073
1076
  if hasattr(self, "predict"):
1074
1077
  # keep mypy happy
@@ -1076,7 +1079,7 @@ class NearestCentroid(BaseTransformer):
1076
1079
  # For classifier, the type of predict is the same as the type of label
1077
1080
  if self._sklearn_object._estimator_type == "classifier":
1078
1081
  # label columns is the desired type for output
1079
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1082
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1080
1083
  # rename the output columns
1081
1084
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1082
1085
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.neighbors".replace("skle
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class NearestNeighbors(BaseTransformer):
61
64
  r"""Unsupervised learner for implementing neighbor searches
62
65
  For more details on this class, see [sklearn.neighbors.NearestNeighbors]
@@ -468,7 +471,7 @@ class NearestNeighbors(BaseTransformer):
468
471
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
469
472
  expected_dtype = "array"
470
473
  else:
471
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
474
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
472
475
  # We can only infer the output types from the input types if the following two statemetns are true:
473
476
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
474
477
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1125,7 +1128,7 @@ class NearestNeighbors(BaseTransformer):
1125
1128
 
1126
1129
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1127
1130
 
1128
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1131
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1129
1132
  outputs: List[BaseFeatureSpec] = []
1130
1133
  if hasattr(self, "predict"):
1131
1134
  # keep mypy happy
@@ -1133,7 +1136,7 @@ class NearestNeighbors(BaseTransformer):
1133
1136
  # For classifier, the type of predict is the same as the type of label
1134
1137
  if self._sklearn_object._estimator_type == "classifier":
1135
1138
  # label columns is the desired type for output
1136
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1139
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1137
1140
  # rename the output columns
1138
1141
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1139
1142
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.neighbors".replace("skle
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class NeighborhoodComponentsAnalysis(BaseTransformer):
61
64
  r"""Neighborhood Components Analysis
62
65
  For more details on this class, see [sklearn.neighbors.NeighborhoodComponentsAnalysis]
@@ -489,7 +492,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
489
492
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
490
493
  expected_dtype = "array"
491
494
  else:
492
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
495
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
493
496
  # We can only infer the output types from the input types if the following two statemetns are true:
494
497
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
495
498
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1146,7 +1149,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
1146
1149
 
1147
1150
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1148
1151
 
1149
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1152
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1150
1153
  outputs: List[BaseFeatureSpec] = []
1151
1154
  if hasattr(self, "predict"):
1152
1155
  # keep mypy happy
@@ -1154,7 +1157,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
1154
1157
  # For classifier, the type of predict is the same as the type of label
1155
1158
  if self._sklearn_object._estimator_type == "classifier":
1156
1159
  # label columns is the desired type for output
1157
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1160
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1158
1161
  # rename the output columns
1159
1162
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1160
1163
  self._model_signature_dict["predict"] = ModelSignature(