snowflake-ml-python 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. snowflake/ml/_internal/env_utils.py +16 -13
  2. snowflake/ml/_internal/exceptions/modeling_error_messages.py +5 -1
  3. snowflake/ml/_internal/telemetry.py +19 -0
  4. snowflake/ml/feature_store/__init__.py +9 -0
  5. snowflake/ml/feature_store/entity.py +73 -0
  6. snowflake/ml/feature_store/feature_store.py +1657 -0
  7. snowflake/ml/feature_store/feature_view.py +459 -0
  8. snowflake/ml/model/_client/ops/model_ops.py +16 -38
  9. snowflake/ml/model/_client/sql/model.py +1 -7
  10. snowflake/ml/model/_client/sql/model_version.py +20 -15
  11. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +9 -1
  12. snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +12 -2
  14. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +7 -3
  15. snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +1 -6
  16. snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +0 -2
  17. snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +10 -1
  18. snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -2
  19. snowflake/ml/model/_packager/model_meta/_core_requirements.py +11 -1
  20. snowflake/ml/model/_packager/model_meta/_packaging_requirements.py +3 -0
  21. snowflake/ml/model/_packager/model_meta/model_meta.py +17 -3
  22. snowflake/ml/model/model_signature.py +72 -16
  23. snowflake/ml/model/type_hints.py +12 -0
  24. snowflake/ml/modeling/_internal/estimator_protocols.py +1 -41
  25. snowflake/ml/modeling/_internal/model_trainer_builder.py +13 -9
  26. snowflake/ml/modeling/_internal/{distributed_hpo_trainer.py → snowpark_implementations/distributed_hpo_trainer.py} +66 -96
  27. snowflake/ml/modeling/_internal/{snowpark_handlers.py → snowpark_implementations/snowpark_handlers.py} +9 -6
  28. snowflake/ml/modeling/_internal/{xgboost_external_memory_trainer.py → snowpark_implementations/xgboost_external_memory_trainer.py} +3 -1
  29. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +19 -3
  30. snowflake/ml/modeling/cluster/affinity_propagation.py +19 -3
  31. snowflake/ml/modeling/cluster/agglomerative_clustering.py +19 -3
  32. snowflake/ml/modeling/cluster/birch.py +19 -3
  33. snowflake/ml/modeling/cluster/bisecting_k_means.py +19 -3
  34. snowflake/ml/modeling/cluster/dbscan.py +19 -3
  35. snowflake/ml/modeling/cluster/feature_agglomeration.py +19 -3
  36. snowflake/ml/modeling/cluster/k_means.py +19 -3
  37. snowflake/ml/modeling/cluster/mean_shift.py +19 -3
  38. snowflake/ml/modeling/cluster/mini_batch_k_means.py +19 -3
  39. snowflake/ml/modeling/cluster/optics.py +19 -3
  40. snowflake/ml/modeling/cluster/spectral_biclustering.py +19 -3
  41. snowflake/ml/modeling/cluster/spectral_clustering.py +19 -3
  42. snowflake/ml/modeling/cluster/spectral_coclustering.py +19 -3
  43. snowflake/ml/modeling/compose/column_transformer.py +19 -3
  44. snowflake/ml/modeling/compose/transformed_target_regressor.py +19 -3
  45. snowflake/ml/modeling/covariance/elliptic_envelope.py +19 -3
  46. snowflake/ml/modeling/covariance/empirical_covariance.py +19 -3
  47. snowflake/ml/modeling/covariance/graphical_lasso.py +19 -3
  48. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +19 -3
  49. snowflake/ml/modeling/covariance/ledoit_wolf.py +19 -3
  50. snowflake/ml/modeling/covariance/min_cov_det.py +19 -3
  51. snowflake/ml/modeling/covariance/oas.py +19 -3
  52. snowflake/ml/modeling/covariance/shrunk_covariance.py +19 -3
  53. snowflake/ml/modeling/decomposition/dictionary_learning.py +19 -3
  54. snowflake/ml/modeling/decomposition/factor_analysis.py +19 -3
  55. snowflake/ml/modeling/decomposition/fast_ica.py +19 -3
  56. snowflake/ml/modeling/decomposition/incremental_pca.py +19 -3
  57. snowflake/ml/modeling/decomposition/kernel_pca.py +19 -3
  58. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +19 -3
  59. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +19 -3
  60. snowflake/ml/modeling/decomposition/pca.py +19 -3
  61. snowflake/ml/modeling/decomposition/sparse_pca.py +19 -3
  62. snowflake/ml/modeling/decomposition/truncated_svd.py +19 -3
  63. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +19 -3
  64. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +19 -3
  65. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +19 -3
  66. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +19 -3
  67. snowflake/ml/modeling/ensemble/bagging_classifier.py +19 -3
  68. snowflake/ml/modeling/ensemble/bagging_regressor.py +19 -3
  69. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +19 -3
  70. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +19 -3
  71. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +19 -3
  72. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +19 -3
  73. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +19 -3
  74. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +19 -3
  75. snowflake/ml/modeling/ensemble/isolation_forest.py +19 -3
  76. snowflake/ml/modeling/ensemble/random_forest_classifier.py +19 -3
  77. snowflake/ml/modeling/ensemble/random_forest_regressor.py +19 -3
  78. snowflake/ml/modeling/ensemble/stacking_regressor.py +19 -3
  79. snowflake/ml/modeling/ensemble/voting_classifier.py +19 -3
  80. snowflake/ml/modeling/ensemble/voting_regressor.py +19 -3
  81. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +19 -3
  82. snowflake/ml/modeling/feature_selection/select_fdr.py +19 -3
  83. snowflake/ml/modeling/feature_selection/select_fpr.py +19 -3
  84. snowflake/ml/modeling/feature_selection/select_fwe.py +19 -3
  85. snowflake/ml/modeling/feature_selection/select_k_best.py +19 -3
  86. snowflake/ml/modeling/feature_selection/select_percentile.py +19 -3
  87. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +19 -3
  88. snowflake/ml/modeling/feature_selection/variance_threshold.py +19 -3
  89. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +19 -3
  90. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +19 -3
  91. snowflake/ml/modeling/impute/iterative_imputer.py +19 -3
  92. snowflake/ml/modeling/impute/knn_imputer.py +19 -3
  93. snowflake/ml/modeling/impute/missing_indicator.py +19 -3
  94. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +19 -3
  95. snowflake/ml/modeling/kernel_approximation/nystroem.py +19 -3
  96. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +19 -3
  97. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +19 -3
  98. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +19 -3
  99. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +19 -3
  100. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +19 -3
  101. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +19 -3
  102. snowflake/ml/modeling/linear_model/ard_regression.py +19 -3
  103. snowflake/ml/modeling/linear_model/bayesian_ridge.py +19 -3
  104. snowflake/ml/modeling/linear_model/elastic_net.py +19 -3
  105. snowflake/ml/modeling/linear_model/elastic_net_cv.py +19 -3
  106. snowflake/ml/modeling/linear_model/gamma_regressor.py +19 -3
  107. snowflake/ml/modeling/linear_model/huber_regressor.py +19 -3
  108. snowflake/ml/modeling/linear_model/lars.py +19 -3
  109. snowflake/ml/modeling/linear_model/lars_cv.py +19 -3
  110. snowflake/ml/modeling/linear_model/lasso.py +19 -3
  111. snowflake/ml/modeling/linear_model/lasso_cv.py +19 -3
  112. snowflake/ml/modeling/linear_model/lasso_lars.py +19 -3
  113. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +19 -3
  114. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +19 -3
  115. snowflake/ml/modeling/linear_model/linear_regression.py +19 -3
  116. snowflake/ml/modeling/linear_model/logistic_regression.py +19 -3
  117. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +19 -3
  118. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +19 -3
  119. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +19 -3
  120. snowflake/ml/modeling/linear_model/multi_task_lasso.py +19 -3
  121. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +19 -3
  122. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +19 -3
  123. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +19 -3
  124. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +19 -3
  125. snowflake/ml/modeling/linear_model/perceptron.py +19 -3
  126. snowflake/ml/modeling/linear_model/poisson_regressor.py +19 -3
  127. snowflake/ml/modeling/linear_model/ransac_regressor.py +19 -3
  128. snowflake/ml/modeling/linear_model/ridge.py +19 -3
  129. snowflake/ml/modeling/linear_model/ridge_classifier.py +19 -3
  130. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +19 -3
  131. snowflake/ml/modeling/linear_model/ridge_cv.py +19 -3
  132. snowflake/ml/modeling/linear_model/sgd_classifier.py +19 -3
  133. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +19 -3
  134. snowflake/ml/modeling/linear_model/sgd_regressor.py +19 -3
  135. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +19 -3
  136. snowflake/ml/modeling/linear_model/tweedie_regressor.py +19 -3
  137. snowflake/ml/modeling/manifold/isomap.py +19 -3
  138. snowflake/ml/modeling/manifold/mds.py +19 -3
  139. snowflake/ml/modeling/manifold/spectral_embedding.py +19 -3
  140. snowflake/ml/modeling/manifold/tsne.py +19 -3
  141. snowflake/ml/modeling/metrics/classification.py +5 -6
  142. snowflake/ml/modeling/metrics/metrics_utils.py +5 -3
  143. snowflake/ml/modeling/metrics/ranking.py +7 -3
  144. snowflake/ml/modeling/metrics/regression.py +6 -3
  145. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +19 -3
  146. snowflake/ml/modeling/mixture/gaussian_mixture.py +19 -3
  147. snowflake/ml/modeling/model_selection/grid_search_cv.py +3 -13
  148. snowflake/ml/modeling/model_selection/randomized_search_cv.py +3 -13
  149. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +19 -3
  150. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +19 -3
  151. snowflake/ml/modeling/multiclass/output_code_classifier.py +19 -3
  152. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +19 -3
  153. snowflake/ml/modeling/naive_bayes/categorical_nb.py +19 -3
  154. snowflake/ml/modeling/naive_bayes/complement_nb.py +19 -3
  155. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +19 -3
  156. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +19 -3
  157. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +19 -3
  158. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +19 -3
  159. snowflake/ml/modeling/neighbors/kernel_density.py +19 -3
  160. snowflake/ml/modeling/neighbors/local_outlier_factor.py +19 -3
  161. snowflake/ml/modeling/neighbors/nearest_centroid.py +19 -3
  162. snowflake/ml/modeling/neighbors/nearest_neighbors.py +19 -3
  163. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +19 -3
  164. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +19 -3
  165. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +19 -3
  166. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +19 -3
  167. snowflake/ml/modeling/neural_network/mlp_classifier.py +19 -3
  168. snowflake/ml/modeling/neural_network/mlp_regressor.py +19 -3
  169. snowflake/ml/modeling/preprocessing/polynomial_features.py +19 -3
  170. snowflake/ml/modeling/semi_supervised/label_propagation.py +19 -3
  171. snowflake/ml/modeling/semi_supervised/label_spreading.py +19 -3
  172. snowflake/ml/modeling/svm/linear_svc.py +19 -3
  173. snowflake/ml/modeling/svm/linear_svr.py +19 -3
  174. snowflake/ml/modeling/svm/nu_svc.py +19 -3
  175. snowflake/ml/modeling/svm/nu_svr.py +19 -3
  176. snowflake/ml/modeling/svm/svc.py +19 -3
  177. snowflake/ml/modeling/svm/svr.py +19 -3
  178. snowflake/ml/modeling/tree/decision_tree_classifier.py +19 -3
  179. snowflake/ml/modeling/tree/decision_tree_regressor.py +19 -3
  180. snowflake/ml/modeling/tree/extra_tree_classifier.py +19 -3
  181. snowflake/ml/modeling/tree/extra_tree_regressor.py +19 -3
  182. snowflake/ml/modeling/xgboost/xgb_classifier.py +19 -3
  183. snowflake/ml/modeling/xgboost/xgb_regressor.py +19 -3
  184. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +19 -3
  185. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +19 -3
  186. snowflake/ml/registry/registry.py +2 -0
  187. snowflake/ml/version.py +1 -1
  188. snowflake_ml_python-1.2.2.dist-info/LICENSE.txt +202 -0
  189. {snowflake_ml_python-1.2.0.dist-info → snowflake_ml_python-1.2.2.dist-info}/METADATA +276 -50
  190. {snowflake_ml_python-1.2.0.dist-info → snowflake_ml_python-1.2.2.dist-info}/RECORD +204 -197
  191. {snowflake_ml_python-1.2.0.dist-info → snowflake_ml_python-1.2.2.dist-info}/WHEEL +2 -1
  192. snowflake_ml_python-1.2.2.dist-info/top_level.txt +1 -0
  193. /snowflake/ml/modeling/_internal/{pandas_trainer.py → local_implementations/pandas_trainer.py} +0 -0
  194. /snowflake/ml/modeling/_internal/{snowpark_trainer.py → snowpark_implementations/snowpark_trainer.py} +0 -0
@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
26
26
  from snowflake.ml._internal.utils import pkg_version_utils, identifier
27
27
  from snowflake.snowpark import DataFrame, Session
28
28
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
- from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
29
+ from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
30
30
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
31
  from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
32
32
  from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
35
35
  transform_snowml_obj_to_sklearn_obj,
36
36
  validate_sklearn_args,
37
37
  )
38
- from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
38
+ from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
39
39
 
40
40
  from snowflake.ml.model.model_signature import (
41
41
  DataType,
@@ -231,7 +231,7 @@ class RidgeCV(BaseTransformer):
231
231
  self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
232
232
  # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
233
233
  self._snowpark_cols: Optional[List[str]] = self.input_cols
234
- self._handlers: FitPredictHandlers = HandlersImpl(class_name=RidgeCV.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
234
+ self._handlers: TransformerHandlers = HandlersImpl(class_name=RidgeCV.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
235
235
  self._autogenerated = True
236
236
 
237
237
  def _get_rand_id(self) -> str:
@@ -591,6 +591,22 @@ class RidgeCV(BaseTransformer):
591
591
  # each row containing a list of values.
592
592
  expected_dtype = "ARRAY"
593
593
 
594
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
595
+ if expected_dtype == "":
596
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
597
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
598
+ expected_dtype = "ARRAY"
599
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
600
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
601
+ expected_dtype = "ARRAY"
602
+ else:
603
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
604
+ # We can only infer the output types from the input types if the following two statemetns are true:
605
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
606
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
607
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
608
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
609
+
594
610
  output_df = self._batch_inference(
595
611
  dataset=dataset,
596
612
  inference_method="transform",
@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
26
26
  from snowflake.ml._internal.utils import pkg_version_utils, identifier
27
27
  from snowflake.snowpark import DataFrame, Session
28
28
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
- from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
29
+ from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
30
30
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
31
  from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
32
32
  from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
35
35
  transform_snowml_obj_to_sklearn_obj,
36
36
  validate_sklearn_args,
37
37
  )
38
- from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
38
+ from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
39
39
 
40
40
  from snowflake.ml.model.model_signature import (
41
41
  DataType,
@@ -350,7 +350,7 @@ class SGDClassifier(BaseTransformer):
350
350
  self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
351
351
  # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
352
352
  self._snowpark_cols: Optional[List[str]] = self.input_cols
353
- self._handlers: FitPredictHandlers = HandlersImpl(class_name=SGDClassifier.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
353
+ self._handlers: TransformerHandlers = HandlersImpl(class_name=SGDClassifier.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
354
354
  self._autogenerated = True
355
355
 
356
356
  def _get_rand_id(self) -> str:
@@ -710,6 +710,22 @@ class SGDClassifier(BaseTransformer):
710
710
  # each row containing a list of values.
711
711
  expected_dtype = "ARRAY"
712
712
 
713
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
714
+ if expected_dtype == "":
715
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
716
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
717
+ expected_dtype = "ARRAY"
718
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
719
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
720
+ expected_dtype = "ARRAY"
721
+ else:
722
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
723
+ # We can only infer the output types from the input types if the following two statemetns are true:
724
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
725
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
726
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
727
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
728
+
713
729
  output_df = self._batch_inference(
714
730
  dataset=dataset,
715
731
  inference_method="transform",
@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
26
26
  from snowflake.ml._internal.utils import pkg_version_utils, identifier
27
27
  from snowflake.snowpark import DataFrame, Session
28
28
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
- from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
29
+ from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
30
30
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
31
  from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
32
32
  from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
35
35
  transform_snowml_obj_to_sklearn_obj,
36
36
  validate_sklearn_args,
37
37
  )
38
- from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
38
+ from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
39
39
 
40
40
  from snowflake.ml.model.model_signature import (
41
41
  DataType,
@@ -248,7 +248,7 @@ class SGDOneClassSVM(BaseTransformer):
248
248
  self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
249
249
  # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
250
250
  self._snowpark_cols: Optional[List[str]] = self.input_cols
251
- self._handlers: FitPredictHandlers = HandlersImpl(class_name=SGDOneClassSVM.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
251
+ self._handlers: TransformerHandlers = HandlersImpl(class_name=SGDOneClassSVM.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
252
252
  self._autogenerated = True
253
253
 
254
254
  def _get_rand_id(self) -> str:
@@ -608,6 +608,22 @@ class SGDOneClassSVM(BaseTransformer):
608
608
  # each row containing a list of values.
609
609
  expected_dtype = "ARRAY"
610
610
 
611
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
612
+ if expected_dtype == "":
613
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
614
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
615
+ expected_dtype = "ARRAY"
616
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
617
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
618
+ expected_dtype = "ARRAY"
619
+ else:
620
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
621
+ # We can only infer the output types from the input types if the following two statemetns are true:
622
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
623
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
624
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
625
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
626
+
611
627
  output_df = self._batch_inference(
612
628
  dataset=dataset,
613
629
  inference_method="transform",
@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
26
26
  from snowflake.ml._internal.utils import pkg_version_utils, identifier
27
27
  from snowflake.snowpark import DataFrame, Session
28
28
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
- from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
29
+ from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
30
30
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
31
  from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
32
32
  from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
35
35
  transform_snowml_obj_to_sklearn_obj,
36
36
  validate_sklearn_args,
37
37
  )
38
- from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
38
+ from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
39
39
 
40
40
  from snowflake.ml.model.model_signature import (
41
41
  DataType,
@@ -316,7 +316,7 @@ class SGDRegressor(BaseTransformer):
316
316
  self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
317
317
  # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
318
318
  self._snowpark_cols: Optional[List[str]] = self.input_cols
319
- self._handlers: FitPredictHandlers = HandlersImpl(class_name=SGDRegressor.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
319
+ self._handlers: TransformerHandlers = HandlersImpl(class_name=SGDRegressor.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
320
320
  self._autogenerated = True
321
321
 
322
322
  def _get_rand_id(self) -> str:
@@ -676,6 +676,22 @@ class SGDRegressor(BaseTransformer):
676
676
  # each row containing a list of values.
677
677
  expected_dtype = "ARRAY"
678
678
 
679
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
680
+ if expected_dtype == "":
681
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
682
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
683
+ expected_dtype = "ARRAY"
684
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
685
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
686
+ expected_dtype = "ARRAY"
687
+ else:
688
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
689
+ # We can only infer the output types from the input types if the following two statemetns are true:
690
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
691
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
692
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
693
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
694
+
679
695
  output_df = self._batch_inference(
680
696
  dataset=dataset,
681
697
  inference_method="transform",
@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
26
26
  from snowflake.ml._internal.utils import pkg_version_utils, identifier
27
27
  from snowflake.snowpark import DataFrame, Session
28
28
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
- from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
29
+ from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
30
30
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
31
  from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
32
32
  from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
35
35
  transform_snowml_obj_to_sklearn_obj,
36
36
  validate_sklearn_args,
37
37
  )
38
- from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
38
+ from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
39
39
 
40
40
  from snowflake.ml.model.model_signature import (
41
41
  DataType,
@@ -218,7 +218,7 @@ class TheilSenRegressor(BaseTransformer):
218
218
  self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
219
219
  # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
220
220
  self._snowpark_cols: Optional[List[str]] = self.input_cols
221
- self._handlers: FitPredictHandlers = HandlersImpl(class_name=TheilSenRegressor.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
221
+ self._handlers: TransformerHandlers = HandlersImpl(class_name=TheilSenRegressor.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
222
222
  self._autogenerated = True
223
223
 
224
224
  def _get_rand_id(self) -> str:
@@ -578,6 +578,22 @@ class TheilSenRegressor(BaseTransformer):
578
578
  # each row containing a list of values.
579
579
  expected_dtype = "ARRAY"
580
580
 
581
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
582
+ if expected_dtype == "":
583
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
584
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
585
+ expected_dtype = "ARRAY"
586
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
587
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
588
+ expected_dtype = "ARRAY"
589
+ else:
590
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
591
+ # We can only infer the output types from the input types if the following two statemetns are true:
592
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
593
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
594
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
595
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
596
+
581
597
  output_df = self._batch_inference(
582
598
  dataset=dataset,
583
599
  inference_method="transform",
@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
26
26
  from snowflake.ml._internal.utils import pkg_version_utils, identifier
27
27
  from snowflake.snowpark import DataFrame, Session
28
28
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
- from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
29
+ from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
30
30
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
31
  from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
32
32
  from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
35
35
  transform_snowml_obj_to_sklearn_obj,
36
36
  validate_sklearn_args,
37
37
  )
38
- from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
38
+ from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
39
39
 
40
40
  from snowflake.ml.model.model_signature import (
41
41
  DataType,
@@ -244,7 +244,7 @@ class TweedieRegressor(BaseTransformer):
244
244
  self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
245
245
  # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
246
246
  self._snowpark_cols: Optional[List[str]] = self.input_cols
247
- self._handlers: FitPredictHandlers = HandlersImpl(class_name=TweedieRegressor.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
247
+ self._handlers: TransformerHandlers = HandlersImpl(class_name=TweedieRegressor.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
248
248
  self._autogenerated = True
249
249
 
250
250
  def _get_rand_id(self) -> str:
@@ -604,6 +604,22 @@ class TweedieRegressor(BaseTransformer):
604
604
  # each row containing a list of values.
605
605
  expected_dtype = "ARRAY"
606
606
 
607
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
608
+ if expected_dtype == "":
609
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
610
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
611
+ expected_dtype = "ARRAY"
612
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
613
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
614
+ expected_dtype = "ARRAY"
615
+ else:
616
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
617
+ # We can only infer the output types from the input types if the following two statemetns are true:
618
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
619
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
620
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
621
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
622
+
607
623
  output_df = self._batch_inference(
608
624
  dataset=dataset,
609
625
  inference_method="transform",
@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
26
26
  from snowflake.ml._internal.utils import pkg_version_utils, identifier
27
27
  from snowflake.snowpark import DataFrame, Session
28
28
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
- from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
29
+ from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
30
30
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
31
  from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
32
32
  from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
35
35
  transform_snowml_obj_to_sklearn_obj,
36
36
  validate_sklearn_args,
37
37
  )
38
- from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
38
+ from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
39
39
 
40
40
  from snowflake.ml.model.model_signature import (
41
41
  DataType,
@@ -240,7 +240,7 @@ class Isomap(BaseTransformer):
240
240
  self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
241
241
  # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
242
242
  self._snowpark_cols: Optional[List[str]] = self.input_cols
243
- self._handlers: FitPredictHandlers = HandlersImpl(class_name=Isomap.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
243
+ self._handlers: TransformerHandlers = HandlersImpl(class_name=Isomap.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
244
244
  self._autogenerated = True
245
245
 
246
246
  def _get_rand_id(self) -> str:
@@ -600,6 +600,22 @@ class Isomap(BaseTransformer):
600
600
  # each row containing a list of values.
601
601
  expected_dtype = "ARRAY"
602
602
 
603
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
604
+ if expected_dtype == "":
605
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
606
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
607
+ expected_dtype = "ARRAY"
608
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
609
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
610
+ expected_dtype = "ARRAY"
611
+ else:
612
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
613
+ # We can only infer the output types from the input types if the following two statemetns are true:
614
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
615
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
616
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
617
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
618
+
603
619
  output_df = self._batch_inference(
604
620
  dataset=dataset,
605
621
  inference_method="transform",
@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
26
26
  from snowflake.ml._internal.utils import pkg_version_utils, identifier
27
27
  from snowflake.snowpark import DataFrame, Session
28
28
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
- from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
29
+ from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
30
30
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
31
  from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
32
32
  from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
35
35
  transform_snowml_obj_to_sklearn_obj,
36
36
  validate_sklearn_args,
37
37
  )
38
- from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
38
+ from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
39
39
 
40
40
  from snowflake.ml.model.model_signature import (
41
41
  DataType,
@@ -223,7 +223,7 @@ class MDS(BaseTransformer):
223
223
  self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
224
224
  # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
225
225
  self._snowpark_cols: Optional[List[str]] = self.input_cols
226
- self._handlers: FitPredictHandlers = HandlersImpl(class_name=MDS.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
226
+ self._handlers: TransformerHandlers = HandlersImpl(class_name=MDS.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
227
227
  self._autogenerated = True
228
228
 
229
229
  def _get_rand_id(self) -> str:
@@ -581,6 +581,22 @@ class MDS(BaseTransformer):
581
581
  # each row containing a list of values.
582
582
  expected_dtype = "ARRAY"
583
583
 
584
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
585
+ if expected_dtype == "":
586
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
587
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
588
+ expected_dtype = "ARRAY"
589
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
590
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
591
+ expected_dtype = "ARRAY"
592
+ else:
593
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
594
+ # We can only infer the output types from the input types if the following two statemetns are true:
595
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
596
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
597
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
598
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
599
+
584
600
  output_df = self._batch_inference(
585
601
  dataset=dataset,
586
602
  inference_method="transform",
@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
26
26
  from snowflake.ml._internal.utils import pkg_version_utils, identifier
27
27
  from snowflake.snowpark import DataFrame, Session
28
28
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
- from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
29
+ from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
30
30
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
31
  from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
32
32
  from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
35
35
  transform_snowml_obj_to_sklearn_obj,
36
36
  validate_sklearn_args,
37
37
  )
38
- from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
38
+ from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
39
39
 
40
40
  from snowflake.ml.model.model_signature import (
41
41
  DataType,
@@ -225,7 +225,7 @@ class SpectralEmbedding(BaseTransformer):
225
225
  self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
226
226
  # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
227
227
  self._snowpark_cols: Optional[List[str]] = self.input_cols
228
- self._handlers: FitPredictHandlers = HandlersImpl(class_name=SpectralEmbedding.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
228
+ self._handlers: TransformerHandlers = HandlersImpl(class_name=SpectralEmbedding.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
229
229
  self._autogenerated = True
230
230
 
231
231
  def _get_rand_id(self) -> str:
@@ -583,6 +583,22 @@ class SpectralEmbedding(BaseTransformer):
583
583
  # each row containing a list of values.
584
584
  expected_dtype = "ARRAY"
585
585
 
586
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
587
+ if expected_dtype == "":
588
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
589
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
590
+ expected_dtype = "ARRAY"
591
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
592
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
593
+ expected_dtype = "ARRAY"
594
+ else:
595
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
596
+ # We can only infer the output types from the input types if the following two statemetns are true:
597
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
598
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
599
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
600
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
601
+
586
602
  output_df = self._batch_inference(
587
603
  dataset=dataset,
588
604
  inference_method="transform",
@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
26
26
  from snowflake.ml._internal.utils import pkg_version_utils, identifier
27
27
  from snowflake.snowpark import DataFrame, Session
28
28
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
- from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
29
+ from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
30
30
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
31
  from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
32
32
  from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
35
35
  transform_snowml_obj_to_sklearn_obj,
36
36
  validate_sklearn_args,
37
37
  )
38
- from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
38
+ from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
39
39
 
40
40
  from snowflake.ml.model.model_signature import (
41
41
  DataType,
@@ -284,7 +284,7 @@ class TSNE(BaseTransformer):
284
284
  self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
285
285
  # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
286
286
  self._snowpark_cols: Optional[List[str]] = self.input_cols
287
- self._handlers: FitPredictHandlers = HandlersImpl(class_name=TSNE.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
287
+ self._handlers: TransformerHandlers = HandlersImpl(class_name=TSNE.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
288
288
  self._autogenerated = True
289
289
 
290
290
  def _get_rand_id(self) -> str:
@@ -642,6 +642,22 @@ class TSNE(BaseTransformer):
642
642
  # each row containing a list of values.
643
643
  expected_dtype = "ARRAY"
644
644
 
645
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
646
+ if expected_dtype == "":
647
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
648
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
649
+ expected_dtype = "ARRAY"
650
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
651
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
652
+ expected_dtype = "ARRAY"
653
+ else:
654
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
655
+ # We can only infer the output types from the input types if the following two statemetns are true:
656
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
657
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
658
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
659
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
660
+
645
661
  output_df = self._batch_inference(
646
662
  dataset=dataset,
647
663
  inference_method="transform",
@@ -228,16 +228,15 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
228
228
  Returns:
229
229
  Name of the UDTF.
230
230
  """
231
+ batch_size = metrics_utils.BATCH_SIZE
231
232
 
232
233
  class ConfusionMatrixComputer:
233
- BATCH_SIZE = 1000
234
-
235
234
  def __init__(self) -> None:
236
235
  self._initialized = False
237
236
  self._confusion_matrix = np.zeros((1, 1))
238
- # 2d array containing a batch of input rows. A batch contains self.BATCH_SIZE rows.
237
+ # 2d array containing a batch of input rows. A batch contains metrics_utils.BATCH_SIZE rows.
239
238
  # [sample_weight, y_true, y_pred]
240
- self._batched_rows = np.zeros((self.BATCH_SIZE, 1))
239
+ self._batched_rows = np.zeros((batch_size, 1))
241
240
  # Number of columns in the dataset.
242
241
  self._n_cols = -1
243
242
  # Running count of number of rows added to self._batched_rows.
@@ -255,7 +254,7 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
255
254
  # 1. Initialize variables.
256
255
  if not self._initialized:
257
256
  self._n_cols = len(input_row)
258
- self._batched_rows = np.zeros((self.BATCH_SIZE, self._n_cols))
257
+ self._batched_rows = np.zeros((batch_size, self._n_cols))
259
258
  self._n_label = n_label
260
259
  self._confusion_matrix = np.zeros((self._n_label, self._n_label))
261
260
  self._initialized = True
@@ -264,7 +263,7 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
264
263
  self._cur_count += 1
265
264
 
266
265
  # 2. Compute incremental confusion matrix for the batch.
267
- if self._cur_count >= self.BATCH_SIZE:
266
+ if self._cur_count >= batch_size:
268
267
  self.update_confusion_matrix()
269
268
  self._cur_count = 0
270
269
 
@@ -15,6 +15,7 @@ from snowflake.snowpark import Session, functions as F, types as T
15
15
 
16
16
  LABEL = "LABEL"
17
17
  INDEX = "INDEX"
18
+ BATCH_SIZE = 1000
18
19
 
19
20
 
20
21
  def register_accumulator_udtf(*, session: Session, statement_params: Dict[str, Any]) -> str:
@@ -82,7 +83,7 @@ def register_sharded_dot_sum_computer(*, session: Session, statement_params: Dic
82
83
  """This class is registered as a UDTF and computes the sum and dot product
83
84
  of columns for each partition of rows. The computations across all the partitions happens
84
85
  in parallel using the nodes in the warehouse. In order to avoid keeping the entire partition
85
- in memory, we batch the rows (size is 1000) and maintain a running sum and dot prod in self._sum_by_count,
86
+ in memory, we batch the rows and maintain a running sum and dot prod in self._sum_by_count,
86
87
  self._sum_by_countd and self._dot_prod respectively. We return these at the end of the partition.
87
88
  """
88
89
 
@@ -95,7 +96,7 @@ def register_sharded_dot_sum_computer(*, session: Session, statement_params: Dic
95
96
  # delta degree of freedom
96
97
  self._ddof = 0
97
98
  # Setting the batch size to 1000 based on experimentation. Can be fine tuned later.
98
- self._batch_size = 1000
99
+ self._batch_size = BATCH_SIZE
99
100
  # 2d array containing a batch of input rows. A batch contains self._batch_size rows.
100
101
  self._batched_rows = np.zeros((self._batch_size, 1))
101
102
  # 1d array of length = # of cols. Contains sum(col/count) for each column.
@@ -224,7 +225,7 @@ def check_label_columns(
224
225
  TypeError: `y_true_col_names` and `y_pred_col_names` are of different types.
225
226
  ValueError: Multilabel `y_true_col_names` and `y_pred_col_names` are of different lengths.
226
227
  """
227
- if type(y_true_col_names) != type(y_pred_col_names):
228
+ if type(y_true_col_names) is not type(y_pred_col_names):
228
229
  raise TypeError(
229
230
  "Label columns should be of the same type."
230
231
  f"Got y_true_col_names={type(y_true_col_names)} vs y_pred_col_names={type(y_pred_col_names)}."
@@ -300,6 +301,7 @@ def validate_average_pos_label(average: Optional[str] = None, pos_label: Union[s
300
301
  "average != 'binary' (got %r). You may use "
301
302
  "labels=[pos_label] to specify a single positive class." % (pos_label, average),
302
303
  UserWarning,
304
+ stacklevel=2,
303
305
  )
304
306
 
305
307
 
@@ -122,7 +122,8 @@ def precision_recall_curve(
122
122
  result_module = cloudpickle.loads(pickled_result_module)
123
123
  return result_module.serialize(session, (precision, recall, thresholds)) # type: ignore[no-any-return]
124
124
 
125
- result_object = result.deserialize(session, precision_recall_curve_anon_sproc(session))
125
+ kwargs = telemetry.get_sproc_statement_params_kwargs(precision_recall_curve_anon_sproc, statement_params)
126
+ result_object = result.deserialize(session, precision_recall_curve_anon_sproc(session, **kwargs))
126
127
  res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object
127
128
  return res
128
129
 
@@ -271,7 +272,8 @@ def roc_auc_score(
271
272
  result_module = cloudpickle.loads(pickled_result_module)
272
273
  return result_module.serialize(session, auc) # type: ignore[no-any-return]
273
274
 
274
- result_object = result.deserialize(session, roc_auc_score_anon_sproc(session))
275
+ kwargs = telemetry.get_sproc_statement_params_kwargs(roc_auc_score_anon_sproc, statement_params)
276
+ result_object = result.deserialize(session, roc_auc_score_anon_sproc(session, **kwargs))
275
277
  auc: Union[float, npt.NDArray[np.float_]] = result_object
276
278
  return auc
277
279
 
@@ -372,7 +374,9 @@ def roc_curve(
372
374
  result_module = cloudpickle.loads(pickled_result_module)
373
375
  return result_module.serialize(session, (fpr, tpr, thresholds)) # type: ignore[no-any-return]
374
376
 
375
- result_object = result.deserialize(session, roc_curve_anon_sproc(session))
377
+ kwargs = telemetry.get_sproc_statement_params_kwargs(roc_curve_anon_sproc, statement_params)
378
+ result_object = result.deserialize(session, roc_curve_anon_sproc(session, **kwargs))
379
+
376
380
  res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object
377
381
 
378
382
  return res