snowflake-ml-python 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. snowflake/ml/_internal/telemetry.py +19 -0
  2. snowflake/ml/model/_client/ops/model_ops.py +16 -38
  3. snowflake/ml/model/_client/sql/model.py +1 -7
  4. snowflake/ml/model/_client/sql/model_version.py +20 -15
  5. snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +1 -6
  6. snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +0 -2
  7. snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +10 -1
  8. snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -2
  9. snowflake/ml/model/_packager/model_meta/_core_requirements.py +11 -1
  10. snowflake/ml/model/_packager/model_meta/_packaging_requirements.py +3 -0
  11. snowflake/ml/model/_packager/model_meta/model_meta.py +17 -3
  12. snowflake/ml/model/type_hints.py +3 -0
  13. snowflake/ml/modeling/_internal/distributed_hpo_trainer.py +63 -95
  14. snowflake/ml/modeling/_internal/snowpark_handlers.py +9 -6
  15. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +16 -0
  16. snowflake/ml/modeling/cluster/affinity_propagation.py +16 -0
  17. snowflake/ml/modeling/cluster/agglomerative_clustering.py +16 -0
  18. snowflake/ml/modeling/cluster/birch.py +16 -0
  19. snowflake/ml/modeling/cluster/bisecting_k_means.py +16 -0
  20. snowflake/ml/modeling/cluster/dbscan.py +16 -0
  21. snowflake/ml/modeling/cluster/feature_agglomeration.py +16 -0
  22. snowflake/ml/modeling/cluster/k_means.py +16 -0
  23. snowflake/ml/modeling/cluster/mean_shift.py +16 -0
  24. snowflake/ml/modeling/cluster/mini_batch_k_means.py +16 -0
  25. snowflake/ml/modeling/cluster/optics.py +16 -0
  26. snowflake/ml/modeling/cluster/spectral_biclustering.py +16 -0
  27. snowflake/ml/modeling/cluster/spectral_clustering.py +16 -0
  28. snowflake/ml/modeling/cluster/spectral_coclustering.py +16 -0
  29. snowflake/ml/modeling/compose/column_transformer.py +16 -0
  30. snowflake/ml/modeling/compose/transformed_target_regressor.py +16 -0
  31. snowflake/ml/modeling/covariance/elliptic_envelope.py +16 -0
  32. snowflake/ml/modeling/covariance/empirical_covariance.py +16 -0
  33. snowflake/ml/modeling/covariance/graphical_lasso.py +16 -0
  34. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +16 -0
  35. snowflake/ml/modeling/covariance/ledoit_wolf.py +16 -0
  36. snowflake/ml/modeling/covariance/min_cov_det.py +16 -0
  37. snowflake/ml/modeling/covariance/oas.py +16 -0
  38. snowflake/ml/modeling/covariance/shrunk_covariance.py +16 -0
  39. snowflake/ml/modeling/decomposition/dictionary_learning.py +16 -0
  40. snowflake/ml/modeling/decomposition/factor_analysis.py +16 -0
  41. snowflake/ml/modeling/decomposition/fast_ica.py +16 -0
  42. snowflake/ml/modeling/decomposition/incremental_pca.py +16 -0
  43. snowflake/ml/modeling/decomposition/kernel_pca.py +16 -0
  44. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +16 -0
  45. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +16 -0
  46. snowflake/ml/modeling/decomposition/pca.py +16 -0
  47. snowflake/ml/modeling/decomposition/sparse_pca.py +16 -0
  48. snowflake/ml/modeling/decomposition/truncated_svd.py +16 -0
  49. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +16 -0
  50. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +16 -0
  51. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +16 -0
  52. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +16 -0
  53. snowflake/ml/modeling/ensemble/bagging_classifier.py +16 -0
  54. snowflake/ml/modeling/ensemble/bagging_regressor.py +16 -0
  55. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +16 -0
  56. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +16 -0
  57. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +16 -0
  58. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +16 -0
  59. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +16 -0
  60. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +16 -0
  61. snowflake/ml/modeling/ensemble/isolation_forest.py +16 -0
  62. snowflake/ml/modeling/ensemble/random_forest_classifier.py +16 -0
  63. snowflake/ml/modeling/ensemble/random_forest_regressor.py +16 -0
  64. snowflake/ml/modeling/ensemble/stacking_regressor.py +16 -0
  65. snowflake/ml/modeling/ensemble/voting_classifier.py +16 -0
  66. snowflake/ml/modeling/ensemble/voting_regressor.py +16 -0
  67. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +16 -0
  68. snowflake/ml/modeling/feature_selection/select_fdr.py +16 -0
  69. snowflake/ml/modeling/feature_selection/select_fpr.py +16 -0
  70. snowflake/ml/modeling/feature_selection/select_fwe.py +16 -0
  71. snowflake/ml/modeling/feature_selection/select_k_best.py +16 -0
  72. snowflake/ml/modeling/feature_selection/select_percentile.py +16 -0
  73. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +16 -0
  74. snowflake/ml/modeling/feature_selection/variance_threshold.py +16 -0
  75. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +16 -0
  76. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +16 -0
  77. snowflake/ml/modeling/impute/iterative_imputer.py +16 -0
  78. snowflake/ml/modeling/impute/knn_imputer.py +16 -0
  79. snowflake/ml/modeling/impute/missing_indicator.py +16 -0
  80. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +16 -0
  81. snowflake/ml/modeling/kernel_approximation/nystroem.py +16 -0
  82. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +16 -0
  83. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +16 -0
  84. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +16 -0
  85. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +16 -0
  86. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +16 -0
  87. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +16 -0
  88. snowflake/ml/modeling/linear_model/ard_regression.py +16 -0
  89. snowflake/ml/modeling/linear_model/bayesian_ridge.py +16 -0
  90. snowflake/ml/modeling/linear_model/elastic_net.py +16 -0
  91. snowflake/ml/modeling/linear_model/elastic_net_cv.py +16 -0
  92. snowflake/ml/modeling/linear_model/gamma_regressor.py +16 -0
  93. snowflake/ml/modeling/linear_model/huber_regressor.py +16 -0
  94. snowflake/ml/modeling/linear_model/lars.py +16 -0
  95. snowflake/ml/modeling/linear_model/lars_cv.py +16 -0
  96. snowflake/ml/modeling/linear_model/lasso.py +16 -0
  97. snowflake/ml/modeling/linear_model/lasso_cv.py +16 -0
  98. snowflake/ml/modeling/linear_model/lasso_lars.py +16 -0
  99. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +16 -0
  100. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +16 -0
  101. snowflake/ml/modeling/linear_model/linear_regression.py +16 -0
  102. snowflake/ml/modeling/linear_model/logistic_regression.py +16 -0
  103. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +16 -0
  104. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +16 -0
  105. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +16 -0
  106. snowflake/ml/modeling/linear_model/multi_task_lasso.py +16 -0
  107. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +16 -0
  108. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +16 -0
  109. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +16 -0
  110. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +16 -0
  111. snowflake/ml/modeling/linear_model/perceptron.py +16 -0
  112. snowflake/ml/modeling/linear_model/poisson_regressor.py +16 -0
  113. snowflake/ml/modeling/linear_model/ransac_regressor.py +16 -0
  114. snowflake/ml/modeling/linear_model/ridge.py +16 -0
  115. snowflake/ml/modeling/linear_model/ridge_classifier.py +16 -0
  116. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +16 -0
  117. snowflake/ml/modeling/linear_model/ridge_cv.py +16 -0
  118. snowflake/ml/modeling/linear_model/sgd_classifier.py +16 -0
  119. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +16 -0
  120. snowflake/ml/modeling/linear_model/sgd_regressor.py +16 -0
  121. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +16 -0
  122. snowflake/ml/modeling/linear_model/tweedie_regressor.py +16 -0
  123. snowflake/ml/modeling/manifold/isomap.py +16 -0
  124. snowflake/ml/modeling/manifold/mds.py +16 -0
  125. snowflake/ml/modeling/manifold/spectral_embedding.py +16 -0
  126. snowflake/ml/modeling/manifold/tsne.py +16 -0
  127. snowflake/ml/modeling/metrics/classification.py +5 -6
  128. snowflake/ml/modeling/metrics/metrics_utils.py +5 -3
  129. snowflake/ml/modeling/metrics/ranking.py +7 -3
  130. snowflake/ml/modeling/metrics/regression.py +6 -3
  131. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +16 -0
  132. snowflake/ml/modeling/mixture/gaussian_mixture.py +16 -0
  133. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +16 -0
  134. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +16 -0
  135. snowflake/ml/modeling/multiclass/output_code_classifier.py +16 -0
  136. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +16 -0
  137. snowflake/ml/modeling/naive_bayes/categorical_nb.py +16 -0
  138. snowflake/ml/modeling/naive_bayes/complement_nb.py +16 -0
  139. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +16 -0
  140. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +16 -0
  141. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +16 -0
  142. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +16 -0
  143. snowflake/ml/modeling/neighbors/kernel_density.py +16 -0
  144. snowflake/ml/modeling/neighbors/local_outlier_factor.py +16 -0
  145. snowflake/ml/modeling/neighbors/nearest_centroid.py +16 -0
  146. snowflake/ml/modeling/neighbors/nearest_neighbors.py +16 -0
  147. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +16 -0
  148. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +16 -0
  149. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +16 -0
  150. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +16 -0
  151. snowflake/ml/modeling/neural_network/mlp_classifier.py +16 -0
  152. snowflake/ml/modeling/neural_network/mlp_regressor.py +16 -0
  153. snowflake/ml/modeling/preprocessing/polynomial_features.py +16 -0
  154. snowflake/ml/modeling/semi_supervised/label_propagation.py +16 -0
  155. snowflake/ml/modeling/semi_supervised/label_spreading.py +16 -0
  156. snowflake/ml/modeling/svm/linear_svc.py +16 -0
  157. snowflake/ml/modeling/svm/linear_svr.py +16 -0
  158. snowflake/ml/modeling/svm/nu_svc.py +16 -0
  159. snowflake/ml/modeling/svm/nu_svr.py +16 -0
  160. snowflake/ml/modeling/svm/svc.py +16 -0
  161. snowflake/ml/modeling/svm/svr.py +16 -0
  162. snowflake/ml/modeling/tree/decision_tree_classifier.py +16 -0
  163. snowflake/ml/modeling/tree/decision_tree_regressor.py +16 -0
  164. snowflake/ml/modeling/tree/extra_tree_classifier.py +16 -0
  165. snowflake/ml/modeling/tree/extra_tree_regressor.py +16 -0
  166. snowflake/ml/modeling/xgboost/xgb_classifier.py +16 -0
  167. snowflake/ml/modeling/xgboost/xgb_regressor.py +16 -0
  168. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +16 -0
  169. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +16 -0
  170. snowflake/ml/registry/registry.py +2 -0
  171. snowflake/ml/version.py +1 -1
  172. snowflake_ml_python-1.2.1.dist-info/LICENSE.txt +202 -0
  173. {snowflake_ml_python-1.2.0.dist-info → snowflake_ml_python-1.2.1.dist-info}/METADATA +261 -50
  174. {snowflake_ml_python-1.2.0.dist-info → snowflake_ml_python-1.2.1.dist-info}/RECORD +189 -186
  175. {snowflake_ml_python-1.2.0.dist-info → snowflake_ml_python-1.2.1.dist-info}/WHEEL +2 -1
  176. snowflake_ml_python-1.2.1.dist-info/top_level.txt +1 -0
@@ -623,6 +623,22 @@ class PassiveAggressiveClassifier(BaseTransformer):
623
623
  # each row containing a list of values.
624
624
  expected_dtype = "ARRAY"
625
625
 
626
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
627
+ if expected_dtype == "":
628
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
629
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
630
+ expected_dtype = "ARRAY"
631
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
632
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
633
+ expected_dtype = "ARRAY"
634
+ else:
635
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
636
+ # We can only infer the output types from the input types if the following two statemetns are true:
637
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
638
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
639
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
640
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
641
+
626
642
  output_df = self._batch_inference(
627
643
  dataset=dataset,
628
644
  inference_method="transform",
@@ -609,6 +609,22 @@ class PassiveAggressiveRegressor(BaseTransformer):
609
609
  # each row containing a list of values.
610
610
  expected_dtype = "ARRAY"
611
611
 
612
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
613
+ if expected_dtype == "":
614
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
615
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
616
+ expected_dtype = "ARRAY"
617
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
618
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
619
+ expected_dtype = "ARRAY"
620
+ else:
621
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
622
+ # We can only infer the output types from the input types if the following two statemetns are true:
623
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
624
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
625
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
626
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
627
+
612
628
  output_df = self._batch_inference(
613
629
  dataset=dataset,
614
630
  inference_method="transform",
@@ -622,6 +622,22 @@ class Perceptron(BaseTransformer):
622
622
  # each row containing a list of values.
623
623
  expected_dtype = "ARRAY"
624
624
 
625
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
626
+ if expected_dtype == "":
627
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
628
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
629
+ expected_dtype = "ARRAY"
630
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
631
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
632
+ expected_dtype = "ARRAY"
633
+ else:
634
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
635
+ # We can only infer the output types from the input types if the following two statemetns are true:
636
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
637
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
638
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
639
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
640
+
625
641
  output_df = self._batch_inference(
626
642
  dataset=dataset,
627
643
  inference_method="transform",
@@ -571,6 +571,22 @@ class PoissonRegressor(BaseTransformer):
571
571
  # each row containing a list of values.
572
572
  expected_dtype = "ARRAY"
573
573
 
574
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
575
+ if expected_dtype == "":
576
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
577
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
578
+ expected_dtype = "ARRAY"
579
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
580
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
581
+ expected_dtype = "ARRAY"
582
+ else:
583
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
584
+ # We can only infer the output types from the input types if the following two statemetns are true:
585
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
586
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
587
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
588
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
589
+
574
590
  output_df = self._batch_inference(
575
591
  dataset=dataset,
576
592
  inference_method="transform",
@@ -627,6 +627,22 @@ class RANSACRegressor(BaseTransformer):
627
627
  # each row containing a list of values.
628
628
  expected_dtype = "ARRAY"
629
629
 
630
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
631
+ if expected_dtype == "":
632
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
633
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
634
+ expected_dtype = "ARRAY"
635
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
636
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
637
+ expected_dtype = "ARRAY"
638
+ else:
639
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
640
+ # We can only infer the output types from the input types if the following two statemetns are true:
641
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
642
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
643
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
644
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
645
+
630
646
  output_df = self._batch_inference(
631
647
  dataset=dataset,
632
648
  inference_method="transform",
@@ -619,6 +619,22 @@ class Ridge(BaseTransformer):
619
619
  # each row containing a list of values.
620
620
  expected_dtype = "ARRAY"
621
621
 
622
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
623
+ if expected_dtype == "":
624
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
625
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
626
+ expected_dtype = "ARRAY"
627
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
628
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
629
+ expected_dtype = "ARRAY"
630
+ else:
631
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
632
+ # We can only infer the output types from the input types if the following two statemetns are true:
633
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
634
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
635
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
636
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
637
+
622
638
  output_df = self._batch_inference(
623
639
  dataset=dataset,
624
640
  inference_method="transform",
@@ -619,6 +619,22 @@ class RidgeClassifier(BaseTransformer):
619
619
  # each row containing a list of values.
620
620
  expected_dtype = "ARRAY"
621
621
 
622
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
623
+ if expected_dtype == "":
624
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
625
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
626
+ expected_dtype = "ARRAY"
627
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
628
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
629
+ expected_dtype = "ARRAY"
630
+ else:
631
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
632
+ # We can only infer the output types from the input types if the following two statemetns are true:
633
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
634
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
635
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
636
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
637
+
622
638
  output_df = self._batch_inference(
623
639
  dataset=dataset,
624
640
  inference_method="transform",
@@ -570,6 +570,22 @@ class RidgeClassifierCV(BaseTransformer):
570
570
  # each row containing a list of values.
571
571
  expected_dtype = "ARRAY"
572
572
 
573
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
574
+ if expected_dtype == "":
575
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
576
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
577
+ expected_dtype = "ARRAY"
578
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
579
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
580
+ expected_dtype = "ARRAY"
581
+ else:
582
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
583
+ # We can only infer the output types from the input types if the following two statemetns are true:
584
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
585
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
586
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
587
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
588
+
573
589
  output_df = self._batch_inference(
574
590
  dataset=dataset,
575
591
  inference_method="transform",
@@ -591,6 +591,22 @@ class RidgeCV(BaseTransformer):
591
591
  # each row containing a list of values.
592
592
  expected_dtype = "ARRAY"
593
593
 
594
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
595
+ if expected_dtype == "":
596
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
597
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
598
+ expected_dtype = "ARRAY"
599
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
600
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
601
+ expected_dtype = "ARRAY"
602
+ else:
603
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
604
+ # We can only infer the output types from the input types if the following two statemetns are true:
605
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
606
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
607
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
608
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
609
+
594
610
  output_df = self._batch_inference(
595
611
  dataset=dataset,
596
612
  inference_method="transform",
@@ -710,6 +710,22 @@ class SGDClassifier(BaseTransformer):
710
710
  # each row containing a list of values.
711
711
  expected_dtype = "ARRAY"
712
712
 
713
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
714
+ if expected_dtype == "":
715
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
716
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
717
+ expected_dtype = "ARRAY"
718
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
719
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
720
+ expected_dtype = "ARRAY"
721
+ else:
722
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
723
+ # We can only infer the output types from the input types if the following two statemetns are true:
724
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
725
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
726
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
727
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
728
+
713
729
  output_df = self._batch_inference(
714
730
  dataset=dataset,
715
731
  inference_method="transform",
@@ -608,6 +608,22 @@ class SGDOneClassSVM(BaseTransformer):
608
608
  # each row containing a list of values.
609
609
  expected_dtype = "ARRAY"
610
610
 
611
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
612
+ if expected_dtype == "":
613
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
614
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
615
+ expected_dtype = "ARRAY"
616
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
617
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
618
+ expected_dtype = "ARRAY"
619
+ else:
620
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
621
+ # We can only infer the output types from the input types if the following two statemetns are true:
622
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
623
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
624
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
625
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
626
+
611
627
  output_df = self._batch_inference(
612
628
  dataset=dataset,
613
629
  inference_method="transform",
@@ -676,6 +676,22 @@ class SGDRegressor(BaseTransformer):
676
676
  # each row containing a list of values.
677
677
  expected_dtype = "ARRAY"
678
678
 
679
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
680
+ if expected_dtype == "":
681
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
682
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
683
+ expected_dtype = "ARRAY"
684
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
685
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
686
+ expected_dtype = "ARRAY"
687
+ else:
688
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
689
+ # We can only infer the output types from the input types if the following two statemetns are true:
690
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
691
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
692
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
693
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
694
+
679
695
  output_df = self._batch_inference(
680
696
  dataset=dataset,
681
697
  inference_method="transform",
@@ -578,6 +578,22 @@ class TheilSenRegressor(BaseTransformer):
578
578
  # each row containing a list of values.
579
579
  expected_dtype = "ARRAY"
580
580
 
581
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
582
+ if expected_dtype == "":
583
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
584
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
585
+ expected_dtype = "ARRAY"
586
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
587
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
588
+ expected_dtype = "ARRAY"
589
+ else:
590
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
591
+ # We can only infer the output types from the input types if the following two statemetns are true:
592
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
593
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
594
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
595
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
596
+
581
597
  output_df = self._batch_inference(
582
598
  dataset=dataset,
583
599
  inference_method="transform",
@@ -604,6 +604,22 @@ class TweedieRegressor(BaseTransformer):
604
604
  # each row containing a list of values.
605
605
  expected_dtype = "ARRAY"
606
606
 
607
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
608
+ if expected_dtype == "":
609
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
610
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
611
+ expected_dtype = "ARRAY"
612
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
613
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
614
+ expected_dtype = "ARRAY"
615
+ else:
616
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
617
+ # We can only infer the output types from the input types if the following two statemetns are true:
618
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
619
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
620
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
621
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
622
+
607
623
  output_df = self._batch_inference(
608
624
  dataset=dataset,
609
625
  inference_method="transform",
@@ -600,6 +600,22 @@ class Isomap(BaseTransformer):
600
600
  # each row containing a list of values.
601
601
  expected_dtype = "ARRAY"
602
602
 
603
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
604
+ if expected_dtype == "":
605
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
606
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
607
+ expected_dtype = "ARRAY"
608
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
609
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
610
+ expected_dtype = "ARRAY"
611
+ else:
612
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
613
+ # We can only infer the output types from the input types if the following two statemetns are true:
614
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
615
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
616
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
617
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
618
+
603
619
  output_df = self._batch_inference(
604
620
  dataset=dataset,
605
621
  inference_method="transform",
@@ -581,6 +581,22 @@ class MDS(BaseTransformer):
581
581
  # each row containing a list of values.
582
582
  expected_dtype = "ARRAY"
583
583
 
584
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
585
+ if expected_dtype == "":
586
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
587
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
588
+ expected_dtype = "ARRAY"
589
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
590
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
591
+ expected_dtype = "ARRAY"
592
+ else:
593
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
594
+ # We can only infer the output types from the input types if the following two statemetns are true:
595
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
596
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
597
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
598
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
599
+
584
600
  output_df = self._batch_inference(
585
601
  dataset=dataset,
586
602
  inference_method="transform",
@@ -583,6 +583,22 @@ class SpectralEmbedding(BaseTransformer):
583
583
  # each row containing a list of values.
584
584
  expected_dtype = "ARRAY"
585
585
 
586
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
587
+ if expected_dtype == "":
588
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
589
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
590
+ expected_dtype = "ARRAY"
591
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
592
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
593
+ expected_dtype = "ARRAY"
594
+ else:
595
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
596
+ # We can only infer the output types from the input types if the following two statemetns are true:
597
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
598
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
599
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
600
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
601
+
586
602
  output_df = self._batch_inference(
587
603
  dataset=dataset,
588
604
  inference_method="transform",
@@ -642,6 +642,22 @@ class TSNE(BaseTransformer):
642
642
  # each row containing a list of values.
643
643
  expected_dtype = "ARRAY"
644
644
 
645
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
646
+ if expected_dtype == "":
647
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
648
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
649
+ expected_dtype = "ARRAY"
650
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
651
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
652
+ expected_dtype = "ARRAY"
653
+ else:
654
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
655
+ # We can only infer the output types from the input types if the following two statemetns are true:
656
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
657
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
658
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
659
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
660
+
645
661
  output_df = self._batch_inference(
646
662
  dataset=dataset,
647
663
  inference_method="transform",
@@ -228,16 +228,15 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
228
228
  Returns:
229
229
  Name of the UDTF.
230
230
  """
231
+ batch_size = metrics_utils.BATCH_SIZE
231
232
 
232
233
  class ConfusionMatrixComputer:
233
- BATCH_SIZE = 1000
234
-
235
234
  def __init__(self) -> None:
236
235
  self._initialized = False
237
236
  self._confusion_matrix = np.zeros((1, 1))
238
- # 2d array containing a batch of input rows. A batch contains self.BATCH_SIZE rows.
237
+ # 2d array containing a batch of input rows. A batch contains metrics_utils.BATCH_SIZE rows.
239
238
  # [sample_weight, y_true, y_pred]
240
- self._batched_rows = np.zeros((self.BATCH_SIZE, 1))
239
+ self._batched_rows = np.zeros((batch_size, 1))
241
240
  # Number of columns in the dataset.
242
241
  self._n_cols = -1
243
242
  # Running count of number of rows added to self._batched_rows.
@@ -255,7 +254,7 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
255
254
  # 1. Initialize variables.
256
255
  if not self._initialized:
257
256
  self._n_cols = len(input_row)
258
- self._batched_rows = np.zeros((self.BATCH_SIZE, self._n_cols))
257
+ self._batched_rows = np.zeros((batch_size, self._n_cols))
259
258
  self._n_label = n_label
260
259
  self._confusion_matrix = np.zeros((self._n_label, self._n_label))
261
260
  self._initialized = True
@@ -264,7 +263,7 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
264
263
  self._cur_count += 1
265
264
 
266
265
  # 2. Compute incremental confusion matrix for the batch.
267
- if self._cur_count >= self.BATCH_SIZE:
266
+ if self._cur_count >= batch_size:
268
267
  self.update_confusion_matrix()
269
268
  self._cur_count = 0
270
269
 
@@ -15,6 +15,7 @@ from snowflake.snowpark import Session, functions as F, types as T
15
15
 
16
16
  LABEL = "LABEL"
17
17
  INDEX = "INDEX"
18
+ BATCH_SIZE = 1000
18
19
 
19
20
 
20
21
  def register_accumulator_udtf(*, session: Session, statement_params: Dict[str, Any]) -> str:
@@ -82,7 +83,7 @@ def register_sharded_dot_sum_computer(*, session: Session, statement_params: Dic
82
83
  """This class is registered as a UDTF and computes the sum and dot product
83
84
  of columns for each partition of rows. The computations across all the partitions happens
84
85
  in parallel using the nodes in the warehouse. In order to avoid keeping the entire partition
85
- in memory, we batch the rows (size is 1000) and maintain a running sum and dot prod in self._sum_by_count,
86
+ in memory, we batch the rows and maintain a running sum and dot prod in self._sum_by_count,
86
87
  self._sum_by_countd and self._dot_prod respectively. We return these at the end of the partition.
87
88
  """
88
89
 
@@ -95,7 +96,7 @@ def register_sharded_dot_sum_computer(*, session: Session, statement_params: Dic
95
96
  # delta degree of freedom
96
97
  self._ddof = 0
97
98
  # Setting the batch size to 1000 based on experimentation. Can be fine tuned later.
98
- self._batch_size = 1000
99
+ self._batch_size = BATCH_SIZE
99
100
  # 2d array containing a batch of input rows. A batch contains self._batch_size rows.
100
101
  self._batched_rows = np.zeros((self._batch_size, 1))
101
102
  # 1d array of length = # of cols. Contains sum(col/count) for each column.
@@ -224,7 +225,7 @@ def check_label_columns(
224
225
  TypeError: `y_true_col_names` and `y_pred_col_names` are of different types.
225
226
  ValueError: Multilabel `y_true_col_names` and `y_pred_col_names` are of different lengths.
226
227
  """
227
- if type(y_true_col_names) != type(y_pred_col_names):
228
+ if type(y_true_col_names) is not type(y_pred_col_names):
228
229
  raise TypeError(
229
230
  "Label columns should be of the same type."
230
231
  f"Got y_true_col_names={type(y_true_col_names)} vs y_pred_col_names={type(y_pred_col_names)}."
@@ -300,6 +301,7 @@ def validate_average_pos_label(average: Optional[str] = None, pos_label: Union[s
300
301
  "average != 'binary' (got %r). You may use "
301
302
  "labels=[pos_label] to specify a single positive class." % (pos_label, average),
302
303
  UserWarning,
304
+ stacklevel=2,
303
305
  )
304
306
 
305
307
 
@@ -122,7 +122,8 @@ def precision_recall_curve(
122
122
  result_module = cloudpickle.loads(pickled_result_module)
123
123
  return result_module.serialize(session, (precision, recall, thresholds)) # type: ignore[no-any-return]
124
124
 
125
- result_object = result.deserialize(session, precision_recall_curve_anon_sproc(session))
125
+ kwargs = telemetry.get_sproc_statement_params_kwargs(precision_recall_curve_anon_sproc, statement_params)
126
+ result_object = result.deserialize(session, precision_recall_curve_anon_sproc(session, **kwargs))
126
127
  res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object
127
128
  return res
128
129
 
@@ -271,7 +272,8 @@ def roc_auc_score(
271
272
  result_module = cloudpickle.loads(pickled_result_module)
272
273
  return result_module.serialize(session, auc) # type: ignore[no-any-return]
273
274
 
274
- result_object = result.deserialize(session, roc_auc_score_anon_sproc(session))
275
+ kwargs = telemetry.get_sproc_statement_params_kwargs(roc_auc_score_anon_sproc, statement_params)
276
+ result_object = result.deserialize(session, roc_auc_score_anon_sproc(session, **kwargs))
275
277
  auc: Union[float, npt.NDArray[np.float_]] = result_object
276
278
  return auc
277
279
 
@@ -372,7 +374,9 @@ def roc_curve(
372
374
  result_module = cloudpickle.loads(pickled_result_module)
373
375
  return result_module.serialize(session, (fpr, tpr, thresholds)) # type: ignore[no-any-return]
374
376
 
375
- result_object = result.deserialize(session, roc_curve_anon_sproc(session))
377
+ kwargs = telemetry.get_sproc_statement_params_kwargs(roc_curve_anon_sproc, statement_params)
378
+ result_object = result.deserialize(session, roc_curve_anon_sproc(session, **kwargs))
379
+
376
380
  res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object
377
381
 
378
382
  return res
@@ -108,7 +108,8 @@ def d2_absolute_error_score(
108
108
  result_module = cloudpickle.loads(pickled_snowflake_result)
109
109
  return result_module.serialize(session, score) # type: ignore[no-any-return]
110
110
 
111
- result_object = result.deserialize(session, d2_absolute_error_score_anon_sproc(session))
111
+ kwargs = telemetry.get_sproc_statement_params_kwargs(d2_absolute_error_score_anon_sproc, statement_params)
112
+ result_object = result.deserialize(session, d2_absolute_error_score_anon_sproc(session, **kwargs))
112
113
  score: Union[float, npt.NDArray[np.float_]] = result_object
113
114
  return score
114
115
 
@@ -205,7 +206,8 @@ def d2_pinball_score(
205
206
  result_module = cloudpickle.loads(pickled_result_module)
206
207
  return result_module.serialize(session, score) # type: ignore[no-any-return]
207
208
 
208
- result_object = result.deserialize(session, d2_pinball_score_anon_sproc(session))
209
+ kwargs = telemetry.get_sproc_statement_params_kwargs(d2_pinball_score_anon_sproc, statement_params)
210
+ result_object = result.deserialize(session, d2_pinball_score_anon_sproc(session, **kwargs))
209
211
 
210
212
  score: Union[float, npt.NDArray[np.float_]] = result_object
211
213
  return score
@@ -319,7 +321,8 @@ def explained_variance_score(
319
321
  result_module = cloudpickle.loads(pickled_result_module)
320
322
  return result_module.serialize(session, score) # type: ignore[no-any-return]
321
323
 
322
- result_object = result.deserialize(session, explained_variance_score_anon_sproc(session))
324
+ kwargs = telemetry.get_sproc_statement_params_kwargs(explained_variance_score_anon_sproc, statement_params)
325
+ result_object = result.deserialize(session, explained_variance_score_anon_sproc(session, **kwargs))
323
326
  score: Union[float, npt.NDArray[np.float_]] = result_object
324
327
  return score
325
328
 
@@ -647,6 +647,22 @@ class BayesianGaussianMixture(BaseTransformer):
647
647
  # each row containing a list of values.
648
648
  expected_dtype = "ARRAY"
649
649
 
650
+ # If we were unable to assign a type to this transform in the factory, infer the type here.
651
+ if expected_dtype == "":
652
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
653
+ if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
654
+ expected_dtype = "ARRAY"
655
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
656
+ elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
657
+ expected_dtype = "ARRAY"
658
+ else:
659
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
660
+ # We can only infer the output types from the input types if the following two statemetns are true:
661
+ # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
662
+ # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
663
+ if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
664
+ expected_dtype = convert_sp_to_sf_type(output_types[0])
665
+
650
666
  output_df = self._batch_inference(
651
667
  dataset=dataset,
652
668
  inference_method="transform",