snowflake-ml-python 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. snowflake/ml/_internal/file_utils.py +3 -3
  2. snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
  3. snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
  4. snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
  5. snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
  6. snowflake/ml/_internal/telemetry.py +11 -2
  7. snowflake/ml/_internal/utils/formatting.py +1 -1
  8. snowflake/ml/feature_store/feature_store.py +15 -106
  9. snowflake/ml/fileset/sfcfs.py +4 -3
  10. snowflake/ml/fileset/stage_fs.py +18 -0
  11. snowflake/ml/model/_api.py +9 -9
  12. snowflake/ml/model/_client/model/model_version_impl.py +20 -15
  13. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +3 -9
  14. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -5
  15. snowflake/ml/model/_deploy_client/snowservice/deploy.py +7 -6
  16. snowflake/ml/model/_model_composer/model_composer.py +10 -8
  17. snowflake/ml/model/_model_composer/model_method/function_generator.py +1 -1
  18. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +2 -1
  19. snowflake/ml/model/_model_composer/model_method/model_method.py +2 -2
  20. snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +1 -1
  21. snowflake/ml/model/_packager/model_handlers/_base.py +2 -2
  22. snowflake/ml/model/_packager/model_handlers/_utils.py +5 -5
  23. snowflake/ml/model/_packager/model_handlers/custom.py +7 -7
  24. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +2 -2
  25. snowflake/ml/model/_packager/model_handlers/llm.py +1 -1
  26. snowflake/ml/model/_packager/model_handlers/mlflow.py +1 -1
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +13 -10
  28. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +214 -0
  29. snowflake/ml/model/_packager/model_handlers/sklearn.py +6 -6
  30. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +15 -3
  31. snowflake/ml/model/_packager/model_handlers/tensorflow.py +8 -8
  32. snowflake/ml/model/_packager/model_handlers/torchscript.py +7 -7
  33. snowflake/ml/model/_packager/model_handlers/xgboost.py +8 -8
  34. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  35. snowflake/ml/model/_packager/model_packager.py +8 -6
  36. snowflake/ml/model/custom_model.py +3 -1
  37. snowflake/ml/model/type_hints.py +13 -0
  38. snowflake/ml/modeling/_internal/estimator_utils.py +61 -1
  39. snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -43
  40. snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +4 -4
  41. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +21 -17
  42. snowflake/ml/modeling/_internal/model_specifications.py +3 -1
  43. snowflake/ml/modeling/_internal/model_trainer.py +2 -2
  44. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +547 -1
  45. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +67 -114
  46. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -9
  47. snowflake/ml/modeling/_internal/transformer_protocols.py +2 -3
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +33 -61
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +33 -61
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +33 -61
  51. snowflake/ml/modeling/cluster/birch.py +33 -61
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +33 -61
  53. snowflake/ml/modeling/cluster/dbscan.py +33 -61
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +33 -61
  55. snowflake/ml/modeling/cluster/k_means.py +33 -61
  56. snowflake/ml/modeling/cluster/mean_shift.py +33 -61
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +33 -61
  58. snowflake/ml/modeling/cluster/optics.py +33 -61
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +33 -61
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +33 -61
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +33 -61
  62. snowflake/ml/modeling/compose/column_transformer.py +33 -61
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +33 -61
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +33 -61
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +33 -61
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +33 -61
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +33 -61
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +33 -61
  69. snowflake/ml/modeling/covariance/min_cov_det.py +33 -61
  70. snowflake/ml/modeling/covariance/oas.py +33 -61
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +33 -61
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +33 -61
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +33 -61
  74. snowflake/ml/modeling/decomposition/fast_ica.py +33 -61
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +33 -61
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +33 -61
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +33 -61
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +33 -61
  79. snowflake/ml/modeling/decomposition/pca.py +33 -61
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +33 -61
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +33 -61
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +33 -61
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +33 -61
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +33 -61
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +33 -61
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +33 -61
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +33 -61
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +33 -61
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +33 -61
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +33 -61
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +33 -61
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +33 -61
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +33 -61
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +33 -61
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +33 -61
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +33 -61
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +33 -61
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +33 -61
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +33 -61
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +33 -61
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +33 -61
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +33 -61
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +33 -61
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +33 -61
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +33 -61
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +33 -61
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +33 -61
  108. snowflake/ml/modeling/framework/base.py +55 -5
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +33 -61
  110. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +33 -61
  111. snowflake/ml/modeling/impute/iterative_imputer.py +33 -61
  112. snowflake/ml/modeling/impute/knn_imputer.py +33 -61
  113. snowflake/ml/modeling/impute/missing_indicator.py +33 -61
  114. snowflake/ml/modeling/impute/simple_imputer.py +4 -15
  115. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +33 -61
  116. snowflake/ml/modeling/kernel_approximation/nystroem.py +33 -61
  117. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +33 -61
  118. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +33 -61
  119. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +33 -61
  120. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +33 -61
  121. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +36 -63
  122. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +36 -63
  123. snowflake/ml/modeling/linear_model/ard_regression.py +33 -61
  124. snowflake/ml/modeling/linear_model/bayesian_ridge.py +33 -61
  125. snowflake/ml/modeling/linear_model/elastic_net.py +33 -61
  126. snowflake/ml/modeling/linear_model/elastic_net_cv.py +33 -61
  127. snowflake/ml/modeling/linear_model/gamma_regressor.py +33 -61
  128. snowflake/ml/modeling/linear_model/huber_regressor.py +33 -61
  129. snowflake/ml/modeling/linear_model/lars.py +33 -61
  130. snowflake/ml/modeling/linear_model/lars_cv.py +33 -61
  131. snowflake/ml/modeling/linear_model/lasso.py +33 -61
  132. snowflake/ml/modeling/linear_model/lasso_cv.py +33 -61
  133. snowflake/ml/modeling/linear_model/lasso_lars.py +33 -61
  134. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +33 -61
  135. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +33 -61
  136. snowflake/ml/modeling/linear_model/linear_regression.py +33 -61
  137. snowflake/ml/modeling/linear_model/logistic_regression.py +33 -61
  138. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +33 -61
  139. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +33 -61
  140. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +33 -61
  141. snowflake/ml/modeling/linear_model/multi_task_lasso.py +33 -61
  142. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +33 -61
  143. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +33 -61
  144. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +33 -61
  145. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +33 -61
  146. snowflake/ml/modeling/linear_model/perceptron.py +33 -61
  147. snowflake/ml/modeling/linear_model/poisson_regressor.py +33 -61
  148. snowflake/ml/modeling/linear_model/ransac_regressor.py +33 -61
  149. snowflake/ml/modeling/linear_model/ridge.py +33 -61
  150. snowflake/ml/modeling/linear_model/ridge_classifier.py +33 -61
  151. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +33 -61
  152. snowflake/ml/modeling/linear_model/ridge_cv.py +33 -61
  153. snowflake/ml/modeling/linear_model/sgd_classifier.py +33 -61
  154. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +33 -61
  155. snowflake/ml/modeling/linear_model/sgd_regressor.py +33 -61
  156. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +33 -61
  157. snowflake/ml/modeling/linear_model/tweedie_regressor.py +33 -61
  158. snowflake/ml/modeling/manifold/isomap.py +33 -61
  159. snowflake/ml/modeling/manifold/mds.py +33 -61
  160. snowflake/ml/modeling/manifold/spectral_embedding.py +33 -61
  161. snowflake/ml/modeling/manifold/tsne.py +33 -61
  162. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +33 -61
  163. snowflake/ml/modeling/mixture/gaussian_mixture.py +33 -61
  164. snowflake/ml/modeling/model_selection/grid_search_cv.py +39 -57
  165. snowflake/ml/modeling/model_selection/randomized_search_cv.py +26 -57
  166. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +33 -61
  167. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +33 -61
  168. snowflake/ml/modeling/multiclass/output_code_classifier.py +33 -61
  169. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +33 -61
  170. snowflake/ml/modeling/naive_bayes/categorical_nb.py +33 -61
  171. snowflake/ml/modeling/naive_bayes/complement_nb.py +33 -61
  172. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +33 -61
  173. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +33 -61
  174. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +33 -61
  175. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +33 -61
  176. snowflake/ml/modeling/neighbors/kernel_density.py +33 -61
  177. snowflake/ml/modeling/neighbors/local_outlier_factor.py +33 -61
  178. snowflake/ml/modeling/neighbors/nearest_centroid.py +33 -61
  179. snowflake/ml/modeling/neighbors/nearest_neighbors.py +33 -61
  180. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +33 -61
  181. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +33 -61
  182. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +33 -61
  183. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +33 -61
  184. snowflake/ml/modeling/neural_network/mlp_classifier.py +33 -61
  185. snowflake/ml/modeling/neural_network/mlp_regressor.py +33 -61
  186. snowflake/ml/modeling/preprocessing/polynomial_features.py +33 -61
  187. snowflake/ml/modeling/semi_supervised/label_propagation.py +33 -61
  188. snowflake/ml/modeling/semi_supervised/label_spreading.py +33 -61
  189. snowflake/ml/modeling/svm/linear_svc.py +33 -61
  190. snowflake/ml/modeling/svm/linear_svr.py +33 -61
  191. snowflake/ml/modeling/svm/nu_svc.py +33 -61
  192. snowflake/ml/modeling/svm/nu_svr.py +33 -61
  193. snowflake/ml/modeling/svm/svc.py +33 -61
  194. snowflake/ml/modeling/svm/svr.py +33 -61
  195. snowflake/ml/modeling/tree/decision_tree_classifier.py +33 -61
  196. snowflake/ml/modeling/tree/decision_tree_regressor.py +33 -61
  197. snowflake/ml/modeling/tree/extra_tree_classifier.py +33 -61
  198. snowflake/ml/modeling/tree/extra_tree_regressor.py +33 -61
  199. snowflake/ml/modeling/xgboost/xgb_classifier.py +33 -61
  200. snowflake/ml/modeling/xgboost/xgb_regressor.py +33 -61
  201. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +33 -61
  202. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +33 -61
  203. snowflake/ml/registry/_manager/model_manager.py +6 -2
  204. snowflake/ml/registry/model_registry.py +100 -27
  205. snowflake/ml/registry/registry.py +6 -2
  206. snowflake/ml/version.py +1 -1
  207. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/METADATA +43 -7
  208. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/RECORD +211 -206
  209. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/LICENSE.txt +0 -0
  210. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/WHEEL +0 -0
  211. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/top_level.txt +0 -0
@@ -15,16 +15,16 @@ from snowflake.ml._internal.utils.temp_file_utils import (
15
15
  cleanup_temp_files,
16
16
  get_temp_file_path,
17
17
  )
18
- from snowflake.snowpark import DataFrame, Session
18
+ from snowflake.ml.modeling._internal.estimator_utils import handle_inference_result
19
+ from snowflake.snowpark import DataFrame, Session, functions as F, types as T
19
20
  from snowflake.snowpark._internal.utils import (
20
21
  TempObjectType,
21
22
  random_name_for_temp_object,
22
23
  )
23
- from snowflake.snowpark.functions import pandas_udf, sproc
24
- from snowflake.snowpark.types import PandasSeries
25
24
 
26
25
  cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path))
27
26
  cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
27
+ cp.register_pickle_by_value(inspect.getmodule(handle_inference_result))
28
28
 
29
29
  _PROJECT = "ModelDevelopment"
30
30
 
@@ -67,9 +67,9 @@ class SnowparkTransformHandlers:
67
67
  inference_method: str,
68
68
  input_cols: List[str],
69
69
  expected_output_cols: List[str],
70
- pass_through_cols: List[str],
71
70
  session: Session,
72
71
  dependencies: List[str],
72
+ drop_input_cols: Optional[bool] = False,
73
73
  expected_output_cols_type: Optional[str] = "",
74
74
  *args: Any,
75
75
  **kwargs: Any,
@@ -81,8 +81,8 @@ class SnowparkTransformHandlers:
81
81
  dependencies: List of dependencies for the transformer.
82
82
  inference_method: the name of the method used by `estimator` to run inference.
83
83
  input_cols: List of feature columns for inference.
84
- pass_through_cols: columns in the dataset not used in inference.
85
84
  expected_output_cols: column names (in order) of the output dataset.
85
+ drop_input_cols: Boolean to determine whether to drop the input columns from the output dataset.
86
86
  expected_output_cols_type: Expected type of the output columns.
87
87
  args: additional positional arguments.
88
88
  kwargs: additional keyword args.
@@ -95,141 +95,94 @@ class SnowparkTransformHandlers:
95
95
  estimator = self.estimator
96
96
  # Register vectorized UDF for batch inference
97
97
  batch_inference_udf_name = random_name_for_temp_object(TempObjectType.FUNCTION)
98
- snowpark_cols = dataset.select(input_cols).columns
98
+
99
99
  dataset = snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)
100
+ # Align the input_cols with snowpark dataframe's column name
101
+ # This step also makes sure that the every col in input_cols exists in the current dataset
102
+ snowpark_cols = dataset.select(input_cols).columns
103
+
104
+ # Infer the datatype from input dataset's schema for batch inference
105
+ # This is required before registering the UDTF
106
+ fields = dataset.select(input_cols).schema.fields
107
+ input_datatypes = []
108
+ for field in fields:
109
+ input_datatypes.append(field.datatype)
100
110
 
101
111
  statement_params = telemetry.get_function_usage_statement_params(
102
112
  project=_PROJECT,
103
113
  subproject=self._subproject,
104
114
  function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), self._class_name),
105
- api_calls=[pandas_udf],
115
+ api_calls=[F.pandas_udf],
106
116
  custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
107
117
  )
108
118
 
109
- @pandas_udf( # type: ignore[arg-type, misc]
119
+ @F.pandas_udf( # type: ignore[arg-type, misc]
110
120
  is_permanent=False,
111
121
  name=batch_inference_udf_name,
112
122
  packages=dependencies, # type: ignore[arg-type]
113
123
  replace=True,
114
124
  session=session,
115
125
  statement_params=statement_params,
126
+ input_types=[T.PandasDataFrameType(input_datatypes)],
116
127
  )
117
- def vec_batch_infer(ds: PandasSeries[dict]) -> PandasSeries[dict]: # type: ignore[type-arg]
118
- import numbers
119
-
120
- import numpy as np
128
+ def vec_batch_infer(input_df: pd.DataFrame) -> T.PandasSeries[dict]: # type: ignore[type-arg]
129
+ import numpy as np # noqa: F401
121
130
  import pandas as pd
122
131
 
123
- input_df = pd.json_normalize(ds)
124
-
125
- # pd.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas().
126
- # But trained models have unquoted input column names saved in internal state if trained using snowpark_df
127
- # or quoted input column names saved in internal state if trained using pandas_df.
128
- # Model expects exact same columns names in the input df for predict call.
129
-
130
- input_df = input_df[input_cols] # Select input columns with quoted column names.
131
- if hasattr(estimator, "feature_names_in_"):
132
- missing_features = []
133
- for i, f in enumerate(getattr(estimator, "feature_names_in_", {})):
134
- if i >= len(input_cols) or (input_cols[i] != f and snowpark_cols[i] != f):
135
- missing_features.append(f)
136
-
137
- if len(missing_features) > 0:
138
- raise ValueError(
139
- "The feature names should match with those that were passed during fit.\n"
140
- f"Features seen during fit call but not present in the input: {missing_features}\n"
141
- f"Features in the input dataframe : {input_cols}\n"
142
- )
143
- input_df.columns = getattr(estimator, "feature_names_in_", {})
144
- else:
145
- # Just rename the column names to unquoted identifiers.
146
- input_df.columns = snowpark_cols # Replace the quoted columns identifier with unquoted column ids.
132
+ input_df.columns = snowpark_cols
133
+
134
+ if hasattr(estimator, "n_jobs"):
135
+ # Vectorized UDF cannot handle joblib multiprocessing right now, deactivate the n_jobs
136
+ estimator.n_jobs = 1
147
137
  inference_res = getattr(estimator, inference_method)(input_df, *args, **kwargs)
148
- if isinstance(inference_res, list) and len(inference_res) > 0 and isinstance(inference_res[0], np.ndarray):
149
- # In case of multioutput estimators, predict_proba, decision_function etc., functions return a list of
150
- # ndarrays. We need to concatenate them.
151
- transformed_numpy_array = np.concatenate(inference_res, axis=1)
152
- elif (
153
- isinstance(inference_res, tuple) and len(inference_res) > 0 and isinstance(inference_res[0], np.ndarray)
154
- ):
155
- # In case of kneighbors, functions return a tuple of ndarrays.
156
- transformed_numpy_array = np.stack(inference_res, axis=1)
157
- elif isinstance(inference_res, numbers.Number):
158
- # In case of BernoulliRBM, functions return a float
159
- transformed_numpy_array = np.array([inference_res])
160
- else:
161
- transformed_numpy_array = inference_res
162
138
 
163
- if (len(transformed_numpy_array.shape) == 3) and inference_method != "kneighbors":
164
- # VotingClassifier will return results of shape (n_classifiers, n_samples, n_classes)
165
- # when voting = "soft" and flatten_transform = False. We can't handle unflatten transforms,
166
- # so we ignore flatten_transform flag and flatten the results.
167
- transformed_numpy_array = np.hstack(transformed_numpy_array) # type: ignore[call-overload]
139
+ transformed_numpy_array, output_cols = handle_inference_result(
140
+ inference_res=inference_res,
141
+ output_cols=expected_output_cols,
142
+ inference_method=inference_method,
143
+ within_udf=True,
144
+ )
168
145
 
169
146
  if len(transformed_numpy_array.shape) > 1:
170
- if transformed_numpy_array.shape[1] != len(expected_output_cols):
171
- # HeterogeneousEnsemble's transform method produce results with variying shapes
172
- # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes).
173
- # It is hard to predict the response shape without using fragile introspection logic.
174
- # So, to avoid that we are packing the results into a dataframe of shape (n_samples, 1) with
175
- # each element being a list.
176
- if len(expected_output_cols) != 1:
177
- raise TypeError(
178
- "expected_output_cols must be same length as transformed array or " "should be of length 1"
179
- )
147
+ if transformed_numpy_array.shape[1] != len(output_cols):
180
148
  series = pd.Series(transformed_numpy_array.tolist())
181
- transformed_pandas_df = pd.DataFrame(series, columns=expected_output_cols)
149
+ transformed_pandas_df = pd.DataFrame(series, columns=output_cols)
182
150
  else:
183
- transformed_pandas_df = pd.DataFrame(transformed_numpy_array.tolist(), columns=expected_output_cols)
151
+ transformed_pandas_df = pd.DataFrame(transformed_numpy_array.tolist(), columns=output_cols)
184
152
  else:
185
- transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=expected_output_cols)
153
+ transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=output_cols)
186
154
 
187
155
  return transformed_pandas_df.to_dict("records") # type: ignore[no-any-return]
188
156
 
189
- batch_inference_table_name = f"SNOWML_BATCH_INFERENCE_INPUT_TABLE_{_get_rand_id()}"
190
-
191
- # Run Transform
192
- query_from_df = str(dataset.queries["queries"][0])
193
-
194
- outer_select_list = pass_through_cols[:]
195
- inner_select_list = pass_through_cols[:]
196
-
197
- outer_select_list.extend(
198
- [
199
- "{object_name}:{column_name}{udf_datatype} as {column_name}".format(
200
- object_name=batch_inference_udf_name,
201
- column_name=identifier.get_inferred_name(c),
202
- udf_datatype=(f"::{expected_output_cols_type}" if expected_output_cols_type else ""),
203
- )
204
- for c in expected_output_cols
205
- ]
206
- )
207
-
208
- inner_select_list.extend(
209
- [
210
- "{udf_name}(object_construct_keep_null({input_cols_dict})) AS {udf_name}".format(
211
- udf_name=batch_inference_udf_name,
212
- input_cols_dict=", ".join([f"'{c}', {c}" for c in input_cols]),
213
- )
214
- ]
215
- )
216
-
217
- sql = """WITH {input_table_name} AS ({query})
218
- SELECT
219
- {outer_select_stmt}
220
- FROM (
221
- SELECT
222
- {inner_select_stmt}
223
- FROM {input_table_name}
224
- )
225
- """.format(
226
- input_table_name=batch_inference_table_name,
227
- query=query_from_df,
228
- outer_select_stmt=", ".join(outer_select_list),
229
- inner_select_stmt=", ".join(inner_select_list),
230
- )
231
-
232
- return session.sql(sql)
157
+ # Run Transform and get intermediate result
158
+ INTERMEDIATE_OBJ_NAME = "tmp_result"
159
+ # Use snowpark_cols can make sure the name ordering of the input dataframe
160
+ # and only select those columns to put into vectorized udf
161
+ output_obj = F.call_udf(batch_inference_udf_name, [F.col(col_name) for col_name in snowpark_cols])
162
+ df_res: DataFrame = dataset.with_column(INTERMEDIATE_OBJ_NAME, output_obj)
163
+
164
+ # Prepare the output
165
+ output_cols = []
166
+ output_col_names = []
167
+ # When there is no expected_output_cols_type, default set it as StringType
168
+ # snowpark cannot handle empty string, so this step give "string" value
169
+ if expected_output_cols_type == "":
170
+ expected_output_cols_type = "string"
171
+ assert expected_output_cols_type is not None
172
+ for output_feature in expected_output_cols:
173
+ output_cols.append(F.col(INTERMEDIATE_OBJ_NAME)[output_feature].astype(expected_output_cols_type))
174
+ output_col_names.append(identifier.get_inferred_name(output_feature))
175
+
176
+ # Extract output from INTERMEDIATE_OBJ_NAME and drop that column
177
+ df_res = df_res.with_columns(
178
+ output_col_names,
179
+ output_cols,
180
+ ).drop(INTERMEDIATE_OBJ_NAME)
181
+
182
+ if drop_input_cols:
183
+ df_res = df_res.drop(*input_cols)
184
+
185
+ return df_res
233
186
 
234
187
  def score(
235
188
  self,
@@ -287,7 +240,7 @@ class SnowparkTransformHandlers:
287
240
  function_name=telemetry.get_statement_params_full_func_name(
288
241
  inspect.currentframe(), self.__class__.__name__
289
242
  ),
290
- api_calls=[sproc],
243
+ api_calls=[F.sproc],
291
244
  custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
292
245
  )
293
246
  # Put locally serialized score on stage.
@@ -299,7 +252,7 @@ class SnowparkTransformHandlers:
299
252
  statement_params=statement_params,
300
253
  )
301
254
 
302
- @sproc( # type: ignore[misc]
255
+ @F.sproc( # type: ignore[misc]
303
256
  is_permanent=False,
304
257
  name=score_sproc_name,
305
258
  packages=dependencies, # type: ignore[arg-type]
@@ -279,7 +279,7 @@ class SnowparkModelTrainer:
279
279
  def _build_fit_predict_wrapper_sproc(
280
280
  self,
281
281
  model_spec: ModelSpecifications,
282
- ) -> Callable[[Session, List[str], str, str, List[str], Dict[str, str], List[str], List[str], str], str]:
282
+ ) -> Callable[[Session, List[str], str, str, List[str], Dict[str, str], bool, List[str], str], str]:
283
283
  """
284
284
  Constructs and returns a python stored procedure function to be used for training model.
285
285
 
@@ -299,7 +299,7 @@ class SnowparkModelTrainer:
299
299
  stage_result_file_name: str,
300
300
  input_cols: List[str],
301
301
  statement_params: Dict[str, str],
302
- pass_through_columns: List[str],
302
+ drop_input_cols: bool,
303
303
  expected_output_cols_list: List[str],
304
304
  fit_predict_result_name: str,
305
305
  ) -> str:
@@ -345,12 +345,12 @@ class SnowparkModelTrainer:
345
345
  )
346
346
 
347
347
  # store the predict output
348
- if len(pass_through_columns) != 0:
349
- df = df.copy()
348
+ if drop_input_cols:
350
349
  fit_predict_result_pd = pd.DataFrame(data=fit_predict_result, columns=expected_output_cols_list)
351
- fit_predict_result_pd = pd.concat([df, fit_predict_result_pd], axis=1)
352
350
  else:
351
+ df = df.copy()
353
352
  fit_predict_result_pd = pd.DataFrame(data=fit_predict_result, columns=expected_output_cols_list)
353
+ fit_predict_result_pd = pd.concat([df, fit_predict_result_pd], axis=1)
354
354
 
355
355
  # write into a temp table in sproc and load the table from outside
356
356
  session.write_pandas(
@@ -463,18 +463,18 @@ class SnowparkModelTrainer:
463
463
 
464
464
  def train_fit_predict(
465
465
  self,
466
- pass_through_columns: List[str],
467
466
  expected_output_cols_list: List[str],
467
+ drop_input_cols: Optional[bool] = False,
468
468
  ) -> Tuple[Union[DataFrame, pd.DataFrame], object]:
469
469
  """Trains the model by pushing down the compute into Snowflake using stored procedures.
470
470
  This API is different from fit itself because it would also provide the predict
471
471
  output.
472
472
 
473
473
  Args:
474
- pass_through_columns (List[str]): The column names that would
475
- display in the returned dataset.
476
474
  expected_output_cols_list (List[str]): The output columns
477
475
  name as a list. Defaults to None.
476
+ drop_input_cols (Optional[bool]): Boolean to determine drop
477
+ the input columns from the output dataset or not
478
478
 
479
479
  Returns:
480
480
  Tuple[Union[DataFrame, pd.DataFrame], object]: [predicted dataset, estimator]
@@ -508,7 +508,7 @@ class SnowparkModelTrainer:
508
508
  stage_result_file_name,
509
509
  self.input_cols,
510
510
  statement_params,
511
- pass_through_columns,
511
+ drop_input_cols,
512
512
  expected_output_cols_list,
513
513
  fit_predict_result_name,
514
514
  )
@@ -107,9 +107,9 @@ class RemoteModelTransformHandlers(Protocol):
107
107
  inference_method: str,
108
108
  input_cols: List[str],
109
109
  expected_output_cols: List[str],
110
- pass_through_cols: List[str],
111
110
  session: snowpark.Session,
112
111
  dependencies: List[str],
112
+ drop_input_cols: Optional[bool] = False,
113
113
  expected_output_cols_type: Optional[str] = "",
114
114
  *args: Any,
115
115
  **kwargs: Any,
@@ -121,9 +121,9 @@ class RemoteModelTransformHandlers(Protocol):
121
121
  dependencies: List of dependencies for the transformer.
122
122
  inference_method: the name of the method used by `estimator` to run inference.
123
123
  input_cols: List of feature columns for inference.
124
- pass_through_cols: columns in the dataset not used in inference.
125
124
  expected_output_cols: column names (in order) of the output dataset.
126
125
  expected_output_cols_type: Expected type of the output columns.
126
+ drop_input_cols: Boolean to determine drop the input columns from the output dataset or not
127
127
  args: additional positional arguments.
128
128
  kwargs: additional keyword args.
129
129
 
@@ -175,7 +175,6 @@ class BatchInferenceKwargsTypedDict(TypedDict, total=False):
175
175
 
176
176
  snowpark_input_cols: Optional[List[str]]
177
177
  drop_input_cols: Optional[bool]
178
- pass_through_cols: List[str]
179
178
  session: snowpark.Session
180
179
  dependencies: List[str]
181
180
  expected_output_cols_type: str
@@ -328,18 +328,24 @@ class CalibratedClassifierCV(BaseTransformer):
328
328
  self._get_model_signatures(dataset)
329
329
  return self
330
330
 
331
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
332
- if self._drop_input_cols:
333
- return []
334
- else:
335
- return list(set(dataset.columns) - set(self.output_cols))
336
-
337
331
  def _batch_inference_validate_snowpark(
338
332
  self,
339
333
  dataset: DataFrame,
340
334
  inference_method: str,
341
335
  ) -> List[str]:
342
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
336
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
337
+ return the available package that exists in the snowflake anaconda channel
338
+
339
+ Args:
340
+ dataset: snowpark dataframe
341
+ inference_method: the inference method such as predict, score...
342
+
343
+ Raises:
344
+ SnowflakeMLException: If the estimator is not fitted, raise error
345
+ SnowflakeMLException: If the session is None, raise error
346
+
347
+ Returns:
348
+ A list of available package that exists in the snowflake anaconda channel
343
349
  """
344
350
  if not self._is_fitted:
345
351
  raise exceptions.SnowflakeMLException(
@@ -413,7 +419,7 @@ class CalibratedClassifierCV(BaseTransformer):
413
419
  transform_kwargs = dict(
414
420
  session = dataset._session,
415
421
  dependencies = self._deps,
416
- pass_through_cols = self._get_pass_through_columns(dataset),
422
+ drop_input_cols = self._drop_input_cols,
417
423
  expected_output_cols_type = expected_type_inferred,
418
424
  )
419
425
 
@@ -473,16 +479,16 @@ class CalibratedClassifierCV(BaseTransformer):
473
479
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
474
480
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
475
481
  # each row containing a list of values.
476
- expected_dtype = "ARRAY"
482
+ expected_dtype = "array"
477
483
 
478
484
  # If we were unable to assign a type to this transform in the factory, infer the type here.
479
485
  if expected_dtype == "":
480
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
486
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
481
487
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
482
- expected_dtype = "ARRAY"
483
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
488
+ expected_dtype = "array"
489
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
484
490
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
485
- expected_dtype = "ARRAY"
491
+ expected_dtype = "array"
486
492
  else:
487
493
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
488
494
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -500,7 +506,7 @@ class CalibratedClassifierCV(BaseTransformer):
500
506
  transform_kwargs = dict(
501
507
  session = dataset._session,
502
508
  dependencies = self._deps,
503
- pass_through_cols = self._get_pass_through_columns(dataset),
509
+ drop_input_cols = self._drop_input_cols,
504
510
  expected_output_cols_type = expected_dtype,
505
511
  )
506
512
 
@@ -551,7 +557,7 @@ class CalibratedClassifierCV(BaseTransformer):
551
557
  subproject=_SUBPROJECT,
552
558
  )
553
559
  output_result, fitted_estimator = model_trainer.train_fit_predict(
554
- pass_through_columns=self._get_pass_through_columns(dataset),
560
+ drop_input_cols=self._drop_input_cols,
555
561
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
556
562
  )
557
563
  self._sklearn_object = fitted_estimator
@@ -569,44 +575,6 @@ class CalibratedClassifierCV(BaseTransformer):
569
575
  assert self._sklearn_object is not None
570
576
  return self._sklearn_object.embedding_
571
577
 
572
-
573
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
574
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
575
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
576
- """
577
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
578
- if output_cols:
579
- output_cols = [
580
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
581
- for c in output_cols
582
- ]
583
- elif getattr(self._sklearn_object, "classes_", None) is None:
584
- output_cols = [output_cols_prefix]
585
- elif self._sklearn_object is not None:
586
- classes = self._sklearn_object.classes_
587
- if isinstance(classes, numpy.ndarray):
588
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
589
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
590
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
591
- output_cols = []
592
- for i, cl in enumerate(classes):
593
- # For binary classification, there is only one output column for each class
594
- # ndarray as the two classes are complementary.
595
- if len(cl) == 2:
596
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
597
- else:
598
- output_cols.extend([
599
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
600
- ])
601
- else:
602
- output_cols = []
603
-
604
- # Make sure column names are valid snowflake identifiers.
605
- assert output_cols is not None # Make MyPy happy
606
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
607
-
608
- return rv
609
-
610
578
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
611
579
  @telemetry.send_api_usage_telemetry(
612
580
  project=_PROJECT,
@@ -648,7 +616,7 @@ class CalibratedClassifierCV(BaseTransformer):
648
616
  transform_kwargs = dict(
649
617
  session=dataset._session,
650
618
  dependencies=self._deps,
651
- pass_through_cols=self._get_pass_through_columns(dataset),
619
+ drop_input_cols = self._drop_input_cols,
652
620
  expected_output_cols_type="float",
653
621
  )
654
622
 
@@ -715,7 +683,7 @@ class CalibratedClassifierCV(BaseTransformer):
715
683
  transform_kwargs = dict(
716
684
  session=dataset._session,
717
685
  dependencies=self._deps,
718
- pass_through_cols=self._get_pass_through_columns(dataset),
686
+ drop_input_cols = self._drop_input_cols,
719
687
  expected_output_cols_type="float",
720
688
  )
721
689
  elif isinstance(dataset, pd.DataFrame):
@@ -776,7 +744,7 @@ class CalibratedClassifierCV(BaseTransformer):
776
744
  transform_kwargs = dict(
777
745
  session=dataset._session,
778
746
  dependencies=self._deps,
779
- pass_through_cols=self._get_pass_through_columns(dataset),
747
+ drop_input_cols = self._drop_input_cols,
780
748
  expected_output_cols_type="float",
781
749
  )
782
750
 
@@ -841,7 +809,7 @@ class CalibratedClassifierCV(BaseTransformer):
841
809
  transform_kwargs = dict(
842
810
  session=dataset._session,
843
811
  dependencies=self._deps,
844
- pass_through_cols=self._get_pass_through_columns(dataset),
812
+ drop_input_cols = self._drop_input_cols,
845
813
  expected_output_cols_type="float",
846
814
  )
847
815
 
@@ -897,13 +865,17 @@ class CalibratedClassifierCV(BaseTransformer):
897
865
  transform_kwargs: ScoreKwargsTypedDict = dict()
898
866
 
899
867
  if isinstance(dataset, DataFrame):
868
+ self._deps = self._batch_inference_validate_snowpark(
869
+ dataset=dataset,
870
+ inference_method="score",
871
+ )
900
872
  selected_cols = self._get_active_columns()
901
873
  if len(selected_cols) > 0:
902
874
  dataset = dataset.select(selected_cols)
903
875
  assert isinstance(dataset._session, Session) # keep mypy happy
904
876
  transform_kwargs = dict(
905
877
  session=dataset._session,
906
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
878
+ dependencies=["snowflake-snowpark-python"] + self._deps,
907
879
  score_sproc_imports=['sklearn'],
908
880
  )
909
881
  elif isinstance(dataset, pd.DataFrame):
@@ -977,9 +949,9 @@ class CalibratedClassifierCV(BaseTransformer):
977
949
  transform_kwargs = dict(
978
950
  session = dataset._session,
979
951
  dependencies = self._deps,
980
- pass_through_cols = self._get_pass_through_columns(dataset),
981
- expected_output_cols_type = "array",
982
- n_neighbors = n_neighbors,
952
+ drop_input_cols = self._drop_input_cols,
953
+ expected_output_cols_type="array",
954
+ n_neighbors = n_neighbors,
983
955
  return_distance = return_distance
984
956
  )
985
957
  elif isinstance(dataset, pd.DataFrame):