snowflake-ml-python 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (284) hide show
  1. snowflake/cortex/__init__.py +4 -0
  2. snowflake/cortex/_classify_text.py +2 -2
  3. snowflake/cortex/_embed_text_1024.py +37 -0
  4. snowflake/cortex/_embed_text_768.py +37 -0
  5. snowflake/cortex/_extract_answer.py +2 -2
  6. snowflake/cortex/_sentiment.py +2 -2
  7. snowflake/cortex/_summarize.py +2 -2
  8. snowflake/cortex/_translate.py +2 -2
  9. snowflake/cortex/_util.py +4 -4
  10. snowflake/ml/_internal/env_utils.py +5 -5
  11. snowflake/ml/_internal/exceptions/error_codes.py +2 -0
  12. snowflake/ml/_internal/telemetry.py +142 -20
  13. snowflake/ml/_internal/utils/db_utils.py +50 -0
  14. snowflake/ml/_internal/utils/identifier.py +48 -11
  15. snowflake/ml/_internal/utils/service_logger.py +63 -0
  16. snowflake/ml/_internal/utils/snowflake_env.py +23 -13
  17. snowflake/ml/_internal/utils/sql_identifier.py +26 -2
  18. snowflake/ml/_internal/utils/table_manager.py +19 -1
  19. snowflake/ml/data/_internal/arrow_ingestor.py +1 -11
  20. snowflake/ml/data/data_connector.py +33 -7
  21. snowflake/ml/data/ingestor_utils.py +20 -10
  22. snowflake/ml/data/torch_utils.py +68 -0
  23. snowflake/ml/dataset/dataset.py +1 -3
  24. snowflake/ml/feature_store/access_manager.py +3 -3
  25. snowflake/ml/feature_store/feature_store.py +60 -19
  26. snowflake/ml/feature_store/feature_view.py +84 -30
  27. snowflake/ml/fileset/embedded_stage_fs.py +1 -1
  28. snowflake/ml/fileset/fileset.py +1 -1
  29. snowflake/ml/fileset/sfcfs.py +9 -3
  30. snowflake/ml/fileset/stage_fs.py +2 -1
  31. snowflake/ml/lineage/lineage_node.py +7 -2
  32. snowflake/ml/model/__init__.py +1 -2
  33. snowflake/ml/model/_client/model/model_version_impl.py +96 -12
  34. snowflake/ml/model/_client/ops/model_ops.py +124 -6
  35. snowflake/ml/model/_client/ops/service_ops.py +309 -9
  36. snowflake/ml/model/_client/service/model_deployment_spec.py +8 -5
  37. snowflake/ml/model/_client/service/model_deployment_spec_schema.py +2 -2
  38. snowflake/ml/model/_client/sql/_base.py +5 -0
  39. snowflake/ml/model/_client/sql/model.py +1 -0
  40. snowflake/ml/model/_client/sql/model_version.py +9 -5
  41. snowflake/ml/model/_client/sql/service.py +121 -20
  42. snowflake/ml/model/_model_composer/model_composer.py +11 -39
  43. snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +31 -11
  44. snowflake/ml/model/_packager/model_env/model_env.py +4 -38
  45. snowflake/ml/model/_packager/model_handlers/_utils.py +134 -28
  46. snowflake/ml/model/_packager/model_handlers/catboost.py +31 -30
  47. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +26 -18
  48. snowflake/ml/model/_packager/model_handlers/lightgbm.py +31 -58
  49. snowflake/ml/model/_packager/model_handlers/mlflow.py +3 -5
  50. snowflake/ml/model/_packager/model_handlers/model_objective_utils.py +169 -0
  51. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +15 -8
  52. snowflake/ml/model/_packager/model_handlers/sklearn.py +56 -60
  53. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +141 -9
  54. snowflake/ml/model/_packager/model_handlers/torchscript.py +2 -2
  55. snowflake/ml/model/_packager/model_handlers/xgboost.py +63 -48
  56. snowflake/ml/model/_packager/model_meta/model_meta.py +16 -42
  57. snowflake/ml/model/_packager/model_meta/model_meta_schema.py +1 -14
  58. snowflake/ml/model/_packager/model_packager.py +14 -8
  59. snowflake/ml/model/_packager/model_runtime/model_runtime.py +11 -0
  60. snowflake/ml/model/_signatures/pytorch_handler.py +1 -1
  61. snowflake/ml/model/_signatures/snowpark_handler.py +3 -2
  62. snowflake/ml/model/_signatures/utils.py +9 -0
  63. snowflake/ml/model/type_hints.py +12 -145
  64. snowflake/ml/modeling/_internal/constants.py +1 -0
  65. snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +5 -5
  66. snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +9 -6
  67. snowflake/ml/modeling/_internal/model_specifications.py +2 -0
  68. snowflake/ml/modeling/_internal/model_trainer.py +1 -0
  69. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +2 -4
  70. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
  71. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +130 -166
  72. snowflake/ml/modeling/_internal/snowpark_implementations/xgboost_external_memory_trainer.py +0 -1
  73. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +61 -21
  74. snowflake/ml/modeling/cluster/affinity_propagation.py +61 -21
  75. snowflake/ml/modeling/cluster/agglomerative_clustering.py +61 -21
  76. snowflake/ml/modeling/cluster/birch.py +61 -21
  77. snowflake/ml/modeling/cluster/bisecting_k_means.py +61 -21
  78. snowflake/ml/modeling/cluster/dbscan.py +61 -21
  79. snowflake/ml/modeling/cluster/feature_agglomeration.py +61 -21
  80. snowflake/ml/modeling/cluster/k_means.py +61 -21
  81. snowflake/ml/modeling/cluster/mean_shift.py +61 -21
  82. snowflake/ml/modeling/cluster/mini_batch_k_means.py +61 -21
  83. snowflake/ml/modeling/cluster/optics.py +61 -21
  84. snowflake/ml/modeling/cluster/spectral_biclustering.py +61 -21
  85. snowflake/ml/modeling/cluster/spectral_clustering.py +61 -21
  86. snowflake/ml/modeling/cluster/spectral_coclustering.py +61 -21
  87. snowflake/ml/modeling/compose/column_transformer.py +61 -21
  88. snowflake/ml/modeling/compose/transformed_target_regressor.py +61 -21
  89. snowflake/ml/modeling/covariance/elliptic_envelope.py +61 -21
  90. snowflake/ml/modeling/covariance/empirical_covariance.py +61 -21
  91. snowflake/ml/modeling/covariance/graphical_lasso.py +61 -21
  92. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +61 -21
  93. snowflake/ml/modeling/covariance/ledoit_wolf.py +61 -21
  94. snowflake/ml/modeling/covariance/min_cov_det.py +61 -21
  95. snowflake/ml/modeling/covariance/oas.py +61 -21
  96. snowflake/ml/modeling/covariance/shrunk_covariance.py +61 -21
  97. snowflake/ml/modeling/decomposition/dictionary_learning.py +61 -21
  98. snowflake/ml/modeling/decomposition/factor_analysis.py +61 -21
  99. snowflake/ml/modeling/decomposition/fast_ica.py +61 -21
  100. snowflake/ml/modeling/decomposition/incremental_pca.py +61 -21
  101. snowflake/ml/modeling/decomposition/kernel_pca.py +61 -21
  102. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +61 -21
  103. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +61 -21
  104. snowflake/ml/modeling/decomposition/pca.py +61 -21
  105. snowflake/ml/modeling/decomposition/sparse_pca.py +61 -21
  106. snowflake/ml/modeling/decomposition/truncated_svd.py +61 -21
  107. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +61 -21
  108. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +61 -21
  109. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +61 -21
  110. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +61 -21
  111. snowflake/ml/modeling/ensemble/bagging_classifier.py +61 -21
  112. snowflake/ml/modeling/ensemble/bagging_regressor.py +61 -21
  113. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +61 -21
  114. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +61 -21
  115. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +61 -21
  116. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +61 -21
  117. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +61 -21
  118. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +61 -21
  119. snowflake/ml/modeling/ensemble/isolation_forest.py +61 -21
  120. snowflake/ml/modeling/ensemble/random_forest_classifier.py +61 -21
  121. snowflake/ml/modeling/ensemble/random_forest_regressor.py +61 -21
  122. snowflake/ml/modeling/ensemble/stacking_regressor.py +61 -21
  123. snowflake/ml/modeling/ensemble/voting_classifier.py +61 -21
  124. snowflake/ml/modeling/ensemble/voting_regressor.py +61 -21
  125. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +61 -21
  126. snowflake/ml/modeling/feature_selection/select_fdr.py +61 -21
  127. snowflake/ml/modeling/feature_selection/select_fpr.py +61 -21
  128. snowflake/ml/modeling/feature_selection/select_fwe.py +61 -21
  129. snowflake/ml/modeling/feature_selection/select_k_best.py +61 -21
  130. snowflake/ml/modeling/feature_selection/select_percentile.py +61 -21
  131. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +61 -21
  132. snowflake/ml/modeling/feature_selection/variance_threshold.py +61 -21
  133. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +61 -21
  134. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +61 -21
  135. snowflake/ml/modeling/impute/iterative_imputer.py +61 -21
  136. snowflake/ml/modeling/impute/knn_imputer.py +61 -21
  137. snowflake/ml/modeling/impute/missing_indicator.py +61 -21
  138. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +61 -21
  139. snowflake/ml/modeling/kernel_approximation/nystroem.py +61 -21
  140. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +61 -21
  141. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +61 -21
  142. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +61 -21
  143. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +61 -21
  144. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +61 -21
  145. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +61 -21
  146. snowflake/ml/modeling/linear_model/ard_regression.py +61 -21
  147. snowflake/ml/modeling/linear_model/bayesian_ridge.py +61 -21
  148. snowflake/ml/modeling/linear_model/elastic_net.py +61 -21
  149. snowflake/ml/modeling/linear_model/elastic_net_cv.py +61 -21
  150. snowflake/ml/modeling/linear_model/gamma_regressor.py +61 -21
  151. snowflake/ml/modeling/linear_model/huber_regressor.py +61 -21
  152. snowflake/ml/modeling/linear_model/lars.py +61 -21
  153. snowflake/ml/modeling/linear_model/lars_cv.py +61 -21
  154. snowflake/ml/modeling/linear_model/lasso.py +61 -21
  155. snowflake/ml/modeling/linear_model/lasso_cv.py +61 -21
  156. snowflake/ml/modeling/linear_model/lasso_lars.py +61 -21
  157. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +61 -21
  158. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +61 -21
  159. snowflake/ml/modeling/linear_model/linear_regression.py +61 -21
  160. snowflake/ml/modeling/linear_model/logistic_regression.py +61 -21
  161. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +61 -21
  162. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +61 -21
  163. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +61 -21
  164. snowflake/ml/modeling/linear_model/multi_task_lasso.py +61 -21
  165. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +61 -21
  166. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +61 -21
  167. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +61 -21
  168. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +61 -21
  169. snowflake/ml/modeling/linear_model/perceptron.py +61 -21
  170. snowflake/ml/modeling/linear_model/poisson_regressor.py +61 -21
  171. snowflake/ml/modeling/linear_model/ransac_regressor.py +61 -21
  172. snowflake/ml/modeling/linear_model/ridge.py +61 -21
  173. snowflake/ml/modeling/linear_model/ridge_classifier.py +61 -21
  174. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +61 -21
  175. snowflake/ml/modeling/linear_model/ridge_cv.py +61 -21
  176. snowflake/ml/modeling/linear_model/sgd_classifier.py +61 -21
  177. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +61 -21
  178. snowflake/ml/modeling/linear_model/sgd_regressor.py +61 -21
  179. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +61 -21
  180. snowflake/ml/modeling/linear_model/tweedie_regressor.py +61 -21
  181. snowflake/ml/modeling/manifold/isomap.py +61 -21
  182. snowflake/ml/modeling/manifold/mds.py +61 -21
  183. snowflake/ml/modeling/manifold/spectral_embedding.py +61 -21
  184. snowflake/ml/modeling/manifold/tsne.py +61 -21
  185. snowflake/ml/modeling/metrics/metrics_utils.py +2 -2
  186. snowflake/ml/modeling/metrics/ranking.py +0 -3
  187. snowflake/ml/modeling/metrics/regression.py +0 -3
  188. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +61 -21
  189. snowflake/ml/modeling/mixture/gaussian_mixture.py +61 -21
  190. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +61 -21
  191. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +61 -21
  192. snowflake/ml/modeling/multiclass/output_code_classifier.py +61 -21
  193. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +61 -21
  194. snowflake/ml/modeling/naive_bayes/categorical_nb.py +61 -21
  195. snowflake/ml/modeling/naive_bayes/complement_nb.py +61 -21
  196. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +61 -21
  197. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +61 -21
  198. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +61 -21
  199. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +61 -21
  200. snowflake/ml/modeling/neighbors/kernel_density.py +61 -21
  201. snowflake/ml/modeling/neighbors/local_outlier_factor.py +61 -21
  202. snowflake/ml/modeling/neighbors/nearest_centroid.py +61 -21
  203. snowflake/ml/modeling/neighbors/nearest_neighbors.py +61 -21
  204. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +61 -21
  205. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +61 -21
  206. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +61 -21
  207. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +61 -21
  208. snowflake/ml/modeling/neural_network/mlp_classifier.py +61 -21
  209. snowflake/ml/modeling/neural_network/mlp_regressor.py +61 -21
  210. snowflake/ml/modeling/parameters/disable_model_tracer.py +5 -0
  211. snowflake/ml/modeling/pipeline/pipeline.py +1 -13
  212. snowflake/ml/modeling/preprocessing/polynomial_features.py +61 -21
  213. snowflake/ml/modeling/semi_supervised/label_propagation.py +61 -21
  214. snowflake/ml/modeling/semi_supervised/label_spreading.py +61 -21
  215. snowflake/ml/modeling/svm/linear_svc.py +61 -21
  216. snowflake/ml/modeling/svm/linear_svr.py +61 -21
  217. snowflake/ml/modeling/svm/nu_svc.py +61 -21
  218. snowflake/ml/modeling/svm/nu_svr.py +61 -21
  219. snowflake/ml/modeling/svm/svc.py +61 -21
  220. snowflake/ml/modeling/svm/svr.py +61 -21
  221. snowflake/ml/modeling/tree/decision_tree_classifier.py +61 -21
  222. snowflake/ml/modeling/tree/decision_tree_regressor.py +61 -21
  223. snowflake/ml/modeling/tree/extra_tree_classifier.py +61 -21
  224. snowflake/ml/modeling/tree/extra_tree_regressor.py +61 -21
  225. snowflake/ml/modeling/xgboost/xgb_classifier.py +64 -23
  226. snowflake/ml/modeling/xgboost/xgb_regressor.py +64 -23
  227. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +64 -23
  228. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +64 -23
  229. snowflake/ml/monitoring/_client/model_monitor.py +126 -0
  230. snowflake/ml/monitoring/_client/model_monitor_manager.py +361 -0
  231. snowflake/ml/monitoring/_client/model_monitor_version.py +1 -0
  232. snowflake/ml/monitoring/_client/monitor_sql_client.py +1335 -0
  233. snowflake/ml/monitoring/_client/queries/record_count.ssql +14 -0
  234. snowflake/ml/monitoring/_client/queries/rmse.ssql +28 -0
  235. snowflake/ml/monitoring/entities/model_monitor_config.py +28 -0
  236. snowflake/ml/monitoring/entities/model_monitor_interval.py +46 -0
  237. snowflake/ml/monitoring/entities/output_score_type.py +90 -0
  238. snowflake/ml/registry/_manager/model_manager.py +4 -0
  239. snowflake/ml/registry/registry.py +166 -8
  240. snowflake/ml/version.py +1 -1
  241. {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/METADATA +43 -9
  242. snowflake_ml_python-1.6.3.dist-info/RECORD +400 -0
  243. {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/WHEEL +1 -1
  244. snowflake/ml/_internal/container_services/image_registry/credential.py +0 -84
  245. snowflake/ml/_internal/container_services/image_registry/http_client.py +0 -127
  246. snowflake/ml/_internal/container_services/image_registry/imagelib.py +0 -400
  247. snowflake/ml/_internal/container_services/image_registry/registry_client.py +0 -212
  248. snowflake/ml/_internal/utils/log_stream_processor.py +0 -30
  249. snowflake/ml/_internal/utils/session_token_manager.py +0 -46
  250. snowflake/ml/_internal/utils/spcs_attribution_utils.py +0 -122
  251. snowflake/ml/_internal/utils/uri.py +0 -77
  252. snowflake/ml/data/torch_dataset.py +0 -33
  253. snowflake/ml/model/_api.py +0 -568
  254. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +0 -12
  255. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +0 -249
  256. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +0 -130
  257. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +0 -36
  258. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +0 -268
  259. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +0 -215
  260. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +0 -53
  261. snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template +0 -38
  262. snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template +0 -105
  263. snowflake/ml/model/_deploy_client/snowservice/deploy.py +0 -611
  264. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +0 -116
  265. snowflake/ml/model/_deploy_client/snowservice/instance_types.py +0 -10
  266. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +0 -28
  267. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template_with_model +0 -21
  268. snowflake/ml/model/_deploy_client/utils/constants.py +0 -48
  269. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +0 -280
  270. snowflake/ml/model/_deploy_client/warehouse/deploy.py +0 -202
  271. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +0 -99
  272. snowflake/ml/model/_packager/model_handlers/llm.py +0 -267
  273. snowflake/ml/model/_packager/model_meta/_core_requirements.py +0 -11
  274. snowflake/ml/model/deploy_platforms.py +0 -6
  275. snowflake/ml/model/models/llm.py +0 -104
  276. snowflake/ml/monitoring/monitor.py +0 -203
  277. snowflake/ml/registry/_initial_schema.py +0 -142
  278. snowflake/ml/registry/_schema.py +0 -82
  279. snowflake/ml/registry/_schema_upgrade_plans.py +0 -116
  280. snowflake/ml/registry/_schema_version_manager.py +0 -163
  281. snowflake/ml/registry/model_registry.py +0 -2048
  282. snowflake_ml_python-1.6.1.dist-info/RECORD +0 -422
  283. {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/LICENSE.txt +0 -0
  284. {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,12 @@
4
4
  #
5
5
  import inspect
6
6
  import os
7
- import posixpath
8
- from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
9
- from typing_extensions import TypeGuard
7
+ from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
10
8
  from uuid import uuid4
11
9
 
12
10
  import cloudpickle as cp
13
- import pandas as pd
14
11
  import numpy as np
12
+ import pandas as pd
15
13
  from numpy import typing as npt
16
14
 
17
15
 
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
24
22
  from snowflake.ml._internal import telemetry
25
23
  from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
26
24
  from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
27
- from snowflake.ml._internal.utils import pkg_version_utils, identifier
25
+ from snowflake.ml._internal.utils import identifier
28
26
  from snowflake.snowpark import DataFrame, Session
29
27
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
28
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
29
  from snowflake.ml.modeling._internal.transformer_protocols import (
32
- ModelTransformHandlers,
33
30
  BatchInferenceKwargsTypedDict,
34
31
  ScoreKwargsTypedDict
35
32
  )
@@ -589,12 +586,23 @@ class DictionaryLearning(BaseTransformer):
589
586
  autogenerated=self._autogenerated,
590
587
  subproject=_SUBPROJECT,
591
588
  )
592
- output_result, fitted_estimator = model_trainer.train_fit_predict(
593
- drop_input_cols=self._drop_input_cols,
594
- expected_output_cols_list=(
595
- self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
596
- ),
589
+ expected_output_cols = (
590
+ self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
597
591
  )
592
+ if isinstance(dataset, DataFrame):
593
+ expected_output_cols, example_output_pd_df = self._align_expected_output(
594
+ "fit_predict", dataset, expected_output_cols, output_cols_prefix
595
+ )
596
+ output_result, fitted_estimator = model_trainer.train_fit_predict(
597
+ drop_input_cols=self._drop_input_cols,
598
+ expected_output_cols_list=expected_output_cols,
599
+ example_output_pd_df=example_output_pd_df,
600
+ )
601
+ else:
602
+ output_result, fitted_estimator = model_trainer.train_fit_predict(
603
+ drop_input_cols=self._drop_input_cols,
604
+ expected_output_cols_list=expected_output_cols,
605
+ )
598
606
  self._sklearn_object = fitted_estimator
599
607
  self._is_fitted = True
600
608
  return output_result
@@ -619,6 +627,7 @@ class DictionaryLearning(BaseTransformer):
619
627
  """
620
628
  self._infer_input_output_cols(dataset)
621
629
  super()._check_dataset_type(dataset)
630
+
622
631
  model_trainer = ModelTrainerBuilder.build_fit_transform(
623
632
  estimator=self._sklearn_object,
624
633
  dataset=dataset,
@@ -675,12 +684,41 @@ class DictionaryLearning(BaseTransformer):
675
684
 
676
685
  return rv
677
686
 
678
- def _align_expected_output_names(
679
- self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
680
- ) -> List[str]:
687
+ def _align_expected_output(
688
+ self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
689
+ ) -> Tuple[List[str], pd.DataFrame]:
690
+ """ Run 1 line of data with the desired method, and return one tuple that consists of the output column names
691
+ and output dataframe with 1 line.
692
+ If the method is fit_predict, run 2 lines of data.
693
+ """
681
694
  # in case the inferred output column names dimension is different
682
695
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
683
- sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
696
+
697
+ # For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
698
+ # so change the minimum of number of rows to 2
699
+ num_examples = 2
700
+ statement_params = telemetry.get_function_usage_statement_params(
701
+ project=_PROJECT,
702
+ subproject=_SUBPROJECT,
703
+ function_name=telemetry.get_statement_params_full_func_name(
704
+ inspect.currentframe(), DictionaryLearning.__class__.__name__
705
+ ),
706
+ api_calls=[Session.call],
707
+ custom_tags={"autogen": True} if self._autogenerated else None,
708
+ )
709
+ if output_cols_prefix == "fit_predict_":
710
+ if hasattr(self._sklearn_object, "n_clusters"):
711
+ # cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
712
+ num_examples = self._sklearn_object.n_clusters
713
+ elif hasattr(self._sklearn_object, "min_samples"):
714
+ # OPTICS default min_samples 5, which requires at least 5 lines of data
715
+ num_examples = self._sklearn_object.min_samples
716
+ elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
717
+ # LocalOutlierFactor expects n_neighbors <= n_samples
718
+ num_examples = self._sklearn_object.n_neighbors
719
+ sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
720
+ else:
721
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
684
722
 
685
723
  # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
686
724
  # seen during the fit.
@@ -692,12 +730,14 @@ class DictionaryLearning(BaseTransformer):
692
730
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
693
731
  if self.sample_weight_col:
694
732
  output_df_columns_set -= set(self.sample_weight_col)
733
+
695
734
  # if the dimension of inferred output column names is correct; use it
696
735
  if len(expected_output_cols_list) == len(output_df_columns_set):
697
- return expected_output_cols_list
736
+ return expected_output_cols_list, output_df_pd
698
737
  # otherwise, use the sklearn estimator's output
699
738
  else:
700
- return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
739
+ expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
740
+ return expected_output_cols_list, output_df_pd[expected_output_cols_list]
701
741
 
702
742
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
703
743
  @telemetry.send_api_usage_telemetry(
@@ -743,7 +783,7 @@ class DictionaryLearning(BaseTransformer):
743
783
  drop_input_cols=self._drop_input_cols,
744
784
  expected_output_cols_type="float",
745
785
  )
746
- expected_output_cols = self._align_expected_output_names(
786
+ expected_output_cols, _ = self._align_expected_output(
747
787
  inference_method, dataset, expected_output_cols, output_cols_prefix
748
788
  )
749
789
 
@@ -809,7 +849,7 @@ class DictionaryLearning(BaseTransformer):
809
849
  drop_input_cols=self._drop_input_cols,
810
850
  expected_output_cols_type="float",
811
851
  )
812
- expected_output_cols = self._align_expected_output_names(
852
+ expected_output_cols, _ = self._align_expected_output(
813
853
  inference_method, dataset, expected_output_cols, output_cols_prefix
814
854
  )
815
855
  elif isinstance(dataset, pd.DataFrame):
@@ -872,7 +912,7 @@ class DictionaryLearning(BaseTransformer):
872
912
  drop_input_cols=self._drop_input_cols,
873
913
  expected_output_cols_type="float",
874
914
  )
875
- expected_output_cols = self._align_expected_output_names(
915
+ expected_output_cols, _ = self._align_expected_output(
876
916
  inference_method, dataset, expected_output_cols, output_cols_prefix
877
917
  )
878
918
 
@@ -937,7 +977,7 @@ class DictionaryLearning(BaseTransformer):
937
977
  drop_input_cols = self._drop_input_cols,
938
978
  expected_output_cols_type="float",
939
979
  )
940
- expected_output_cols = self._align_expected_output_names(
980
+ expected_output_cols, _ = self._align_expected_output(
941
981
  inference_method, dataset, expected_output_cols, output_cols_prefix
942
982
  )
943
983
 
@@ -4,14 +4,12 @@
4
4
  #
5
5
  import inspect
6
6
  import os
7
- import posixpath
8
- from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
9
- from typing_extensions import TypeGuard
7
+ from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
10
8
  from uuid import uuid4
11
9
 
12
10
  import cloudpickle as cp
13
- import pandas as pd
14
11
  import numpy as np
12
+ import pandas as pd
15
13
  from numpy import typing as npt
16
14
 
17
15
 
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
24
22
  from snowflake.ml._internal import telemetry
25
23
  from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
26
24
  from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
27
- from snowflake.ml._internal.utils import pkg_version_utils, identifier
25
+ from snowflake.ml._internal.utils import identifier
28
26
  from snowflake.snowpark import DataFrame, Session
29
27
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
28
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
29
  from snowflake.ml.modeling._internal.transformer_protocols import (
32
- ModelTransformHandlers,
33
30
  BatchInferenceKwargsTypedDict,
34
31
  ScoreKwargsTypedDict
35
32
  )
@@ -526,12 +523,23 @@ class FactorAnalysis(BaseTransformer):
526
523
  autogenerated=self._autogenerated,
527
524
  subproject=_SUBPROJECT,
528
525
  )
529
- output_result, fitted_estimator = model_trainer.train_fit_predict(
530
- drop_input_cols=self._drop_input_cols,
531
- expected_output_cols_list=(
532
- self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
533
- ),
526
+ expected_output_cols = (
527
+ self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
534
528
  )
529
+ if isinstance(dataset, DataFrame):
530
+ expected_output_cols, example_output_pd_df = self._align_expected_output(
531
+ "fit_predict", dataset, expected_output_cols, output_cols_prefix
532
+ )
533
+ output_result, fitted_estimator = model_trainer.train_fit_predict(
534
+ drop_input_cols=self._drop_input_cols,
535
+ expected_output_cols_list=expected_output_cols,
536
+ example_output_pd_df=example_output_pd_df,
537
+ )
538
+ else:
539
+ output_result, fitted_estimator = model_trainer.train_fit_predict(
540
+ drop_input_cols=self._drop_input_cols,
541
+ expected_output_cols_list=expected_output_cols,
542
+ )
535
543
  self._sklearn_object = fitted_estimator
536
544
  self._is_fitted = True
537
545
  return output_result
@@ -556,6 +564,7 @@ class FactorAnalysis(BaseTransformer):
556
564
  """
557
565
  self._infer_input_output_cols(dataset)
558
566
  super()._check_dataset_type(dataset)
567
+
559
568
  model_trainer = ModelTrainerBuilder.build_fit_transform(
560
569
  estimator=self._sklearn_object,
561
570
  dataset=dataset,
@@ -612,12 +621,41 @@ class FactorAnalysis(BaseTransformer):
612
621
 
613
622
  return rv
614
623
 
615
- def _align_expected_output_names(
616
- self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
617
- ) -> List[str]:
624
+ def _align_expected_output(
625
+ self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
626
+ ) -> Tuple[List[str], pd.DataFrame]:
627
+ """ Run 1 line of data with the desired method, and return one tuple that consists of the output column names
628
+ and output dataframe with 1 line.
629
+ If the method is fit_predict, run 2 lines of data.
630
+ """
618
631
  # in case the inferred output column names dimension is different
619
632
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
620
- sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
633
+
634
+ # For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
635
+ # so change the minimum of number of rows to 2
636
+ num_examples = 2
637
+ statement_params = telemetry.get_function_usage_statement_params(
638
+ project=_PROJECT,
639
+ subproject=_SUBPROJECT,
640
+ function_name=telemetry.get_statement_params_full_func_name(
641
+ inspect.currentframe(), FactorAnalysis.__class__.__name__
642
+ ),
643
+ api_calls=[Session.call],
644
+ custom_tags={"autogen": True} if self._autogenerated else None,
645
+ )
646
+ if output_cols_prefix == "fit_predict_":
647
+ if hasattr(self._sklearn_object, "n_clusters"):
648
+ # cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
649
+ num_examples = self._sklearn_object.n_clusters
650
+ elif hasattr(self._sklearn_object, "min_samples"):
651
+ # OPTICS default min_samples 5, which requires at least 5 lines of data
652
+ num_examples = self._sklearn_object.min_samples
653
+ elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
654
+ # LocalOutlierFactor expects n_neighbors <= n_samples
655
+ num_examples = self._sklearn_object.n_neighbors
656
+ sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
657
+ else:
658
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
621
659
 
622
660
  # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
623
661
  # seen during the fit.
@@ -629,12 +667,14 @@ class FactorAnalysis(BaseTransformer):
629
667
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
630
668
  if self.sample_weight_col:
631
669
  output_df_columns_set -= set(self.sample_weight_col)
670
+
632
671
  # if the dimension of inferred output column names is correct; use it
633
672
  if len(expected_output_cols_list) == len(output_df_columns_set):
634
- return expected_output_cols_list
673
+ return expected_output_cols_list, output_df_pd
635
674
  # otherwise, use the sklearn estimator's output
636
675
  else:
637
- return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
676
+ expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
677
+ return expected_output_cols_list, output_df_pd[expected_output_cols_list]
638
678
 
639
679
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
640
680
  @telemetry.send_api_usage_telemetry(
@@ -680,7 +720,7 @@ class FactorAnalysis(BaseTransformer):
680
720
  drop_input_cols=self._drop_input_cols,
681
721
  expected_output_cols_type="float",
682
722
  )
683
- expected_output_cols = self._align_expected_output_names(
723
+ expected_output_cols, _ = self._align_expected_output(
684
724
  inference_method, dataset, expected_output_cols, output_cols_prefix
685
725
  )
686
726
 
@@ -746,7 +786,7 @@ class FactorAnalysis(BaseTransformer):
746
786
  drop_input_cols=self._drop_input_cols,
747
787
  expected_output_cols_type="float",
748
788
  )
749
- expected_output_cols = self._align_expected_output_names(
789
+ expected_output_cols, _ = self._align_expected_output(
750
790
  inference_method, dataset, expected_output_cols, output_cols_prefix
751
791
  )
752
792
  elif isinstance(dataset, pd.DataFrame):
@@ -809,7 +849,7 @@ class FactorAnalysis(BaseTransformer):
809
849
  drop_input_cols=self._drop_input_cols,
810
850
  expected_output_cols_type="float",
811
851
  )
812
- expected_output_cols = self._align_expected_output_names(
852
+ expected_output_cols, _ = self._align_expected_output(
813
853
  inference_method, dataset, expected_output_cols, output_cols_prefix
814
854
  )
815
855
 
@@ -876,7 +916,7 @@ class FactorAnalysis(BaseTransformer):
876
916
  drop_input_cols = self._drop_input_cols,
877
917
  expected_output_cols_type="float",
878
918
  )
879
- expected_output_cols = self._align_expected_output_names(
919
+ expected_output_cols, _ = self._align_expected_output(
880
920
  inference_method, dataset, expected_output_cols, output_cols_prefix
881
921
  )
882
922
 
@@ -4,14 +4,12 @@
4
4
  #
5
5
  import inspect
6
6
  import os
7
- import posixpath
8
- from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
9
- from typing_extensions import TypeGuard
7
+ from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
10
8
  from uuid import uuid4
11
9
 
12
10
  import cloudpickle as cp
13
- import pandas as pd
14
11
  import numpy as np
12
+ import pandas as pd
15
13
  from numpy import typing as npt
16
14
 
17
15
 
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
24
22
  from snowflake.ml._internal import telemetry
25
23
  from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
26
24
  from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
27
- from snowflake.ml._internal.utils import pkg_version_utils, identifier
25
+ from snowflake.ml._internal.utils import identifier
28
26
  from snowflake.snowpark import DataFrame, Session
29
27
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
28
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
29
  from snowflake.ml.modeling._internal.transformer_protocols import (
32
- ModelTransformHandlers,
33
30
  BatchInferenceKwargsTypedDict,
34
31
  ScoreKwargsTypedDict
35
32
  )
@@ -544,12 +541,23 @@ class FastICA(BaseTransformer):
544
541
  autogenerated=self._autogenerated,
545
542
  subproject=_SUBPROJECT,
546
543
  )
547
- output_result, fitted_estimator = model_trainer.train_fit_predict(
548
- drop_input_cols=self._drop_input_cols,
549
- expected_output_cols_list=(
550
- self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
551
- ),
544
+ expected_output_cols = (
545
+ self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
552
546
  )
547
+ if isinstance(dataset, DataFrame):
548
+ expected_output_cols, example_output_pd_df = self._align_expected_output(
549
+ "fit_predict", dataset, expected_output_cols, output_cols_prefix
550
+ )
551
+ output_result, fitted_estimator = model_trainer.train_fit_predict(
552
+ drop_input_cols=self._drop_input_cols,
553
+ expected_output_cols_list=expected_output_cols,
554
+ example_output_pd_df=example_output_pd_df,
555
+ )
556
+ else:
557
+ output_result, fitted_estimator = model_trainer.train_fit_predict(
558
+ drop_input_cols=self._drop_input_cols,
559
+ expected_output_cols_list=expected_output_cols,
560
+ )
553
561
  self._sklearn_object = fitted_estimator
554
562
  self._is_fitted = True
555
563
  return output_result
@@ -574,6 +582,7 @@ class FastICA(BaseTransformer):
574
582
  """
575
583
  self._infer_input_output_cols(dataset)
576
584
  super()._check_dataset_type(dataset)
585
+
577
586
  model_trainer = ModelTrainerBuilder.build_fit_transform(
578
587
  estimator=self._sklearn_object,
579
588
  dataset=dataset,
@@ -630,12 +639,41 @@ class FastICA(BaseTransformer):
630
639
 
631
640
  return rv
632
641
 
633
- def _align_expected_output_names(
634
- self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
635
- ) -> List[str]:
642
+ def _align_expected_output(
643
+ self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
644
+ ) -> Tuple[List[str], pd.DataFrame]:
645
+ """ Run 1 line of data with the desired method, and return one tuple that consists of the output column names
646
+ and output dataframe with 1 line.
647
+ If the method is fit_predict, run 2 lines of data.
648
+ """
636
649
  # in case the inferred output column names dimension is different
637
650
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
638
- sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
651
+
652
+ # For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
653
+ # so change the minimum of number of rows to 2
654
+ num_examples = 2
655
+ statement_params = telemetry.get_function_usage_statement_params(
656
+ project=_PROJECT,
657
+ subproject=_SUBPROJECT,
658
+ function_name=telemetry.get_statement_params_full_func_name(
659
+ inspect.currentframe(), FastICA.__class__.__name__
660
+ ),
661
+ api_calls=[Session.call],
662
+ custom_tags={"autogen": True} if self._autogenerated else None,
663
+ )
664
+ if output_cols_prefix == "fit_predict_":
665
+ if hasattr(self._sklearn_object, "n_clusters"):
666
+ # cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
667
+ num_examples = self._sklearn_object.n_clusters
668
+ elif hasattr(self._sklearn_object, "min_samples"):
669
+ # OPTICS default min_samples 5, which requires at least 5 lines of data
670
+ num_examples = self._sklearn_object.min_samples
671
+ elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
672
+ # LocalOutlierFactor expects n_neighbors <= n_samples
673
+ num_examples = self._sklearn_object.n_neighbors
674
+ sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
675
+ else:
676
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
639
677
 
640
678
  # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
641
679
  # seen during the fit.
@@ -647,12 +685,14 @@ class FastICA(BaseTransformer):
647
685
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
648
686
  if self.sample_weight_col:
649
687
  output_df_columns_set -= set(self.sample_weight_col)
688
+
650
689
  # if the dimension of inferred output column names is correct; use it
651
690
  if len(expected_output_cols_list) == len(output_df_columns_set):
652
- return expected_output_cols_list
691
+ return expected_output_cols_list, output_df_pd
653
692
  # otherwise, use the sklearn estimator's output
654
693
  else:
655
- return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
694
+ expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
695
+ return expected_output_cols_list, output_df_pd[expected_output_cols_list]
656
696
 
657
697
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
658
698
  @telemetry.send_api_usage_telemetry(
@@ -698,7 +738,7 @@ class FastICA(BaseTransformer):
698
738
  drop_input_cols=self._drop_input_cols,
699
739
  expected_output_cols_type="float",
700
740
  )
701
- expected_output_cols = self._align_expected_output_names(
741
+ expected_output_cols, _ = self._align_expected_output(
702
742
  inference_method, dataset, expected_output_cols, output_cols_prefix
703
743
  )
704
744
 
@@ -764,7 +804,7 @@ class FastICA(BaseTransformer):
764
804
  drop_input_cols=self._drop_input_cols,
765
805
  expected_output_cols_type="float",
766
806
  )
767
- expected_output_cols = self._align_expected_output_names(
807
+ expected_output_cols, _ = self._align_expected_output(
768
808
  inference_method, dataset, expected_output_cols, output_cols_prefix
769
809
  )
770
810
  elif isinstance(dataset, pd.DataFrame):
@@ -827,7 +867,7 @@ class FastICA(BaseTransformer):
827
867
  drop_input_cols=self._drop_input_cols,
828
868
  expected_output_cols_type="float",
829
869
  )
830
- expected_output_cols = self._align_expected_output_names(
870
+ expected_output_cols, _ = self._align_expected_output(
831
871
  inference_method, dataset, expected_output_cols, output_cols_prefix
832
872
  )
833
873
 
@@ -892,7 +932,7 @@ class FastICA(BaseTransformer):
892
932
  drop_input_cols = self._drop_input_cols,
893
933
  expected_output_cols_type="float",
894
934
  )
895
- expected_output_cols = self._align_expected_output_names(
935
+ expected_output_cols, _ = self._align_expected_output(
896
936
  inference_method, dataset, expected_output_cols, output_cols_prefix
897
937
  )
898
938
 
@@ -4,14 +4,12 @@
4
4
  #
5
5
  import inspect
6
6
  import os
7
- import posixpath
8
- from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
9
- from typing_extensions import TypeGuard
7
+ from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
10
8
  from uuid import uuid4
11
9
 
12
10
  import cloudpickle as cp
13
- import pandas as pd
14
11
  import numpy as np
12
+ import pandas as pd
15
13
  from numpy import typing as npt
16
14
 
17
15
 
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
24
22
  from snowflake.ml._internal import telemetry
25
23
  from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
26
24
  from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
27
- from snowflake.ml._internal.utils import pkg_version_utils, identifier
25
+ from snowflake.ml._internal.utils import identifier
28
26
  from snowflake.snowpark import DataFrame, Session
29
27
  from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
28
  from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
31
29
  from snowflake.ml.modeling._internal.transformer_protocols import (
32
- ModelTransformHandlers,
33
30
  BatchInferenceKwargsTypedDict,
34
31
  ScoreKwargsTypedDict
35
32
  )
@@ -496,12 +493,23 @@ class IncrementalPCA(BaseTransformer):
496
493
  autogenerated=self._autogenerated,
497
494
  subproject=_SUBPROJECT,
498
495
  )
499
- output_result, fitted_estimator = model_trainer.train_fit_predict(
500
- drop_input_cols=self._drop_input_cols,
501
- expected_output_cols_list=(
502
- self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
503
- ),
496
+ expected_output_cols = (
497
+ self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
504
498
  )
499
+ if isinstance(dataset, DataFrame):
500
+ expected_output_cols, example_output_pd_df = self._align_expected_output(
501
+ "fit_predict", dataset, expected_output_cols, output_cols_prefix
502
+ )
503
+ output_result, fitted_estimator = model_trainer.train_fit_predict(
504
+ drop_input_cols=self._drop_input_cols,
505
+ expected_output_cols_list=expected_output_cols,
506
+ example_output_pd_df=example_output_pd_df,
507
+ )
508
+ else:
509
+ output_result, fitted_estimator = model_trainer.train_fit_predict(
510
+ drop_input_cols=self._drop_input_cols,
511
+ expected_output_cols_list=expected_output_cols,
512
+ )
505
513
  self._sklearn_object = fitted_estimator
506
514
  self._is_fitted = True
507
515
  return output_result
@@ -526,6 +534,7 @@ class IncrementalPCA(BaseTransformer):
526
534
  """
527
535
  self._infer_input_output_cols(dataset)
528
536
  super()._check_dataset_type(dataset)
537
+
529
538
  model_trainer = ModelTrainerBuilder.build_fit_transform(
530
539
  estimator=self._sklearn_object,
531
540
  dataset=dataset,
@@ -582,12 +591,41 @@ class IncrementalPCA(BaseTransformer):
582
591
 
583
592
  return rv
584
593
 
585
- def _align_expected_output_names(
586
- self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
587
- ) -> List[str]:
594
+ def _align_expected_output(
595
+ self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
596
+ ) -> Tuple[List[str], pd.DataFrame]:
597
+ """ Run 1 line of data with the desired method, and return one tuple that consists of the output column names
598
+ and output dataframe with 1 line.
599
+ If the method is fit_predict, run 2 lines of data.
600
+ """
588
601
  # in case the inferred output column names dimension is different
589
602
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
590
- sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
603
+
604
+ # For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
605
+ # so change the minimum of number of rows to 2
606
+ num_examples = 2
607
+ statement_params = telemetry.get_function_usage_statement_params(
608
+ project=_PROJECT,
609
+ subproject=_SUBPROJECT,
610
+ function_name=telemetry.get_statement_params_full_func_name(
611
+ inspect.currentframe(), IncrementalPCA.__class__.__name__
612
+ ),
613
+ api_calls=[Session.call],
614
+ custom_tags={"autogen": True} if self._autogenerated else None,
615
+ )
616
+ if output_cols_prefix == "fit_predict_":
617
+ if hasattr(self._sklearn_object, "n_clusters"):
618
+ # cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
619
+ num_examples = self._sklearn_object.n_clusters
620
+ elif hasattr(self._sklearn_object, "min_samples"):
621
+ # OPTICS default min_samples 5, which requires at least 5 lines of data
622
+ num_examples = self._sklearn_object.min_samples
623
+ elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
624
+ # LocalOutlierFactor expects n_neighbors <= n_samples
625
+ num_examples = self._sklearn_object.n_neighbors
626
+ sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
627
+ else:
628
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
591
629
 
592
630
  # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
593
631
  # seen during the fit.
@@ -599,12 +637,14 @@ class IncrementalPCA(BaseTransformer):
599
637
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
600
638
  if self.sample_weight_col:
601
639
  output_df_columns_set -= set(self.sample_weight_col)
640
+
602
641
  # if the dimension of inferred output column names is correct; use it
603
642
  if len(expected_output_cols_list) == len(output_df_columns_set):
604
- return expected_output_cols_list
643
+ return expected_output_cols_list, output_df_pd
605
644
  # otherwise, use the sklearn estimator's output
606
645
  else:
607
- return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
646
+ expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
647
+ return expected_output_cols_list, output_df_pd[expected_output_cols_list]
608
648
 
609
649
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
610
650
  @telemetry.send_api_usage_telemetry(
@@ -650,7 +690,7 @@ class IncrementalPCA(BaseTransformer):
650
690
  drop_input_cols=self._drop_input_cols,
651
691
  expected_output_cols_type="float",
652
692
  )
653
- expected_output_cols = self._align_expected_output_names(
693
+ expected_output_cols, _ = self._align_expected_output(
654
694
  inference_method, dataset, expected_output_cols, output_cols_prefix
655
695
  )
656
696
 
@@ -716,7 +756,7 @@ class IncrementalPCA(BaseTransformer):
716
756
  drop_input_cols=self._drop_input_cols,
717
757
  expected_output_cols_type="float",
718
758
  )
719
- expected_output_cols = self._align_expected_output_names(
759
+ expected_output_cols, _ = self._align_expected_output(
720
760
  inference_method, dataset, expected_output_cols, output_cols_prefix
721
761
  )
722
762
  elif isinstance(dataset, pd.DataFrame):
@@ -779,7 +819,7 @@ class IncrementalPCA(BaseTransformer):
779
819
  drop_input_cols=self._drop_input_cols,
780
820
  expected_output_cols_type="float",
781
821
  )
782
- expected_output_cols = self._align_expected_output_names(
822
+ expected_output_cols, _ = self._align_expected_output(
783
823
  inference_method, dataset, expected_output_cols, output_cols_prefix
784
824
  )
785
825
 
@@ -844,7 +884,7 @@ class IncrementalPCA(BaseTransformer):
844
884
  drop_input_cols = self._drop_input_cols,
845
885
  expected_output_cols_type="float",
846
886
  )
847
- expected_output_cols = self._align_expected_output_names(
887
+ expected_output_cols, _ = self._align_expected_output(
848
888
  inference_method, dataset, expected_output_cols, output_cols_prefix
849
889
  )
850
890