snowflake-ml-python 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/telemetry.py +19 -0
- snowflake/ml/model/_client/ops/model_ops.py +16 -38
- snowflake/ml/model/_client/sql/model.py +1 -7
- snowflake/ml/model/_client/sql/model_version.py +20 -15
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +1 -6
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +0 -2
- snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +10 -1
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -2
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +11 -1
- snowflake/ml/model/_packager/model_meta/_packaging_requirements.py +3 -0
- snowflake/ml/model/_packager/model_meta/model_meta.py +17 -3
- snowflake/ml/model/type_hints.py +3 -0
- snowflake/ml/modeling/_internal/distributed_hpo_trainer.py +63 -95
- snowflake/ml/modeling/_internal/snowpark_handlers.py +9 -6
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +16 -0
- snowflake/ml/modeling/cluster/affinity_propagation.py +16 -0
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +16 -0
- snowflake/ml/modeling/cluster/birch.py +16 -0
- snowflake/ml/modeling/cluster/bisecting_k_means.py +16 -0
- snowflake/ml/modeling/cluster/dbscan.py +16 -0
- snowflake/ml/modeling/cluster/feature_agglomeration.py +16 -0
- snowflake/ml/modeling/cluster/k_means.py +16 -0
- snowflake/ml/modeling/cluster/mean_shift.py +16 -0
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +16 -0
- snowflake/ml/modeling/cluster/optics.py +16 -0
- snowflake/ml/modeling/cluster/spectral_biclustering.py +16 -0
- snowflake/ml/modeling/cluster/spectral_clustering.py +16 -0
- snowflake/ml/modeling/cluster/spectral_coclustering.py +16 -0
- snowflake/ml/modeling/compose/column_transformer.py +16 -0
- snowflake/ml/modeling/compose/transformed_target_regressor.py +16 -0
- snowflake/ml/modeling/covariance/elliptic_envelope.py +16 -0
- snowflake/ml/modeling/covariance/empirical_covariance.py +16 -0
- snowflake/ml/modeling/covariance/graphical_lasso.py +16 -0
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +16 -0
- snowflake/ml/modeling/covariance/ledoit_wolf.py +16 -0
- snowflake/ml/modeling/covariance/min_cov_det.py +16 -0
- snowflake/ml/modeling/covariance/oas.py +16 -0
- snowflake/ml/modeling/covariance/shrunk_covariance.py +16 -0
- snowflake/ml/modeling/decomposition/dictionary_learning.py +16 -0
- snowflake/ml/modeling/decomposition/factor_analysis.py +16 -0
- snowflake/ml/modeling/decomposition/fast_ica.py +16 -0
- snowflake/ml/modeling/decomposition/incremental_pca.py +16 -0
- snowflake/ml/modeling/decomposition/kernel_pca.py +16 -0
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +16 -0
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +16 -0
- snowflake/ml/modeling/decomposition/pca.py +16 -0
- snowflake/ml/modeling/decomposition/sparse_pca.py +16 -0
- snowflake/ml/modeling/decomposition/truncated_svd.py +16 -0
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +16 -0
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +16 -0
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +16 -0
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +16 -0
- snowflake/ml/modeling/ensemble/bagging_classifier.py +16 -0
- snowflake/ml/modeling/ensemble/bagging_regressor.py +16 -0
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +16 -0
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +16 -0
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +16 -0
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +16 -0
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +16 -0
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +16 -0
- snowflake/ml/modeling/ensemble/isolation_forest.py +16 -0
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +16 -0
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +16 -0
- snowflake/ml/modeling/ensemble/stacking_regressor.py +16 -0
- snowflake/ml/modeling/ensemble/voting_classifier.py +16 -0
- snowflake/ml/modeling/ensemble/voting_regressor.py +16 -0
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +16 -0
- snowflake/ml/modeling/feature_selection/select_fdr.py +16 -0
- snowflake/ml/modeling/feature_selection/select_fpr.py +16 -0
- snowflake/ml/modeling/feature_selection/select_fwe.py +16 -0
- snowflake/ml/modeling/feature_selection/select_k_best.py +16 -0
- snowflake/ml/modeling/feature_selection/select_percentile.py +16 -0
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +16 -0
- snowflake/ml/modeling/feature_selection/variance_threshold.py +16 -0
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +16 -0
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +16 -0
- snowflake/ml/modeling/impute/iterative_imputer.py +16 -0
- snowflake/ml/modeling/impute/knn_imputer.py +16 -0
- snowflake/ml/modeling/impute/missing_indicator.py +16 -0
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +16 -0
- snowflake/ml/modeling/kernel_approximation/nystroem.py +16 -0
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +16 -0
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +16 -0
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +16 -0
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +16 -0
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +16 -0
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +16 -0
- snowflake/ml/modeling/linear_model/ard_regression.py +16 -0
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +16 -0
- snowflake/ml/modeling/linear_model/elastic_net.py +16 -0
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +16 -0
- snowflake/ml/modeling/linear_model/gamma_regressor.py +16 -0
- snowflake/ml/modeling/linear_model/huber_regressor.py +16 -0
- snowflake/ml/modeling/linear_model/lars.py +16 -0
- snowflake/ml/modeling/linear_model/lars_cv.py +16 -0
- snowflake/ml/modeling/linear_model/lasso.py +16 -0
- snowflake/ml/modeling/linear_model/lasso_cv.py +16 -0
- snowflake/ml/modeling/linear_model/lasso_lars.py +16 -0
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +16 -0
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +16 -0
- snowflake/ml/modeling/linear_model/linear_regression.py +16 -0
- snowflake/ml/modeling/linear_model/logistic_regression.py +16 -0
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +16 -0
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +16 -0
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +16 -0
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +16 -0
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +16 -0
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +16 -0
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +16 -0
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +16 -0
- snowflake/ml/modeling/linear_model/perceptron.py +16 -0
- snowflake/ml/modeling/linear_model/poisson_regressor.py +16 -0
- snowflake/ml/modeling/linear_model/ransac_regressor.py +16 -0
- snowflake/ml/modeling/linear_model/ridge.py +16 -0
- snowflake/ml/modeling/linear_model/ridge_classifier.py +16 -0
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +16 -0
- snowflake/ml/modeling/linear_model/ridge_cv.py +16 -0
- snowflake/ml/modeling/linear_model/sgd_classifier.py +16 -0
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +16 -0
- snowflake/ml/modeling/linear_model/sgd_regressor.py +16 -0
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +16 -0
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +16 -0
- snowflake/ml/modeling/manifold/isomap.py +16 -0
- snowflake/ml/modeling/manifold/mds.py +16 -0
- snowflake/ml/modeling/manifold/spectral_embedding.py +16 -0
- snowflake/ml/modeling/manifold/tsne.py +16 -0
- snowflake/ml/modeling/metrics/classification.py +5 -6
- snowflake/ml/modeling/metrics/metrics_utils.py +5 -3
- snowflake/ml/modeling/metrics/ranking.py +7 -3
- snowflake/ml/modeling/metrics/regression.py +6 -3
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +16 -0
- snowflake/ml/modeling/mixture/gaussian_mixture.py +16 -0
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +16 -0
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +16 -0
- snowflake/ml/modeling/multiclass/output_code_classifier.py +16 -0
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +16 -0
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +16 -0
- snowflake/ml/modeling/naive_bayes/complement_nb.py +16 -0
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +16 -0
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +16 -0
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +16 -0
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +16 -0
- snowflake/ml/modeling/neighbors/kernel_density.py +16 -0
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +16 -0
- snowflake/ml/modeling/neighbors/nearest_centroid.py +16 -0
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +16 -0
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +16 -0
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +16 -0
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +16 -0
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +16 -0
- snowflake/ml/modeling/neural_network/mlp_classifier.py +16 -0
- snowflake/ml/modeling/neural_network/mlp_regressor.py +16 -0
- snowflake/ml/modeling/preprocessing/polynomial_features.py +16 -0
- snowflake/ml/modeling/semi_supervised/label_propagation.py +16 -0
- snowflake/ml/modeling/semi_supervised/label_spreading.py +16 -0
- snowflake/ml/modeling/svm/linear_svc.py +16 -0
- snowflake/ml/modeling/svm/linear_svr.py +16 -0
- snowflake/ml/modeling/svm/nu_svc.py +16 -0
- snowflake/ml/modeling/svm/nu_svr.py +16 -0
- snowflake/ml/modeling/svm/svc.py +16 -0
- snowflake/ml/modeling/svm/svr.py +16 -0
- snowflake/ml/modeling/tree/decision_tree_classifier.py +16 -0
- snowflake/ml/modeling/tree/decision_tree_regressor.py +16 -0
- snowflake/ml/modeling/tree/extra_tree_classifier.py +16 -0
- snowflake/ml/modeling/tree/extra_tree_regressor.py +16 -0
- snowflake/ml/modeling/xgboost/xgb_classifier.py +16 -0
- snowflake/ml/modeling/xgboost/xgb_regressor.py +16 -0
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +16 -0
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +16 -0
- snowflake/ml/registry/registry.py +2 -0
- snowflake/ml/version.py +1 -1
- snowflake_ml_python-1.2.1.dist-info/LICENSE.txt +202 -0
- {snowflake_ml_python-1.2.0.dist-info → snowflake_ml_python-1.2.1.dist-info}/METADATA +261 -50
- {snowflake_ml_python-1.2.0.dist-info → snowflake_ml_python-1.2.1.dist-info}/RECORD +189 -186
- {snowflake_ml_python-1.2.0.dist-info → snowflake_ml_python-1.2.1.dist-info}/WHEEL +2 -1
- snowflake_ml_python-1.2.1.dist-info/top_level.txt +1 -0
@@ -623,6 +623,22 @@ class PassiveAggressiveClassifier(BaseTransformer):
|
|
623
623
|
# each row containing a list of values.
|
624
624
|
expected_dtype = "ARRAY"
|
625
625
|
|
626
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
627
|
+
if expected_dtype == "":
|
628
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
629
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
630
|
+
expected_dtype = "ARRAY"
|
631
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
632
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
633
|
+
expected_dtype = "ARRAY"
|
634
|
+
else:
|
635
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
636
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
637
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
638
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
639
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
640
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
641
|
+
|
626
642
|
output_df = self._batch_inference(
|
627
643
|
dataset=dataset,
|
628
644
|
inference_method="transform",
|
@@ -609,6 +609,22 @@ class PassiveAggressiveRegressor(BaseTransformer):
|
|
609
609
|
# each row containing a list of values.
|
610
610
|
expected_dtype = "ARRAY"
|
611
611
|
|
612
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
613
|
+
if expected_dtype == "":
|
614
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
615
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
616
|
+
expected_dtype = "ARRAY"
|
617
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
618
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
619
|
+
expected_dtype = "ARRAY"
|
620
|
+
else:
|
621
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
622
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
623
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
624
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
625
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
626
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
627
|
+
|
612
628
|
output_df = self._batch_inference(
|
613
629
|
dataset=dataset,
|
614
630
|
inference_method="transform",
|
@@ -622,6 +622,22 @@ class Perceptron(BaseTransformer):
|
|
622
622
|
# each row containing a list of values.
|
623
623
|
expected_dtype = "ARRAY"
|
624
624
|
|
625
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
626
|
+
if expected_dtype == "":
|
627
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
628
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
629
|
+
expected_dtype = "ARRAY"
|
630
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
631
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
632
|
+
expected_dtype = "ARRAY"
|
633
|
+
else:
|
634
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
635
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
636
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
637
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
638
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
639
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
640
|
+
|
625
641
|
output_df = self._batch_inference(
|
626
642
|
dataset=dataset,
|
627
643
|
inference_method="transform",
|
@@ -571,6 +571,22 @@ class PoissonRegressor(BaseTransformer):
|
|
571
571
|
# each row containing a list of values.
|
572
572
|
expected_dtype = "ARRAY"
|
573
573
|
|
574
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
575
|
+
if expected_dtype == "":
|
576
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
577
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
578
|
+
expected_dtype = "ARRAY"
|
579
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
580
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
581
|
+
expected_dtype = "ARRAY"
|
582
|
+
else:
|
583
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
584
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
585
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
586
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
587
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
588
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
589
|
+
|
574
590
|
output_df = self._batch_inference(
|
575
591
|
dataset=dataset,
|
576
592
|
inference_method="transform",
|
@@ -627,6 +627,22 @@ class RANSACRegressor(BaseTransformer):
|
|
627
627
|
# each row containing a list of values.
|
628
628
|
expected_dtype = "ARRAY"
|
629
629
|
|
630
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
631
|
+
if expected_dtype == "":
|
632
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
633
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
634
|
+
expected_dtype = "ARRAY"
|
635
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
636
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
637
|
+
expected_dtype = "ARRAY"
|
638
|
+
else:
|
639
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
640
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
641
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
642
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
643
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
644
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
645
|
+
|
630
646
|
output_df = self._batch_inference(
|
631
647
|
dataset=dataset,
|
632
648
|
inference_method="transform",
|
@@ -619,6 +619,22 @@ class Ridge(BaseTransformer):
|
|
619
619
|
# each row containing a list of values.
|
620
620
|
expected_dtype = "ARRAY"
|
621
621
|
|
622
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
623
|
+
if expected_dtype == "":
|
624
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
625
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
626
|
+
expected_dtype = "ARRAY"
|
627
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
628
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
629
|
+
expected_dtype = "ARRAY"
|
630
|
+
else:
|
631
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
632
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
633
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
634
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
635
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
636
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
637
|
+
|
622
638
|
output_df = self._batch_inference(
|
623
639
|
dataset=dataset,
|
624
640
|
inference_method="transform",
|
@@ -619,6 +619,22 @@ class RidgeClassifier(BaseTransformer):
|
|
619
619
|
# each row containing a list of values.
|
620
620
|
expected_dtype = "ARRAY"
|
621
621
|
|
622
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
623
|
+
if expected_dtype == "":
|
624
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
625
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
626
|
+
expected_dtype = "ARRAY"
|
627
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
628
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
629
|
+
expected_dtype = "ARRAY"
|
630
|
+
else:
|
631
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
632
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
633
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
634
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
635
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
636
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
637
|
+
|
622
638
|
output_df = self._batch_inference(
|
623
639
|
dataset=dataset,
|
624
640
|
inference_method="transform",
|
@@ -570,6 +570,22 @@ class RidgeClassifierCV(BaseTransformer):
|
|
570
570
|
# each row containing a list of values.
|
571
571
|
expected_dtype = "ARRAY"
|
572
572
|
|
573
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
574
|
+
if expected_dtype == "":
|
575
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
576
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
577
|
+
expected_dtype = "ARRAY"
|
578
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
579
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
580
|
+
expected_dtype = "ARRAY"
|
581
|
+
else:
|
582
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
583
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
584
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
585
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
586
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
587
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
588
|
+
|
573
589
|
output_df = self._batch_inference(
|
574
590
|
dataset=dataset,
|
575
591
|
inference_method="transform",
|
@@ -591,6 +591,22 @@ class RidgeCV(BaseTransformer):
|
|
591
591
|
# each row containing a list of values.
|
592
592
|
expected_dtype = "ARRAY"
|
593
593
|
|
594
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
595
|
+
if expected_dtype == "":
|
596
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
597
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
598
|
+
expected_dtype = "ARRAY"
|
599
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
600
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
601
|
+
expected_dtype = "ARRAY"
|
602
|
+
else:
|
603
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
604
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
605
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
606
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
607
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
608
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
609
|
+
|
594
610
|
output_df = self._batch_inference(
|
595
611
|
dataset=dataset,
|
596
612
|
inference_method="transform",
|
@@ -710,6 +710,22 @@ class SGDClassifier(BaseTransformer):
|
|
710
710
|
# each row containing a list of values.
|
711
711
|
expected_dtype = "ARRAY"
|
712
712
|
|
713
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
714
|
+
if expected_dtype == "":
|
715
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
716
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
717
|
+
expected_dtype = "ARRAY"
|
718
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
719
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
720
|
+
expected_dtype = "ARRAY"
|
721
|
+
else:
|
722
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
723
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
724
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
725
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
726
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
727
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
728
|
+
|
713
729
|
output_df = self._batch_inference(
|
714
730
|
dataset=dataset,
|
715
731
|
inference_method="transform",
|
@@ -608,6 +608,22 @@ class SGDOneClassSVM(BaseTransformer):
|
|
608
608
|
# each row containing a list of values.
|
609
609
|
expected_dtype = "ARRAY"
|
610
610
|
|
611
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
612
|
+
if expected_dtype == "":
|
613
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
614
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
615
|
+
expected_dtype = "ARRAY"
|
616
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
617
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
618
|
+
expected_dtype = "ARRAY"
|
619
|
+
else:
|
620
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
621
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
622
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
623
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
624
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
625
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
626
|
+
|
611
627
|
output_df = self._batch_inference(
|
612
628
|
dataset=dataset,
|
613
629
|
inference_method="transform",
|
@@ -676,6 +676,22 @@ class SGDRegressor(BaseTransformer):
|
|
676
676
|
# each row containing a list of values.
|
677
677
|
expected_dtype = "ARRAY"
|
678
678
|
|
679
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
680
|
+
if expected_dtype == "":
|
681
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
682
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
683
|
+
expected_dtype = "ARRAY"
|
684
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
685
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
686
|
+
expected_dtype = "ARRAY"
|
687
|
+
else:
|
688
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
689
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
690
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
691
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
692
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
693
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
694
|
+
|
679
695
|
output_df = self._batch_inference(
|
680
696
|
dataset=dataset,
|
681
697
|
inference_method="transform",
|
@@ -578,6 +578,22 @@ class TheilSenRegressor(BaseTransformer):
|
|
578
578
|
# each row containing a list of values.
|
579
579
|
expected_dtype = "ARRAY"
|
580
580
|
|
581
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
582
|
+
if expected_dtype == "":
|
583
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
584
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
585
|
+
expected_dtype = "ARRAY"
|
586
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
587
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
588
|
+
expected_dtype = "ARRAY"
|
589
|
+
else:
|
590
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
591
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
592
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
593
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
594
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
595
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
596
|
+
|
581
597
|
output_df = self._batch_inference(
|
582
598
|
dataset=dataset,
|
583
599
|
inference_method="transform",
|
@@ -604,6 +604,22 @@ class TweedieRegressor(BaseTransformer):
|
|
604
604
|
# each row containing a list of values.
|
605
605
|
expected_dtype = "ARRAY"
|
606
606
|
|
607
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
608
|
+
if expected_dtype == "":
|
609
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
610
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
611
|
+
expected_dtype = "ARRAY"
|
612
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
613
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
614
|
+
expected_dtype = "ARRAY"
|
615
|
+
else:
|
616
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
617
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
618
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
619
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
620
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
621
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
622
|
+
|
607
623
|
output_df = self._batch_inference(
|
608
624
|
dataset=dataset,
|
609
625
|
inference_method="transform",
|
@@ -600,6 +600,22 @@ class Isomap(BaseTransformer):
|
|
600
600
|
# each row containing a list of values.
|
601
601
|
expected_dtype = "ARRAY"
|
602
602
|
|
603
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
604
|
+
if expected_dtype == "":
|
605
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
606
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
607
|
+
expected_dtype = "ARRAY"
|
608
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
609
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
610
|
+
expected_dtype = "ARRAY"
|
611
|
+
else:
|
612
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
613
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
614
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
615
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
616
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
617
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
618
|
+
|
603
619
|
output_df = self._batch_inference(
|
604
620
|
dataset=dataset,
|
605
621
|
inference_method="transform",
|
@@ -581,6 +581,22 @@ class MDS(BaseTransformer):
|
|
581
581
|
# each row containing a list of values.
|
582
582
|
expected_dtype = "ARRAY"
|
583
583
|
|
584
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
585
|
+
if expected_dtype == "":
|
586
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
587
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
588
|
+
expected_dtype = "ARRAY"
|
589
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
590
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
591
|
+
expected_dtype = "ARRAY"
|
592
|
+
else:
|
593
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
594
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
595
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
596
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
597
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
598
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
599
|
+
|
584
600
|
output_df = self._batch_inference(
|
585
601
|
dataset=dataset,
|
586
602
|
inference_method="transform",
|
@@ -583,6 +583,22 @@ class SpectralEmbedding(BaseTransformer):
|
|
583
583
|
# each row containing a list of values.
|
584
584
|
expected_dtype = "ARRAY"
|
585
585
|
|
586
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
587
|
+
if expected_dtype == "":
|
588
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
589
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
590
|
+
expected_dtype = "ARRAY"
|
591
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
592
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
593
|
+
expected_dtype = "ARRAY"
|
594
|
+
else:
|
595
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
596
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
597
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
598
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
599
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
600
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
601
|
+
|
586
602
|
output_df = self._batch_inference(
|
587
603
|
dataset=dataset,
|
588
604
|
inference_method="transform",
|
@@ -642,6 +642,22 @@ class TSNE(BaseTransformer):
|
|
642
642
|
# each row containing a list of values.
|
643
643
|
expected_dtype = "ARRAY"
|
644
644
|
|
645
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
646
|
+
if expected_dtype == "":
|
647
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
648
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
649
|
+
expected_dtype = "ARRAY"
|
650
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
651
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
652
|
+
expected_dtype = "ARRAY"
|
653
|
+
else:
|
654
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
655
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
656
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
657
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
658
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
659
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
660
|
+
|
645
661
|
output_df = self._batch_inference(
|
646
662
|
dataset=dataset,
|
647
663
|
inference_method="transform",
|
@@ -228,16 +228,15 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
|
|
228
228
|
Returns:
|
229
229
|
Name of the UDTF.
|
230
230
|
"""
|
231
|
+
batch_size = metrics_utils.BATCH_SIZE
|
231
232
|
|
232
233
|
class ConfusionMatrixComputer:
|
233
|
-
BATCH_SIZE = 1000
|
234
|
-
|
235
234
|
def __init__(self) -> None:
|
236
235
|
self._initialized = False
|
237
236
|
self._confusion_matrix = np.zeros((1, 1))
|
238
|
-
# 2d array containing a batch of input rows. A batch contains
|
237
|
+
# 2d array containing a batch of input rows. A batch contains metrics_utils.BATCH_SIZE rows.
|
239
238
|
# [sample_weight, y_true, y_pred]
|
240
|
-
self._batched_rows = np.zeros((
|
239
|
+
self._batched_rows = np.zeros((batch_size, 1))
|
241
240
|
# Number of columns in the dataset.
|
242
241
|
self._n_cols = -1
|
243
242
|
# Running count of number of rows added to self._batched_rows.
|
@@ -255,7 +254,7 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
|
|
255
254
|
# 1. Initialize variables.
|
256
255
|
if not self._initialized:
|
257
256
|
self._n_cols = len(input_row)
|
258
|
-
self._batched_rows = np.zeros((
|
257
|
+
self._batched_rows = np.zeros((batch_size, self._n_cols))
|
259
258
|
self._n_label = n_label
|
260
259
|
self._confusion_matrix = np.zeros((self._n_label, self._n_label))
|
261
260
|
self._initialized = True
|
@@ -264,7 +263,7 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
|
|
264
263
|
self._cur_count += 1
|
265
264
|
|
266
265
|
# 2. Compute incremental confusion matrix for the batch.
|
267
|
-
if self._cur_count >=
|
266
|
+
if self._cur_count >= batch_size:
|
268
267
|
self.update_confusion_matrix()
|
269
268
|
self._cur_count = 0
|
270
269
|
|
@@ -15,6 +15,7 @@ from snowflake.snowpark import Session, functions as F, types as T
|
|
15
15
|
|
16
16
|
LABEL = "LABEL"
|
17
17
|
INDEX = "INDEX"
|
18
|
+
BATCH_SIZE = 1000
|
18
19
|
|
19
20
|
|
20
21
|
def register_accumulator_udtf(*, session: Session, statement_params: Dict[str, Any]) -> str:
|
@@ -82,7 +83,7 @@ def register_sharded_dot_sum_computer(*, session: Session, statement_params: Dic
|
|
82
83
|
"""This class is registered as a UDTF and computes the sum and dot product
|
83
84
|
of columns for each partition of rows. The computations across all the partitions happens
|
84
85
|
in parallel using the nodes in the warehouse. In order to avoid keeping the entire partition
|
85
|
-
in memory, we batch the rows
|
86
|
+
in memory, we batch the rows and maintain a running sum and dot prod in self._sum_by_count,
|
86
87
|
self._sum_by_countd and self._dot_prod respectively. We return these at the end of the partition.
|
87
88
|
"""
|
88
89
|
|
@@ -95,7 +96,7 @@ def register_sharded_dot_sum_computer(*, session: Session, statement_params: Dic
|
|
95
96
|
# delta degree of freedom
|
96
97
|
self._ddof = 0
|
97
98
|
# Setting the batch size to 1000 based on experimentation. Can be fine tuned later.
|
98
|
-
self._batch_size =
|
99
|
+
self._batch_size = BATCH_SIZE
|
99
100
|
# 2d array containing a batch of input rows. A batch contains self._batch_size rows.
|
100
101
|
self._batched_rows = np.zeros((self._batch_size, 1))
|
101
102
|
# 1d array of length = # of cols. Contains sum(col/count) for each column.
|
@@ -224,7 +225,7 @@ def check_label_columns(
|
|
224
225
|
TypeError: `y_true_col_names` and `y_pred_col_names` are of different types.
|
225
226
|
ValueError: Multilabel `y_true_col_names` and `y_pred_col_names` are of different lengths.
|
226
227
|
"""
|
227
|
-
if type(y_true_col_names)
|
228
|
+
if type(y_true_col_names) is not type(y_pred_col_names):
|
228
229
|
raise TypeError(
|
229
230
|
"Label columns should be of the same type."
|
230
231
|
f"Got y_true_col_names={type(y_true_col_names)} vs y_pred_col_names={type(y_pred_col_names)}."
|
@@ -300,6 +301,7 @@ def validate_average_pos_label(average: Optional[str] = None, pos_label: Union[s
|
|
300
301
|
"average != 'binary' (got %r). You may use "
|
301
302
|
"labels=[pos_label] to specify a single positive class." % (pos_label, average),
|
302
303
|
UserWarning,
|
304
|
+
stacklevel=2,
|
303
305
|
)
|
304
306
|
|
305
307
|
|
@@ -122,7 +122,8 @@ def precision_recall_curve(
|
|
122
122
|
result_module = cloudpickle.loads(pickled_result_module)
|
123
123
|
return result_module.serialize(session, (precision, recall, thresholds)) # type: ignore[no-any-return]
|
124
124
|
|
125
|
-
|
125
|
+
kwargs = telemetry.get_sproc_statement_params_kwargs(precision_recall_curve_anon_sproc, statement_params)
|
126
|
+
result_object = result.deserialize(session, precision_recall_curve_anon_sproc(session, **kwargs))
|
126
127
|
res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object
|
127
128
|
return res
|
128
129
|
|
@@ -271,7 +272,8 @@ def roc_auc_score(
|
|
271
272
|
result_module = cloudpickle.loads(pickled_result_module)
|
272
273
|
return result_module.serialize(session, auc) # type: ignore[no-any-return]
|
273
274
|
|
274
|
-
|
275
|
+
kwargs = telemetry.get_sproc_statement_params_kwargs(roc_auc_score_anon_sproc, statement_params)
|
276
|
+
result_object = result.deserialize(session, roc_auc_score_anon_sproc(session, **kwargs))
|
275
277
|
auc: Union[float, npt.NDArray[np.float_]] = result_object
|
276
278
|
return auc
|
277
279
|
|
@@ -372,7 +374,9 @@ def roc_curve(
|
|
372
374
|
result_module = cloudpickle.loads(pickled_result_module)
|
373
375
|
return result_module.serialize(session, (fpr, tpr, thresholds)) # type: ignore[no-any-return]
|
374
376
|
|
375
|
-
|
377
|
+
kwargs = telemetry.get_sproc_statement_params_kwargs(roc_curve_anon_sproc, statement_params)
|
378
|
+
result_object = result.deserialize(session, roc_curve_anon_sproc(session, **kwargs))
|
379
|
+
|
376
380
|
res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object
|
377
381
|
|
378
382
|
return res
|
@@ -108,7 +108,8 @@ def d2_absolute_error_score(
|
|
108
108
|
result_module = cloudpickle.loads(pickled_snowflake_result)
|
109
109
|
return result_module.serialize(session, score) # type: ignore[no-any-return]
|
110
110
|
|
111
|
-
|
111
|
+
kwargs = telemetry.get_sproc_statement_params_kwargs(d2_absolute_error_score_anon_sproc, statement_params)
|
112
|
+
result_object = result.deserialize(session, d2_absolute_error_score_anon_sproc(session, **kwargs))
|
112
113
|
score: Union[float, npt.NDArray[np.float_]] = result_object
|
113
114
|
return score
|
114
115
|
|
@@ -205,7 +206,8 @@ def d2_pinball_score(
|
|
205
206
|
result_module = cloudpickle.loads(pickled_result_module)
|
206
207
|
return result_module.serialize(session, score) # type: ignore[no-any-return]
|
207
208
|
|
208
|
-
|
209
|
+
kwargs = telemetry.get_sproc_statement_params_kwargs(d2_pinball_score_anon_sproc, statement_params)
|
210
|
+
result_object = result.deserialize(session, d2_pinball_score_anon_sproc(session, **kwargs))
|
209
211
|
|
210
212
|
score: Union[float, npt.NDArray[np.float_]] = result_object
|
211
213
|
return score
|
@@ -319,7 +321,8 @@ def explained_variance_score(
|
|
319
321
|
result_module = cloudpickle.loads(pickled_result_module)
|
320
322
|
return result_module.serialize(session, score) # type: ignore[no-any-return]
|
321
323
|
|
322
|
-
|
324
|
+
kwargs = telemetry.get_sproc_statement_params_kwargs(explained_variance_score_anon_sproc, statement_params)
|
325
|
+
result_object = result.deserialize(session, explained_variance_score_anon_sproc(session, **kwargs))
|
323
326
|
score: Union[float, npt.NDArray[np.float_]] = result_object
|
324
327
|
return score
|
325
328
|
|
@@ -647,6 +647,22 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
647
647
|
# each row containing a list of values.
|
648
648
|
expected_dtype = "ARRAY"
|
649
649
|
|
650
|
+
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
651
|
+
if expected_dtype == "":
|
652
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
|
653
|
+
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
654
|
+
expected_dtype = "ARRAY"
|
655
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
|
656
|
+
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
657
|
+
expected_dtype = "ARRAY"
|
658
|
+
else:
|
659
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
660
|
+
# We can only infer the output types from the input types if the following two statemetns are true:
|
661
|
+
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
662
|
+
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
663
|
+
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
664
|
+
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
665
|
+
|
650
666
|
output_df = self._batch_inference(
|
651
667
|
dataset=dataset,
|
652
668
|
inference_method="transform",
|