snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. snowflake/ml/_internal/file_utils.py +8 -35
  2. snowflake/ml/_internal/utils/identifier.py +74 -7
  3. snowflake/ml/model/_core_requirements.py +1 -1
  4. snowflake/ml/model/_deploy_client/warehouse/deploy.py +5 -26
  5. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +2 -2
  6. snowflake/ml/model/_handlers/_base.py +3 -1
  7. snowflake/ml/model/_handlers/sklearn.py +1 -0
  8. snowflake/ml/model/_handlers/xgboost.py +1 -1
  9. snowflake/ml/model/_model.py +24 -19
  10. snowflake/ml/model/_model_meta.py +24 -15
  11. snowflake/ml/model/type_hints.py +5 -11
  12. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +28 -17
  13. snowflake/ml/modeling/cluster/affinity_propagation.py +28 -17
  14. snowflake/ml/modeling/cluster/agglomerative_clustering.py +28 -17
  15. snowflake/ml/modeling/cluster/birch.py +28 -17
  16. snowflake/ml/modeling/cluster/bisecting_k_means.py +28 -17
  17. snowflake/ml/modeling/cluster/dbscan.py +28 -17
  18. snowflake/ml/modeling/cluster/feature_agglomeration.py +28 -17
  19. snowflake/ml/modeling/cluster/k_means.py +28 -17
  20. snowflake/ml/modeling/cluster/mean_shift.py +28 -17
  21. snowflake/ml/modeling/cluster/mini_batch_k_means.py +28 -17
  22. snowflake/ml/modeling/cluster/optics.py +28 -17
  23. snowflake/ml/modeling/cluster/spectral_biclustering.py +28 -17
  24. snowflake/ml/modeling/cluster/spectral_clustering.py +28 -17
  25. snowflake/ml/modeling/cluster/spectral_coclustering.py +28 -17
  26. snowflake/ml/modeling/compose/column_transformer.py +28 -17
  27. snowflake/ml/modeling/compose/transformed_target_regressor.py +28 -17
  28. snowflake/ml/modeling/covariance/elliptic_envelope.py +28 -17
  29. snowflake/ml/modeling/covariance/empirical_covariance.py +28 -17
  30. snowflake/ml/modeling/covariance/graphical_lasso.py +28 -17
  31. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +28 -17
  32. snowflake/ml/modeling/covariance/ledoit_wolf.py +28 -17
  33. snowflake/ml/modeling/covariance/min_cov_det.py +28 -17
  34. snowflake/ml/modeling/covariance/oas.py +28 -17
  35. snowflake/ml/modeling/covariance/shrunk_covariance.py +28 -17
  36. snowflake/ml/modeling/decomposition/dictionary_learning.py +28 -17
  37. snowflake/ml/modeling/decomposition/factor_analysis.py +28 -17
  38. snowflake/ml/modeling/decomposition/fast_ica.py +28 -17
  39. snowflake/ml/modeling/decomposition/incremental_pca.py +28 -17
  40. snowflake/ml/modeling/decomposition/kernel_pca.py +28 -17
  41. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +28 -17
  42. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +28 -17
  43. snowflake/ml/modeling/decomposition/pca.py +28 -17
  44. snowflake/ml/modeling/decomposition/sparse_pca.py +28 -17
  45. snowflake/ml/modeling/decomposition/truncated_svd.py +28 -17
  46. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +28 -17
  47. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +28 -17
  48. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +28 -17
  49. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +28 -17
  50. snowflake/ml/modeling/ensemble/bagging_classifier.py +28 -17
  51. snowflake/ml/modeling/ensemble/bagging_regressor.py +28 -17
  52. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +28 -17
  53. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +28 -17
  54. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +28 -17
  55. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +28 -17
  56. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +28 -17
  57. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +28 -17
  58. snowflake/ml/modeling/ensemble/isolation_forest.py +28 -17
  59. snowflake/ml/modeling/ensemble/random_forest_classifier.py +28 -17
  60. snowflake/ml/modeling/ensemble/random_forest_regressor.py +28 -17
  61. snowflake/ml/modeling/ensemble/stacking_regressor.py +28 -17
  62. snowflake/ml/modeling/ensemble/voting_classifier.py +28 -17
  63. snowflake/ml/modeling/ensemble/voting_regressor.py +28 -17
  64. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +28 -17
  65. snowflake/ml/modeling/feature_selection/select_fdr.py +28 -17
  66. snowflake/ml/modeling/feature_selection/select_fpr.py +28 -17
  67. snowflake/ml/modeling/feature_selection/select_fwe.py +28 -17
  68. snowflake/ml/modeling/feature_selection/select_k_best.py +28 -17
  69. snowflake/ml/modeling/feature_selection/select_percentile.py +28 -17
  70. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +28 -17
  71. snowflake/ml/modeling/feature_selection/variance_threshold.py +28 -17
  72. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +28 -17
  73. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +28 -17
  74. snowflake/ml/modeling/impute/iterative_imputer.py +28 -17
  75. snowflake/ml/modeling/impute/knn_imputer.py +28 -17
  76. snowflake/ml/modeling/impute/missing_indicator.py +28 -17
  77. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +28 -17
  78. snowflake/ml/modeling/kernel_approximation/nystroem.py +28 -17
  79. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +28 -17
  80. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +28 -17
  81. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +28 -17
  82. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +28 -17
  83. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +28 -17
  84. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +28 -17
  85. snowflake/ml/modeling/linear_model/ard_regression.py +28 -17
  86. snowflake/ml/modeling/linear_model/bayesian_ridge.py +28 -17
  87. snowflake/ml/modeling/linear_model/elastic_net.py +28 -17
  88. snowflake/ml/modeling/linear_model/elastic_net_cv.py +28 -17
  89. snowflake/ml/modeling/linear_model/gamma_regressor.py +28 -17
  90. snowflake/ml/modeling/linear_model/huber_regressor.py +28 -17
  91. snowflake/ml/modeling/linear_model/lars.py +28 -17
  92. snowflake/ml/modeling/linear_model/lars_cv.py +28 -17
  93. snowflake/ml/modeling/linear_model/lasso.py +28 -17
  94. snowflake/ml/modeling/linear_model/lasso_cv.py +28 -17
  95. snowflake/ml/modeling/linear_model/lasso_lars.py +28 -17
  96. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +28 -17
  97. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +28 -17
  98. snowflake/ml/modeling/linear_model/linear_regression.py +28 -17
  99. snowflake/ml/modeling/linear_model/logistic_regression.py +28 -17
  100. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +28 -17
  101. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +28 -17
  102. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +28 -17
  103. snowflake/ml/modeling/linear_model/multi_task_lasso.py +28 -17
  104. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +28 -17
  105. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +28 -17
  106. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +28 -17
  107. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +28 -17
  108. snowflake/ml/modeling/linear_model/perceptron.py +28 -17
  109. snowflake/ml/modeling/linear_model/poisson_regressor.py +28 -17
  110. snowflake/ml/modeling/linear_model/ransac_regressor.py +28 -17
  111. snowflake/ml/modeling/linear_model/ridge.py +28 -17
  112. snowflake/ml/modeling/linear_model/ridge_classifier.py +28 -17
  113. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +28 -17
  114. snowflake/ml/modeling/linear_model/ridge_cv.py +28 -17
  115. snowflake/ml/modeling/linear_model/sgd_classifier.py +28 -17
  116. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +28 -17
  117. snowflake/ml/modeling/linear_model/sgd_regressor.py +28 -17
  118. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +28 -17
  119. snowflake/ml/modeling/linear_model/tweedie_regressor.py +28 -17
  120. snowflake/ml/modeling/manifold/isomap.py +28 -17
  121. snowflake/ml/modeling/manifold/mds.py +28 -17
  122. snowflake/ml/modeling/manifold/spectral_embedding.py +28 -17
  123. snowflake/ml/modeling/manifold/tsne.py +28 -17
  124. snowflake/ml/modeling/metrics/classification.py +6 -1
  125. snowflake/ml/modeling/metrics/regression.py +517 -9
  126. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +28 -17
  127. snowflake/ml/modeling/mixture/gaussian_mixture.py +28 -17
  128. snowflake/ml/modeling/model_selection/grid_search_cv.py +28 -17
  129. snowflake/ml/modeling/model_selection/randomized_search_cv.py +28 -17
  130. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +28 -17
  131. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +28 -17
  132. snowflake/ml/modeling/multiclass/output_code_classifier.py +28 -17
  133. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +28 -17
  134. snowflake/ml/modeling/naive_bayes/categorical_nb.py +28 -17
  135. snowflake/ml/modeling/naive_bayes/complement_nb.py +28 -17
  136. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +28 -17
  137. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +28 -17
  138. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +28 -17
  139. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +28 -17
  140. snowflake/ml/modeling/neighbors/kernel_density.py +28 -17
  141. snowflake/ml/modeling/neighbors/local_outlier_factor.py +28 -17
  142. snowflake/ml/modeling/neighbors/nearest_centroid.py +28 -17
  143. snowflake/ml/modeling/neighbors/nearest_neighbors.py +28 -17
  144. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +28 -17
  145. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +28 -17
  146. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +28 -17
  147. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +28 -17
  148. snowflake/ml/modeling/neural_network/mlp_classifier.py +28 -17
  149. snowflake/ml/modeling/neural_network/mlp_regressor.py +28 -17
  150. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  151. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  152. snowflake/ml/modeling/preprocessing/polynomial_features.py +28 -17
  153. snowflake/ml/modeling/semi_supervised/label_propagation.py +28 -17
  154. snowflake/ml/modeling/semi_supervised/label_spreading.py +28 -17
  155. snowflake/ml/modeling/svm/linear_svc.py +28 -17
  156. snowflake/ml/modeling/svm/linear_svr.py +28 -17
  157. snowflake/ml/modeling/svm/nu_svc.py +28 -17
  158. snowflake/ml/modeling/svm/nu_svr.py +28 -17
  159. snowflake/ml/modeling/svm/svc.py +28 -17
  160. snowflake/ml/modeling/svm/svr.py +28 -17
  161. snowflake/ml/modeling/tree/decision_tree_classifier.py +28 -17
  162. snowflake/ml/modeling/tree/decision_tree_regressor.py +28 -17
  163. snowflake/ml/modeling/tree/extra_tree_classifier.py +28 -17
  164. snowflake/ml/modeling/tree/extra_tree_regressor.py +28 -17
  165. snowflake/ml/modeling/xgboost/xgb_classifier.py +28 -17
  166. snowflake/ml/modeling/xgboost/xgb_regressor.py +28 -17
  167. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +28 -17
  168. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +28 -17
  169. snowflake/ml/registry/model_registry.py +49 -65
  170. snowflake/ml/version.py +1 -1
  171. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.2.dist-info}/METADATA +24 -1
  172. snowflake_ml_python-1.0.2.dist-info/RECORD +246 -0
  173. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  174. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.2.dist-info}/WHEEL +0 -0
@@ -659,26 +659,37 @@ class AffinityPropagation(BaseTransformer):
659
659
  # input cols need to match unquoted / quoted
660
660
  input_cols = self.input_cols
661
661
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
662
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
662
663
 
663
664
  estimator = self._sklearn_object
664
665
 
665
- input_df = dataset[input_cols] # Select input columns with quoted column names.
666
- if hasattr(estimator, "feature_names_in_"):
667
- missing_features = []
668
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
669
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
670
- missing_features.append(f)
671
-
672
- if len(missing_features) > 0:
673
- raise ValueError(
674
- "The feature names should match with those that were passed during fit.\n"
675
- f"Features seen during fit call but not present in the input: {missing_features}\n"
676
- f"Features in the input dataframe : {input_cols}\n"
677
- )
678
- input_df.columns = getattr(estimator, "feature_names_in_")
679
- else:
680
- # Just rename the column names to unquoted identifiers.
681
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
666
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
667
+ missing_features = []
668
+ features_in_dataset = set(dataset.columns)
669
+ columns_to_select = []
670
+ for i, f in enumerate(features_required_by_estimator):
671
+ if (
672
+ i >= len(input_cols)
673
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
674
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
675
+ and quoted_input_cols[i] not in features_in_dataset)
676
+ ):
677
+ missing_features.append(f)
678
+ elif input_cols[i] in features_in_dataset:
679
+ columns_to_select.append(input_cols[i])
680
+ elif unquoted_input_cols[i] in features_in_dataset:
681
+ columns_to_select.append(unquoted_input_cols[i])
682
+ else:
683
+ columns_to_select.append(quoted_input_cols[i])
684
+
685
+ if len(missing_features) > 0:
686
+ raise ValueError(
687
+ "The feature names should match with those that were passed during fit.\n"
688
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
689
+ f"Features in the input dataframe : {input_cols}\n"
690
+ )
691
+ input_df = dataset[columns_to_select]
692
+ input_df.columns = features_required_by_estimator
682
693
 
683
694
  transformed_numpy_array = getattr(estimator, inference_method)(
684
695
  input_df
@@ -692,26 +692,37 @@ class AgglomerativeClustering(BaseTransformer):
692
692
  # input cols need to match unquoted / quoted
693
693
  input_cols = self.input_cols
694
694
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
695
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
695
696
 
696
697
  estimator = self._sklearn_object
697
698
 
698
- input_df = dataset[input_cols] # Select input columns with quoted column names.
699
- if hasattr(estimator, "feature_names_in_"):
700
- missing_features = []
701
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
702
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
703
- missing_features.append(f)
704
-
705
- if len(missing_features) > 0:
706
- raise ValueError(
707
- "The feature names should match with those that were passed during fit.\n"
708
- f"Features seen during fit call but not present in the input: {missing_features}\n"
709
- f"Features in the input dataframe : {input_cols}\n"
710
- )
711
- input_df.columns = getattr(estimator, "feature_names_in_")
712
- else:
713
- # Just rename the column names to unquoted identifiers.
714
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
699
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
700
+ missing_features = []
701
+ features_in_dataset = set(dataset.columns)
702
+ columns_to_select = []
703
+ for i, f in enumerate(features_required_by_estimator):
704
+ if (
705
+ i >= len(input_cols)
706
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
707
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
708
+ and quoted_input_cols[i] not in features_in_dataset)
709
+ ):
710
+ missing_features.append(f)
711
+ elif input_cols[i] in features_in_dataset:
712
+ columns_to_select.append(input_cols[i])
713
+ elif unquoted_input_cols[i] in features_in_dataset:
714
+ columns_to_select.append(unquoted_input_cols[i])
715
+ else:
716
+ columns_to_select.append(quoted_input_cols[i])
717
+
718
+ if len(missing_features) > 0:
719
+ raise ValueError(
720
+ "The feature names should match with those that were passed during fit.\n"
721
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
722
+ f"Features in the input dataframe : {input_cols}\n"
723
+ )
724
+ input_df = dataset[columns_to_select]
725
+ input_df.columns = features_required_by_estimator
715
726
 
716
727
  transformed_numpy_array = getattr(estimator, inference_method)(
717
728
  input_df
@@ -650,26 +650,37 @@ class Birch(BaseTransformer):
650
650
  # input cols need to match unquoted / quoted
651
651
  input_cols = self.input_cols
652
652
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
653
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
653
654
 
654
655
  estimator = self._sklearn_object
655
656
 
656
- input_df = dataset[input_cols] # Select input columns with quoted column names.
657
- if hasattr(estimator, "feature_names_in_"):
658
- missing_features = []
659
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
660
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
661
- missing_features.append(f)
662
-
663
- if len(missing_features) > 0:
664
- raise ValueError(
665
- "The feature names should match with those that were passed during fit.\n"
666
- f"Features seen during fit call but not present in the input: {missing_features}\n"
667
- f"Features in the input dataframe : {input_cols}\n"
668
- )
669
- input_df.columns = getattr(estimator, "feature_names_in_")
670
- else:
671
- # Just rename the column names to unquoted identifiers.
672
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
657
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
658
+ missing_features = []
659
+ features_in_dataset = set(dataset.columns)
660
+ columns_to_select = []
661
+ for i, f in enumerate(features_required_by_estimator):
662
+ if (
663
+ i >= len(input_cols)
664
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
665
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
666
+ and quoted_input_cols[i] not in features_in_dataset)
667
+ ):
668
+ missing_features.append(f)
669
+ elif input_cols[i] in features_in_dataset:
670
+ columns_to_select.append(input_cols[i])
671
+ elif unquoted_input_cols[i] in features_in_dataset:
672
+ columns_to_select.append(unquoted_input_cols[i])
673
+ else:
674
+ columns_to_select.append(quoted_input_cols[i])
675
+
676
+ if len(missing_features) > 0:
677
+ raise ValueError(
678
+ "The feature names should match with those that were passed during fit.\n"
679
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
680
+ f"Features in the input dataframe : {input_cols}\n"
681
+ )
682
+ input_df = dataset[columns_to_select]
683
+ input_df.columns = features_required_by_estimator
673
684
 
674
685
  transformed_numpy_array = getattr(estimator, inference_method)(
675
686
  input_df
@@ -699,26 +699,37 @@ class BisectingKMeans(BaseTransformer):
699
699
  # input cols need to match unquoted / quoted
700
700
  input_cols = self.input_cols
701
701
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
702
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
702
703
 
703
704
  estimator = self._sklearn_object
704
705
 
705
- input_df = dataset[input_cols] # Select input columns with quoted column names.
706
- if hasattr(estimator, "feature_names_in_"):
707
- missing_features = []
708
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
709
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
710
- missing_features.append(f)
711
-
712
- if len(missing_features) > 0:
713
- raise ValueError(
714
- "The feature names should match with those that were passed during fit.\n"
715
- f"Features seen during fit call but not present in the input: {missing_features}\n"
716
- f"Features in the input dataframe : {input_cols}\n"
717
- )
718
- input_df.columns = getattr(estimator, "feature_names_in_")
719
- else:
720
- # Just rename the column names to unquoted identifiers.
721
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
706
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
707
+ missing_features = []
708
+ features_in_dataset = set(dataset.columns)
709
+ columns_to_select = []
710
+ for i, f in enumerate(features_required_by_estimator):
711
+ if (
712
+ i >= len(input_cols)
713
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
714
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
715
+ and quoted_input_cols[i] not in features_in_dataset)
716
+ ):
717
+ missing_features.append(f)
718
+ elif input_cols[i] in features_in_dataset:
719
+ columns_to_select.append(input_cols[i])
720
+ elif unquoted_input_cols[i] in features_in_dataset:
721
+ columns_to_select.append(unquoted_input_cols[i])
722
+ else:
723
+ columns_to_select.append(quoted_input_cols[i])
724
+
725
+ if len(missing_features) > 0:
726
+ raise ValueError(
727
+ "The feature names should match with those that were passed during fit.\n"
728
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
729
+ f"Features in the input dataframe : {input_cols}\n"
730
+ )
731
+ input_df = dataset[columns_to_select]
732
+ input_df.columns = features_required_by_estimator
722
733
 
723
734
  transformed_numpy_array = getattr(estimator, inference_method)(
724
735
  input_df
@@ -667,26 +667,37 @@ class DBSCAN(BaseTransformer):
667
667
  # input cols need to match unquoted / quoted
668
668
  input_cols = self.input_cols
669
669
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
670
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
670
671
 
671
672
  estimator = self._sklearn_object
672
673
 
673
- input_df = dataset[input_cols] # Select input columns with quoted column names.
674
- if hasattr(estimator, "feature_names_in_"):
675
- missing_features = []
676
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
677
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
678
- missing_features.append(f)
679
-
680
- if len(missing_features) > 0:
681
- raise ValueError(
682
- "The feature names should match with those that were passed during fit.\n"
683
- f"Features seen during fit call but not present in the input: {missing_features}\n"
684
- f"Features in the input dataframe : {input_cols}\n"
685
- )
686
- input_df.columns = getattr(estimator, "feature_names_in_")
687
- else:
688
- # Just rename the column names to unquoted identifiers.
689
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
674
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
675
+ missing_features = []
676
+ features_in_dataset = set(dataset.columns)
677
+ columns_to_select = []
678
+ for i, f in enumerate(features_required_by_estimator):
679
+ if (
680
+ i >= len(input_cols)
681
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
682
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
683
+ and quoted_input_cols[i] not in features_in_dataset)
684
+ ):
685
+ missing_features.append(f)
686
+ elif input_cols[i] in features_in_dataset:
687
+ columns_to_select.append(input_cols[i])
688
+ elif unquoted_input_cols[i] in features_in_dataset:
689
+ columns_to_select.append(unquoted_input_cols[i])
690
+ else:
691
+ columns_to_select.append(quoted_input_cols[i])
692
+
693
+ if len(missing_features) > 0:
694
+ raise ValueError(
695
+ "The feature names should match with those that were passed during fit.\n"
696
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
697
+ f"Features in the input dataframe : {input_cols}\n"
698
+ )
699
+ input_df = dataset[columns_to_select]
700
+ input_df.columns = features_required_by_estimator
690
701
 
691
702
  transformed_numpy_array = getattr(estimator, inference_method)(
692
703
  input_df
@@ -699,26 +699,37 @@ class FeatureAgglomeration(BaseTransformer):
699
699
  # input cols need to match unquoted / quoted
700
700
  input_cols = self.input_cols
701
701
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
702
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
702
703
 
703
704
  estimator = self._sklearn_object
704
705
 
705
- input_df = dataset[input_cols] # Select input columns with quoted column names.
706
- if hasattr(estimator, "feature_names_in_"):
707
- missing_features = []
708
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
709
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
710
- missing_features.append(f)
711
-
712
- if len(missing_features) > 0:
713
- raise ValueError(
714
- "The feature names should match with those that were passed during fit.\n"
715
- f"Features seen during fit call but not present in the input: {missing_features}\n"
716
- f"Features in the input dataframe : {input_cols}\n"
717
- )
718
- input_df.columns = getattr(estimator, "feature_names_in_")
719
- else:
720
- # Just rename the column names to unquoted identifiers.
721
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
706
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
707
+ missing_features = []
708
+ features_in_dataset = set(dataset.columns)
709
+ columns_to_select = []
710
+ for i, f in enumerate(features_required_by_estimator):
711
+ if (
712
+ i >= len(input_cols)
713
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
714
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
715
+ and quoted_input_cols[i] not in features_in_dataset)
716
+ ):
717
+ missing_features.append(f)
718
+ elif input_cols[i] in features_in_dataset:
719
+ columns_to_select.append(input_cols[i])
720
+ elif unquoted_input_cols[i] in features_in_dataset:
721
+ columns_to_select.append(unquoted_input_cols[i])
722
+ else:
723
+ columns_to_select.append(quoted_input_cols[i])
724
+
725
+ if len(missing_features) > 0:
726
+ raise ValueError(
727
+ "The feature names should match with those that were passed during fit.\n"
728
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
729
+ f"Features in the input dataframe : {input_cols}\n"
730
+ )
731
+ input_df = dataset[columns_to_select]
732
+ input_df.columns = features_required_by_estimator
722
733
 
723
734
  transformed_numpy_array = getattr(estimator, inference_method)(
724
735
  input_df
@@ -694,26 +694,37 @@ class KMeans(BaseTransformer):
694
694
  # input cols need to match unquoted / quoted
695
695
  input_cols = self.input_cols
696
696
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
697
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
697
698
 
698
699
  estimator = self._sklearn_object
699
700
 
700
- input_df = dataset[input_cols] # Select input columns with quoted column names.
701
- if hasattr(estimator, "feature_names_in_"):
702
- missing_features = []
703
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
704
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
705
- missing_features.append(f)
706
-
707
- if len(missing_features) > 0:
708
- raise ValueError(
709
- "The feature names should match with those that were passed during fit.\n"
710
- f"Features seen during fit call but not present in the input: {missing_features}\n"
711
- f"Features in the input dataframe : {input_cols}\n"
712
- )
713
- input_df.columns = getattr(estimator, "feature_names_in_")
714
- else:
715
- # Just rename the column names to unquoted identifiers.
716
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
701
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
702
+ missing_features = []
703
+ features_in_dataset = set(dataset.columns)
704
+ columns_to_select = []
705
+ for i, f in enumerate(features_required_by_estimator):
706
+ if (
707
+ i >= len(input_cols)
708
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
709
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
710
+ and quoted_input_cols[i] not in features_in_dataset)
711
+ ):
712
+ missing_features.append(f)
713
+ elif input_cols[i] in features_in_dataset:
714
+ columns_to_select.append(input_cols[i])
715
+ elif unquoted_input_cols[i] in features_in_dataset:
716
+ columns_to_select.append(unquoted_input_cols[i])
717
+ else:
718
+ columns_to_select.append(quoted_input_cols[i])
719
+
720
+ if len(missing_features) > 0:
721
+ raise ValueError(
722
+ "The feature names should match with those that were passed during fit.\n"
723
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
724
+ f"Features in the input dataframe : {input_cols}\n"
725
+ )
726
+ input_df = dataset[columns_to_select]
727
+ input_df.columns = features_required_by_estimator
717
728
 
718
729
  transformed_numpy_array = getattr(estimator, inference_method)(
719
730
  input_df
@@ -670,26 +670,37 @@ class MeanShift(BaseTransformer):
670
670
  # input cols need to match unquoted / quoted
671
671
  input_cols = self.input_cols
672
672
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
673
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
673
674
 
674
675
  estimator = self._sklearn_object
675
676
 
676
- input_df = dataset[input_cols] # Select input columns with quoted column names.
677
- if hasattr(estimator, "feature_names_in_"):
678
- missing_features = []
679
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
680
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
681
- missing_features.append(f)
682
-
683
- if len(missing_features) > 0:
684
- raise ValueError(
685
- "The feature names should match with those that were passed during fit.\n"
686
- f"Features seen during fit call but not present in the input: {missing_features}\n"
687
- f"Features in the input dataframe : {input_cols}\n"
688
- )
689
- input_df.columns = getattr(estimator, "feature_names_in_")
690
- else:
691
- # Just rename the column names to unquoted identifiers.
692
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
677
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
678
+ missing_features = []
679
+ features_in_dataset = set(dataset.columns)
680
+ columns_to_select = []
681
+ for i, f in enumerate(features_required_by_estimator):
682
+ if (
683
+ i >= len(input_cols)
684
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
685
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
686
+ and quoted_input_cols[i] not in features_in_dataset)
687
+ ):
688
+ missing_features.append(f)
689
+ elif input_cols[i] in features_in_dataset:
690
+ columns_to_select.append(input_cols[i])
691
+ elif unquoted_input_cols[i] in features_in_dataset:
692
+ columns_to_select.append(unquoted_input_cols[i])
693
+ else:
694
+ columns_to_select.append(quoted_input_cols[i])
695
+
696
+ if len(missing_features) > 0:
697
+ raise ValueError(
698
+ "The feature names should match with those that were passed during fit.\n"
699
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
700
+ f"Features in the input dataframe : {input_cols}\n"
701
+ )
702
+ input_df = dataset[columns_to_select]
703
+ input_df.columns = features_required_by_estimator
693
704
 
694
705
  transformed_numpy_array = getattr(estimator, inference_method)(
695
706
  input_df
@@ -720,26 +720,37 @@ class MiniBatchKMeans(BaseTransformer):
720
720
  # input cols need to match unquoted / quoted
721
721
  input_cols = self.input_cols
722
722
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
723
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
723
724
 
724
725
  estimator = self._sklearn_object
725
726
 
726
- input_df = dataset[input_cols] # Select input columns with quoted column names.
727
- if hasattr(estimator, "feature_names_in_"):
728
- missing_features = []
729
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
730
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
731
- missing_features.append(f)
732
-
733
- if len(missing_features) > 0:
734
- raise ValueError(
735
- "The feature names should match with those that were passed during fit.\n"
736
- f"Features seen during fit call but not present in the input: {missing_features}\n"
737
- f"Features in the input dataframe : {input_cols}\n"
738
- )
739
- input_df.columns = getattr(estimator, "feature_names_in_")
740
- else:
741
- # Just rename the column names to unquoted identifiers.
742
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
727
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
728
+ missing_features = []
729
+ features_in_dataset = set(dataset.columns)
730
+ columns_to_select = []
731
+ for i, f in enumerate(features_required_by_estimator):
732
+ if (
733
+ i >= len(input_cols)
734
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
735
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
736
+ and quoted_input_cols[i] not in features_in_dataset)
737
+ ):
738
+ missing_features.append(f)
739
+ elif input_cols[i] in features_in_dataset:
740
+ columns_to_select.append(input_cols[i])
741
+ elif unquoted_input_cols[i] in features_in_dataset:
742
+ columns_to_select.append(unquoted_input_cols[i])
743
+ else:
744
+ columns_to_select.append(quoted_input_cols[i])
745
+
746
+ if len(missing_features) > 0:
747
+ raise ValueError(
748
+ "The feature names should match with those that were passed during fit.\n"
749
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
750
+ f"Features in the input dataframe : {input_cols}\n"
751
+ )
752
+ input_df = dataset[columns_to_select]
753
+ input_df.columns = features_required_by_estimator
743
754
 
744
755
  transformed_numpy_array = getattr(estimator, inference_method)(
745
756
  input_df
@@ -740,26 +740,37 @@ class OPTICS(BaseTransformer):
740
740
  # input cols need to match unquoted / quoted
741
741
  input_cols = self.input_cols
742
742
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
743
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
743
744
 
744
745
  estimator = self._sklearn_object
745
746
 
746
- input_df = dataset[input_cols] # Select input columns with quoted column names.
747
- if hasattr(estimator, "feature_names_in_"):
748
- missing_features = []
749
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
750
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
751
- missing_features.append(f)
752
-
753
- if len(missing_features) > 0:
754
- raise ValueError(
755
- "The feature names should match with those that were passed during fit.\n"
756
- f"Features seen during fit call but not present in the input: {missing_features}\n"
757
- f"Features in the input dataframe : {input_cols}\n"
758
- )
759
- input_df.columns = getattr(estimator, "feature_names_in_")
760
- else:
761
- # Just rename the column names to unquoted identifiers.
762
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
747
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
748
+ missing_features = []
749
+ features_in_dataset = set(dataset.columns)
750
+ columns_to_select = []
751
+ for i, f in enumerate(features_required_by_estimator):
752
+ if (
753
+ i >= len(input_cols)
754
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
755
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
756
+ and quoted_input_cols[i] not in features_in_dataset)
757
+ ):
758
+ missing_features.append(f)
759
+ elif input_cols[i] in features_in_dataset:
760
+ columns_to_select.append(input_cols[i])
761
+ elif unquoted_input_cols[i] in features_in_dataset:
762
+ columns_to_select.append(unquoted_input_cols[i])
763
+ else:
764
+ columns_to_select.append(quoted_input_cols[i])
765
+
766
+ if len(missing_features) > 0:
767
+ raise ValueError(
768
+ "The feature names should match with those that were passed during fit.\n"
769
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
770
+ f"Features in the input dataframe : {input_cols}\n"
771
+ )
772
+ input_df = dataset[columns_to_select]
773
+ input_df.columns = features_required_by_estimator
763
774
 
764
775
  transformed_numpy_array = getattr(estimator, inference_method)(
765
776
  input_df
@@ -678,26 +678,37 @@ class SpectralBiclustering(BaseTransformer):
678
678
  # input cols need to match unquoted / quoted
679
679
  input_cols = self.input_cols
680
680
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
681
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
681
682
 
682
683
  estimator = self._sklearn_object
683
684
 
684
- input_df = dataset[input_cols] # Select input columns with quoted column names.
685
- if hasattr(estimator, "feature_names_in_"):
686
- missing_features = []
687
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
688
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
689
- missing_features.append(f)
690
-
691
- if len(missing_features) > 0:
692
- raise ValueError(
693
- "The feature names should match with those that were passed during fit.\n"
694
- f"Features seen during fit call but not present in the input: {missing_features}\n"
695
- f"Features in the input dataframe : {input_cols}\n"
696
- )
697
- input_df.columns = getattr(estimator, "feature_names_in_")
698
- else:
699
- # Just rename the column names to unquoted identifiers.
700
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
685
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
686
+ missing_features = []
687
+ features_in_dataset = set(dataset.columns)
688
+ columns_to_select = []
689
+ for i, f in enumerate(features_required_by_estimator):
690
+ if (
691
+ i >= len(input_cols)
692
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
693
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
694
+ and quoted_input_cols[i] not in features_in_dataset)
695
+ ):
696
+ missing_features.append(f)
697
+ elif input_cols[i] in features_in_dataset:
698
+ columns_to_select.append(input_cols[i])
699
+ elif unquoted_input_cols[i] in features_in_dataset:
700
+ columns_to_select.append(unquoted_input_cols[i])
701
+ else:
702
+ columns_to_select.append(quoted_input_cols[i])
703
+
704
+ if len(missing_features) > 0:
705
+ raise ValueError(
706
+ "The feature names should match with those that were passed during fit.\n"
707
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
708
+ f"Features in the input dataframe : {input_cols}\n"
709
+ )
710
+ input_df = dataset[columns_to_select]
711
+ input_df.columns = features_required_by_estimator
701
712
 
702
713
  transformed_numpy_array = getattr(estimator, inference_method)(
703
714
  input_df