snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. snowflake/ml/_internal/file_utils.py +8 -35
  2. snowflake/ml/_internal/utils/identifier.py +74 -7
  3. snowflake/ml/model/_core_requirements.py +1 -1
  4. snowflake/ml/model/_deploy_client/warehouse/deploy.py +5 -26
  5. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +2 -2
  6. snowflake/ml/model/_handlers/_base.py +3 -1
  7. snowflake/ml/model/_handlers/sklearn.py +1 -0
  8. snowflake/ml/model/_handlers/xgboost.py +1 -1
  9. snowflake/ml/model/_model.py +24 -19
  10. snowflake/ml/model/_model_meta.py +24 -15
  11. snowflake/ml/model/type_hints.py +5 -11
  12. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +28 -17
  13. snowflake/ml/modeling/cluster/affinity_propagation.py +28 -17
  14. snowflake/ml/modeling/cluster/agglomerative_clustering.py +28 -17
  15. snowflake/ml/modeling/cluster/birch.py +28 -17
  16. snowflake/ml/modeling/cluster/bisecting_k_means.py +28 -17
  17. snowflake/ml/modeling/cluster/dbscan.py +28 -17
  18. snowflake/ml/modeling/cluster/feature_agglomeration.py +28 -17
  19. snowflake/ml/modeling/cluster/k_means.py +28 -17
  20. snowflake/ml/modeling/cluster/mean_shift.py +28 -17
  21. snowflake/ml/modeling/cluster/mini_batch_k_means.py +28 -17
  22. snowflake/ml/modeling/cluster/optics.py +28 -17
  23. snowflake/ml/modeling/cluster/spectral_biclustering.py +28 -17
  24. snowflake/ml/modeling/cluster/spectral_clustering.py +28 -17
  25. snowflake/ml/modeling/cluster/spectral_coclustering.py +28 -17
  26. snowflake/ml/modeling/compose/column_transformer.py +28 -17
  27. snowflake/ml/modeling/compose/transformed_target_regressor.py +28 -17
  28. snowflake/ml/modeling/covariance/elliptic_envelope.py +28 -17
  29. snowflake/ml/modeling/covariance/empirical_covariance.py +28 -17
  30. snowflake/ml/modeling/covariance/graphical_lasso.py +28 -17
  31. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +28 -17
  32. snowflake/ml/modeling/covariance/ledoit_wolf.py +28 -17
  33. snowflake/ml/modeling/covariance/min_cov_det.py +28 -17
  34. snowflake/ml/modeling/covariance/oas.py +28 -17
  35. snowflake/ml/modeling/covariance/shrunk_covariance.py +28 -17
  36. snowflake/ml/modeling/decomposition/dictionary_learning.py +28 -17
  37. snowflake/ml/modeling/decomposition/factor_analysis.py +28 -17
  38. snowflake/ml/modeling/decomposition/fast_ica.py +28 -17
  39. snowflake/ml/modeling/decomposition/incremental_pca.py +28 -17
  40. snowflake/ml/modeling/decomposition/kernel_pca.py +28 -17
  41. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +28 -17
  42. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +28 -17
  43. snowflake/ml/modeling/decomposition/pca.py +28 -17
  44. snowflake/ml/modeling/decomposition/sparse_pca.py +28 -17
  45. snowflake/ml/modeling/decomposition/truncated_svd.py +28 -17
  46. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +28 -17
  47. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +28 -17
  48. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +28 -17
  49. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +28 -17
  50. snowflake/ml/modeling/ensemble/bagging_classifier.py +28 -17
  51. snowflake/ml/modeling/ensemble/bagging_regressor.py +28 -17
  52. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +28 -17
  53. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +28 -17
  54. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +28 -17
  55. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +28 -17
  56. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +28 -17
  57. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +28 -17
  58. snowflake/ml/modeling/ensemble/isolation_forest.py +28 -17
  59. snowflake/ml/modeling/ensemble/random_forest_classifier.py +28 -17
  60. snowflake/ml/modeling/ensemble/random_forest_regressor.py +28 -17
  61. snowflake/ml/modeling/ensemble/stacking_regressor.py +28 -17
  62. snowflake/ml/modeling/ensemble/voting_classifier.py +28 -17
  63. snowflake/ml/modeling/ensemble/voting_regressor.py +28 -17
  64. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +28 -17
  65. snowflake/ml/modeling/feature_selection/select_fdr.py +28 -17
  66. snowflake/ml/modeling/feature_selection/select_fpr.py +28 -17
  67. snowflake/ml/modeling/feature_selection/select_fwe.py +28 -17
  68. snowflake/ml/modeling/feature_selection/select_k_best.py +28 -17
  69. snowflake/ml/modeling/feature_selection/select_percentile.py +28 -17
  70. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +28 -17
  71. snowflake/ml/modeling/feature_selection/variance_threshold.py +28 -17
  72. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +28 -17
  73. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +28 -17
  74. snowflake/ml/modeling/impute/iterative_imputer.py +28 -17
  75. snowflake/ml/modeling/impute/knn_imputer.py +28 -17
  76. snowflake/ml/modeling/impute/missing_indicator.py +28 -17
  77. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +28 -17
  78. snowflake/ml/modeling/kernel_approximation/nystroem.py +28 -17
  79. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +28 -17
  80. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +28 -17
  81. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +28 -17
  82. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +28 -17
  83. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +28 -17
  84. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +28 -17
  85. snowflake/ml/modeling/linear_model/ard_regression.py +28 -17
  86. snowflake/ml/modeling/linear_model/bayesian_ridge.py +28 -17
  87. snowflake/ml/modeling/linear_model/elastic_net.py +28 -17
  88. snowflake/ml/modeling/linear_model/elastic_net_cv.py +28 -17
  89. snowflake/ml/modeling/linear_model/gamma_regressor.py +28 -17
  90. snowflake/ml/modeling/linear_model/huber_regressor.py +28 -17
  91. snowflake/ml/modeling/linear_model/lars.py +28 -17
  92. snowflake/ml/modeling/linear_model/lars_cv.py +28 -17
  93. snowflake/ml/modeling/linear_model/lasso.py +28 -17
  94. snowflake/ml/modeling/linear_model/lasso_cv.py +28 -17
  95. snowflake/ml/modeling/linear_model/lasso_lars.py +28 -17
  96. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +28 -17
  97. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +28 -17
  98. snowflake/ml/modeling/linear_model/linear_regression.py +28 -17
  99. snowflake/ml/modeling/linear_model/logistic_regression.py +28 -17
  100. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +28 -17
  101. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +28 -17
  102. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +28 -17
  103. snowflake/ml/modeling/linear_model/multi_task_lasso.py +28 -17
  104. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +28 -17
  105. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +28 -17
  106. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +28 -17
  107. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +28 -17
  108. snowflake/ml/modeling/linear_model/perceptron.py +28 -17
  109. snowflake/ml/modeling/linear_model/poisson_regressor.py +28 -17
  110. snowflake/ml/modeling/linear_model/ransac_regressor.py +28 -17
  111. snowflake/ml/modeling/linear_model/ridge.py +28 -17
  112. snowflake/ml/modeling/linear_model/ridge_classifier.py +28 -17
  113. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +28 -17
  114. snowflake/ml/modeling/linear_model/ridge_cv.py +28 -17
  115. snowflake/ml/modeling/linear_model/sgd_classifier.py +28 -17
  116. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +28 -17
  117. snowflake/ml/modeling/linear_model/sgd_regressor.py +28 -17
  118. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +28 -17
  119. snowflake/ml/modeling/linear_model/tweedie_regressor.py +28 -17
  120. snowflake/ml/modeling/manifold/isomap.py +28 -17
  121. snowflake/ml/modeling/manifold/mds.py +28 -17
  122. snowflake/ml/modeling/manifold/spectral_embedding.py +28 -17
  123. snowflake/ml/modeling/manifold/tsne.py +28 -17
  124. snowflake/ml/modeling/metrics/classification.py +6 -1
  125. snowflake/ml/modeling/metrics/regression.py +517 -9
  126. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +28 -17
  127. snowflake/ml/modeling/mixture/gaussian_mixture.py +28 -17
  128. snowflake/ml/modeling/model_selection/grid_search_cv.py +28 -17
  129. snowflake/ml/modeling/model_selection/randomized_search_cv.py +28 -17
  130. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +28 -17
  131. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +28 -17
  132. snowflake/ml/modeling/multiclass/output_code_classifier.py +28 -17
  133. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +28 -17
  134. snowflake/ml/modeling/naive_bayes/categorical_nb.py +28 -17
  135. snowflake/ml/modeling/naive_bayes/complement_nb.py +28 -17
  136. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +28 -17
  137. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +28 -17
  138. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +28 -17
  139. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +28 -17
  140. snowflake/ml/modeling/neighbors/kernel_density.py +28 -17
  141. snowflake/ml/modeling/neighbors/local_outlier_factor.py +28 -17
  142. snowflake/ml/modeling/neighbors/nearest_centroid.py +28 -17
  143. snowflake/ml/modeling/neighbors/nearest_neighbors.py +28 -17
  144. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +28 -17
  145. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +28 -17
  146. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +28 -17
  147. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +28 -17
  148. snowflake/ml/modeling/neural_network/mlp_classifier.py +28 -17
  149. snowflake/ml/modeling/neural_network/mlp_regressor.py +28 -17
  150. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  151. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  152. snowflake/ml/modeling/preprocessing/polynomial_features.py +28 -17
  153. snowflake/ml/modeling/semi_supervised/label_propagation.py +28 -17
  154. snowflake/ml/modeling/semi_supervised/label_spreading.py +28 -17
  155. snowflake/ml/modeling/svm/linear_svc.py +28 -17
  156. snowflake/ml/modeling/svm/linear_svr.py +28 -17
  157. snowflake/ml/modeling/svm/nu_svc.py +28 -17
  158. snowflake/ml/modeling/svm/nu_svr.py +28 -17
  159. snowflake/ml/modeling/svm/svc.py +28 -17
  160. snowflake/ml/modeling/svm/svr.py +28 -17
  161. snowflake/ml/modeling/tree/decision_tree_classifier.py +28 -17
  162. snowflake/ml/modeling/tree/decision_tree_regressor.py +28 -17
  163. snowflake/ml/modeling/tree/extra_tree_classifier.py +28 -17
  164. snowflake/ml/modeling/tree/extra_tree_regressor.py +28 -17
  165. snowflake/ml/modeling/xgboost/xgb_classifier.py +28 -17
  166. snowflake/ml/modeling/xgboost/xgb_regressor.py +28 -17
  167. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +28 -17
  168. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +28 -17
  169. snowflake/ml/registry/model_registry.py +49 -65
  170. snowflake/ml/version.py +1 -1
  171. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.2.dist-info}/METADATA +24 -1
  172. snowflake_ml_python-1.0.2.dist-info/RECORD +246 -0
  173. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  174. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.2.dist-info}/WHEEL +0 -0
@@ -798,26 +798,37 @@ class MLPRegressor(BaseTransformer):
798
798
  # input cols need to match unquoted / quoted
799
799
  input_cols = self.input_cols
800
800
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
801
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
801
802
 
802
803
  estimator = self._sklearn_object
803
804
 
804
- input_df = dataset[input_cols] # Select input columns with quoted column names.
805
- if hasattr(estimator, "feature_names_in_"):
806
- missing_features = []
807
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
808
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
809
- missing_features.append(f)
810
-
811
- if len(missing_features) > 0:
812
- raise ValueError(
813
- "The feature names should match with those that were passed during fit.\n"
814
- f"Features seen during fit call but not present in the input: {missing_features}\n"
815
- f"Features in the input dataframe : {input_cols}\n"
816
- )
817
- input_df.columns = getattr(estimator, "feature_names_in_")
818
- else:
819
- # Just rename the column names to unquoted identifiers.
820
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
805
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
806
+ missing_features = []
807
+ features_in_dataset = set(dataset.columns)
808
+ columns_to_select = []
809
+ for i, f in enumerate(features_required_by_estimator):
810
+ if (
811
+ i >= len(input_cols)
812
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
813
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
814
+ and quoted_input_cols[i] not in features_in_dataset)
815
+ ):
816
+ missing_features.append(f)
817
+ elif input_cols[i] in features_in_dataset:
818
+ columns_to_select.append(input_cols[i])
819
+ elif unquoted_input_cols[i] in features_in_dataset:
820
+ columns_to_select.append(unquoted_input_cols[i])
821
+ else:
822
+ columns_to_select.append(quoted_input_cols[i])
823
+
824
+ if len(missing_features) > 0:
825
+ raise ValueError(
826
+ "The feature names should match with those that were passed during fit.\n"
827
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
828
+ f"Features in the input dataframe : {input_cols}\n"
829
+ )
830
+ input_df = dataset[columns_to_select]
831
+ input_df.columns = features_required_by_estimator
821
832
 
822
833
  transformed_numpy_array = getattr(estimator, inference_method)(
823
834
  input_df
@@ -14,6 +14,7 @@ from sklearn.utils import metaestimators
14
14
 
15
15
  from snowflake import snowpark
16
16
  from snowflake.ml._internal import telemetry
17
+ from snowflake.ml.model.model_signature import ModelSignature, _infer_signature
17
18
  from snowflake.ml.modeling.framework import _utils, base
18
19
 
19
20
  _PROJECT = "ModelDevelopment"
@@ -103,6 +104,8 @@ class Pipeline(base.BaseTransformer):
103
104
  self._transformers_to_input_indices: Dict[str, List[int]] = {}
104
105
  self._is_convertable_to_sklearn = True
105
106
 
107
+ self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
108
+
106
109
  deps: Set[str] = {f"pandas=={pd.__version__}", f"scikit-learn=={skversion}"}
107
110
  for _, obj in steps:
108
111
  if isinstance(obj, base.BaseTransformer):
@@ -241,6 +244,7 @@ class Pipeline(base.BaseTransformer):
241
244
  step_name=estimator[0], all_cols=all_cols, input_cols=estimator[1].get_input_cols()
242
245
  )
243
246
 
247
+ self._get_model_signatures(dataset=dataset)
244
248
  self._is_fitted = True
245
249
  return self
246
250
 
@@ -309,6 +313,7 @@ class Pipeline(base.BaseTransformer):
309
313
  res = estimator[1].fit(transformed_dataset).transform(transformed_dataset)
310
314
  return res
311
315
 
316
+ self._get_model_signatures(dataset=dataset)
312
317
  self._is_fitted = True
313
318
  return transformed_dataset
314
319
 
@@ -346,6 +351,7 @@ class Pipeline(base.BaseTransformer):
346
351
  else:
347
352
  transformed_dataset = estimator[1].fit(transformed_dataset).predict(transformed_dataset)
348
353
 
354
+ self._get_model_signatures(dataset=dataset)
349
355
  self._is_fitted = True
350
356
  return transformed_dataset
351
357
 
@@ -559,3 +565,21 @@ class Pipeline(base.BaseTransformer):
559
565
 
560
566
  def _get_dependencies(self) -> List[str]:
561
567
  return self._deps
568
+
569
+ def _get_model_signatures(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> None:
570
+ self._model_signature_dict = dict()
571
+
572
+ input_columns = self._get_sanitized_list_of_columns(dataset.columns)
573
+ inputs_signature = _infer_signature(dataset[input_columns], "input")
574
+
575
+ estimator_step = self._get_estimator()
576
+ if estimator_step:
577
+ estimator_signatures = estimator_step[1].model_signatures
578
+ for method, signature in estimator_signatures.items():
579
+ self._model_signature_dict[method] = ModelSignature(inputs=inputs_signature, outputs=signature.outputs)
580
+
581
+ @property
582
+ def model_signatures(self) -> Dict[str, ModelSignature]:
583
+ if self._model_signature_dict is None:
584
+ raise RuntimeError("Estimator not fitted before accessing property model_signatures! ")
585
+ return self._model_signature_dict
@@ -800,7 +800,7 @@ class OneHotEncoder(base.BaseTransformer):
800
800
  state_df = dataset._session.create_dataframe(state_pandas)
801
801
 
802
802
  transformed_dataset = dataset
803
- origional_dataset_columns = transformed_dataset.columns[:]
803
+ original_dataset_columns = transformed_dataset.columns[:]
804
804
  all_output_cols = []
805
805
  for input_col in self.input_cols:
806
806
  output_cols = [
@@ -818,7 +818,7 @@ class OneHotEncoder(base.BaseTransformer):
818
818
 
819
819
  transformed_dataset = self._handle_unknown_in_transform(transformed_dataset)
820
820
  # Reorder columns. Passthrough columns are added at the right to the output of the transformers.
821
- transformed_dataset = transformed_dataset[all_output_cols + origional_dataset_columns]
821
+ transformed_dataset = transformed_dataset[all_output_cols + original_dataset_columns]
822
822
  return transformed_dataset
823
823
 
824
824
  def _transform_snowpark_sparse_udf(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame:
@@ -895,15 +895,14 @@ class OneHotEncoder(base.BaseTransformer):
895
895
  Output dataset.
896
896
  """
897
897
  encoder_sklearn = self.to_sklearn()
898
-
899
898
  transformed_dataset = encoder_sklearn.transform(dataset[self.input_cols])
900
899
 
901
- if not self.sparse:
902
- dataset = dataset.copy()
903
- dataset[self.get_output_cols()] = transformed_dataset
904
- return dataset
900
+ if self.sparse:
901
+ return transformed_dataset
905
902
 
906
- return transformed_dataset
903
+ dataset = dataset.copy()
904
+ dataset[self.get_output_cols()] = transformed_dataset
905
+ return dataset
907
906
 
908
907
  def _create_unfitted_sklearn_object(self) -> preprocessing.OneHotEncoder:
909
908
  sklearn_args = self.get_sklearn_args(
@@ -1331,17 +1330,17 @@ class OneHotEncoder(base.BaseTransformer):
1331
1330
  Output columns.
1332
1331
  """
1333
1332
  if self.sparse:
1334
- output_cols = self.output_cols
1335
- else:
1336
- output_cols = (
1337
- [
1338
- identifier.quote_name_without_upper_casing(col)
1339
- for input_col in self.input_cols
1340
- for col in self._dense_output_cols_mappings[input_col]
1341
- ]
1342
- if self._dense_output_cols_mappings
1343
- else []
1344
- )
1333
+ return self.output_cols
1334
+
1335
+ output_cols = (
1336
+ [
1337
+ identifier.get_inferred_name(col)
1338
+ for input_col in self.input_cols
1339
+ for col in self._dense_output_cols_mappings[input_col]
1340
+ ]
1341
+ if self._dense_output_cols_mappings
1342
+ else []
1343
+ )
1345
1344
  return output_cols
1346
1345
 
1347
1346
  def _get_dense_output_cols_mappings(self) -> None:
@@ -639,26 +639,37 @@ class PolynomialFeatures(BaseTransformer):
639
639
  # input cols need to match unquoted / quoted
640
640
  input_cols = self.input_cols
641
641
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
642
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
642
643
 
643
644
  estimator = self._sklearn_object
644
645
 
645
- input_df = dataset[input_cols] # Select input columns with quoted column names.
646
- if hasattr(estimator, "feature_names_in_"):
647
- missing_features = []
648
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
649
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
650
- missing_features.append(f)
651
-
652
- if len(missing_features) > 0:
653
- raise ValueError(
654
- "The feature names should match with those that were passed during fit.\n"
655
- f"Features seen during fit call but not present in the input: {missing_features}\n"
656
- f"Features in the input dataframe : {input_cols}\n"
657
- )
658
- input_df.columns = getattr(estimator, "feature_names_in_")
659
- else:
660
- # Just rename the column names to unquoted identifiers.
661
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
646
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
647
+ missing_features = []
648
+ features_in_dataset = set(dataset.columns)
649
+ columns_to_select = []
650
+ for i, f in enumerate(features_required_by_estimator):
651
+ if (
652
+ i >= len(input_cols)
653
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
654
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
655
+ and quoted_input_cols[i] not in features_in_dataset)
656
+ ):
657
+ missing_features.append(f)
658
+ elif input_cols[i] in features_in_dataset:
659
+ columns_to_select.append(input_cols[i])
660
+ elif unquoted_input_cols[i] in features_in_dataset:
661
+ columns_to_select.append(unquoted_input_cols[i])
662
+ else:
663
+ columns_to_select.append(quoted_input_cols[i])
664
+
665
+ if len(missing_features) > 0:
666
+ raise ValueError(
667
+ "The feature names should match with those that were passed during fit.\n"
668
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
669
+ f"Features in the input dataframe : {input_cols}\n"
670
+ )
671
+ input_df = dataset[columns_to_select]
672
+ input_df.columns = features_required_by_estimator
662
673
 
663
674
  transformed_numpy_array = getattr(estimator, inference_method)(
664
675
  input_df
@@ -643,26 +643,37 @@ class LabelPropagation(BaseTransformer):
643
643
  # input cols need to match unquoted / quoted
644
644
  input_cols = self.input_cols
645
645
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
646
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
646
647
 
647
648
  estimator = self._sklearn_object
648
649
 
649
- input_df = dataset[input_cols] # Select input columns with quoted column names.
650
- if hasattr(estimator, "feature_names_in_"):
651
- missing_features = []
652
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
653
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
654
- missing_features.append(f)
655
-
656
- if len(missing_features) > 0:
657
- raise ValueError(
658
- "The feature names should match with those that were passed during fit.\n"
659
- f"Features seen during fit call but not present in the input: {missing_features}\n"
660
- f"Features in the input dataframe : {input_cols}\n"
661
- )
662
- input_df.columns = getattr(estimator, "feature_names_in_")
663
- else:
664
- # Just rename the column names to unquoted identifiers.
665
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
650
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
651
+ missing_features = []
652
+ features_in_dataset = set(dataset.columns)
653
+ columns_to_select = []
654
+ for i, f in enumerate(features_required_by_estimator):
655
+ if (
656
+ i >= len(input_cols)
657
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
658
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
659
+ and quoted_input_cols[i] not in features_in_dataset)
660
+ ):
661
+ missing_features.append(f)
662
+ elif input_cols[i] in features_in_dataset:
663
+ columns_to_select.append(input_cols[i])
664
+ elif unquoted_input_cols[i] in features_in_dataset:
665
+ columns_to_select.append(unquoted_input_cols[i])
666
+ else:
667
+ columns_to_select.append(quoted_input_cols[i])
668
+
669
+ if len(missing_features) > 0:
670
+ raise ValueError(
671
+ "The feature names should match with those that were passed during fit.\n"
672
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
673
+ f"Features in the input dataframe : {input_cols}\n"
674
+ )
675
+ input_df = dataset[columns_to_select]
676
+ input_df.columns = features_required_by_estimator
666
677
 
667
678
  transformed_numpy_array = getattr(estimator, inference_method)(
668
679
  input_df
@@ -652,26 +652,37 @@ class LabelSpreading(BaseTransformer):
652
652
  # input cols need to match unquoted / quoted
653
653
  input_cols = self.input_cols
654
654
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
655
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
655
656
 
656
657
  estimator = self._sklearn_object
657
658
 
658
- input_df = dataset[input_cols] # Select input columns with quoted column names.
659
- if hasattr(estimator, "feature_names_in_"):
660
- missing_features = []
661
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
662
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
663
- missing_features.append(f)
664
-
665
- if len(missing_features) > 0:
666
- raise ValueError(
667
- "The feature names should match with those that were passed during fit.\n"
668
- f"Features seen during fit call but not present in the input: {missing_features}\n"
669
- f"Features in the input dataframe : {input_cols}\n"
670
- )
671
- input_df.columns = getattr(estimator, "feature_names_in_")
672
- else:
673
- # Just rename the column names to unquoted identifiers.
674
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
659
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
660
+ missing_features = []
661
+ features_in_dataset = set(dataset.columns)
662
+ columns_to_select = []
663
+ for i, f in enumerate(features_required_by_estimator):
664
+ if (
665
+ i >= len(input_cols)
666
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
667
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
668
+ and quoted_input_cols[i] not in features_in_dataset)
669
+ ):
670
+ missing_features.append(f)
671
+ elif input_cols[i] in features_in_dataset:
672
+ columns_to_select.append(input_cols[i])
673
+ elif unquoted_input_cols[i] in features_in_dataset:
674
+ columns_to_select.append(unquoted_input_cols[i])
675
+ else:
676
+ columns_to_select.append(quoted_input_cols[i])
677
+
678
+ if len(missing_features) > 0:
679
+ raise ValueError(
680
+ "The feature names should match with those that were passed during fit.\n"
681
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
682
+ f"Features in the input dataframe : {input_cols}\n"
683
+ )
684
+ input_df = dataset[columns_to_select]
685
+ input_df.columns = features_required_by_estimator
675
686
 
676
687
  transformed_numpy_array = getattr(estimator, inference_method)(
677
688
  input_df
@@ -703,26 +703,37 @@ class LinearSVC(BaseTransformer):
703
703
  # input cols need to match unquoted / quoted
704
704
  input_cols = self.input_cols
705
705
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
706
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
706
707
 
707
708
  estimator = self._sklearn_object
708
709
 
709
- input_df = dataset[input_cols] # Select input columns with quoted column names.
710
- if hasattr(estimator, "feature_names_in_"):
711
- missing_features = []
712
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
713
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
714
- missing_features.append(f)
715
-
716
- if len(missing_features) > 0:
717
- raise ValueError(
718
- "The feature names should match with those that were passed during fit.\n"
719
- f"Features seen during fit call but not present in the input: {missing_features}\n"
720
- f"Features in the input dataframe : {input_cols}\n"
721
- )
722
- input_df.columns = getattr(estimator, "feature_names_in_")
723
- else:
724
- # Just rename the column names to unquoted identifiers.
725
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
710
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
711
+ missing_features = []
712
+ features_in_dataset = set(dataset.columns)
713
+ columns_to_select = []
714
+ for i, f in enumerate(features_required_by_estimator):
715
+ if (
716
+ i >= len(input_cols)
717
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
718
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
719
+ and quoted_input_cols[i] not in features_in_dataset)
720
+ ):
721
+ missing_features.append(f)
722
+ elif input_cols[i] in features_in_dataset:
723
+ columns_to_select.append(input_cols[i])
724
+ elif unquoted_input_cols[i] in features_in_dataset:
725
+ columns_to_select.append(unquoted_input_cols[i])
726
+ else:
727
+ columns_to_select.append(quoted_input_cols[i])
728
+
729
+ if len(missing_features) > 0:
730
+ raise ValueError(
731
+ "The feature names should match with those that were passed during fit.\n"
732
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
733
+ f"Features in the input dataframe : {input_cols}\n"
734
+ )
735
+ input_df = dataset[columns_to_select]
736
+ input_df.columns = features_required_by_estimator
726
737
 
727
738
  transformed_numpy_array = getattr(estimator, inference_method)(
728
739
  input_df
@@ -676,26 +676,37 @@ class LinearSVR(BaseTransformer):
676
676
  # input cols need to match unquoted / quoted
677
677
  input_cols = self.input_cols
678
678
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
679
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
679
680
 
680
681
  estimator = self._sklearn_object
681
682
 
682
- input_df = dataset[input_cols] # Select input columns with quoted column names.
683
- if hasattr(estimator, "feature_names_in_"):
684
- missing_features = []
685
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
686
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
687
- missing_features.append(f)
688
-
689
- if len(missing_features) > 0:
690
- raise ValueError(
691
- "The feature names should match with those that were passed during fit.\n"
692
- f"Features seen during fit call but not present in the input: {missing_features}\n"
693
- f"Features in the input dataframe : {input_cols}\n"
694
- )
695
- input_df.columns = getattr(estimator, "feature_names_in_")
696
- else:
697
- # Just rename the column names to unquoted identifiers.
698
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
683
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
684
+ missing_features = []
685
+ features_in_dataset = set(dataset.columns)
686
+ columns_to_select = []
687
+ for i, f in enumerate(features_required_by_estimator):
688
+ if (
689
+ i >= len(input_cols)
690
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
691
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
692
+ and quoted_input_cols[i] not in features_in_dataset)
693
+ ):
694
+ missing_features.append(f)
695
+ elif input_cols[i] in features_in_dataset:
696
+ columns_to_select.append(input_cols[i])
697
+ elif unquoted_input_cols[i] in features_in_dataset:
698
+ columns_to_select.append(unquoted_input_cols[i])
699
+ else:
700
+ columns_to_select.append(quoted_input_cols[i])
701
+
702
+ if len(missing_features) > 0:
703
+ raise ValueError(
704
+ "The feature names should match with those that were passed during fit.\n"
705
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
706
+ f"Features in the input dataframe : {input_cols}\n"
707
+ )
708
+ input_df = dataset[columns_to_select]
709
+ input_df.columns = features_required_by_estimator
699
710
 
700
711
  transformed_numpy_array = getattr(estimator, inference_method)(
701
712
  input_df
@@ -714,26 +714,37 @@ class NuSVC(BaseTransformer):
714
714
  # input cols need to match unquoted / quoted
715
715
  input_cols = self.input_cols
716
716
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
717
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
717
718
 
718
719
  estimator = self._sklearn_object
719
720
 
720
- input_df = dataset[input_cols] # Select input columns with quoted column names.
721
- if hasattr(estimator, "feature_names_in_"):
722
- missing_features = []
723
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
724
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
725
- missing_features.append(f)
726
-
727
- if len(missing_features) > 0:
728
- raise ValueError(
729
- "The feature names should match with those that were passed during fit.\n"
730
- f"Features seen during fit call but not present in the input: {missing_features}\n"
731
- f"Features in the input dataframe : {input_cols}\n"
732
- )
733
- input_df.columns = getattr(estimator, "feature_names_in_")
734
- else:
735
- # Just rename the column names to unquoted identifiers.
736
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
721
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
722
+ missing_features = []
723
+ features_in_dataset = set(dataset.columns)
724
+ columns_to_select = []
725
+ for i, f in enumerate(features_required_by_estimator):
726
+ if (
727
+ i >= len(input_cols)
728
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
729
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
730
+ and quoted_input_cols[i] not in features_in_dataset)
731
+ ):
732
+ missing_features.append(f)
733
+ elif input_cols[i] in features_in_dataset:
734
+ columns_to_select.append(input_cols[i])
735
+ elif unquoted_input_cols[i] in features_in_dataset:
736
+ columns_to_select.append(unquoted_input_cols[i])
737
+ else:
738
+ columns_to_select.append(quoted_input_cols[i])
739
+
740
+ if len(missing_features) > 0:
741
+ raise ValueError(
742
+ "The feature names should match with those that were passed during fit.\n"
743
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
744
+ f"Features in the input dataframe : {input_cols}\n"
745
+ )
746
+ input_df = dataset[columns_to_select]
747
+ input_df.columns = features_required_by_estimator
737
748
 
738
749
  transformed_numpy_array = getattr(estimator, inference_method)(
739
750
  input_df
@@ -675,26 +675,37 @@ class NuSVR(BaseTransformer):
675
675
  # input cols need to match unquoted / quoted
676
676
  input_cols = self.input_cols
677
677
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
678
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
678
679
 
679
680
  estimator = self._sklearn_object
680
681
 
681
- input_df = dataset[input_cols] # Select input columns with quoted column names.
682
- if hasattr(estimator, "feature_names_in_"):
683
- missing_features = []
684
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
685
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
686
- missing_features.append(f)
687
-
688
- if len(missing_features) > 0:
689
- raise ValueError(
690
- "The feature names should match with those that were passed during fit.\n"
691
- f"Features seen during fit call but not present in the input: {missing_features}\n"
692
- f"Features in the input dataframe : {input_cols}\n"
693
- )
694
- input_df.columns = getattr(estimator, "feature_names_in_")
695
- else:
696
- # Just rename the column names to unquoted identifiers.
697
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
682
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
683
+ missing_features = []
684
+ features_in_dataset = set(dataset.columns)
685
+ columns_to_select = []
686
+ for i, f in enumerate(features_required_by_estimator):
687
+ if (
688
+ i >= len(input_cols)
689
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
690
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
691
+ and quoted_input_cols[i] not in features_in_dataset)
692
+ ):
693
+ missing_features.append(f)
694
+ elif input_cols[i] in features_in_dataset:
695
+ columns_to_select.append(input_cols[i])
696
+ elif unquoted_input_cols[i] in features_in_dataset:
697
+ columns_to_select.append(unquoted_input_cols[i])
698
+ else:
699
+ columns_to_select.append(quoted_input_cols[i])
700
+
701
+ if len(missing_features) > 0:
702
+ raise ValueError(
703
+ "The feature names should match with those that were passed during fit.\n"
704
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
705
+ f"Features in the input dataframe : {input_cols}\n"
706
+ )
707
+ input_df = dataset[columns_to_select]
708
+ input_df.columns = features_required_by_estimator
698
709
 
699
710
  transformed_numpy_array = getattr(estimator, inference_method)(
700
711
  input_df