PyPI - snowflake-ml-python - Versions diffs - 1.5.1__py3-none-any.whl → 1.5.3__py3-none-any.whl - Mend

snowflake-ml-python 1.5.1py3-none-any.whl → 1.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

snowflake/ml/modeling/ensemble/voting_regressor.py CHANGED Viewed

@@ -248,7 +248,7 @@ class VotingRegressor(BaseTransformer):
                         inspect.currentframe(), VotingRegressor.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -585,7 +585,14 @@ class VotingRegressor(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/feature_selection/generic_univariate_select.py CHANGED Viewed

@@ -238,7 +238,7 @@ class GenericUnivariateSelect(BaseTransformer):
                         inspect.currentframe(), GenericUnivariateSelect.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -573,7 +573,14 @@ class GenericUnivariateSelect(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/feature_selection/select_fdr.py CHANGED Viewed

@@ -234,7 +234,7 @@ class SelectFdr(BaseTransformer):
                         inspect.currentframe(), SelectFdr.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -569,7 +569,14 @@ class SelectFdr(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/feature_selection/select_fpr.py CHANGED Viewed

@@ -234,7 +234,7 @@ class SelectFpr(BaseTransformer):
                         inspect.currentframe(), SelectFpr.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -569,7 +569,14 @@ class SelectFpr(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/feature_selection/select_fwe.py CHANGED Viewed

@@ -234,7 +234,7 @@ class SelectFwe(BaseTransformer):
                         inspect.currentframe(), SelectFwe.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -569,7 +569,14 @@ class SelectFwe(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/feature_selection/select_k_best.py CHANGED Viewed

@@ -235,7 +235,7 @@ class SelectKBest(BaseTransformer):
                         inspect.currentframe(), SelectKBest.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -570,7 +570,14 @@ class SelectKBest(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/feature_selection/select_percentile.py CHANGED Viewed

@@ -234,7 +234,7 @@ class SelectPercentile(BaseTransformer):
                         inspect.currentframe(), SelectPercentile.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -569,7 +569,14 @@ class SelectPercentile(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/feature_selection/sequential_feature_selector.py CHANGED Viewed

@@ -292,7 +292,7 @@ class SequentialFeatureSelector(BaseTransformer):
                         inspect.currentframe(), SequentialFeatureSelector.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -627,7 +627,14 @@ class SequentialFeatureSelector(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/feature_selection/variance_threshold.py CHANGED Viewed

@@ -225,7 +225,7 @@ class VarianceThreshold(BaseTransformer):
                         inspect.currentframe(), VarianceThreshold.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -560,7 +560,14 @@ class VarianceThreshold(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/framework/base.py CHANGED Viewed

@@ -16,7 +16,7 @@ from snowflake.ml._internal.exceptions import (
     exceptions,
     modeling_error_messages,
 )
-from snowflake.ml._internal.lineage import data_source, lineage_utils
+from snowflake.ml._internal.lineage import lineage_utils
 from snowflake.ml._internal.utils import identifier, parallelize
 from snowflake.ml.modeling.framework import _utils
 from snowflake.snowpark import functions as F
@@ -386,7 +386,6 @@ class BaseEstimator(Base):
         self.file_names = file_names
         self.custom_states = custom_states
         self.sample_weight_col = sample_weight_col
-        self._data_sources: Optional[List[data_source.DataSource]] = None
         self.start_time = datetime.now().strftime(_utils.DATETIME_FORMAT)[:-3]
@@ -421,18 +420,14 @@ class BaseEstimator(Base):
         """
         return []
-    def _get_data_sources(self) -> Optional[List[data_source.DataSource]]:
-        return self._data_sources
     @telemetry.send_api_usage_telemetry(
         project=PROJECT,
         subproject=SUBPROJECT,
     )
     def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "BaseEstimator":
         """Runs universal logics for all fit implementations."""
-        self._data_sources = getattr(dataset, lineage_utils.DATA_SOURCES_ATTR, None)
-        if self._data_sources:
-            assert all(isinstance(ds, data_source.DataSource) for ds in self._data_sources)
+        data_sources = lineage_utils.get_data_sources(dataset)
+        lineage_utils.set_data_sources(self, data_sources)
         return self._fit(dataset)
     @abstractmethod

snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py CHANGED Viewed

@@ -320,7 +320,7 @@ class GaussianProcessClassifier(BaseTransformer):
                         inspect.currentframe(), GaussianProcessClassifier.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -653,7 +653,14 @@ class GaussianProcessClassifier(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py CHANGED Viewed

@@ -311,7 +311,7 @@ class GaussianProcessRegressor(BaseTransformer):
                         inspect.currentframe(), GaussianProcessRegressor.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -644,7 +644,14 @@ class GaussianProcessRegressor(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/impute/iterative_imputer.py CHANGED Viewed

@@ -353,7 +353,7 @@ class IterativeImputer(BaseTransformer):
                         inspect.currentframe(), IterativeImputer.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -688,7 +688,14 @@ class IterativeImputer(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/impute/knn_imputer.py CHANGED Viewed

@@ -279,7 +279,7 @@ class KNNImputer(BaseTransformer):
                         inspect.currentframe(), KNNImputer.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -614,7 +614,14 @@ class KNNImputer(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/impute/missing_indicator.py CHANGED Viewed

@@ -253,7 +253,7 @@ class MissingIndicator(BaseTransformer):
                         inspect.currentframe(), MissingIndicator.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -588,7 +588,14 @@ class MissingIndicator(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/impute/simple_imputer.py CHANGED Viewed

@@ -102,10 +102,14 @@ class SimpleImputer(base.BaseTransformer):
             For string or object data types, `fill_value` must be a string. If `None`, `fill_value` will be 0 when
             imputing numerical data and `missing_value` for strings and object data types.
         input_cols: Optional[Union[str, List[str]]]
-            Columns to use as inputs during fit and transform.
+            The name(s) of one or more columns in the input DataFrame containing feature(s) to be imputed. Input
+            columns must be specified before fit with this argument or after initialization with the
+            `set_input_cols` method. This argument is optional for API consistency.
         output_cols: Optional[Union[str, List[str]]]
-            A string or list of strings representing column names that will store the output of transform operation.
-            The length of `output_cols` must equal the length of `input_cols`.
+            The name(s) to assign output columns in the output DataFrame. The number of
+            output columns specified must equal the number of input columns. Output columns must be specified before
+            transform with this argument or after initialization with the `set_output_cols` method. This argument is
+            optional for API consistency.
         passthrough_cols: A string or a list of strings indicating column names to be excluded from any
             operations (such as train, transform, or inference). These specified column(s)
             will remain untouched throughout the process. This option is helpful in scenarios
@@ -158,6 +162,7 @@ class SimpleImputer(base.BaseTransformer):
         self.fill_value = fill_value
         self.missing_values = missing_values
+        self.statistics_: Dict[str, Any] = {}
         # TODO(hayu): [SNOW-752265] Support SimpleImputer keep_empty_features.
         #  Add back when `keep_empty_features` is supported.
         # self.keep_empty_features = keep_empty_features
@@ -229,8 +234,27 @@ class SimpleImputer(base.BaseTransformer):
         return input_col_datatypes
+    def _fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "SimpleImputer":
+        if isinstance(dataset, snowpark.DataFrame):
+            return self._fit_snowpark(dataset)
+        else:
+            return self._fit_sklearn(dataset)
+    def _fit_sklearn(self, dataset: pd.DataFrame) -> "SimpleImputer":
+        dataset = self._use_input_cols_only(dataset)
+        sklearn_simple_imputer = self._create_sklearn_object()
+        sklearn_simple_imputer = sklearn_simple_imputer.fit(dataset)
+        self._sklearn_object = sklearn_simple_imputer
+        for input_col, fill_value in zip(self.input_cols, sklearn_simple_imputer.statistics_.tolist()):
+            self.statistics_[input_col] = fill_value
+        self._sklearn_fit_dtype = sklearn_simple_imputer._fit_dtype
+        self.n_features_in_ = len(self.input_cols)
+        self.feature_names_in_ = self.input_cols
+        self._is_fitted = True
+        return self
     @telemetry.send_api_usage_telemetry(project=base.PROJECT, subproject=_SUBPROJECT)
-    def fit(self, dataset: snowpark.DataFrame) -> "SimpleImputer":
+    def _fit_snowpark(self, dataset: snowpark.DataFrame) -> "SimpleImputer":
         """
         Compute values to impute for the dataset according to the strategy.
@@ -245,7 +269,6 @@ class SimpleImputer(base.BaseTransformer):
         # In order to fit, the input columns should have the same type.
         input_col_datatypes = self._get_dataset_input_col_datatypes(dataset)
-        self.statistics_: Dict[str, Any] = {}
         statement_params = telemetry.get_statement_params(base.PROJECT, _SUBPROJECT, self.__class__.__name__)
         if self.strategy == "constant":

snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py CHANGED Viewed

@@ -228,7 +228,7 @@ class AdditiveChi2Sampler(BaseTransformer):
                         inspect.currentframe(), AdditiveChi2Sampler.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -563,7 +563,14 @@ class AdditiveChi2Sampler(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/kernel_approximation/nystroem.py CHANGED Viewed

@@ -276,7 +276,7 @@ class Nystroem(BaseTransformer):
                         inspect.currentframe(), Nystroem.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -611,7 +611,14 @@ class Nystroem(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py CHANGED Viewed

@@ -252,7 +252,7 @@ class PolynomialCountSketch(BaseTransformer):
                         inspect.currentframe(), PolynomialCountSketch.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -587,7 +587,14 @@ class PolynomialCountSketch(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/kernel_approximation/rbf_sampler.py CHANGED Viewed

@@ -239,7 +239,7 @@ class RBFSampler(BaseTransformer):
                         inspect.currentframe(), RBFSampler.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -574,7 +574,14 @@ class RBFSampler(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py CHANGED Viewed

@@ -237,7 +237,7 @@ class SkewedChi2Sampler(BaseTransformer):
                         inspect.currentframe(), SkewedChi2Sampler.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -572,7 +572,14 @@ class SkewedChi2Sampler(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/kernel_ridge/kernel_ridge.py CHANGED Viewed

@@ -273,7 +273,7 @@ class KernelRidge(BaseTransformer):
                         inspect.currentframe(), KernelRidge.__class__.__name__
                     ),
                     api_calls=[Session.call],
-                    custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+                    custom_tags={"autogen": True} if self._autogenerated else None,
                 )
                 pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
                 pd_df.columns = dataset.columns
@@ -606,7 +606,14 @@ class KernelRidge(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake-ml-python 1.5.1__py3-none-any.whl → 1.5.3__py3-none-any.whl

snowflake-ml-python 1.5.1py3-none-any.whl → 1.5.3py3-none-any.whl