PyPI - upgini - Versions diffs - 1.2.59a3818.dev1__tar.gz → 1.2.60__tar.gz - Mend

upgini 1.2.59a3818.dev1tar.gz → 1.2.60tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (69) hide show

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.59a3818.dev1
+Version: 1.2.60
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/
@@ -30,6 +30,7 @@ Requires-Dist: jarowinkler>=2.0.0
 Requires-Dist: levenshtein>=0.25.1
 Requires-Dist: numpy<=1.26.4,>=1.19.0
 Requires-Dist: pandas<3.0.0,>=1.1.0
+Requires-Dist: psutil>=6.0.0
 Requires-Dist: pydantic<3.0.0,>1.0.0
 Requires-Dist: pyjwt>=2.8.0
 Requires-Dist: python-bidi==0.4.2

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/pyproject.toml RENAMED Viewed

@@ -50,6 +50,7 @@ dependencies = [
     "xhtml2pdf>=0.2.11,<0.3.0",
     "jarowinkler>=2.0.0",
     "levenshtein>=0.25.1",
+    "psutil>=6.0.0",
 ]
 [project.urls]

upgini-1.2.60/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.60"

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/date.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import abc
 import json
-from typing import Any, Dict, List, Optional, Union
+from typing import Dict, List, Optional, Union
 import numpy as np
 import pandas as pd

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/vector.py RENAMED Viewed

@@ -55,7 +55,7 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
         ts.set_index(date.name, inplace=True)
         ts = ts[ts.index.notna()].sort_index()
         ts = (
-            ts.groupby([c.name for c in data[1:-1]], group_keys=True)
+            ts.groupby([c.name for c in data[1:-1]])
             .apply(self._shift)[data[-1].name]
             .to_frame()
             .reset_index()

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/data_source/data_source_publisher.py RENAMED Viewed

@@ -386,6 +386,7 @@ class DataSourcePublisher:
                 search_keys = [k.value.value for k in search_keys] if search_keys else None
                 request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
                 task_id = self._rest_client.upload_online(request, trace_id)
+                print(f"Uploading online task created. task_id={task_id}")
                 with Spinner():
                     status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
                     while status_response["status"] not in self.FINAL_STATUSES:

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/dataset.py RENAMED Viewed

@@ -587,15 +587,23 @@ class Dataset:  # (pd.DataFrame):
         if (
             runtime_parameters is not None
             and runtime_parameters.properties is not None
-            and "generate_features" in runtime_parameters.properties
         ):
-            generate_features = runtime_parameters.properties["generate_features"].split(",")
-            renamed_generate_features = []
-            for f in generate_features:
-                for new_column, orig_column in self.columns_renaming.items():
-                    if f == orig_column:
-                        renamed_generate_features.append(new_column)
-            runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
+            if "generate_features" in runtime_parameters.properties:
+                generate_features = runtime_parameters.properties["generate_features"].split(",")
+                renamed_generate_features = []
+                for f in generate_features:
+                    for new_column, orig_column in self.columns_renaming.items():
+                        if f == orig_column:
+                            renamed_generate_features.append(new_column)
+                runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
+            if "columns_for_online_api" in runtime_parameters.properties:
+                columns_for_online_api = runtime_parameters.properties["columns_for_online_api"].split(",")
+                renamed_columns_for_online_api = []
+                for f in columns_for_online_api:
+                    for new_column, orig_column in self.columns_renaming.items():
+                        if f == orig_column:
+                            renamed_columns_for_online_api.append(new_column)
+                runtime_parameters.properties["columns_for_online_api"] = ",".join(renamed_columns_for_online_api)
         return runtime_parameters

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/features_enricher.py RENAMED Viewed

@@ -112,6 +112,7 @@ try:
 except Exception:
     from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
+from upgini.utils.sort import sort_columns
 from upgini.utils.target_utils import (
     balance_undersample_forced,
     calculate_psi,
@@ -222,6 +223,7 @@ class FeaturesEnricher(TransformerMixin):
         loss: Optional[str] = None,
         detect_missing_search_keys: bool = True,
         generate_features: Optional[List[str]] = None,
+        columns_for_online_api: Optional[List[str]] = None,
         round_embeddings: Optional[int] = None,
         logs_enabled: bool = True,
         raise_validation_error: bool = True,
@@ -345,6 +347,9 @@ class FeaturesEnricher(TransformerMixin):
                     self.logger.error(msg)
                     raise ValidationError(msg)
                 self.runtime_parameters.properties["round_embeddings"] = round_embeddings
+        self.columns_for_online_api = columns_for_online_api
+        if columns_for_online_api is not None:
+            self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
         maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
         if maybe_downsampling_limit is not None:
             Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
@@ -1257,7 +1262,7 @@ class FeaturesEnricher(TransformerMixin):
             for feature, shap in new_shaps.items()
             if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
         }
-        self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
+        self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
         if self.features_info_display_handle is not None:
             try:
@@ -1564,9 +1569,23 @@ class FeaturesEnricher(TransformerMixin):
         fitting_eval_set_dict = {}
         fitting_x_columns = fitting_X.columns.to_list()
-        self.logger.info(f"Final list of fitting X columns: {fitting_x_columns}")
+        # Idempotently sort columns
+        fitting_x_columns = sort_columns(
+            fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
+        )
+        fitting_X = fitting_X[fitting_x_columns]
+        self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
         fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
-        self.logger.info(f"Final list of fitting enriched X columns: {fitting_enriched_x_columns}")
+        fitting_enriched_x_columns = sort_columns(
+            fitting_enriched_X,
+            enriched_y_sorted,
+            search_keys,
+            self.model_task_type,
+            sort_all_columns=True,
+            logger=self.logger,
+        )
+        fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
+        self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
         for idx, eval_tuple in eval_set_sampled_dict.items():
             eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
             eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
@@ -1730,11 +1749,15 @@ class FeaturesEnricher(TransformerMixin):
             if eval_set is not None
             else (Dataset.FIT_SAMPLE_THRESHOLD, Dataset.FIT_SAMPLE_ROWS)
         )
+        df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
+        # Sample after sorting by system_record_id for idempotency
+        df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
         if num_samples > sample_threshold:
             self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
             df = df.sample(n=sample_rows, random_state=self.random_state)
-        df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
         if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
             df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
@@ -1873,15 +1896,12 @@ class FeaturesEnricher(TransformerMixin):
             # downsample if need to eval_set threshold
             num_samples = _num_samples(df)
-            phone_column = self._get_phone_column(self.search_keys)
             force_downsampling = (
                 not self.disable_force_downsampling
-                and self.generate_features is not None
-                and phone_column is not None
-                and self.fit_columns_renaming is not None
-                and self.fit_columns_renaming.get(phone_column) in self.generate_features
+                and self.columns_for_online_api is not None
                 and num_samples > Dataset.FORCE_SAMPLE_SIZE
             )
+            # TODO: check that system_record_id was added before this step
             if force_downsampling:
                 self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
                 df = balance_undersample_forced(
@@ -1915,6 +1935,7 @@ class FeaturesEnricher(TransformerMixin):
                 progress_bar=progress_bar,
                 progress_callback=progress_callback,
                 add_fit_system_record_id=True,
+                target_name=tmp_target_name,
             )
             if enriched_df is None:
                 return None
@@ -1948,7 +1969,28 @@ class FeaturesEnricher(TransformerMixin):
             df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
             num_samples = _num_samples(df)
-            if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
+            force_downsampling = (
+                not self.disable_force_downsampling
+                and self.columns_for_online_api is not None
+                and num_samples > Dataset.FORCE_SAMPLE_SIZE
+            )
+            if force_downsampling:
+                self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
+                df = balance_undersample_forced(
+                    df=df,
+                    target_column=TARGET,
+                    id_columns=self.id_columns,
+                    date_column=self._get_date_column(self.search_keys),
+                    task_type=self.model_task_type,
+                    cv_type=self.cv,
+                    random_state=self.random_state,
+                    sample_size=Dataset.FORCE_SAMPLE_SIZE,
+                    logger=self.logger,
+                    bundle=self.bundle,
+                    warning_callback=self.__log_warning,
+                )
+            elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
                 self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
                 df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
@@ -1964,6 +2006,7 @@ class FeaturesEnricher(TransformerMixin):
                 progress_bar=progress_bar,
                 progress_callback=progress_callback,
                 add_fit_system_record_id=True,
+                target_name=tmp_target_name,
             )
             if enriched_Xy is None:
                 return None
@@ -2125,6 +2168,7 @@ if response.status_code == 200:
         progress_bar: Optional[ProgressBar] = None,
         progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
         add_fit_system_record_id: bool = False,
+        target_name: Optional[str] = None,
     ) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
         if self._search_task is None:
             raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -2309,8 +2353,16 @@ if response.status_code == 200:
                 and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
             ]
-            if add_fit_system_record_id:
-                df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
+            if add_fit_system_record_id and target_name is not None:
+                reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
+                df = self.__add_fit_system_record_id(
+                    df,
+                    search_keys,
+                    SYSTEM_RECORD_ID,
+                    reversed_columns_renaming.get(target_name, target_name),
+                    columns_renaming,
+                    silent=True,
+                )
                 df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
                 features_not_to_pass.append(SORT_ID)
@@ -2620,17 +2672,18 @@ if response.status_code == 200:
             checked_generate_features = []
             for gen_feature in self.generate_features:
                 if gen_feature not in x_columns:
-                    if gen_feature == self._get_phone_column(self.search_keys):
-                        raise ValidationError(
-                            self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
-                        )
-                    else:
-                        self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
+                    msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
+                    self.__log_warning(msg)
                 else:
                     checked_generate_features.append(gen_feature)
             self.generate_features = checked_generate_features
             self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
+        if self.columns_for_online_api is not None and len(self.columns_for_online_api) > 0:
+            for column in self.columns_for_online_api:
+                if column not in validated_X.columns:
+                    raise ValidationError(self.bundle.get("missing_column_for_online_api").format(column))
         if self.id_columns is not None:
             for id_column in self.id_columns:
                 if id_column not in validated_X.columns:
@@ -2754,7 +2807,9 @@ if response.status_code == 200:
             self.__log_warning(full_duplicates_warning)
         # Explode multiple search keys
-        df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
+        df = self.__add_fit_system_record_id(
+            df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming
+        )
         # TODO check that this is correct for enrichment
         self.df_with_original_index = df.copy()
@@ -2836,7 +2891,9 @@ if response.status_code == 200:
         if eval_set is not None and len(eval_set) > 0:
             meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
-        df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
+        df = self.__add_fit_system_record_id(
+            df, self.fit_search_keys, SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming, silent=True
+        )
         if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
             df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
@@ -2852,9 +2909,7 @@ if response.status_code == 200:
         # Force downsampling to 7000 for API features generation
         force_downsampling = (
             not self.disable_force_downsampling
-            and self.generate_features is not None
-            and phone_column is not None
-            and self.fit_columns_renaming[phone_column] in self.generate_features
+            and self.columns_for_online_api is not None
             and len(df) > Dataset.FORCE_SAMPLE_SIZE
         )
         if force_downsampling:
@@ -3525,56 +3580,82 @@ if response.status_code == 200:
     def __add_fit_system_record_id(
         self,
         df: pd.DataFrame,
-        # meaning_types: Dict[str, FileColumnMeaningType],
         search_keys: Dict[str, SearchKey],
         id_name: str,
+        target_name: str,
+        columns_renaming: Dict[str, str],
+        silent: bool = False,
     ) -> pd.DataFrame:
-        # save original order or rows
         original_index_name = df.index.name
         index_name = df.index.name or DEFAULT_INDEX
         original_order_name = "original_order"
+        # Save original index
         df = df.reset_index().rename(columns={index_name: ORIGINAL_INDEX})
+        # Save original order
         df = df.reset_index().rename(columns={DEFAULT_INDEX: original_order_name})
-        # order by date and idempotent order by other keys
-        if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
-            sort_exclude_columns = [
-                original_order_name,
-                ORIGINAL_INDEX,
-                EVAL_SET_INDEX,
-                TARGET,
-                "__target",
-                ENTITY_SYSTEM_RECORD_ID,
-            ]
-            if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
-                date_column = DateTimeSearchKeyConverter.DATETIME_COL
-                sort_exclude_columns.append(self._get_date_column(search_keys))
-            else:
-                date_column = self._get_date_column(search_keys)
-            sort_columns = [date_column] if date_column is not None else []
+        # order by date and idempotent order by other keys and features
-            sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
-            sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
+        sort_exclude_columns = [
+            original_order_name,
+            ORIGINAL_INDEX,
+            EVAL_SET_INDEX,
+            TARGET,
+            "__target",
+            ENTITY_SYSTEM_RECORD_ID,
+        ]
+        if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
+            date_column = DateTimeSearchKeyConverter.DATETIME_COL
+            sort_exclude_columns.append(FeaturesEnricher._get_date_column(search_keys))
+        else:
+            date_column = FeaturesEnricher._get_date_column(search_keys)
+        sort_exclude_columns.append(date_column)
+        columns_to_sort = [date_column] if date_column is not None else []
+        do_sorting = True
+        if self.id_columns and self.cv in [CVType.time_series, CVType.blocked_time_series]:
+            # Check duplicates by date and id_columns
+            reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
+            renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
+            duplicate_check_columns = [c for c in renamed_id_columns if c in df.columns]
+            if date_column is not None:
+                duplicate_check_columns.append(date_column)
-            other_columns = sorted(
-                [
-                    c
-                    for c in df.columns
-                    if c not in sort_columns
-                    and c not in sorted_other_keys
-                    and c not in sort_exclude_columns
-                    and df[c].nunique() > 1
-                ]
+            duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
+            if duplicates.any():
+                if not silent:
+                    self.__log_warning(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
+                else:
+                    self.logger.warning(
+                        f"Found {duplicates.sum()} duplicate rows by date and ID columns: {duplicate_check_columns}."
+                        " Will not sort dataset"
+                    )
+                do_sorting = False
+            else:
+                columns_to_hash = list(search_keys.keys()) + renamed_id_columns + [target_name]
+                columns_to_hash = sort_columns(
+                    df[columns_to_hash],
+                    target_name,
+                    search_keys,
+                    self.model_task_type,
+                    sort_exclude_columns,
+                    logger=self.logger,
+                )
+        else:
+            columns_to_hash = sort_columns(
+                df, target_name, search_keys, self.model_task_type, sort_exclude_columns, logger=self.logger
             )
-            all_other_columns = sorted_other_keys + other_columns
+        if do_sorting:
             search_keys_hash = "search_keys_hash"
-            if len(all_other_columns) > 0:
-                sort_columns.append(search_keys_hash)
-                df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
-            df = df.sort_values(by=sort_columns)
+            if len(columns_to_hash) > 0:
+                factorized_df = df.copy()
+                for col in columns_to_hash:
+                    if col not in search_keys and not is_numeric_dtype(factorized_df[col]):
+                        factorized_df[col] = factorized_df[col].factorize(sort=True)[0]
+                df[search_keys_hash] = pd.util.hash_pandas_object(factorized_df[columns_to_hash], index=False)
+                columns_to_sort.append(search_keys_hash)
+            df = df.sort_values(by=columns_to_sort)
             if search_keys_hash in df.columns:
                 df.drop(columns=search_keys_hash, inplace=True)

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/metrics.py RENAMED Viewed

@@ -30,8 +30,8 @@ except ImportError:
 from sklearn.metrics._regression import (
     _check_reg_targets,
     check_consistent_length,
-    mean_squared_error,
 )
+from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import BaseCrossValidator
 from upgini.errors import ValidationError
@@ -289,9 +289,6 @@ class EstimatorWrapper:
         else:
             x, y = self._remove_empty_target_rows(x, y)
-        # Make order of columns idempotent
-        x = x[sorted(x.columns)]
         self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
         return x, y, groups
@@ -569,7 +566,7 @@ class CatBoostWrapper(EstimatorWrapper):
             if all([isinstance(c, int) for c in estimator_cat_features]):
                 cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
                 cat_features_idx.update(estimator_cat_features)
-                self.cat_features = [x.columns[idx] for idx in sorted(cat_features_idx)]
+                self.cat_features = [x.columns[idx] for idx in cat_features_idx]
             elif all([isinstance(c, str) for c in estimator_cat_features]):
                 self.cat_features = list(set(self.cat_features + estimator_cat_features))
             else:
@@ -940,13 +937,13 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
     if (y_true < 0).any():
         raise ValidationError(bundle.get("metrics_msle_negative_target"))
-    return mean_squared_error(
+    mse = mean_squared_error(
         log1p(y_true),
         log1p(y_pred.clip(0)),
         sample_weight=sample_weight,
         multioutput=multioutput,
-        squared=squared,
     )
+    return mse if squared else np.sqrt(mse)
 def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/resource_bundle/strings.properties RENAMED Viewed

@@ -35,6 +35,7 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
 loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
 loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
 multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
+date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
 group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
 current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
 # Errors
@@ -111,6 +112,7 @@ x_is_empty=X is empty
 y_is_empty=y is empty
 x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
 missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
+missing_column_for_online_api=Column {} specified in `columns_for_online_api` is not present in input columns: {}
 x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
 train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
 eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/datetime_utils.py RENAMED Viewed

@@ -166,6 +166,8 @@ class DateTimeSearchKeyConverter:
             # Drop intermediate columns if not needed
             df.drop(columns=["second", "minute", "hour"], inplace=True)
+        else:
+            keep_time = False
         for generated_feature in self.generated_features[:]:
             if df[generated_feature].dropna().nunique() <= 1:

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/email_utils.py RENAMED Viewed

@@ -116,17 +116,17 @@ class EmailSearchKeyConverter:
         else:
             df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
-        del self.search_keys[self.email_column]
-        if self.email_column in self.unnest_search_keys:
-            self.unnest_search_keys.remove(self.email_column)
+        # del self.search_keys[self.email_column]
+        # if self.email_column in self.unnest_search_keys:
+        #     self.unnest_search_keys.remove(self.email_column)
         one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
         df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
         self.columns_renaming[one_domain_name] = original_email_column
         self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
-        if self.email_converted_to_hem:
-            df = df.drop(columns=self.email_column)
-            del self.columns_renaming[self.email_column]
+        # if self.email_converted_to_hem:
+        #     df = df.drop(columns=self.email_column)
+        #     del self.columns_renaming[self.email_column]
         return df

upgini-1.2.60/src/upgini/utils/mstats.py ADDED Viewed

@@ -0,0 +1,177 @@
+import warnings
+from collections import namedtuple
+import numpy as np
+import numpy.ma as ma
+import scipy
+from joblib import Parallel, delayed
+from numpy import ndarray
+from psutil import cpu_count
+np.seterr(divide="ignore")
+warnings.simplefilter(action="ignore", category=RuntimeWarning)
+def _find_repeats(arr):
+    # This function assumes it may clobber its input.
+    if len(arr) == 0:
+        return np.array(0, np.float64), np.array(0, np.intp)
+    # XXX This cast was previously needed for the Fortran implementation,
+    # should we ditch it?
+    arr = np.asarray(arr, np.float64).ravel()
+    arr.sort()
+    # Taken from NumPy 1.9's np.unique.
+    change = np.concatenate(([True], arr[1:] != arr[:-1]))
+    unique = arr[change]
+    change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
+    freq = np.diff(change_idx)
+    atleast2 = freq > 1
+    return unique[atleast2], freq[atleast2]
+def find_repeats(arr):
+    # Make sure we get a copy. ma.compressed promises a "new array", but can
+    # actually return a reference.
+    compr = np.asarray(ma.compressed(arr), dtype=np.float64)
+    try:
+        need_copy = np.may_share_memory(compr, arr)
+    except AttributeError:
+        # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
+        # while in numpy 1.8.2 and above it just (correctly) returns False.
+        need_copy = False
+    if need_copy:
+        compr = compr.copy()
+    return _find_repeats(compr)
+def rankdata(data, axis=None, use_missing=False):
+    def _rank1d(data, use_missing=False):
+        n = data.count()
+        rk = np.empty(data.size, dtype=float)
+        idx = data.argsort()
+        rk[idx[:n]] = np.arange(1, n + 1)
+        if use_missing:
+            rk[idx[n:]] = (n + 1) / 2.0
+        else:
+            rk[idx[n:]] = 0
+        repeats = find_repeats(data.copy())
+        for r in repeats[0]:
+            condition = (data == r).filled(False)
+            rk[condition] = rk[condition].mean()
+        return rk
+    data = ma.array(data, copy=False)
+    if axis is None:
+        if data.ndim > 1:
+            return _rank1d(data.ravel(), use_missing).reshape(data.shape)
+        else:
+            return _rank1d(data, use_missing)
+    else:
+        return ma.apply_along_axis(_rank1d, axis, data, use_missing).view(ndarray)
+def _chk_asarray(a, axis):
+    # Always returns a masked array, raveled for axis=None
+    a = ma.asanyarray(a)
+    if axis is None:
+        a = ma.ravel(a)
+        outaxis = 0
+    else:
+        outaxis = axis
+    return a, outaxis
+SpearmanrResult = namedtuple("SpearmanrResult", ("correlation", "pvalue"))
+# Taken from scipy.mstats with following tweaks:
+# 1. parallel pairwise computation
+# 2. custom masking
+def spearmanr(
+    x, y=None, use_ties=True, axis=None, nan_policy="propagate", alternative="two-sided", mask_fn=ma.masked_invalid
+):
+    if not use_ties:
+        raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
+    # Always returns a masked array, raveled if axis=None
+    x, axisout = _chk_asarray(x, axis)
+    if y is not None:
+        # Deal only with 2-D `x` case.
+        y, _ = _chk_asarray(y, axis)
+        if axisout == 0:
+            x = ma.column_stack((x, y))
+        else:
+            x = ma.row_stack((x, y))
+    if axisout == 1:
+        # To simplify the code that follow (always use `n_obs, n_vars` shape)
+        x = x.T
+    if nan_policy == "omit":
+        x = mask_fn(x)
+    def _spearmanr_2cols(x):
+        # Mask the same observations for all variables, and then drop those
+        # observations (can't leave them masked, rankdata is weird).
+        x = ma.mask_rowcols(x, axis=0)
+        x = x[~x.mask.any(axis=1), :]
+        # If either column is entirely NaN or Inf
+        if not np.any(x.data):
+            return SpearmanrResult(np.nan, np.nan)
+        m = ma.getmask(x)
+        n_obs = x.shape[0]
+        dof = n_obs - 2 - int(m.sum(axis=0)[0])
+        if dof < 0:
+            return SpearmanrResult(np.nan, np.nan)
+        # Gets the ranks and rank differences
+        x_ranked = rankdata(x, axis=0)
+        rs = ma.corrcoef(x_ranked, rowvar=False).data
+        # rs can have elements equal to 1, so avoid zero division warnings
+        with np.errstate(divide="ignore"):
+            # clip the small negative values possibly caused by rounding
+            # errors before taking the square root
+            t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
+        t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
+        # For backwards compatibility, return scalars when comparing 2 columns
+        if rs.shape == (2, 2):
+            return SpearmanrResult(rs[1, 0], prob[1, 0])
+        else:
+            return SpearmanrResult(rs, prob)
+    # Need to do this per pair of variables, otherwise the dropped observations
+    # in a third column mess up the result for a pair.
+    n_vars = x.shape[1]
+    if n_vars == 2:
+        return _spearmanr_2cols(x)
+    else:
+        max_cpu_cores = cpu_count(logical=False)
+        with np.errstate(divide="ignore"):
+            results = Parallel(n_jobs=max_cpu_cores)(
+                delayed(_spearmanr_2cols)(x[:, [var1, var2]])
+                for var1 in range(n_vars - 1)
+                for var2 in range(var1 + 1, n_vars)
+            )
+        rs = np.ones((n_vars, n_vars), dtype=float)
+        prob = np.zeros((n_vars, n_vars), dtype=float)
+        for var1 in range(n_vars - 1):
+            for var2 in range(var1 + 1, n_vars):
+                result = results.pop(0)
+                rs[var1, var2] = result.correlation
+                rs[var2, var1] = result.correlation
+                prob[var1, var2] = result.pvalue
+                prob[var2, var1] = result.pvalue
+        return SpearmanrResult(rs, prob)

upgini-1.2.60/src/upgini/utils/sort.py ADDED Viewed

@@ -0,0 +1,172 @@
+import hashlib
+import logging
+from typing import Any, Dict, List, Optional, Union
+import numpy as np
+import pandas as pd
+from joblib import Parallel, delayed
+from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
+from psutil import cpu_count
+from scipy.stats import skew, spearmanr
+from upgini.metadata import ModelTaskType, SearchKey
+from upgini.utils import mstats
+def sort_columns(
+    df: pd.DataFrame,
+    target_column: Union[str, pd.Series],
+    search_keys: Dict[str, SearchKey],
+    model_task_type: ModelTaskType,
+    exclude_columns: Optional[List[str]] = None,
+    sort_all_columns: bool = False,
+    logger: Optional[logging.Logger] = None,
+) -> List[str]:
+    if exclude_columns is None:
+        exclude_columns = []
+    if logger is None:
+        logger = logging.getLogger(__name__)
+        logger.setLevel(logging.FATAL)
+    df = df.copy()  # avoid side effects
+    # Check multiple search keys
+    search_key_values = list(search_keys.values())
+    has_duplicate_search_keys = len(search_key_values) != len(set(search_key_values))
+    if has_duplicate_search_keys:
+        logging.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
+    sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
+    sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
+    other_columns = sorted(
+        [
+            c
+            for c in df.columns
+            if c not in sorted_keys and c not in exclude_columns and (df[c].nunique() > 1 or sort_all_columns)
+        ]
+    )
+    target = target_column if isinstance(target_column, pd.Series) else df[target_column]
+    target = prepare_target(target, model_task_type)
+    sort_dict = get_sort_columns_dict(
+        df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True, sort_all_columns=sort_all_columns
+    )
+    other_columns = [c for c in other_columns if c in sort_dict]
+    columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
+    return columns_for_sort
+def get_sort_columns_dict(
+    df: pd.DataFrame,
+    target: pd.Series,
+    sorted_keys: List[str],
+    omit_nan: bool,
+    n_jobs: Optional[int] = None,
+    sort_all_columns: bool = False,
+) -> Dict[str, Any]:
+    string_features = [c for c in df.select_dtypes(exclude=[np.number]).columns if c not in sorted_keys]
+    columns_for_sort = [c for c in df.columns if c not in sorted_keys + string_features]
+    if len(string_features) > 0:
+        if len(df) > len(df.drop(columns=string_features).drop_duplicates()) or sort_all_columns:
+            # factorize string features
+            for c in string_features:
+                df.loc[:, c] = pd.Series(df[c].factorize(sort=True)[0], index=df.index, dtype="int")
+            columns_for_sort.extend(string_features)
+    if len(columns_for_sort) == 0:
+        return {}
+    df = df[columns_for_sort]
+    hashes = [hash_series(df[col]) for col in columns_for_sort]
+    df = np.asarray(df, dtype=np.float32)
+    correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
+    sort_dict = {col: (corr, h) for col, corr, h in zip(columns_for_sort, correlations, hashes)}
+    return sort_dict
+def get_sort_columns_correlations(df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: Optional[int] = None):
+    target_correlations = get_target_correlations(df, target, omit_nan, n_jobs, precision=7)
+    return np.max(target_correlations, axis=0)
+def get_target_correlations(
+    df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: Optional[int] = None, precision: int = 15
+):
+    df = np.asarray(df, dtype=np.float32)
+    target_correlations = np.zeros((2, df.shape[1]))
+    target_correlations[0, :] = np.nan_to_num(
+        calculate_spearman_corr_with_target(df, target, omit_nan, n_jobs), copy=False
+    )
+    target_correlations[1, :] = np.nan_to_num(np.abs(np.corrcoef(df.T, target.T, rowvar=True)[-1, :-1]))
+    target_correlations = np.trunc(target_correlations * 10**precision) / (10**precision)
+    return target_correlations
+def calculate_spearman_corr_with_target(
+    X: Union[pd.DataFrame, np.ndarray], y: pd.Series, omit_nan: bool = False, n_jobs: Optional[int] = None
+) -> np.ndarray:
+    if isinstance(X, pd.DataFrame):
+        X = np.asarray(X, dtype=np.float32)
+    if X.size == 0:
+        return np.ndarray(shape=(0,))
+    all_correlations = np.zeros(X.shape[1])
+    all_correlations.fill(np.nan)
+    cols2calc = np.where([c.size > 0 and not (c == c[0]).all() for c in X.T])[0]
+    if omit_nan:
+        results = Parallel(n_jobs=n_jobs or cpu_count(logical=False))(
+            delayed(mstats.spearmanr)(
+                X[:, i],
+                y,
+                nan_policy="omit",
+                axis=0,
+            )
+            for i in cols2calc
+        )
+        target_correlations = np.array([abs(res.correlation) for res in results])
+    else:
+        cols2calc = cols2calc[np.where(~np.isnan(X[:, cols2calc]).any(axis=0))[0]]
+        target_correlations = calculate_spearman(X[:, cols2calc], y, nan_policy="raise")
+        if isinstance(target_correlations, float):
+            target_correlations = np.abs([target_correlations])
+        else:
+            target_correlations = np.abs(target_correlations)[-1, :-1]
+    all_correlations[cols2calc] = target_correlations
+    return all_correlations
+def calculate_spearman(X: np.ndarray, y: Optional[pd.Series], nan_policy: str):
+    features_num = X.shape[1]
+    if y is not None:
+        features_num += 1
+    if features_num < 2:
+        return 1.0
+    else:
+        return spearmanr(X, y, nan_policy=nan_policy).correlation
+def hash_series(series: pd.Series) -> int:
+    return int(hashlib.sha256(pd.util.hash_pandas_object(series, index=True).values).hexdigest(), 16)
+def prepare_target(target: pd.Series, model_task_type: ModelTaskType) -> pd.Series:
+    target_name = target.name
+    if model_task_type != ModelTaskType.REGRESSION or (
+        not is_numeric_dtype(target) and not is_datetime64_any_dtype(target)
+    ):
+        target = target.astype(str).astype("category").cat.codes
+    elif model_task_type == ModelTaskType.REGRESSION:
+        skewness = round(abs(skew(target)), 2)
+        if (target.min() >= 0) and (skewness >= 0.9):
+            target = np.log1p(target)
+    return pd.Series(target, name=target_name)

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/target_utils.py RENAMED Viewed

@@ -1,4 +1,3 @@
-import itertools
 import logging
 from typing import Callable, List, Optional, Union
@@ -207,7 +206,7 @@ def balance_undersample_forced(
     id_columns: List[str],
     date_column: str,
     task_type: ModelTaskType,
-    cv_type: CVType | None,
+    cv_type: Optional[CVType],
     random_state: int,
     sample_size: int = 7000,
     logger: Optional[logging.Logger] = None,
@@ -319,7 +318,8 @@ def balance_undersample_time_series(
     if len(id_counts) < min_different_ids:
         if logger is not None:
             logger.info(
-                f"Different ids count {len(id_counts)} for sample size {sample_size} is less than min different ids {min_different_ids}, sampling time window"
+                f"Different ids count {len(id_counts)} for sample size {sample_size}"
+                f" is less than min different ids {min_different_ids}, sampling time window"
             )
         date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
         ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index

upgini-1.2.59a3818.dev1/src/upgini/__about__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "1.2.59a3818.dev1"

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/.gitignore RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/LICENSE RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/README.md RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/__init__.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/ads.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/ads_management/__init__.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/ads_management/ads_manager.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/__init__.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/all_operands.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/binary.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/feature.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/groupby.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/operand.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/unary.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/data_source/__init__.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/errors.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/http.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/lazy_import.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/mdc/__init__.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/mdc/context.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/metadata.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/normalizer/__init__.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/normalizer/normalize_utils.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/resource_bundle/__init__.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/resource_bundle/exceptions.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/resource_bundle/strings_widget.properties RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/sampler/__init__.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/sampler/base.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/sampler/random_under_sampler.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/sampler/utils.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/search_task.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/spinner.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/Roboto-Regular.ttf RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/__init__.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/base_search_key_detector.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/blocked_time_series.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/country_utils.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/custom_loss_utils.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/cv_utils.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/deduplicate_utils.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/display_utils.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/fallback_progress_bar.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/feature_info.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/features_validator.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/format.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/ip_utils.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/phone_utils.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/postal_code_utils.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/progress_bar.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/sklearn_ext.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/track_info.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/warning_counter.py RENAMED Viewed

File without changes

{upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/version_validator.py RENAMED Viewed

File without changes

upgini 1.2.59a3818.dev1__tar.gz → 1.2.60__tar.gz

Potentially problematic release.

upgini 1.2.59a3818.dev1tar.gz → 1.2.60tar.gz