PyPI - upgini - Versions diffs - 1.2.71a3832.dev11__py3-none-any.whl → 1.2.71a3832.dev13__py3-none-any.whl - Mend

upgini 1.2.71a3832.dev11py3-none-any.whl → 1.2.71a3832.dev13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.71a3832.~~dev11~~"
1	+ __version__ = "1.2.71a3832.dev13"

upgini/features_enricher.py CHANGED Viewed

@@ -12,6 +12,7 @@ import tempfile
 import time
 import uuid
 from collections import Counter
+from copy import deepcopy
 from dataclasses import dataclass
 from threading import Thread
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
@@ -3812,6 +3813,7 @@ if response.status_code == 200:
         features_meta = self._search_task.get_all_features_metadata_v2()
         if features_meta is None:
             raise Exception(self.bundle.get("missing_features_meta"))
+        features_meta = deepcopy(features_meta)
         original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
         df = df.rename(columns=original_names_dict)
@@ -3854,6 +3856,7 @@ if response.status_code == 200:
         features_meta = self._search_task.get_all_features_metadata_v2()
         if features_meta is None:
             raise Exception(self.bundle.get("missing_features_meta"))
+        features_meta = deepcopy(features_meta)
         original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
         features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
@@ -3869,15 +3872,23 @@ if response.status_code == 200:
         original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
-        if updated_shaps is not None:
-            for fm in features_meta:
-                fm.shap_value = updated_shaps.get(fm.name, 0.0)
-        features_meta.sort(key=lambda m: (-m.shap_value, m.name))
         for feature_meta in features_meta:
             if feature_meta.name in original_names_dict.keys():
                 feature_meta.name = original_names_dict[feature_meta.name]
+            if updated_shaps is not None:
+                updating_shap = updated_shaps.get(feature_meta.name)
+                if updating_shap is None:
+                    self.logger.warning(
+                        f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
+                    )
+                    updating_shap = 0.0
+                feature_meta.shap_value = updating_shap
+        features_meta.sort(key=lambda m: (-m.shap_value, m.name))
+        for feature_meta in features_meta:
             is_client_feature = feature_meta.name in df.columns
             # TODO make a decision about selected features based on special flag from mlb
@@ -3889,7 +3900,7 @@ if response.status_code == 200:
             # Use only important features
             if (
                 # feature_meta.name in self.fit_generated_features or
-                feature_meta.name == COUNTRY
+                feature_meta.name == COUNTRY  # constant synthetic column
                 # In select_features mode we select also from etalon features and need to show them
                 or (not self.fit_select_features and is_client_feature)
             ):
@@ -4039,11 +4050,11 @@ if response.status_code == 200:
         if len(filtered_importances) == 0:
             return []
-        filtered_importances = list(zip(self.feature_names_, self.feature_importances_))
         if importance_threshold is not None:
             filtered_importances = [
-                (name, importance) for name, importance in filtered_importances if importance > importance_threshold
+                (name, importance)
+                for name, importance in filtered_importances.items()
+                if importance > importance_threshold
             ]
         if max_features is not None:
             filtered_importances = list(filtered_importances)[:max_features]

upgini/metrics.py CHANGED Viewed

@@ -14,7 +14,9 @@ from lightgbm import LGBMClassifier, LGBMRegressor
 import lightgbm as lgb
 from numpy import log1p
 from pandas.api.types import is_numeric_dtype
+# from sklearn.calibration import LabelEncoder
 from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
+from sklearn.preprocessing import OrdinalEncoder
 from upgini.utils.features_validator import FeaturesValidator
 from upgini.utils.sklearn_ext import cross_validate
@@ -754,6 +756,8 @@ class LightGBMWrapper(EstimatorWrapper):
             logger=logger,
         )
         self.cat_features = None
+        # self.cat_features_encoders = dict()
+        self.cat_encoder = None
         self.n_classes = None
     def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
@@ -765,9 +769,22 @@ class LightGBMWrapper(EstimatorWrapper):
         self.cat_features = _get_cat_features(x)
         if self.cat_features:
             params["categorical_feature"] = self.cat_features
-        x = fill_na_cat_features(x, self.cat_features)
-        for feature in self.cat_features:
-            x[feature] = x[feature].astype("category").cat.codes
+            # params["categorical_feature"] = [x.columns.get_loc(c) for c in self.cat_features] Works
+            # params["categorical_feature"] = "notauto"
+            # params["categorical_feature"] = "name:" + ",".join(self.cat_features)  # Doesn't work
+            # cat_indices = [str(x.columns.get_loc(c)) for c in self.cat_features] Doesn't work
+            # params["categorical_feature"] = ",".join(cat_indices)
+            pass
+            x = fill_na_cat_features(x, self.cat_features)
+            encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
+            encoded = encoder.fit_transform(x[self.cat_features], y_numpy)
+            x[self.cat_features] = encoded
+            self.cat_encoder = encoder
+        # for feature in self.cat_features:
+        #     encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
+        #     x[feature] = encoder.fit_transform(x[feature])
+        #     self.cat_features_encoders[feature] = encoder
+            # x[feature] = x[feature].astype("category").cat.codes
         if not is_numeric_dtype(y_numpy):
             y_numpy = correct_string_target(y_numpy)
@@ -776,9 +793,19 @@ class LightGBMWrapper(EstimatorWrapper):
     def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
         x, y_numpy, params = super()._prepare_to_calculate(x, y)
         if self.cat_features is not None:
+            params["categorical_feature"] = self.cat_features
+            # params["categorical_feature"] = [x.columns.get_loc(c) for c in self.cat_features]
+            # params["categorical_feature"] = "notauto"
+            # params["categorical_feature"] = "name:" + ",".join(self.cat_features)  # Doesn't work
+            # cat_indices = [str(x.columns.get_loc(c)) for c in self.cat_features]
+            # params["categorical_feature"] = ",".join(cat_indices)
             x = fill_na_cat_features(x, self.cat_features)
-            for feature in self.cat_features:
-                x[feature] = x[feature].astype("category").cat.codes
+            if self.cat_encoder is not None:
+                x[self.cat_features] = self.cat_encoder.transform(x[self.cat_features])
+            # for feature in self.cat_features:
+            #     encoder = self.cat_features_encoders[feature]
+            #     x[feature] = encoder.transform(x[feature])
+                # x[feature] = x[feature].astype("category").cat.codes
         if not is_numeric_dtype(y):
             y_numpy = correct_string_target(y_numpy)
         return x, y_numpy, params

upgini/utils/target_utils.py CHANGED Viewed

@@ -204,7 +204,7 @@ def balance_undersample(
 def balance_undersample_forced(
     df: pd.DataFrame,
     target_column: str,
-    id_columns: List[str],
+    id_columns: Optional[List[str]],
     date_column: str,
     task_type: ModelTaskType,
     cv_type: Optional[CVType],
@@ -287,7 +287,7 @@ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
 def balance_undersample_time_series_trunc(
     df: pd.DataFrame,
-    id_columns: List[str],
+    id_columns: Optional[List[str]],
     date_column: str,
     sample_size: int,
     random_state: int = 42,
@@ -298,6 +298,8 @@ def balance_undersample_time_series_trunc(
     **kwargs,
 ):
     # Convert date column to datetime
+    if id_columns is None:
+        id_columns = [date_column]
     dates_df = df[id_columns + [date_column]].copy()
     dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")

{upgini-1.2.71a3832.dev11.dist-info → upgini-1.2.71a3832.dev13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.71a3832.dev11
+Version: 1.2.71a3832.dev13
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.71a3832.dev11.dist-info → upgini-1.2.71a3832.dev13.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-upgini/__about__.py,sha256=MPYFg9v0SOhqTxe0IfYh4m6Nh3TlmyfHR9sua58WXBM,34
+upgini/__about__.py,sha256=buorll9F2OX4EgV8VmlIrj09nqmsSmqAG8T8p6hRCls,34
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=oYOBaHIyPjm-EEZvJT9pU35_DW8bArEQKymZyhW8LbE,206592
+upgini/features_enricher.py,sha256=lk80Bx9U36lva6T4lPHBFk88ivrpZ-2uwwMwQg0LglE,207023
 upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
 upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
-upgini/metrics.py,sha256=9AaQi7Yb22ZNnycUOAUpcP7TWF5Pfy_NGACcDj10aMs,38820
+upgini/metrics.py,sha256=ot6AhxfRRTzM-dNApWTvmteLBAmGjD9OyAuKmtUTprE,40630
 upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
 upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
 upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
-upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
+upgini/utils/target_utils.py,sha256=KNFzJta1SpGU4sp07dHKSeVJlDs_9qgD2wcw5YuJfOc,16661
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.71a3832.dev11.dist-info/METADATA,sha256=QuI4m49RjcWmDJ74fXMWfNqBKPXGKDsKGhhO_wR1Kfw,49102
-upgini-1.2.71a3832.dev11.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.71a3832.dev11.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.71a3832.dev11.dist-info/RECORD,,
+upgini-1.2.71a3832.dev13.dist-info/METADATA,sha256=JdRugxJAMW4KLyRuz7yIX_PqSz_nObynmhkW5-g_lVs,49102
+upgini-1.2.71a3832.dev13.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.71a3832.dev13.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.71a3832.dev13.dist-info/RECORD,,

{upgini-1.2.71a3832.dev11.dist-info → upgini-1.2.71a3832.dev13.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.71a3832.dev11.dist-info → upgini-1.2.71a3832.dev13.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.71a3832.dev11__py3-none-any.whl → 1.2.71a3832.dev13__py3-none-any.whl

upgini 1.2.71a3832.dev11py3-none-any.whl → 1.2.71a3832.dev13py3-none-any.whl