PyPI - upgini - Versions diffs - 1.2.71a3832.dev12__py3-none-any.whl → 1.2.72a3659.dev1__py3-none-any.whl - Mend

upgini 1.2.71a3832.dev12py3-none-any.whl → 1.2.72a3659.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.~~71a3832~~.~~dev12~~"
1	+ __version__ = "1.2.72a3659.dev1"

upgini/autofe/vector.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Dict, List, Optional
 import pandas as pd
@@ -22,3 +22,25 @@ class Sum(PandasOperator, VectorizableMixin):
     def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
         return pd.DataFrame(data).T.fillna(0).sum(axis=1)
+class OnnxModel(PandasOperator):
+    name: str = "onnx"
+    is_vector: bool = True
+    output_type: Optional[str] = "float"
+    model_name: str
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "model_name": self.model_name,
+            }
+        )
+        return res
+    # def load_model(self):
+    #     ...
+    # def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+    #     ...

upgini/features_enricher.py CHANGED Viewed

@@ -3250,8 +3250,7 @@ if response.status_code == 200:
     def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
         if len(eval_pair) != 2:
             raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
-        eval_X = eval_pair[0]
-        eval_y = eval_pair[1]
+        eval_X, eval_y = eval_pair
         if _num_samples(eval_X) == 0:
             raise ValidationError(self.bundle.get("eval_x_is_empty"))
@@ -3872,15 +3871,23 @@ if response.status_code == 200:
         original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
-        if updated_shaps is not None:
-            for fm in features_meta:
-                fm.shap_value = updated_shaps.get(fm.name, 0.0)
-        features_meta.sort(key=lambda m: (-m.shap_value, m.name))
         for feature_meta in features_meta:
             if feature_meta.name in original_names_dict.keys():
                 feature_meta.name = original_names_dict[feature_meta.name]
+            if updated_shaps is not None:
+                updating_shap = updated_shaps.get(feature_meta.name)
+                if updating_shap is None:
+                    self.logger.warning(
+                        f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
+                    )
+                    updating_shap = 0.0
+                feature_meta.shap_value = updating_shap
+        features_meta.sort(key=lambda m: (-m.shap_value, m.name))
+        for feature_meta in features_meta:
             is_client_feature = feature_meta.name in df.columns
             # TODO make a decision about selected features based on special flag from mlb
@@ -3892,7 +3899,7 @@ if response.status_code == 200:
             # Use only important features
             if (
                 # feature_meta.name in self.fit_generated_features or
-                feature_meta.name == COUNTRY
+                feature_meta.name == COUNTRY  # constant synthetic column
                 # In select_features mode we select also from etalon features and need to show them
                 or (not self.fit_select_features and is_client_feature)
             ):

upgini/metrics.py CHANGED Viewed

@@ -8,13 +8,14 @@ from copy import deepcopy
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import lightgbm as lgb
 import numpy as np
 import pandas as pd
 from lightgbm import LGBMClassifier, LGBMRegressor
-import lightgbm as lgb
 from numpy import log1p
 from pandas.api.types import is_numeric_dtype
 from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
+from sklearn.preprocessing import OrdinalEncoder
 from upgini.utils.features_validator import FeaturesValidator
 from upgini.utils.sklearn_ext import cross_validate
@@ -125,7 +126,7 @@ LIGHTGBM_MULTICLASS_PARAMS = {
     "max_cat_threshold": 80,
     "min_data_per_group": 20,
     "cat_smooth": 18,
-    "cat_l2" : 8,
+    "cat_l2": 8,
     "objective": "multiclass",
     "class_weight": "balanced",
     "use_quantized_grad": "true",
@@ -146,7 +147,7 @@ LIGHTGBM_BINARY_PARAMS = {
     "max_cat_threshold": 80,
     "min_data_per_group": 20,
     "cat_smooth": 18,
-    "cat_l2" : 8,
+    "cat_l2": 8,
     "verbosity": -1,
 }
@@ -754,6 +755,7 @@ class LightGBMWrapper(EstimatorWrapper):
             logger=logger,
         )
         self.cat_features = None
+        self.cat_encoder = None
         self.n_classes = None
     def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
@@ -764,10 +766,13 @@ class LightGBMWrapper(EstimatorWrapper):
             params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
         self.cat_features = _get_cat_features(x)
         if self.cat_features:
-            params["categorical_feature"] = self.cat_features
-        x = fill_na_cat_features(x, self.cat_features)
-        for feature in self.cat_features:
-            x[feature] = x[feature].astype("category").cat.codes
+            x = fill_na_cat_features(x, self.cat_features)
+            encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
+            encoded = pd.DataFrame(
+                encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
+            )
+            x[self.cat_features] = encoded
+            self.cat_encoder = encoder
         if not is_numeric_dtype(y_numpy):
             y_numpy = correct_string_target(y_numpy)
@@ -777,8 +782,10 @@ class LightGBMWrapper(EstimatorWrapper):
         x, y_numpy, params = super()._prepare_to_calculate(x, y)
         if self.cat_features is not None:
             x = fill_na_cat_features(x, self.cat_features)
-            for feature in self.cat_features:
-                x[feature] = x[feature].astype("category").cat.codes
+            if self.cat_encoder is not None:
+                x[self.cat_features] = pd.DataFrame(
+                    self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
+                )
         if not is_numeric_dtype(y):
             y_numpy = correct_string_target(y_numpy)
         return x, y_numpy, params

upgini/utils/target_utils.py CHANGED Viewed

@@ -204,7 +204,7 @@ def balance_undersample(
 def balance_undersample_forced(
     df: pd.DataFrame,
     target_column: str,
-    id_columns: List[str],
+    id_columns: Optional[List[str]],
     date_column: str,
     task_type: ModelTaskType,
     cv_type: Optional[CVType],
@@ -287,7 +287,7 @@ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
 def balance_undersample_time_series_trunc(
     df: pd.DataFrame,
-    id_columns: List[str],
+    id_columns: Optional[List[str]],
     date_column: str,
     sample_size: int,
     random_state: int = 42,
@@ -298,6 +298,8 @@ def balance_undersample_time_series_trunc(
     **kwargs,
 ):
     # Convert date column to datetime
+    if id_columns is None:
+        id_columns = [date_column]
     dates_df = df[id_columns + [date_column]].copy()
     dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")

{upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72a3659.dev1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.71a3832.dev12
+Version: 1.2.72a3659.dev1
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72a3659.dev1.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-upgini/__about__.py,sha256=okx02f-XOrtKlpdtJnV-aHreGFvFkxZ5NQhd5zxvhMk,34
+upgini/__about__.py,sha256=n3Di7UqdUYABUquK0tXIme5xiFjO7fpJ3AKGXnT-Jec,33
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=DgWboHEhr5BQT87MaAo2iUtrhapP3iqczLeZtWLRkDs,206664
+upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
 upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
 upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
-upgini/metrics.py,sha256=9AaQi7Yb22ZNnycUOAUpcP7TWF5Pfy_NGACcDj10aMs,38820
+upgini/metrics.py,sha256=jobZL_Hg7guufDYH2XdanxgbyJTuC9ZAMZodeptE3I4,39177
 upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -21,7 +21,7 @@ upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
 upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
 upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
 upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
-upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
+upgini/autofe/vector.py,sha256=-aLI4cA5HI2p42Skj4Sfb3XAPAFfbcu7FjukWsxVFdM,1161
 upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
 upgini/autofe/timeseries/base.py,sha256=rWJqRuFAzTZEsUdWG5s1Vhif9zzRRmalASXvarufRxI,3610
 upgini/autofe/timeseries/cross.py,sha256=BTINVwuZSbm_4NKkVm0FGM68SrvZLENZKXN7-UyvhYI,5319
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
 upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
 upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
-upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
+upgini/utils/target_utils.py,sha256=KNFzJta1SpGU4sp07dHKSeVJlDs_9qgD2wcw5YuJfOc,16661
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.71a3832.dev12.dist-info/METADATA,sha256=8jmuNEDPwjc-Wa6Bds0FjYqYgqf3LFMYyRGUDy5DME8,49102
-upgini-1.2.71a3832.dev12.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.71a3832.dev12.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.71a3832.dev12.dist-info/RECORD,,
+upgini-1.2.72a3659.dev1.dist-info/METADATA,sha256=tuv9DtWEtwHVjoIMPK4LKOvrmaQ3suMZS43JeEcEDiY,49101
+upgini-1.2.72a3659.dev1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.72a3659.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.72a3659.dev1.dist-info/RECORD,,

{upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72a3659.dev1.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72a3659.dev1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.71a3832.dev12__py3-none-any.whl → 1.2.72a3659.dev1__py3-none-any.whl

upgini 1.2.71a3832.dev12py3-none-any.whl → 1.2.72a3659.dev1py3-none-any.whl