PyPI - upgini - Versions diffs - 1.2.71a3832.dev12__py3-none-any.whl → 1.2.72__py3-none-any.whl - Mend

upgini 1.2.71a3832.dev12py3-none-any.whl → 1.2.72py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.~~71a3832.dev12~~"
1	+ __version__ = "1.2.72"

upgini/features_enricher.py CHANGED Viewed

@@ -3250,8 +3250,7 @@ if response.status_code == 200:
     def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
         if len(eval_pair) != 2:
             raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
-        eval_X = eval_pair[0]
-        eval_y = eval_pair[1]
+        eval_X, eval_y = eval_pair
         if _num_samples(eval_X) == 0:
             raise ValidationError(self.bundle.get("eval_x_is_empty"))
@@ -3872,15 +3871,23 @@ if response.status_code == 200:
         original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
-        if updated_shaps is not None:
-            for fm in features_meta:
-                fm.shap_value = updated_shaps.get(fm.name, 0.0)
-        features_meta.sort(key=lambda m: (-m.shap_value, m.name))
         for feature_meta in features_meta:
             if feature_meta.name in original_names_dict.keys():
                 feature_meta.name = original_names_dict[feature_meta.name]
+            if updated_shaps is not None:
+                updating_shap = updated_shaps.get(feature_meta.name)
+                if updating_shap is None:
+                    self.logger.warning(
+                        f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
+                    )
+                    updating_shap = 0.0
+                feature_meta.shap_value = updating_shap
+        features_meta.sort(key=lambda m: (-m.shap_value, m.name))
+        for feature_meta in features_meta:
             is_client_feature = feature_meta.name in df.columns
             # TODO make a decision about selected features based on special flag from mlb
@@ -3892,7 +3899,7 @@ if response.status_code == 200:
             # Use only important features
             if (
                 # feature_meta.name in self.fit_generated_features or
-                feature_meta.name == COUNTRY
+                feature_meta.name == COUNTRY  # constant synthetic column
                 # In select_features mode we select also from etalon features and need to show them
                 or (not self.fit_select_features and is_client_feature)
             ):

upgini/metrics.py CHANGED Viewed

@@ -8,16 +8,18 @@ from copy import deepcopy
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import lightgbm as lgb
 import numpy as np
 import pandas as pd
 from lightgbm import LGBMClassifier, LGBMRegressor
-import lightgbm as lgb
 from numpy import log1p
 from pandas.api.types import is_numeric_dtype
 from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
+from sklearn.preprocessing import OrdinalEncoder
 from upgini.utils.features_validator import FeaturesValidator
 from upgini.utils.sklearn_ext import cross_validate
+from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
 try:
     from sklearn.metrics import get_scorer_names
@@ -29,7 +31,7 @@ except ImportError:
     available_scorers = SCORERS
 from sklearn.metrics import mean_squared_error
 from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
-from sklearn.model_selection import BaseCrossValidator
+from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
 from upgini.errors import ValidationError
 from upgini.metadata import ModelTaskType
@@ -83,22 +85,6 @@ CATBOOST_MULTICLASS_PARAMS = {
     "auto_class_weights": "Balanced",
 }
-LIGHTGBM_PARAMS = {
-    "random_state": DEFAULT_RANDOM_STATE,
-    # "num_leaves": 16,
-    # "n_estimators": 150,
-    # "min_child_weight": 1,
-    "max_depth": 4,
-    "max_cat_threshold": 80,
-    "min_data_per_group": 25,
-    "num_boost_round": 150,
-    "cat_l2": 10,
-    "cat_smooth": 12,
-    "learning_rate": 0.05,
-    "feature_fraction": 1.0,
-    "min_sum_hessian_in_leaf": 0.01,
-}
 LIGHTGBM_REGRESSION_PARAMS = {
     "random_state": DEFAULT_RANDOM_STATE,
     "deterministic": True,
@@ -125,9 +111,9 @@ LIGHTGBM_MULTICLASS_PARAMS = {
     "max_cat_threshold": 80,
     "min_data_per_group": 20,
     "cat_smooth": 18,
-    "cat_l2" : 8,
+    "cat_l2": 8,
     "objective": "multiclass",
-    "class_weight": "balanced",
+    # "class_weight": "balanced",
     "use_quantized_grad": "true",
     "num_grad_quant_bins": "8",
     "stochastic_rounding": "true",
@@ -141,12 +127,12 @@ LIGHTGBM_BINARY_PARAMS = {
     "max_depth": 5,
     "learning_rate": 0.05,
     "objective": "binary",
-    "class_weight": "balanced",
+    # "class_weight": "balanced",
     "deterministic": True,
     "max_cat_threshold": 80,
     "min_data_per_group": 20,
     "cat_smooth": 18,
-    "cat_l2" : 8,
+    "cat_l2": 8,
     "verbosity": -1,
 }
@@ -505,7 +491,8 @@ class EstimatorWrapper:
                 params = _get_add_params(params, add_params)
                 estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
             elif target_type == ModelTaskType.REGRESSION:
-                params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
+                if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
+                    params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
                 params = _get_add_params(params, add_params)
                 estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
             else:
@@ -754,6 +741,7 @@ class LightGBMWrapper(EstimatorWrapper):
             logger=logger,
         )
         self.cat_features = None
+        self.cat_encoder = None
         self.n_classes = None
     def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
@@ -764,10 +752,13 @@ class LightGBMWrapper(EstimatorWrapper):
             params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
         self.cat_features = _get_cat_features(x)
         if self.cat_features:
-            params["categorical_feature"] = self.cat_features
-        x = fill_na_cat_features(x, self.cat_features)
-        for feature in self.cat_features:
-            x[feature] = x[feature].astype("category").cat.codes
+            x = fill_na_cat_features(x, self.cat_features)
+            encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
+            encoded = pd.DataFrame(
+                encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
+            )
+            x[self.cat_features] = encoded
+            self.cat_encoder = encoder
         if not is_numeric_dtype(y_numpy):
             y_numpy = correct_string_target(y_numpy)
@@ -777,8 +768,10 @@ class LightGBMWrapper(EstimatorWrapper):
         x, y_numpy, params = super()._prepare_to_calculate(x, y)
         if self.cat_features is not None:
             x = fill_na_cat_features(x, self.cat_features)
-            for feature in self.cat_features:
-                x[feature] = x[feature].astype("category").cat.codes
+            if self.cat_encoder is not None:
+                x[self.cat_features] = pd.DataFrame(
+                    self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
+                )
         if not is_numeric_dtype(y):
             y_numpy = correct_string_target(y_numpy)
         return x, y_numpy, params

upgini/utils/target_utils.py CHANGED Viewed

@@ -204,7 +204,7 @@ def balance_undersample(
 def balance_undersample_forced(
     df: pd.DataFrame,
     target_column: str,
-    id_columns: List[str],
+    id_columns: Optional[List[str]],
     date_column: str,
     task_type: ModelTaskType,
     cv_type: Optional[CVType],
@@ -287,7 +287,7 @@ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
 def balance_undersample_time_series_trunc(
     df: pd.DataFrame,
-    id_columns: List[str],
+    id_columns: Optional[List[str]],
     date_column: str,
     sample_size: int,
     random_state: int = 42,
@@ -297,6 +297,8 @@ def balance_undersample_time_series_trunc(
     time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
     **kwargs,
 ):
+    if id_columns is None:
+        id_columns = []
     # Convert date column to datetime
     dates_df = df[id_columns + [date_column]].copy()
     dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")

{upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.71a3832.dev12
+Version: 1.2.72
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-upgini/__about__.py,sha256=okx02f-XOrtKlpdtJnV-aHreGFvFkxZ5NQhd5zxvhMk,34
+upgini/__about__.py,sha256=-EK4ypqJTIRrg6g1P6PtLXT9vC4Vq7zblqFi389VgwA,23
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=DgWboHEhr5BQT87MaAo2iUtrhapP3iqczLeZtWLRkDs,206664
+upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
 upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
 upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
-upgini/metrics.py,sha256=9AaQi7Yb22ZNnycUOAUpcP7TWF5Pfy_NGACcDj10aMs,38820
+upgini/metrics.py,sha256=a0bY4oTMb-MgB1yC1IuTcEtotKZxAxjgV_QV2Z4V8u4,38988
 upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
 upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
 upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
-upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
+upgini/utils/target_utils.py,sha256=P0cCVRaakWLydYwFjk3TEaQfr0p0hfsJCvKRD8qcxiE,16650
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.71a3832.dev12.dist-info/METADATA,sha256=8jmuNEDPwjc-Wa6Bds0FjYqYgqf3LFMYyRGUDy5DME8,49102
-upgini-1.2.71a3832.dev12.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.71a3832.dev12.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.71a3832.dev12.dist-info/RECORD,,
+upgini-1.2.72.dist-info/METADATA,sha256=OpaT2gblO8qGzEJBNf36-dPwbedHPP93bX0fPAOMl38,49091
+upgini-1.2.72.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.72.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.72.dist-info/RECORD,,

{upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.71a3832.dev12__py3-none-any.whl → 1.2.72__py3-none-any.whl

upgini 1.2.71a3832.dev12py3-none-any.whl → 1.2.72py3-none-any.whl