PyPI - upgini - Versions diffs - 1.2.68a3832.dev5__py3-none-any.whl → 1.2.68a3832.dev7__py3-none-any.whl - Mend

upgini 1.2.68a3832.dev5py3-none-any.whl → 1.2.68a3832.dev7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (6) hide show

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.68a3832.~~dev5~~"
1	+ __version__ = "1.2.68a3832.dev7"

upgini/metrics.py CHANGED Viewed

@@ -1,11 +1,11 @@
 from __future__ import annotations
-from dataclasses import dataclass
 import inspect
 import logging
 import re
 from collections import defaultdict
 from copy import deepcopy
+from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import numpy as np
@@ -26,11 +26,8 @@ except ImportError:
     from sklearn.metrics._scorer import SCORERS
     available_scorers = SCORERS
-from sklearn.metrics._regression import (
-    _check_reg_targets,
-    check_consistent_length,
-)
 from sklearn.metrics import mean_squared_error
+from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
 from sklearn.model_selection import BaseCrossValidator
 from upgini.errors import ValidationError
@@ -101,6 +98,58 @@ LIGHTGBM_PARAMS = {
     "min_sum_hessian_in_leaf": 0.01,
 }
+LIGHTGBM_REGRESSION_PARAMS = {
+    "random_state": DEFAULT_RANDOM_STATE,
+    "deterministic": True,
+    "min_gain_to_split": 0.001,
+    "n_estimators": 275,
+    "max_depth": 5,
+    "max_cat_threshold": 80,
+    "min_data_per_group": 25,
+    "cat_l2": 10,
+    "cat_smooth": 12,
+    "learning_rate": 0.05,
+    "feature_fraction": 1.0,
+    "min_sum_hessian_in_leaf": 0.01,
+    "objective": "huber",
+    "verbosity": -1,
+}
+LIGHTGBM_MULTICLASS_PARAMS = {
+    "random_state": DEFAULT_RANDOM_STATE,
+    "deterministic": True,
+    "min_gain_to_split": 0.001,
+    "n_estimators": 275,
+    "max_depth": 3,
+    "max_cat_threshold": 80,
+    "min_data_per_group": 25,
+    "cat_l2": 10,
+    "cat_smooth": 12,
+    "learning_rate": 0.25,  # CatBoost 0.25
+    "min_sum_hessian_in_leaf": 0.01,
+    "objective": "softmax",
+    "class_weight": "balanced",  # TODO pass dict with weights for each class
+    "verbosity": -1,
+}
+LIGHTGBM_BINARY_PARAMS = {
+    "random_state": DEFAULT_RANDOM_STATE,
+    "deterministic": True,
+    "min_gain_to_split": 0.001,
+    "n_estimators": 275,
+    "max_depth": 5,
+    "max_cat_threshold": 80,
+    "min_data_per_group": 25,
+    "cat_l2": 10,
+    "cat_smooth": 12,
+    "learning_rate": 0.05,
+    "feature_fraction": 1.0,
+    "min_sum_hessian_in_leaf": 0.01,
+    "objective": "binary",
+    "class_weight": "balanced",  # TODO pass dict with weights for each class
+    "verbosity": -1,
+}
 N_FOLDS = 5
 BLOCKED_TS_TEST_SIZE = 0.2
@@ -220,6 +269,7 @@ SUPPORTED_CATBOOST_METRICS = {
 def is_catboost_estimator(estimator):
     try:
         from catboost import CatBoostClassifier, CatBoostRegressor
         return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
     except ImportError:
         return False
@@ -441,28 +491,27 @@ class EstimatorWrapper:
         }
         if estimator is None:
             params = {}
-            params["has_time"] = has_date
             # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
             #     params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
             if target_type == ModelTaskType.MULTICLASS:
                 # params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
                 # params = _get_add_params(params, add_params)
                 # estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
-                params = _get_add_params(params, LIGHTGBM_PARAMS)
+                params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
                 params = _get_add_params(params, add_params)
                 estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
             elif target_type == ModelTaskType.BINARY:
                 # params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
                 # params = _get_add_params(params, add_params)
                 # estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
-                params = _get_add_params(params, LIGHTGBM_PARAMS)
+                params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
                 params = _get_add_params(params, add_params)
                 estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
             elif target_type == ModelTaskType.REGRESSION:
                 # params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
                 # params = _get_add_params(params, add_params)
                 # estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
-                params = _get_add_params(params, LIGHTGBM_PARAMS)
+                params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
                 params = _get_add_params(params, add_params)
                 estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
             else:
@@ -474,15 +523,14 @@ class EstimatorWrapper:
                 estimator_copy = deepcopy(estimator)
             kwargs["estimator"] = estimator_copy
             if is_catboost_estimator(estimator):
+                params["has_time"] = has_date
                 if cat_features is not None:
                     for cat_feature in cat_features:
                         if cat_feature not in x.columns:
                             logger.error(
                                 f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
                             )
-                    estimator_copy.set_params(
-                        cat_features=cat_features
-                    )
+                    estimator_copy.set_params(cat_features=cat_features)
                 estimator = CatBoostWrapper(**kwargs)
             else:
                 if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
@@ -530,8 +578,9 @@ class CatBoostWrapper(EstimatorWrapper):
         x, y, groups, params = super()._prepare_to_fit(x, y)
         # Find embeddings
-        from catboost import CatBoostClassifier
         import catboost
+        from catboost import CatBoostClassifier
         if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
             emb_pattern = r"(.+)_emb\d+"
             self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
@@ -655,6 +704,7 @@ class CatBoostWrapper(EstimatorWrapper):
     def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
         try:
             from catboost import Pool
             # Create Pool for fold data, if need (for example, when categorical features are present)
             fold_pool = Pool(
                 x,
@@ -712,6 +762,14 @@ class LightGBMWrapper(EstimatorWrapper):
     def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
         x, y, groups, params = super()._prepare_to_fit(x, y)
+        if self.target_type == ModelTaskType.MULTICLASS:
+            params["num_class"] = y.nunique()
+        emb_pattern = r"(.+)_emb\d+"
+        emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
+        max_bin_by_feature_type = {
+            feature: 63 if feature in emb_features else 255 for feature in x.columns
+        }
+        params["max_bin_by_feature_type"] = max_bin_by_feature_type
         self.cat_features = _get_cat_features(x)
         x = fill_na_cat_features(x, self.cat_features)
         for feature in self.cat_features:
@@ -733,8 +791,8 @@ class LightGBMWrapper(EstimatorWrapper):
     def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
         try:
-            import shap
             import lightgbm as lgb
+            import shap
             if not isinstance(estimator, (lgb.LGBMRegressor, lgb.LGBMClassifier)):
                 return None

{upgini-1.2.68a3832.dev5.dist-info → upgini-1.2.68a3832.dev7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.68a3832.dev5
+Version: 1.2.68a3832.dev7
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.68a3832.dev5.dist-info → upgini-1.2.68a3832.dev7.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-upgini/__about__.py,sha256=7DKSsnFO8h8_6mNOcY1H-BxP7lm1gyUPvtuFwHwu1x8,33
+upgini/__about__.py,sha256=CR4sN9ZhYNXzf0xJ61KtPk3O9k_pn-EljmzdfIfJyoM,33
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=1rb6BzyuiQFGVCTDmKL2wox3UFRNjtNaIJOwQnZ801A,34956
@@ -7,7 +7,7 @@ upgini/features_enricher.py,sha256=GXXx14jwf3F26_KrfJ6O40Vcu1hRx5iBjUB_jxy3Xvg,2
 upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
 upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
 upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
-upgini/metrics.py,sha256=0WIe1IQx9vzUK0pVGv3hODBrOL3zaLDybXbs5S_ntvQ,36991
+upgini/metrics.py,sha256=onr-wFpP0idy0SH3Wxv2xnqxT5H5MiP70o44f1PhfFw,38808
 upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.68a3832.dev5.dist-info/METADATA,sha256=DYYPHgDqV3PqCiz7WUUXZs4xnkC3Zh89hX-q5NKsFzk,49149
-upgini-1.2.68a3832.dev5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.68a3832.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.68a3832.dev5.dist-info/RECORD,,
+upgini-1.2.68a3832.dev7.dist-info/METADATA,sha256=LZ6mg6092FrqWu_yNWdMKTMgOC9lYclcPCQFLyMQAW0,49149
+upgini-1.2.68a3832.dev7.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.68a3832.dev7.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.68a3832.dev7.dist-info/RECORD,,

{upgini-1.2.68a3832.dev5.dist-info → upgini-1.2.68a3832.dev7.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.68a3832.dev5.dist-info → upgini-1.2.68a3832.dev7.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.68a3832.dev5__py3-none-any.whl → 1.2.68a3832.dev7__py3-none-any.whl

Potentially problematic release.

upgini 1.2.68a3832.dev5py3-none-any.whl → 1.2.68a3832.dev7py3-none-any.whl