upgini 1.2.68a3832.dev6__py3-none-any.whl → 1.2.68a3832.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.68a3832.dev6"
1
+ __version__ = "1.2.68a3832.dev7"
upgini/metrics.py CHANGED
@@ -1,11 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import dataclass
4
3
  import inspect
5
4
  import logging
6
5
  import re
7
6
  from collections import defaultdict
8
7
  from copy import deepcopy
8
+ from dataclasses import dataclass
9
9
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
10
10
 
11
11
  import numpy as np
@@ -26,11 +26,8 @@ except ImportError:
26
26
  from sklearn.metrics._scorer import SCORERS
27
27
 
28
28
  available_scorers = SCORERS
29
- from sklearn.metrics._regression import (
30
- _check_reg_targets,
31
- check_consistent_length,
32
- )
33
29
  from sklearn.metrics import mean_squared_error
30
+ from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
34
31
  from sklearn.model_selection import BaseCrossValidator
35
32
 
36
33
  from upgini.errors import ValidationError
@@ -102,37 +99,43 @@ LIGHTGBM_PARAMS = {
102
99
  }
103
100
 
104
101
  LIGHTGBM_REGRESSION_PARAMS = {
105
- "random_state": DEFAULT_RANDOM_STATE,
106
- "n_estimators": 275,
107
- "max_depth": 5,
108
- "max_cat_threshold": 80,
109
- "min_data_per_group": 25,
110
- "cat_l2": 10,
111
- "cat_smooth": 12,
112
- "learning_rate": 0.05,
113
- "feature_fraction": 1.0,
114
- "min_sum_hessian_in_leaf": 0.01,
115
- "objective": "huber",
116
- "verbosity": 0,
102
+ "random_state": DEFAULT_RANDOM_STATE,
103
+ "deterministic": True,
104
+ "min_gain_to_split": 0.001,
105
+ "n_estimators": 275,
106
+ "max_depth": 5,
107
+ "max_cat_threshold": 80,
108
+ "min_data_per_group": 25,
109
+ "cat_l2": 10,
110
+ "cat_smooth": 12,
111
+ "learning_rate": 0.05,
112
+ "feature_fraction": 1.0,
113
+ "min_sum_hessian_in_leaf": 0.01,
114
+ "objective": "huber",
115
+ "verbosity": -1,
117
116
  }
118
117
 
119
118
  LIGHTGBM_MULTICLASS_PARAMS = {
120
119
  "random_state": DEFAULT_RANDOM_STATE,
120
+ "deterministic": True,
121
+ "min_gain_to_split": 0.001,
121
122
  "n_estimators": 275,
122
123
  "max_depth": 3,
123
124
  "max_cat_threshold": 80,
124
125
  "min_data_per_group": 25,
125
126
  "cat_l2": 10,
126
127
  "cat_smooth": 12,
127
- "learning_rate": 0.25, # CatBoost 0.25
128
+ "learning_rate": 0.25, # CatBoost 0.25
128
129
  "min_sum_hessian_in_leaf": 0.01,
129
- "objective": "multiclass",
130
- "class_weight": "balanced",
131
- "verbosity": 0,
130
+ "objective": "softmax",
131
+ "class_weight": "balanced", # TODO pass dict with weights for each class
132
+ "verbosity": -1,
132
133
  }
133
134
 
134
135
  LIGHTGBM_BINARY_PARAMS = {
135
136
  "random_state": DEFAULT_RANDOM_STATE,
137
+ "deterministic": True,
138
+ "min_gain_to_split": 0.001,
136
139
  "n_estimators": 275,
137
140
  "max_depth": 5,
138
141
  "max_cat_threshold": 80,
@@ -143,8 +146,8 @@ LIGHTGBM_BINARY_PARAMS = {
143
146
  "feature_fraction": 1.0,
144
147
  "min_sum_hessian_in_leaf": 0.01,
145
148
  "objective": "binary",
146
- "class_weight": "balanced",
147
- "verbosity": 0,
149
+ "class_weight": "balanced", # TODO pass dict with weights for each class
150
+ "verbosity": -1,
148
151
  }
149
152
 
150
153
  N_FOLDS = 5
@@ -266,6 +269,7 @@ SUPPORTED_CATBOOST_METRICS = {
266
269
  def is_catboost_estimator(estimator):
267
270
  try:
268
271
  from catboost import CatBoostClassifier, CatBoostRegressor
272
+
269
273
  return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
270
274
  except ImportError:
271
275
  return False
@@ -526,9 +530,7 @@ class EstimatorWrapper:
526
530
  logger.error(
527
531
  f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
528
532
  )
529
- estimator_copy.set_params(
530
- cat_features=cat_features
531
- )
533
+ estimator_copy.set_params(cat_features=cat_features)
532
534
  estimator = CatBoostWrapper(**kwargs)
533
535
  else:
534
536
  if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
@@ -576,8 +578,9 @@ class CatBoostWrapper(EstimatorWrapper):
576
578
  x, y, groups, params = super()._prepare_to_fit(x, y)
577
579
 
578
580
  # Find embeddings
579
- from catboost import CatBoostClassifier
580
581
  import catboost
582
+ from catboost import CatBoostClassifier
583
+
581
584
  if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
582
585
  emb_pattern = r"(.+)_emb\d+"
583
586
  self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
@@ -701,6 +704,7 @@ class CatBoostWrapper(EstimatorWrapper):
701
704
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
702
705
  try:
703
706
  from catboost import Pool
707
+
704
708
  # Create Pool for fold data, if need (for example, when categorical features are present)
705
709
  fold_pool = Pool(
706
710
  x,
@@ -760,6 +764,12 @@ class LightGBMWrapper(EstimatorWrapper):
760
764
  x, y, groups, params = super()._prepare_to_fit(x, y)
761
765
  if self.target_type == ModelTaskType.MULTICLASS:
762
766
  params["num_class"] = y.nunique()
767
+ emb_pattern = r"(.+)_emb\d+"
768
+ emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
769
+ max_bin_by_feature_type = {
770
+ feature: 63 if feature in emb_features else 255 for feature in x.columns
771
+ }
772
+ params["max_bin_by_feature_type"] = max_bin_by_feature_type
763
773
  self.cat_features = _get_cat_features(x)
764
774
  x = fill_na_cat_features(x, self.cat_features)
765
775
  for feature in self.cat_features:
@@ -781,8 +791,8 @@ class LightGBMWrapper(EstimatorWrapper):
781
791
 
782
792
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
783
793
  try:
784
- import shap
785
794
  import lightgbm as lgb
795
+ import shap
786
796
 
787
797
  if not isinstance(estimator, (lgb.LGBMRegressor, lgb.LGBMClassifier)):
788
798
  return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.68a3832.dev6
3
+ Version: 1.2.68a3832.dev7
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,4 +1,4 @@
1
- upgini/__about__.py,sha256=8CoP2d6NQy3RuFamWKCHcwiF2GYkyj5rtk6FIpBm0rI,33
1
+ upgini/__about__.py,sha256=CR4sN9ZhYNXzf0xJ61KtPk3O9k_pn-EljmzdfIfJyoM,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=1rb6BzyuiQFGVCTDmKL2wox3UFRNjtNaIJOwQnZ801A,34956
@@ -7,7 +7,7 @@ upgini/features_enricher.py,sha256=GXXx14jwf3F26_KrfJ6O40Vcu1hRx5iBjUB_jxy3Xvg,2
7
7
  upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
10
- upgini/metrics.py,sha256=ZBAjInLCm15BBYWNi9kz6IJs8R0WrF2PkrLnLAodR1Y,38246
10
+ upgini/metrics.py,sha256=onr-wFpP0idy0SH3Wxv2xnqxT5H5MiP70o44f1PhfFw,38808
11
11
  upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.68a3832.dev6.dist-info/METADATA,sha256=UWgAnrn5D5mQT6Js-sXVBCA1wPW7YZU-JEEcnRdUCHU,49149
74
- upgini-1.2.68a3832.dev6.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.68a3832.dev6.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.68a3832.dev6.dist-info/RECORD,,
73
+ upgini-1.2.68a3832.dev7.dist-info/METADATA,sha256=LZ6mg6092FrqWu_yNWdMKTMgOC9lYclcPCQFLyMQAW0,49149
74
+ upgini-1.2.68a3832.dev7.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.68a3832.dev7.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.68a3832.dev7.dist-info/RECORD,,