upgini 1.2.68a3832.dev5__py3-none-any.whl → 1.2.68a3832.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.68a3832.dev5"
1
+ __version__ = "1.2.68a3832.dev7"
upgini/metrics.py CHANGED
@@ -1,11 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import dataclass
4
3
  import inspect
5
4
  import logging
6
5
  import re
7
6
  from collections import defaultdict
8
7
  from copy import deepcopy
8
+ from dataclasses import dataclass
9
9
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
10
10
 
11
11
  import numpy as np
@@ -26,11 +26,8 @@ except ImportError:
26
26
  from sklearn.metrics._scorer import SCORERS
27
27
 
28
28
  available_scorers = SCORERS
29
- from sklearn.metrics._regression import (
30
- _check_reg_targets,
31
- check_consistent_length,
32
- )
33
29
  from sklearn.metrics import mean_squared_error
30
+ from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
34
31
  from sklearn.model_selection import BaseCrossValidator
35
32
 
36
33
  from upgini.errors import ValidationError
@@ -101,6 +98,58 @@ LIGHTGBM_PARAMS = {
101
98
  "min_sum_hessian_in_leaf": 0.01,
102
99
  }
103
100
 
101
+ LIGHTGBM_REGRESSION_PARAMS = {
102
+ "random_state": DEFAULT_RANDOM_STATE,
103
+ "deterministic": True,
104
+ "min_gain_to_split": 0.001,
105
+ "n_estimators": 275,
106
+ "max_depth": 5,
107
+ "max_cat_threshold": 80,
108
+ "min_data_per_group": 25,
109
+ "cat_l2": 10,
110
+ "cat_smooth": 12,
111
+ "learning_rate": 0.05,
112
+ "feature_fraction": 1.0,
113
+ "min_sum_hessian_in_leaf": 0.01,
114
+ "objective": "huber",
115
+ "verbosity": -1,
116
+ }
117
+
118
+ LIGHTGBM_MULTICLASS_PARAMS = {
119
+ "random_state": DEFAULT_RANDOM_STATE,
120
+ "deterministic": True,
121
+ "min_gain_to_split": 0.001,
122
+ "n_estimators": 275,
123
+ "max_depth": 3,
124
+ "max_cat_threshold": 80,
125
+ "min_data_per_group": 25,
126
+ "cat_l2": 10,
127
+ "cat_smooth": 12,
128
+ "learning_rate": 0.25, # CatBoost 0.25
129
+ "min_sum_hessian_in_leaf": 0.01,
130
+ "objective": "softmax",
131
+ "class_weight": "balanced", # TODO pass dict with weights for each class
132
+ "verbosity": -1,
133
+ }
134
+
135
+ LIGHTGBM_BINARY_PARAMS = {
136
+ "random_state": DEFAULT_RANDOM_STATE,
137
+ "deterministic": True,
138
+ "min_gain_to_split": 0.001,
139
+ "n_estimators": 275,
140
+ "max_depth": 5,
141
+ "max_cat_threshold": 80,
142
+ "min_data_per_group": 25,
143
+ "cat_l2": 10,
144
+ "cat_smooth": 12,
145
+ "learning_rate": 0.05,
146
+ "feature_fraction": 1.0,
147
+ "min_sum_hessian_in_leaf": 0.01,
148
+ "objective": "binary",
149
+ "class_weight": "balanced", # TODO pass dict with weights for each class
150
+ "verbosity": -1,
151
+ }
152
+
104
153
  N_FOLDS = 5
105
154
  BLOCKED_TS_TEST_SIZE = 0.2
106
155
 
@@ -220,6 +269,7 @@ SUPPORTED_CATBOOST_METRICS = {
220
269
  def is_catboost_estimator(estimator):
221
270
  try:
222
271
  from catboost import CatBoostClassifier, CatBoostRegressor
272
+
223
273
  return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
224
274
  except ImportError:
225
275
  return False
@@ -441,28 +491,27 @@ class EstimatorWrapper:
441
491
  }
442
492
  if estimator is None:
443
493
  params = {}
444
- params["has_time"] = has_date
445
494
  # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
446
495
  # params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
447
496
  if target_type == ModelTaskType.MULTICLASS:
448
497
  # params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
449
498
  # params = _get_add_params(params, add_params)
450
499
  # estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
451
- params = _get_add_params(params, LIGHTGBM_PARAMS)
500
+ params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
452
501
  params = _get_add_params(params, add_params)
453
502
  estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
454
503
  elif target_type == ModelTaskType.BINARY:
455
504
  # params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
456
505
  # params = _get_add_params(params, add_params)
457
506
  # estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
458
- params = _get_add_params(params, LIGHTGBM_PARAMS)
507
+ params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
459
508
  params = _get_add_params(params, add_params)
460
509
  estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
461
510
  elif target_type == ModelTaskType.REGRESSION:
462
511
  # params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
463
512
  # params = _get_add_params(params, add_params)
464
513
  # estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
465
- params = _get_add_params(params, LIGHTGBM_PARAMS)
514
+ params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
466
515
  params = _get_add_params(params, add_params)
467
516
  estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
468
517
  else:
@@ -474,15 +523,14 @@ class EstimatorWrapper:
474
523
  estimator_copy = deepcopy(estimator)
475
524
  kwargs["estimator"] = estimator_copy
476
525
  if is_catboost_estimator(estimator):
526
+ params["has_time"] = has_date
477
527
  if cat_features is not None:
478
528
  for cat_feature in cat_features:
479
529
  if cat_feature not in x.columns:
480
530
  logger.error(
481
531
  f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
482
532
  )
483
- estimator_copy.set_params(
484
- cat_features=cat_features
485
- )
533
+ estimator_copy.set_params(cat_features=cat_features)
486
534
  estimator = CatBoostWrapper(**kwargs)
487
535
  else:
488
536
  if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
@@ -530,8 +578,9 @@ class CatBoostWrapper(EstimatorWrapper):
530
578
  x, y, groups, params = super()._prepare_to_fit(x, y)
531
579
 
532
580
  # Find embeddings
533
- from catboost import CatBoostClassifier
534
581
  import catboost
582
+ from catboost import CatBoostClassifier
583
+
535
584
  if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
536
585
  emb_pattern = r"(.+)_emb\d+"
537
586
  self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
@@ -655,6 +704,7 @@ class CatBoostWrapper(EstimatorWrapper):
655
704
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
656
705
  try:
657
706
  from catboost import Pool
707
+
658
708
  # Create Pool for fold data, if need (for example, when categorical features are present)
659
709
  fold_pool = Pool(
660
710
  x,
@@ -712,6 +762,14 @@ class LightGBMWrapper(EstimatorWrapper):
712
762
 
713
763
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
714
764
  x, y, groups, params = super()._prepare_to_fit(x, y)
765
+ if self.target_type == ModelTaskType.MULTICLASS:
766
+ params["num_class"] = y.nunique()
767
+ emb_pattern = r"(.+)_emb\d+"
768
+ emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
769
+ max_bin_by_feature_type = {
770
+ feature: 63 if feature in emb_features else 255 for feature in x.columns
771
+ }
772
+ params["max_bin_by_feature_type"] = max_bin_by_feature_type
715
773
  self.cat_features = _get_cat_features(x)
716
774
  x = fill_na_cat_features(x, self.cat_features)
717
775
  for feature in self.cat_features:
@@ -733,8 +791,8 @@ class LightGBMWrapper(EstimatorWrapper):
733
791
 
734
792
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
735
793
  try:
736
- import shap
737
794
  import lightgbm as lgb
795
+ import shap
738
796
 
739
797
  if not isinstance(estimator, (lgb.LGBMRegressor, lgb.LGBMClassifier)):
740
798
  return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.68a3832.dev5
3
+ Version: 1.2.68a3832.dev7
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,4 +1,4 @@
1
- upgini/__about__.py,sha256=7DKSsnFO8h8_6mNOcY1H-BxP7lm1gyUPvtuFwHwu1x8,33
1
+ upgini/__about__.py,sha256=CR4sN9ZhYNXzf0xJ61KtPk3O9k_pn-EljmzdfIfJyoM,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=1rb6BzyuiQFGVCTDmKL2wox3UFRNjtNaIJOwQnZ801A,34956
@@ -7,7 +7,7 @@ upgini/features_enricher.py,sha256=GXXx14jwf3F26_KrfJ6O40Vcu1hRx5iBjUB_jxy3Xvg,2
7
7
  upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
10
- upgini/metrics.py,sha256=0WIe1IQx9vzUK0pVGv3hODBrOL3zaLDybXbs5S_ntvQ,36991
10
+ upgini/metrics.py,sha256=onr-wFpP0idy0SH3Wxv2xnqxT5H5MiP70o44f1PhfFw,38808
11
11
  upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.68a3832.dev5.dist-info/METADATA,sha256=DYYPHgDqV3PqCiz7WUUXZs4xnkC3Zh89hX-q5NKsFzk,49149
74
- upgini-1.2.68a3832.dev5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.68a3832.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.68a3832.dev5.dist-info/RECORD,,
73
+ upgini-1.2.68a3832.dev7.dist-info/METADATA,sha256=LZ6mg6092FrqWu_yNWdMKTMgOC9lYclcPCQFLyMQAW0,49149
74
+ upgini-1.2.68a3832.dev7.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.68a3832.dev7.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.68a3832.dev7.dist-info/RECORD,,