upgini 1.2.71a3832.dev13__py3-none-any.whl → 1.2.72__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.71a3832.dev13"
1
+ __version__ = "1.2.72"
@@ -3250,8 +3250,7 @@ if response.status_code == 200:
3250
3250
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3251
3251
  if len(eval_pair) != 2:
3252
3252
  raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
3253
- eval_X = eval_pair[0]
3254
- eval_y = eval_pair[1]
3253
+ eval_X, eval_y = eval_pair
3255
3254
 
3256
3255
  if _num_samples(eval_X) == 0:
3257
3256
  raise ValidationError(self.bundle.get("eval_x_is_empty"))
upgini/metrics.py CHANGED
@@ -8,18 +8,18 @@ from copy import deepcopy
8
8
  from dataclasses import dataclass
9
9
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
10
10
 
11
+ import lightgbm as lgb
11
12
  import numpy as np
12
13
  import pandas as pd
13
14
  from lightgbm import LGBMClassifier, LGBMRegressor
14
- import lightgbm as lgb
15
15
  from numpy import log1p
16
16
  from pandas.api.types import is_numeric_dtype
17
- # from sklearn.calibration import LabelEncoder
18
17
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
19
18
  from sklearn.preprocessing import OrdinalEncoder
20
19
 
21
20
  from upgini.utils.features_validator import FeaturesValidator
22
21
  from upgini.utils.sklearn_ext import cross_validate
22
+ from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
23
23
 
24
24
  try:
25
25
  from sklearn.metrics import get_scorer_names
@@ -31,7 +31,7 @@ except ImportError:
31
31
  available_scorers = SCORERS
32
32
  from sklearn.metrics import mean_squared_error
33
33
  from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
34
- from sklearn.model_selection import BaseCrossValidator
34
+ from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
35
35
 
36
36
  from upgini.errors import ValidationError
37
37
  from upgini.metadata import ModelTaskType
@@ -85,22 +85,6 @@ CATBOOST_MULTICLASS_PARAMS = {
85
85
  "auto_class_weights": "Balanced",
86
86
  }
87
87
 
88
- LIGHTGBM_PARAMS = {
89
- "random_state": DEFAULT_RANDOM_STATE,
90
- # "num_leaves": 16,
91
- # "n_estimators": 150,
92
- # "min_child_weight": 1,
93
- "max_depth": 4,
94
- "max_cat_threshold": 80,
95
- "min_data_per_group": 25,
96
- "num_boost_round": 150,
97
- "cat_l2": 10,
98
- "cat_smooth": 12,
99
- "learning_rate": 0.05,
100
- "feature_fraction": 1.0,
101
- "min_sum_hessian_in_leaf": 0.01,
102
- }
103
-
104
88
  LIGHTGBM_REGRESSION_PARAMS = {
105
89
  "random_state": DEFAULT_RANDOM_STATE,
106
90
  "deterministic": True,
@@ -127,9 +111,9 @@ LIGHTGBM_MULTICLASS_PARAMS = {
127
111
  "max_cat_threshold": 80,
128
112
  "min_data_per_group": 20,
129
113
  "cat_smooth": 18,
130
- "cat_l2" : 8,
114
+ "cat_l2": 8,
131
115
  "objective": "multiclass",
132
- "class_weight": "balanced",
116
+ # "class_weight": "balanced",
133
117
  "use_quantized_grad": "true",
134
118
  "num_grad_quant_bins": "8",
135
119
  "stochastic_rounding": "true",
@@ -143,12 +127,12 @@ LIGHTGBM_BINARY_PARAMS = {
143
127
  "max_depth": 5,
144
128
  "learning_rate": 0.05,
145
129
  "objective": "binary",
146
- "class_weight": "balanced",
130
+ # "class_weight": "balanced",
147
131
  "deterministic": True,
148
132
  "max_cat_threshold": 80,
149
133
  "min_data_per_group": 20,
150
134
  "cat_smooth": 18,
151
- "cat_l2" : 8,
135
+ "cat_l2": 8,
152
136
  "verbosity": -1,
153
137
  }
154
138
 
@@ -507,7 +491,8 @@ class EstimatorWrapper:
507
491
  params = _get_add_params(params, add_params)
508
492
  estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
509
493
  elif target_type == ModelTaskType.REGRESSION:
510
- params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
494
+ if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
495
+ params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
511
496
  params = _get_add_params(params, add_params)
512
497
  estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
513
498
  else:
@@ -756,7 +741,6 @@ class LightGBMWrapper(EstimatorWrapper):
756
741
  logger=logger,
757
742
  )
758
743
  self.cat_features = None
759
- # self.cat_features_encoders = dict()
760
744
  self.cat_encoder = None
761
745
  self.n_classes = None
762
746
 
@@ -768,23 +752,13 @@ class LightGBMWrapper(EstimatorWrapper):
768
752
  params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
769
753
  self.cat_features = _get_cat_features(x)
770
754
  if self.cat_features:
771
- params["categorical_feature"] = self.cat_features
772
- # params["categorical_feature"] = [x.columns.get_loc(c) for c in self.cat_features] Works
773
- # params["categorical_feature"] = "notauto"
774
- # params["categorical_feature"] = "name:" + ",".join(self.cat_features) # Doesn't work
775
- # cat_indices = [str(x.columns.get_loc(c)) for c in self.cat_features] Doesn't work
776
- # params["categorical_feature"] = ",".join(cat_indices)
777
- pass
778
755
  x = fill_na_cat_features(x, self.cat_features)
779
756
  encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
780
- encoded = encoder.fit_transform(x[self.cat_features], y_numpy)
757
+ encoded = pd.DataFrame(
758
+ encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
759
+ )
781
760
  x[self.cat_features] = encoded
782
761
  self.cat_encoder = encoder
783
- # for feature in self.cat_features:
784
- # encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
785
- # x[feature] = encoder.fit_transform(x[feature])
786
- # self.cat_features_encoders[feature] = encoder
787
- # x[feature] = x[feature].astype("category").cat.codes
788
762
  if not is_numeric_dtype(y_numpy):
789
763
  y_numpy = correct_string_target(y_numpy)
790
764
 
@@ -793,19 +767,11 @@ class LightGBMWrapper(EstimatorWrapper):
793
767
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
794
768
  x, y_numpy, params = super()._prepare_to_calculate(x, y)
795
769
  if self.cat_features is not None:
796
- params["categorical_feature"] = self.cat_features
797
- # params["categorical_feature"] = [x.columns.get_loc(c) for c in self.cat_features]
798
- # params["categorical_feature"] = "notauto"
799
- # params["categorical_feature"] = "name:" + ",".join(self.cat_features) # Doesn't work
800
- # cat_indices = [str(x.columns.get_loc(c)) for c in self.cat_features]
801
- # params["categorical_feature"] = ",".join(cat_indices)
802
770
  x = fill_na_cat_features(x, self.cat_features)
803
771
  if self.cat_encoder is not None:
804
- x[self.cat_features] = self.cat_encoder.transform(x[self.cat_features])
805
- # for feature in self.cat_features:
806
- # encoder = self.cat_features_encoders[feature]
807
- # x[feature] = encoder.transform(x[feature])
808
- # x[feature] = x[feature].astype("category").cat.codes
772
+ x[self.cat_features] = pd.DataFrame(
773
+ self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
774
+ )
809
775
  if not is_numeric_dtype(y):
810
776
  y_numpy = correct_string_target(y_numpy)
811
777
  return x, y_numpy, params
@@ -297,9 +297,9 @@ def balance_undersample_time_series_trunc(
297
297
  time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
298
298
  **kwargs,
299
299
  ):
300
- # Convert date column to datetime
301
300
  if id_columns is None:
302
- id_columns = [date_column]
301
+ id_columns = []
302
+ # Convert date column to datetime
303
303
  dates_df = df[id_columns + [date_column]].copy()
304
304
  dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
305
305
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.71a3832.dev13
3
+ Version: 1.2.72
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=buorll9F2OX4EgV8VmlIrj09nqmsSmqAG8T8p6hRCls,34
1
+ upgini/__about__.py,sha256=-EK4ypqJTIRrg6g1P6PtLXT9vC4Vq7zblqFi389VgwA,23
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=lk80Bx9U36lva6T4lPHBFk88ivrpZ-2uwwMwQg0LglE,207023
6
+ upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
7
7
  upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
8
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
9
- upgini/metrics.py,sha256=ot6AhxfRRTzM-dNApWTvmteLBAmGjD9OyAuKmtUTprE,40630
9
+ upgini/metrics.py,sha256=a0bY4oTMb-MgB1yC1IuTcEtotKZxAxjgV_QV2Z4V8u4,38988
10
10
  upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
67
  upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
68
68
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
69
- upgini/utils/target_utils.py,sha256=KNFzJta1SpGU4sp07dHKSeVJlDs_9qgD2wcw5YuJfOc,16661
69
+ upgini/utils/target_utils.py,sha256=P0cCVRaakWLydYwFjk3TEaQfr0p0hfsJCvKRD8qcxiE,16650
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.71a3832.dev13.dist-info/METADATA,sha256=JdRugxJAMW4KLyRuz7yIX_PqSz_nObynmhkW5-g_lVs,49102
74
- upgini-1.2.71a3832.dev13.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.71a3832.dev13.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.71a3832.dev13.dist-info/RECORD,,
73
+ upgini-1.2.72.dist-info/METADATA,sha256=OpaT2gblO8qGzEJBNf36-dPwbedHPP93bX0fPAOMl38,49091
74
+ upgini-1.2.72.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.72.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.72.dist-info/RECORD,,