upgini 1.2.71a3832.dev12__py3-none-any.whl → 1.2.72a3659.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.71a3832.dev12"
1
+ __version__ = "1.2.72a3659.dev1"
upgini/autofe/vector.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import List, Optional
1
+ from typing import Dict, List, Optional
2
2
 
3
3
  import pandas as pd
4
4
 
@@ -22,3 +22,25 @@ class Sum(PandasOperator, VectorizableMixin):
22
22
 
23
23
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
24
24
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
25
+
26
+
27
+ class OnnxModel(PandasOperator):
28
+ name: str = "onnx"
29
+ is_vector: bool = True
30
+ output_type: Optional[str] = "float"
31
+ model_name: str
32
+
33
+ def get_params(self) -> Dict[str, Optional[str]]:
34
+ res = super().get_params()
35
+ res.update(
36
+ {
37
+ "model_name": self.model_name,
38
+ }
39
+ )
40
+ return res
41
+
42
+ # def load_model(self):
43
+ # ...
44
+
45
+ # def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
46
+ # ...
@@ -3250,8 +3250,7 @@ if response.status_code == 200:
3250
3250
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3251
3251
  if len(eval_pair) != 2:
3252
3252
  raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
3253
- eval_X = eval_pair[0]
3254
- eval_y = eval_pair[1]
3253
+ eval_X, eval_y = eval_pair
3255
3254
 
3256
3255
  if _num_samples(eval_X) == 0:
3257
3256
  raise ValidationError(self.bundle.get("eval_x_is_empty"))
@@ -3872,15 +3871,23 @@ if response.status_code == 200:
3872
3871
 
3873
3872
  original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
3874
3873
 
3875
- if updated_shaps is not None:
3876
- for fm in features_meta:
3877
- fm.shap_value = updated_shaps.get(fm.name, 0.0)
3878
-
3879
- features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3880
3874
  for feature_meta in features_meta:
3881
3875
  if feature_meta.name in original_names_dict.keys():
3882
3876
  feature_meta.name = original_names_dict[feature_meta.name]
3883
3877
 
3878
+ if updated_shaps is not None:
3879
+ updating_shap = updated_shaps.get(feature_meta.name)
3880
+ if updating_shap is None:
3881
+ self.logger.warning(
3882
+ f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
3883
+ )
3884
+ updating_shap = 0.0
3885
+ feature_meta.shap_value = updating_shap
3886
+
3887
+ features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3888
+
3889
+ for feature_meta in features_meta:
3890
+
3884
3891
  is_client_feature = feature_meta.name in df.columns
3885
3892
 
3886
3893
  # TODO make a decision about selected features based on special flag from mlb
@@ -3892,7 +3899,7 @@ if response.status_code == 200:
3892
3899
  # Use only important features
3893
3900
  if (
3894
3901
  # feature_meta.name in self.fit_generated_features or
3895
- feature_meta.name == COUNTRY
3902
+ feature_meta.name == COUNTRY # constant synthetic column
3896
3903
  # In select_features mode we select also from etalon features and need to show them
3897
3904
  or (not self.fit_select_features and is_client_feature)
3898
3905
  ):
upgini/metrics.py CHANGED
@@ -8,13 +8,14 @@ from copy import deepcopy
8
8
  from dataclasses import dataclass
9
9
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
10
10
 
11
+ import lightgbm as lgb
11
12
  import numpy as np
12
13
  import pandas as pd
13
14
  from lightgbm import LGBMClassifier, LGBMRegressor
14
- import lightgbm as lgb
15
15
  from numpy import log1p
16
16
  from pandas.api.types import is_numeric_dtype
17
17
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
18
+ from sklearn.preprocessing import OrdinalEncoder
18
19
 
19
20
  from upgini.utils.features_validator import FeaturesValidator
20
21
  from upgini.utils.sklearn_ext import cross_validate
@@ -125,7 +126,7 @@ LIGHTGBM_MULTICLASS_PARAMS = {
125
126
  "max_cat_threshold": 80,
126
127
  "min_data_per_group": 20,
127
128
  "cat_smooth": 18,
128
- "cat_l2" : 8,
129
+ "cat_l2": 8,
129
130
  "objective": "multiclass",
130
131
  "class_weight": "balanced",
131
132
  "use_quantized_grad": "true",
@@ -146,7 +147,7 @@ LIGHTGBM_BINARY_PARAMS = {
146
147
  "max_cat_threshold": 80,
147
148
  "min_data_per_group": 20,
148
149
  "cat_smooth": 18,
149
- "cat_l2" : 8,
150
+ "cat_l2": 8,
150
151
  "verbosity": -1,
151
152
  }
152
153
 
@@ -754,6 +755,7 @@ class LightGBMWrapper(EstimatorWrapper):
754
755
  logger=logger,
755
756
  )
756
757
  self.cat_features = None
758
+ self.cat_encoder = None
757
759
  self.n_classes = None
758
760
 
759
761
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
@@ -764,10 +766,13 @@ class LightGBMWrapper(EstimatorWrapper):
764
766
  params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
765
767
  self.cat_features = _get_cat_features(x)
766
768
  if self.cat_features:
767
- params["categorical_feature"] = self.cat_features
768
- x = fill_na_cat_features(x, self.cat_features)
769
- for feature in self.cat_features:
770
- x[feature] = x[feature].astype("category").cat.codes
769
+ x = fill_na_cat_features(x, self.cat_features)
770
+ encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
771
+ encoded = pd.DataFrame(
772
+ encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
773
+ )
774
+ x[self.cat_features] = encoded
775
+ self.cat_encoder = encoder
771
776
  if not is_numeric_dtype(y_numpy):
772
777
  y_numpy = correct_string_target(y_numpy)
773
778
 
@@ -777,8 +782,10 @@ class LightGBMWrapper(EstimatorWrapper):
777
782
  x, y_numpy, params = super()._prepare_to_calculate(x, y)
778
783
  if self.cat_features is not None:
779
784
  x = fill_na_cat_features(x, self.cat_features)
780
- for feature in self.cat_features:
781
- x[feature] = x[feature].astype("category").cat.codes
785
+ if self.cat_encoder is not None:
786
+ x[self.cat_features] = pd.DataFrame(
787
+ self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
788
+ )
782
789
  if not is_numeric_dtype(y):
783
790
  y_numpy = correct_string_target(y_numpy)
784
791
  return x, y_numpy, params
@@ -204,7 +204,7 @@ def balance_undersample(
204
204
  def balance_undersample_forced(
205
205
  df: pd.DataFrame,
206
206
  target_column: str,
207
- id_columns: List[str],
207
+ id_columns: Optional[List[str]],
208
208
  date_column: str,
209
209
  task_type: ModelTaskType,
210
210
  cv_type: Optional[CVType],
@@ -287,7 +287,7 @@ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
287
287
 
288
288
  def balance_undersample_time_series_trunc(
289
289
  df: pd.DataFrame,
290
- id_columns: List[str],
290
+ id_columns: Optional[List[str]],
291
291
  date_column: str,
292
292
  sample_size: int,
293
293
  random_state: int = 42,
@@ -298,6 +298,8 @@ def balance_undersample_time_series_trunc(
298
298
  **kwargs,
299
299
  ):
300
300
  # Convert date column to datetime
301
+ if id_columns is None:
302
+ id_columns = [date_column]
301
303
  dates_df = df[id_columns + [date_column]].copy()
302
304
  dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
303
305
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.71a3832.dev12
3
+ Version: 1.2.72a3659.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=okx02f-XOrtKlpdtJnV-aHreGFvFkxZ5NQhd5zxvhMk,34
1
+ upgini/__about__.py,sha256=n3Di7UqdUYABUquK0tXIme5xiFjO7fpJ3AKGXnT-Jec,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=DgWboHEhr5BQT87MaAo2iUtrhapP3iqczLeZtWLRkDs,206664
6
+ upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
7
7
  upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
8
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
9
- upgini/metrics.py,sha256=9AaQi7Yb22ZNnycUOAUpcP7TWF5Pfy_NGACcDj10aMs,38820
9
+ upgini/metrics.py,sha256=jobZL_Hg7guufDYH2XdanxgbyJTuC9ZAMZodeptE3I4,39177
10
10
  upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -21,7 +21,7 @@ upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
21
21
  upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
22
22
  upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
23
23
  upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
24
- upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
24
+ upgini/autofe/vector.py,sha256=-aLI4cA5HI2p42Skj4Sfb3XAPAFfbcu7FjukWsxVFdM,1161
25
25
  upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
26
26
  upgini/autofe/timeseries/base.py,sha256=rWJqRuFAzTZEsUdWG5s1Vhif9zzRRmalASXvarufRxI,3610
27
27
  upgini/autofe/timeseries/cross.py,sha256=BTINVwuZSbm_4NKkVm0FGM68SrvZLENZKXN7-UyvhYI,5319
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
67
  upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
68
68
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
69
- upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
69
+ upgini/utils/target_utils.py,sha256=KNFzJta1SpGU4sp07dHKSeVJlDs_9qgD2wcw5YuJfOc,16661
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.71a3832.dev12.dist-info/METADATA,sha256=8jmuNEDPwjc-Wa6Bds0FjYqYgqf3LFMYyRGUDy5DME8,49102
74
- upgini-1.2.71a3832.dev12.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.71a3832.dev12.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.71a3832.dev12.dist-info/RECORD,,
73
+ upgini-1.2.72a3659.dev1.dist-info/METADATA,sha256=tuv9DtWEtwHVjoIMPK4LKOvrmaQ3suMZS43JeEcEDiY,49101
74
+ upgini-1.2.72a3659.dev1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.72a3659.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.72a3659.dev1.dist-info/RECORD,,