upgini 1.2.73a3659.dev2__py3-none-any.whl → 1.2.75__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.73a3659.dev2"
1
+ __version__ = "1.2.75"
upgini/autofe/date.py CHANGED
@@ -187,16 +187,21 @@ class DateListDiff(PandasOperator, DateDiffMixin, ParametrizedOperator):
187
187
  class DateListDiffBounded(DateListDiff, ParametrizedOperator):
188
188
  lower_bound: Optional[int] = None
189
189
  upper_bound: Optional[int] = None
190
+ normalize: Optional[bool] = None
190
191
 
191
192
  def to_formula(self) -> str:
192
193
  lower_bound = "minusinf" if self.lower_bound is None else self.lower_bound
193
194
  upper_bound = "plusinf" if self.upper_bound is None else self.upper_bound
194
- return f"date_diff_{self.diff_unit}_{lower_bound}_{upper_bound}_{self.aggregation}"
195
+ norm = "_norm" if self.normalize else ""
196
+ return f"date_diff_{self.diff_unit}_{lower_bound}_{upper_bound}_{self.aggregation}{norm}"
195
197
 
196
198
  @classmethod
197
199
  def from_formula(cls, formula: str) -> Optional["DateListDiffBounded"]:
198
200
  import re
199
201
 
202
+ normalize = formula.endswith("_norm")
203
+ formula = formula.replace("_norm", "")
204
+
200
205
  pattern = r"^date_diff_([^_]+)_((minusinf|\d+))_((plusinf|\d+))_(\w+)$"
201
206
  match = re.match(pattern, formula)
202
207
 
@@ -207,8 +212,13 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
207
212
  lower_bound = None if match.group(2) == "minusinf" else int(match.group(2))
208
213
  upper_bound = None if match.group(4) == "plusinf" else int(match.group(4))
209
214
  aggregation = match.group(6)
210
-
211
- return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
215
+ return cls(
216
+ diff_unit=diff_unit,
217
+ lower_bound=lower_bound,
218
+ upper_bound=upper_bound,
219
+ aggregation=aggregation,
220
+ normalize=normalize,
221
+ )
212
222
 
213
223
  def get_params(self) -> Dict[str, Optional[str]]:
214
224
  res = super().get_params()
@@ -216,14 +226,20 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
216
226
  res["lower_bound"] = str(self.lower_bound)
217
227
  if self.upper_bound is not None:
218
228
  res["upper_bound"] = str(self.upper_bound)
229
+ if self.normalize is not None:
230
+ res["normalize"] = str(self.normalize)
219
231
  return res
220
232
 
221
233
  def _agg(self, x):
234
+ orig_len = len(x)
222
235
  x = x[
223
236
  (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
224
237
  & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
225
238
  ]
226
- return super()._agg(x)
239
+ agg_res = super()._agg(x)
240
+ if self.normalize and orig_len > 0:
241
+ return agg_res / orig_len
242
+ return agg_res
227
243
 
228
244
 
229
245
  class DatePercentileBase(PandasOperator, abc.ABC):
upgini/autofe/feature.py CHANGED
@@ -154,24 +154,34 @@ class Feature:
154
154
  for child in self.children:
155
155
  child.delete_data()
156
156
 
157
- def get_op_display_name(self) -> str:
158
- return (self.op.alias or self.op.to_formula()).lower()
157
+ def get_op_display_name(self, use_alias: bool = True) -> str:
158
+ return (self.op.alias or self.op.to_formula()).lower() if use_alias else self.op.to_formula()
159
159
 
160
- def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
160
+ def get_display_name(self, cache: bool = True, shorten: bool = False, use_op_alias: bool = True, **kwargs) -> str:
161
161
  if self.cached_display_name is not None and cache:
162
162
  return self.cached_display_name
163
163
 
164
164
  should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
165
- prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
165
+ components = []
166
166
 
167
167
  if self.alias:
168
- components = ["f_autofe", self.alias]
169
- elif shorten and (not self.op.is_unary or should_stack_op):
170
- components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
171
- else:
172
- components = (
173
- ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
168
+ components.extend(["f_autofe", self.alias])
169
+ elif should_stack_op:
170
+ components.extend(
171
+ [
172
+ self.children[0].get_display_name(
173
+ cache=cache, shorten=shorten, use_op_alias=use_op_alias, **kwargs
174
+ ),
175
+ self.get_op_display_name(use_alias=use_op_alias),
176
+ ]
174
177
  )
178
+ elif shorten and not self.op.is_unary:
179
+ components.extend(["f_autofe", self.get_op_display_name(use_alias=use_op_alias)])
180
+ else:
181
+ components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
182
+ "autofe",
183
+ self.get_op_display_name(use_alias=use_op_alias),
184
+ ]
175
185
  components.extend([str(self.display_index)] if self.display_index is not None else [])
176
186
  display_name = "_".join(components)
177
187
 
upgini/autofe/unary.py CHANGED
@@ -1,8 +1,10 @@
1
- from typing import Dict, Optional
1
+ import json
2
+ from typing import Dict, List, Optional
2
3
  import numpy as np
3
4
  import pandas as pd
4
5
 
5
6
  from upgini.autofe.operator import PandasOperator, VectorizableMixin
7
+ from upgini.autofe.utils import pydantic_validator
6
8
 
7
9
 
8
10
  class Abs(PandasOperator, VectorizableMixin):
@@ -153,3 +155,38 @@ class Embeddings(PandasOperator):
153
155
  is_unary: bool = True
154
156
  input_type: Optional[str] = "string"
155
157
  output_type: Optional[str] = "vector"
158
+
159
+
160
+ class Bin(PandasOperator):
161
+ name: str = "bin"
162
+ is_unary: bool = True
163
+ output_type: Optional[str] = "category"
164
+ bin_bounds: List[int] = []
165
+ is_categorical: bool = True
166
+
167
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
168
+ return data.apply(self._bin, bounds=self.bin_bounds).fillna(-1).astype(int).astype("category")
169
+
170
+ def _bin(self, f, bounds):
171
+ if f is None or np.isnan(f):
172
+ return np.nan
173
+ hit = np.where(f >= np.array(bounds))[0]
174
+ if hit.size > 0:
175
+ return np.max(hit) + 1
176
+ else:
177
+ return np.nan
178
+
179
+ def get_params(self) -> Dict[str, Optional[str]]:
180
+ res = super().get_params()
181
+ res.update(
182
+ {
183
+ "bin_bounds": json.dumps(self.bin_bounds),
184
+ }
185
+ )
186
+ return res
187
+
188
+ @pydantic_validator("bin_bounds", mode="before")
189
+ def parse_bin_bounds(cls, value):
190
+ if isinstance(value, str):
191
+ return json.loads(value)
192
+ return value
upgini/autofe/vector.py CHANGED
@@ -1,8 +1,8 @@
1
- from typing import Dict, List, Optional
1
+ from typing import List, Optional
2
2
 
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operator import OperatorRegistry, PandasOperator, VectorizableMixin
5
+ from upgini.autofe.operator import PandasOperator, VectorizableMixin
6
6
 
7
7
 
8
8
  class Mean(PandasOperator, VectorizableMixin):
@@ -24,23 +24,10 @@ class Sum(PandasOperator, VectorizableMixin):
24
24
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
25
25
 
26
26
 
27
- class OnnxModel(PandasOperator, metaclass=OperatorRegistry):
28
- name: str = "onnx"
27
+ class Vectorize(PandasOperator, VectorizableMixin):
28
+ name: str = "vectorize"
29
29
  is_vector: bool = True
30
- output_type: Optional[str] = "float"
31
- model_name: str = ""
32
-
33
- def get_params(self) -> Dict[str, Optional[str]]:
34
- res = super().get_params()
35
- res.update(
36
- {
37
- "model_name": self.model_name,
38
- }
39
- )
40
- return res
41
-
42
- # def load_model(self):
43
- # ...
44
-
45
- # def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
46
- # ...
30
+ group_index: int = 0
31
+
32
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
33
+ return pd.DataFrame(data).T.apply(lambda x: x.to_list(), axis=1)
upgini/metrics.py CHANGED
@@ -326,7 +326,7 @@ class EstimatorWrapper:
326
326
  for c in x.columns:
327
327
  if is_numeric_dtype(x[c]):
328
328
  x[c] = x[c].astype(float)
329
- else:
329
+ elif not x[c].dtype == "category":
330
330
  x[c] = x[c].astype(str)
331
331
 
332
332
  if not isinstance(y, pd.Series):
@@ -481,7 +481,7 @@ class EstimatorWrapper:
481
481
  "logger": logger,
482
482
  }
483
483
  if estimator is None:
484
- params = {}
484
+ params = {"random_state": DEFAULT_RANDOM_STATE}
485
485
  if target_type == ModelTaskType.MULTICLASS:
486
486
  params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
487
487
  params = _get_add_params(params, add_params)
@@ -749,6 +749,8 @@ class LightGBMWrapper(EstimatorWrapper):
749
749
  if self.target_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]:
750
750
  self.n_classes = len(np.unique(y_numpy))
751
751
  if LIGHTGBM_EARLY_STOPPING_ROUNDS is not None:
752
+ if self.target_type == ModelTaskType.BINARY:
753
+ params["eval_metric"] = "auc"
752
754
  params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
753
755
  self.cat_features = _get_cat_features(x)
754
756
  if self.cat_features:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.73a3659.dev2
3
+ Version: 1.2.75
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,4 +1,4 @@
1
- upgini/__about__.py,sha256=Vn3aojC64D6rn5ZFKIFRFVE3tY8D8CLC3Y0V5pbn2Jo,33
1
+ upgini/__about__.py,sha256=xPczHfrMrTuUNz8xC9lgCjhkHVDmW9TFPuLq9_c_Ms8,23
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
@@ -6,7 +6,7 @@ upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
6
  upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
7
7
  upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
8
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
9
- upgini/metrics.py,sha256=a0bY4oTMb-MgB1yC1IuTcEtotKZxAxjgV_QV2Z4V8u4,38988
9
+ upgini/metrics.py,sha256=pFRKBKyAri7xfe5pkNxcx241HQH95rV9afebgg8Tdiw,39156
10
10
  upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -15,13 +15,13 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
15
15
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
17
17
  upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
18
- upgini/autofe/date.py,sha256=C86F7sPiscUGq2a45UtQA9ADWBWg0kt54mePHHzjbLE,10633
19
- upgini/autofe/feature.py,sha256=y1x3wijhTVBmloayQAHiscqKU9Ll8kLcGm1PdvS357I,14910
18
+ upgini/autofe/date.py,sha256=MM1S-6imNSzCDOhbNnmsc_bwSqUWBcS8vWAdHF8j1kY,11134
19
+ upgini/autofe/feature.py,sha256=G_YgnsauIoaMgByx9JXDPiKc4nqs0pwWZUfvoIGMKxY,15305
20
20
  upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
21
21
  upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
22
- upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
22
+ upgini/autofe/unary.py,sha256=Sx11IoHRh5nwyALzjgG9GQOrVNIs8NZ1JzunAJuN66A,5731
23
23
  upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
24
- upgini/autofe/vector.py,sha256=w7ipoFRvR0BcTYcvJR9EbKc_ycIn9cJ94RLgrgIi4Uc,1212
24
+ upgini/autofe/vector.py,sha256=zehv1J9ChHdZKWjKlkRf6RpfQMCJduZmqCEePYNUfkQ,943
25
25
  upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
26
26
  upgini/autofe/timeseries/base.py,sha256=rWJqRuFAzTZEsUdWG5s1Vhif9zzRRmalASXvarufRxI,3610
27
27
  upgini/autofe/timeseries/cross.py,sha256=BTINVwuZSbm_4NKkVm0FGM68SrvZLENZKXN7-UyvhYI,5319
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=P0cCVRaakWLydYwFjk3TEaQfr0p0hfsJCvKRD8qcxiE,
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.73a3659.dev2.dist-info/METADATA,sha256=WImhNzA5wn2I_HyEYKvKAcUfpIWbQ0spUAI7tgu-fiQ,49101
74
- upgini-1.2.73a3659.dev2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.73a3659.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.73a3659.dev2.dist-info/RECORD,,
73
+ upgini-1.2.75.dist-info/METADATA,sha256=jUP3dTEC71e0OcENot-gdjVx1gxqUPVPWufkY-vRv60,49091
74
+ upgini-1.2.75.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.75.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.75.dist-info/RECORD,,