upgini 1.2.73a3659.dev2__py3-none-any.whl → 1.2.75__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +20 -4
- upgini/autofe/feature.py +20 -10
- upgini/autofe/unary.py +38 -1
- upgini/autofe/vector.py +8 -21
- upgini/metrics.py +4 -2
- {upgini-1.2.73a3659.dev2.dist-info → upgini-1.2.75.dist-info}/METADATA +1 -1
- {upgini-1.2.73a3659.dev2.dist-info → upgini-1.2.75.dist-info}/RECORD +10 -10
- {upgini-1.2.73a3659.dev2.dist-info → upgini-1.2.75.dist-info}/WHEEL +0 -0
- {upgini-1.2.73a3659.dev2.dist-info → upgini-1.2.75.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.75"
|
upgini/autofe/date.py
CHANGED
@@ -187,16 +187,21 @@ class DateListDiff(PandasOperator, DateDiffMixin, ParametrizedOperator):
|
|
187
187
|
class DateListDiffBounded(DateListDiff, ParametrizedOperator):
|
188
188
|
lower_bound: Optional[int] = None
|
189
189
|
upper_bound: Optional[int] = None
|
190
|
+
normalize: Optional[bool] = None
|
190
191
|
|
191
192
|
def to_formula(self) -> str:
|
192
193
|
lower_bound = "minusinf" if self.lower_bound is None else self.lower_bound
|
193
194
|
upper_bound = "plusinf" if self.upper_bound is None else self.upper_bound
|
194
|
-
|
195
|
+
norm = "_norm" if self.normalize else ""
|
196
|
+
return f"date_diff_{self.diff_unit}_{lower_bound}_{upper_bound}_{self.aggregation}{norm}"
|
195
197
|
|
196
198
|
@classmethod
|
197
199
|
def from_formula(cls, formula: str) -> Optional["DateListDiffBounded"]:
|
198
200
|
import re
|
199
201
|
|
202
|
+
normalize = formula.endswith("_norm")
|
203
|
+
formula = formula.replace("_norm", "")
|
204
|
+
|
200
205
|
pattern = r"^date_diff_([^_]+)_((minusinf|\d+))_((plusinf|\d+))_(\w+)$"
|
201
206
|
match = re.match(pattern, formula)
|
202
207
|
|
@@ -207,8 +212,13 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
|
|
207
212
|
lower_bound = None if match.group(2) == "minusinf" else int(match.group(2))
|
208
213
|
upper_bound = None if match.group(4) == "plusinf" else int(match.group(4))
|
209
214
|
aggregation = match.group(6)
|
210
|
-
|
211
|
-
|
215
|
+
return cls(
|
216
|
+
diff_unit=diff_unit,
|
217
|
+
lower_bound=lower_bound,
|
218
|
+
upper_bound=upper_bound,
|
219
|
+
aggregation=aggregation,
|
220
|
+
normalize=normalize,
|
221
|
+
)
|
212
222
|
|
213
223
|
def get_params(self) -> Dict[str, Optional[str]]:
|
214
224
|
res = super().get_params()
|
@@ -216,14 +226,20 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
|
|
216
226
|
res["lower_bound"] = str(self.lower_bound)
|
217
227
|
if self.upper_bound is not None:
|
218
228
|
res["upper_bound"] = str(self.upper_bound)
|
229
|
+
if self.normalize is not None:
|
230
|
+
res["normalize"] = str(self.normalize)
|
219
231
|
return res
|
220
232
|
|
221
233
|
def _agg(self, x):
|
234
|
+
orig_len = len(x)
|
222
235
|
x = x[
|
223
236
|
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
224
237
|
& (x < (self.upper_bound if self.upper_bound is not None else np.inf))
|
225
238
|
]
|
226
|
-
|
239
|
+
agg_res = super()._agg(x)
|
240
|
+
if self.normalize and orig_len > 0:
|
241
|
+
return agg_res / orig_len
|
242
|
+
return agg_res
|
227
243
|
|
228
244
|
|
229
245
|
class DatePercentileBase(PandasOperator, abc.ABC):
|
upgini/autofe/feature.py
CHANGED
@@ -154,24 +154,34 @@ class Feature:
|
|
154
154
|
for child in self.children:
|
155
155
|
child.delete_data()
|
156
156
|
|
157
|
-
def get_op_display_name(self) -> str:
|
158
|
-
return (self.op.alias or self.op.to_formula()).lower()
|
157
|
+
def get_op_display_name(self, use_alias: bool = True) -> str:
|
158
|
+
return (self.op.alias or self.op.to_formula()).lower() if use_alias else self.op.to_formula()
|
159
159
|
|
160
|
-
def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
|
160
|
+
def get_display_name(self, cache: bool = True, shorten: bool = False, use_op_alias: bool = True, **kwargs) -> str:
|
161
161
|
if self.cached_display_name is not None and cache:
|
162
162
|
return self.cached_display_name
|
163
163
|
|
164
164
|
should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
|
165
|
-
|
165
|
+
components = []
|
166
166
|
|
167
167
|
if self.alias:
|
168
|
-
components
|
169
|
-
elif
|
170
|
-
components
|
171
|
-
|
172
|
-
|
173
|
-
|
168
|
+
components.extend(["f_autofe", self.alias])
|
169
|
+
elif should_stack_op:
|
170
|
+
components.extend(
|
171
|
+
[
|
172
|
+
self.children[0].get_display_name(
|
173
|
+
cache=cache, shorten=shorten, use_op_alias=use_op_alias, **kwargs
|
174
|
+
),
|
175
|
+
self.get_op_display_name(use_alias=use_op_alias),
|
176
|
+
]
|
174
177
|
)
|
178
|
+
elif shorten and not self.op.is_unary:
|
179
|
+
components.extend(["f_autofe", self.get_op_display_name(use_alias=use_op_alias)])
|
180
|
+
else:
|
181
|
+
components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
|
182
|
+
"autofe",
|
183
|
+
self.get_op_display_name(use_alias=use_op_alias),
|
184
|
+
]
|
175
185
|
components.extend([str(self.display_index)] if self.display_index is not None else [])
|
176
186
|
display_name = "_".join(components)
|
177
187
|
|
upgini/autofe/unary.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
-
|
1
|
+
import json
|
2
|
+
from typing import Dict, List, Optional
|
2
3
|
import numpy as np
|
3
4
|
import pandas as pd
|
4
5
|
|
5
6
|
from upgini.autofe.operator import PandasOperator, VectorizableMixin
|
7
|
+
from upgini.autofe.utils import pydantic_validator
|
6
8
|
|
7
9
|
|
8
10
|
class Abs(PandasOperator, VectorizableMixin):
|
@@ -153,3 +155,38 @@ class Embeddings(PandasOperator):
|
|
153
155
|
is_unary: bool = True
|
154
156
|
input_type: Optional[str] = "string"
|
155
157
|
output_type: Optional[str] = "vector"
|
158
|
+
|
159
|
+
|
160
|
+
class Bin(PandasOperator):
|
161
|
+
name: str = "bin"
|
162
|
+
is_unary: bool = True
|
163
|
+
output_type: Optional[str] = "category"
|
164
|
+
bin_bounds: List[int] = []
|
165
|
+
is_categorical: bool = True
|
166
|
+
|
167
|
+
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
168
|
+
return data.apply(self._bin, bounds=self.bin_bounds).fillna(-1).astype(int).astype("category")
|
169
|
+
|
170
|
+
def _bin(self, f, bounds):
|
171
|
+
if f is None or np.isnan(f):
|
172
|
+
return np.nan
|
173
|
+
hit = np.where(f >= np.array(bounds))[0]
|
174
|
+
if hit.size > 0:
|
175
|
+
return np.max(hit) + 1
|
176
|
+
else:
|
177
|
+
return np.nan
|
178
|
+
|
179
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
180
|
+
res = super().get_params()
|
181
|
+
res.update(
|
182
|
+
{
|
183
|
+
"bin_bounds": json.dumps(self.bin_bounds),
|
184
|
+
}
|
185
|
+
)
|
186
|
+
return res
|
187
|
+
|
188
|
+
@pydantic_validator("bin_bounds", mode="before")
|
189
|
+
def parse_bin_bounds(cls, value):
|
190
|
+
if isinstance(value, str):
|
191
|
+
return json.loads(value)
|
192
|
+
return value
|
upgini/autofe/vector.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
from typing import
|
1
|
+
from typing import List, Optional
|
2
2
|
|
3
3
|
import pandas as pd
|
4
4
|
|
5
|
-
from upgini.autofe.operator import
|
5
|
+
from upgini.autofe.operator import PandasOperator, VectorizableMixin
|
6
6
|
|
7
7
|
|
8
8
|
class Mean(PandasOperator, VectorizableMixin):
|
@@ -24,23 +24,10 @@ class Sum(PandasOperator, VectorizableMixin):
|
|
24
24
|
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
25
25
|
|
26
26
|
|
27
|
-
class
|
28
|
-
name: str = "
|
27
|
+
class Vectorize(PandasOperator, VectorizableMixin):
|
28
|
+
name: str = "vectorize"
|
29
29
|
is_vector: bool = True
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
res = super().get_params()
|
35
|
-
res.update(
|
36
|
-
{
|
37
|
-
"model_name": self.model_name,
|
38
|
-
}
|
39
|
-
)
|
40
|
-
return res
|
41
|
-
|
42
|
-
# def load_model(self):
|
43
|
-
# ...
|
44
|
-
|
45
|
-
# def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
46
|
-
# ...
|
30
|
+
group_index: int = 0
|
31
|
+
|
32
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
33
|
+
return pd.DataFrame(data).T.apply(lambda x: x.to_list(), axis=1)
|
upgini/metrics.py
CHANGED
@@ -326,7 +326,7 @@ class EstimatorWrapper:
|
|
326
326
|
for c in x.columns:
|
327
327
|
if is_numeric_dtype(x[c]):
|
328
328
|
x[c] = x[c].astype(float)
|
329
|
-
|
329
|
+
elif not x[c].dtype == "category":
|
330
330
|
x[c] = x[c].astype(str)
|
331
331
|
|
332
332
|
if not isinstance(y, pd.Series):
|
@@ -481,7 +481,7 @@ class EstimatorWrapper:
|
|
481
481
|
"logger": logger,
|
482
482
|
}
|
483
483
|
if estimator is None:
|
484
|
-
params = {}
|
484
|
+
params = {"random_state": DEFAULT_RANDOM_STATE}
|
485
485
|
if target_type == ModelTaskType.MULTICLASS:
|
486
486
|
params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
|
487
487
|
params = _get_add_params(params, add_params)
|
@@ -749,6 +749,8 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
749
749
|
if self.target_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]:
|
750
750
|
self.n_classes = len(np.unique(y_numpy))
|
751
751
|
if LIGHTGBM_EARLY_STOPPING_ROUNDS is not None:
|
752
|
+
if self.target_type == ModelTaskType.BINARY:
|
753
|
+
params["eval_metric"] = "auc"
|
752
754
|
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
753
755
|
self.cat_features = _get_cat_features(x)
|
754
756
|
if self.cat_features:
|
@@ -1,4 +1,4 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=xPczHfrMrTuUNz8xC9lgCjhkHVDmW9TFPuLq9_c_Ms8,23
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
|
@@ -6,7 +6,7 @@ upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
6
|
upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
|
7
7
|
upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
|
8
8
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=pFRKBKyAri7xfe5pkNxcx241HQH95rV9afebgg8Tdiw,39156
|
10
10
|
upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -15,13 +15,13 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
|
|
15
15
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
16
|
upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
|
17
17
|
upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
|
18
|
-
upgini/autofe/date.py,sha256=
|
19
|
-
upgini/autofe/feature.py,sha256=
|
18
|
+
upgini/autofe/date.py,sha256=MM1S-6imNSzCDOhbNnmsc_bwSqUWBcS8vWAdHF8j1kY,11134
|
19
|
+
upgini/autofe/feature.py,sha256=G_YgnsauIoaMgByx9JXDPiKc4nqs0pwWZUfvoIGMKxY,15305
|
20
20
|
upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
21
21
|
upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
|
22
|
-
upgini/autofe/unary.py,sha256=
|
22
|
+
upgini/autofe/unary.py,sha256=Sx11IoHRh5nwyALzjgG9GQOrVNIs8NZ1JzunAJuN66A,5731
|
23
23
|
upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
|
24
|
-
upgini/autofe/vector.py,sha256=
|
24
|
+
upgini/autofe/vector.py,sha256=zehv1J9ChHdZKWjKlkRf6RpfQMCJduZmqCEePYNUfkQ,943
|
25
25
|
upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
|
26
26
|
upgini/autofe/timeseries/base.py,sha256=rWJqRuFAzTZEsUdWG5s1Vhif9zzRRmalASXvarufRxI,3610
|
27
27
|
upgini/autofe/timeseries/cross.py,sha256=BTINVwuZSbm_4NKkVm0FGM68SrvZLENZKXN7-UyvhYI,5319
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=P0cCVRaakWLydYwFjk3TEaQfr0p0hfsJCvKRD8qcxiE,
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
73
|
+
upgini-1.2.75.dist-info/METADATA,sha256=jUP3dTEC71e0OcENot-gdjVx1gxqUPVPWufkY-vRv60,49091
|
74
|
+
upgini-1.2.75.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.75.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.75.dist-info/RECORD,,
|
File without changes
|
File without changes
|