upgini 1.2.33__py3-none-any.whl → 1.2.34a3657.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +7 -84
- upgini/autofe/date.py +39 -23
- upgini/autofe/feature.py +7 -7
- upgini/autofe/groupby.py +20 -10
- upgini/autofe/operand.py +57 -2
- upgini/autofe/vector.py +120 -2
- {upgini-1.2.33.dist-info → upgini-1.2.34a3657.dev1.dist-info}/METADATA +1 -1
- {upgini-1.2.33.dist-info → upgini-1.2.34a3657.dev1.dist-info}/RECORD +11 -11
- {upgini-1.2.33.dist-info → upgini-1.2.34a3657.dev1.dist-info}/WHEEL +1 -1
- {upgini-1.2.33.dist-info → upgini-1.2.34a3657.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.34a3657.dev1"
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,87 +1,10 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
-
from upgini.autofe.
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
CombineThenFreq,
|
|
8
|
-
Distance,
|
|
9
|
-
Divide,
|
|
10
|
-
JaroWinklerSim1,
|
|
11
|
-
JaroWinklerSim2,
|
|
12
|
-
LevenshteinSim,
|
|
13
|
-
Max,
|
|
14
|
-
Min,
|
|
15
|
-
Multiply,
|
|
16
|
-
Sim,
|
|
17
|
-
Subtract,
|
|
18
|
-
)
|
|
19
|
-
from upgini.autofe.date import (
|
|
20
|
-
DateDiff,
|
|
21
|
-
DateDiffType2,
|
|
22
|
-
DateListDiff,
|
|
23
|
-
DateListDiffBounded,
|
|
24
|
-
DatePercentile,
|
|
25
|
-
DatePercentileMethod2,
|
|
26
|
-
)
|
|
27
|
-
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
|
|
28
|
-
from upgini.autofe.operand import Operand
|
|
29
|
-
from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
30
|
-
from upgini.autofe.vector import Mean, Sum
|
|
31
|
-
|
|
32
|
-
ALL_OPERANDS: Dict[str, Operand] = {
|
|
33
|
-
op.name: op
|
|
34
|
-
for op in [
|
|
35
|
-
Freq(),
|
|
36
|
-
Mean(),
|
|
37
|
-
Sum(),
|
|
38
|
-
Abs(),
|
|
39
|
-
Log(),
|
|
40
|
-
Sqrt(),
|
|
41
|
-
Square(),
|
|
42
|
-
Sigmoid(),
|
|
43
|
-
Floor(),
|
|
44
|
-
Residual(),
|
|
45
|
-
Min(),
|
|
46
|
-
Max(),
|
|
47
|
-
Add(),
|
|
48
|
-
Subtract(),
|
|
49
|
-
Multiply(),
|
|
50
|
-
Divide(),
|
|
51
|
-
GroupByThenAgg(name="GroupByThenMin", agg="min"),
|
|
52
|
-
GroupByThenAgg(name="GroupByThenMax", agg="max"),
|
|
53
|
-
GroupByThenAgg(name="GroupByThenMean", agg="mean"),
|
|
54
|
-
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
55
|
-
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
56
|
-
GroupByThenRank(),
|
|
57
|
-
Combine(),
|
|
58
|
-
CombineThenFreq(),
|
|
59
|
-
GroupByThenNUnique(),
|
|
60
|
-
GroupByThenFreq(),
|
|
61
|
-
Sim(),
|
|
62
|
-
DateDiff(),
|
|
63
|
-
DateDiffType2(),
|
|
64
|
-
DateListDiff(aggregation="min"),
|
|
65
|
-
DateListDiff(aggregation="max"),
|
|
66
|
-
DateListDiff(aggregation="mean"),
|
|
67
|
-
DateListDiff(aggregation="nunique"),
|
|
68
|
-
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
|
|
69
|
-
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
|
|
70
|
-
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
|
|
71
|
-
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
|
|
72
|
-
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
|
|
73
|
-
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
|
|
74
|
-
DatePercentile(),
|
|
75
|
-
DatePercentileMethod2(),
|
|
76
|
-
Norm(),
|
|
77
|
-
JaroWinklerSim1(),
|
|
78
|
-
JaroWinklerSim2(),
|
|
79
|
-
LevenshteinSim(),
|
|
80
|
-
Distance(),
|
|
81
|
-
Embeddings(),
|
|
82
|
-
]
|
|
83
|
-
}
|
|
1
|
+
from upgini.autofe.operand import OperandRegistry
|
|
2
|
+
from upgini.autofe.unary import * # noqa
|
|
3
|
+
from upgini.autofe.binary import * # noqa
|
|
4
|
+
from upgini.autofe.groupby import * # noqa
|
|
5
|
+
from upgini.autofe.date import * # noqa
|
|
6
|
+
from upgini.autofe.vector import * # noqa
|
|
84
7
|
|
|
85
8
|
|
|
86
9
|
def find_op(name):
|
|
87
|
-
return
|
|
10
|
+
return OperandRegistry.get_operand(name)
|
upgini/autofe/date.py
CHANGED
|
@@ -7,11 +7,11 @@ import pandas as pd
|
|
|
7
7
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
8
8
|
from pydantic import BaseModel, __version__ as pydantic_version
|
|
9
9
|
|
|
10
|
-
from upgini.autofe.operand import PandasOperand
|
|
10
|
+
from upgini.autofe.operand import PandasOperand, ParametrizedOperand
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def get_pydantic_version():
|
|
14
|
-
major_version = int(pydantic_version.split(
|
|
14
|
+
major_version = int(pydantic_version.split(".")[0])
|
|
15
15
|
return major_version
|
|
16
16
|
|
|
17
17
|
|
|
@@ -109,7 +109,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
|
|
|
109
109
|
_count_aggregations = ["nunique", "count"]
|
|
110
110
|
|
|
111
111
|
|
|
112
|
-
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
112
|
+
class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
|
|
113
113
|
is_binary: bool = True
|
|
114
114
|
has_symmetry_importance: bool = True
|
|
115
115
|
|
|
@@ -129,10 +129,17 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
129
129
|
)
|
|
130
130
|
return res
|
|
131
131
|
|
|
132
|
-
def
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
132
|
+
def to_formula(self) -> str:
|
|
133
|
+
return f"date_diff_{self.aggregation}"
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def from_formula(cls, formula: str) -> Optional["DateListDiff"]:
|
|
137
|
+
if not formula.startswith("date_diff_"):
|
|
138
|
+
return None
|
|
139
|
+
aggregation = formula.replace("date_diff_", "")
|
|
140
|
+
if "_" in aggregation:
|
|
141
|
+
return None
|
|
142
|
+
return cls(aggregation=aggregation)
|
|
136
143
|
|
|
137
144
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
138
145
|
left = self._convert_to_date(left, self.left_unit)
|
|
@@ -170,23 +177,31 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
170
177
|
return method(x) if len(x) > 0 else default
|
|
171
178
|
|
|
172
179
|
|
|
173
|
-
class DateListDiffBounded(DateListDiff):
|
|
180
|
+
class DateListDiffBounded(DateListDiff, ParametrizedOperand):
|
|
174
181
|
lower_bound: Optional[int] = None
|
|
175
182
|
upper_bound: Optional[int] = None
|
|
176
183
|
|
|
177
|
-
def
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
184
|
+
def to_formula(self) -> str:
|
|
185
|
+
lower_bound = "minusinf" if self.lower_bound is None else self.lower_bound
|
|
186
|
+
upper_bound = "plusinf" if self.upper_bound is None else self.upper_bound
|
|
187
|
+
return f"date_diff_{self.diff_unit}_{lower_bound}_{upper_bound}_{self.aggregation}"
|
|
188
|
+
|
|
189
|
+
@classmethod
|
|
190
|
+
def from_formula(cls, formula: str) -> Optional["DateListDiffBounded"]:
|
|
191
|
+
import re
|
|
192
|
+
|
|
193
|
+
pattern = r"^date_diff_([^_]+)_((minusinf|\d+))_((plusinf|\d+))_(\w+)$"
|
|
194
|
+
match = re.match(pattern, formula)
|
|
195
|
+
|
|
196
|
+
if not match:
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
diff_unit = match.group(1)
|
|
200
|
+
lower_bound = None if match.group(2) == "minusinf" else int(match.group(2))
|
|
201
|
+
upper_bound = None if match.group(4) == "plusinf" else int(match.group(4))
|
|
202
|
+
aggregation = match.group(6)
|
|
203
|
+
|
|
204
|
+
return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
|
|
190
205
|
|
|
191
206
|
def _agg(self, x):
|
|
192
207
|
x = x[
|
|
@@ -257,16 +272,17 @@ class DatePercentile(DatePercentileBase):
|
|
|
257
272
|
# Use @field_validator for Pydantic 2.x
|
|
258
273
|
from pydantic import field_validator
|
|
259
274
|
|
|
260
|
-
@field_validator(
|
|
275
|
+
@field_validator("zero_bounds", mode="before")
|
|
261
276
|
def parse_zero_bounds(cls, value):
|
|
262
277
|
if isinstance(value, str):
|
|
263
278
|
return json.loads(value)
|
|
264
279
|
return value
|
|
280
|
+
|
|
265
281
|
else:
|
|
266
282
|
# Use @validator for Pydantic 1.x
|
|
267
283
|
from pydantic import validator
|
|
268
284
|
|
|
269
|
-
@validator(
|
|
285
|
+
@validator("zero_bounds", pre=True)
|
|
270
286
|
def parse_zero_bounds(cls, value):
|
|
271
287
|
if isinstance(value, str):
|
|
272
288
|
return json.loads(value)
|
upgini/autofe/feature.py
CHANGED
|
@@ -121,7 +121,7 @@ class Feature:
|
|
|
121
121
|
|
|
122
122
|
def get_hash(self) -> str:
|
|
123
123
|
return hashlib.sha256(
|
|
124
|
-
"_".join([self.op.
|
|
124
|
+
"_".join([self.op.to_formula()] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
|
|
125
125
|
).hexdigest()[:8]
|
|
126
126
|
|
|
127
127
|
def set_alias(self, alias: str) -> "Feature":
|
|
@@ -129,7 +129,7 @@ class Feature:
|
|
|
129
129
|
return self
|
|
130
130
|
|
|
131
131
|
def get_all_operand_names(self) -> Set[str]:
|
|
132
|
-
return {self.op.
|
|
132
|
+
return {self.op.to_formula()}.union(
|
|
133
133
|
{n for f in self.children if isinstance(f, Feature) for n in f.get_all_operand_names()}
|
|
134
134
|
)
|
|
135
135
|
|
|
@@ -160,7 +160,7 @@ class Feature:
|
|
|
160
160
|
child.delete_data()
|
|
161
161
|
|
|
162
162
|
def get_op_display_name(self) -> str:
|
|
163
|
-
return self.op.alias or self.op.
|
|
163
|
+
return (self.op.alias or self.op.to_formula()).lower()
|
|
164
164
|
|
|
165
165
|
def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
|
|
166
166
|
if self.cached_display_name is not None and cache:
|
|
@@ -239,9 +239,9 @@ class Feature:
|
|
|
239
239
|
if self.op.name in ["+", "-", "*", "/"]:
|
|
240
240
|
left = self.children[0].to_formula(**kwargs)
|
|
241
241
|
right = self.children[1].to_formula(**kwargs)
|
|
242
|
-
return f"({left}{self.op.
|
|
242
|
+
return f"({left}{self.op.to_formula()}{right})"
|
|
243
243
|
else:
|
|
244
|
-
result = [self.op.
|
|
244
|
+
result = [self.op.to_formula(), "("]
|
|
245
245
|
for i in range(len(self.children)):
|
|
246
246
|
string_i = self.children[i].to_formula(**kwargs)
|
|
247
247
|
result.append(string_i)
|
|
@@ -254,9 +254,9 @@ class Feature:
|
|
|
254
254
|
if self.op.name in ["+", "-", "*", "/"]:
|
|
255
255
|
left = self.children[0].to_pretty_formula()
|
|
256
256
|
right = self.children[1].to_pretty_formula()
|
|
257
|
-
return f"{left} {self.op.
|
|
257
|
+
return f"{left} {self.op.to_formula()} {right}"
|
|
258
258
|
else:
|
|
259
|
-
result = [self.op.
|
|
259
|
+
result = [self.op.to_formula(), "("]
|
|
260
260
|
for i in range(len(self.children)):
|
|
261
261
|
string_i = self.children[i].to_pretty_formula()
|
|
262
262
|
result.append(string_i)
|
upgini/autofe/groupby.py
CHANGED
|
@@ -2,33 +2,43 @@ from typing import Optional
|
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
|
-
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
5
|
+
from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
class GroupByThenAgg(
|
|
8
|
+
class GroupByThenAgg(
|
|
9
|
+
PandasOperand,
|
|
10
|
+
VectorizableMixin,
|
|
11
|
+
ParametrizedOperand,
|
|
12
|
+
):
|
|
9
13
|
agg: Optional[str]
|
|
10
14
|
is_vectorizable: bool = True
|
|
11
15
|
is_grouping: bool = True
|
|
12
16
|
is_distribution_dependent: bool = True
|
|
13
17
|
|
|
18
|
+
def to_formula(self) -> str:
|
|
19
|
+
return f"GroupByThen{self.agg}"
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def from_formula(cls, formula: str) -> Optional["GroupByThenAgg"]:
|
|
23
|
+
if not formula.startswith("GroupByThen"):
|
|
24
|
+
return None
|
|
25
|
+
agg = formula[len("GroupByThen") :]
|
|
26
|
+
if agg.lower() in ["rank", "nunique", "freq"]: # other implementation
|
|
27
|
+
return None
|
|
28
|
+
return cls(agg=agg)
|
|
29
|
+
|
|
14
30
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
15
|
-
temp = left.groupby(right).agg(self.agg)
|
|
31
|
+
temp = left.groupby(right).agg(self.agg.lower())
|
|
16
32
|
return self._loc(right, temp)
|
|
17
33
|
|
|
18
34
|
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
19
35
|
group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
|
|
20
36
|
d1 = data[value_columns]
|
|
21
37
|
d2 = data[group_column]
|
|
22
|
-
temp = d1.groupby(d2).agg(self.agg)
|
|
38
|
+
temp = d1.groupby(d2).agg(self.agg.lower())
|
|
23
39
|
return temp.merge(d2, how="right", on=[group_column])[value_columns]
|
|
24
40
|
|
|
25
41
|
|
|
26
|
-
class GroupByThenMedian(GroupByThenAgg):
|
|
27
|
-
name: str = "GroupByThenMedian"
|
|
28
|
-
pandas_agg: str = "median"
|
|
29
|
-
is_distribution_dependent: bool = True
|
|
30
|
-
|
|
31
|
-
|
|
32
42
|
class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
33
43
|
name: str = "GroupByThenRank"
|
|
34
44
|
is_vectorizable: bool = True
|
upgini/autofe/operand.py
CHANGED
|
@@ -6,8 +6,48 @@ import pandas as pd
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class
|
|
10
|
-
|
|
9
|
+
class OperandRegistry(type(BaseModel)):
|
|
10
|
+
_registry = {}
|
|
11
|
+
_parametrized_registry = []
|
|
12
|
+
|
|
13
|
+
def __new__(cls, name, bases, attrs):
|
|
14
|
+
new_class = super().__new__(cls, name, bases, attrs)
|
|
15
|
+
# Only register if it's a concrete class that inherits from Operand
|
|
16
|
+
base_classes = [b for b in bases]
|
|
17
|
+
base_names = {b.__name__ for b in bases}
|
|
18
|
+
while base_classes:
|
|
19
|
+
base = base_classes.pop()
|
|
20
|
+
base_names.update(b.__name__ for b in base.__bases__)
|
|
21
|
+
base_classes.extend(base.__bases__)
|
|
22
|
+
|
|
23
|
+
if "Operand" in base_names:
|
|
24
|
+
# Track parametrized operands separately
|
|
25
|
+
if "ParametrizedOperand" in base_names:
|
|
26
|
+
cls._parametrized_registry.append(new_class)
|
|
27
|
+
else:
|
|
28
|
+
try:
|
|
29
|
+
instance = new_class()
|
|
30
|
+
cls._registry[instance.name] = new_class
|
|
31
|
+
except Exception:
|
|
32
|
+
pass
|
|
33
|
+
return new_class
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def get_operand(cls, name: str) -> Optional["Operand"]:
|
|
37
|
+
# First try to resolve as a parametrized operand formula
|
|
38
|
+
for operand_cls in cls._parametrized_registry:
|
|
39
|
+
resolved = operand_cls.from_formula(name)
|
|
40
|
+
if resolved is not None:
|
|
41
|
+
return resolved
|
|
42
|
+
# Fall back to direct registry lookup
|
|
43
|
+
non_parametrized = cls._registry.get(name)
|
|
44
|
+
if non_parametrized is not None:
|
|
45
|
+
return non_parametrized()
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Operand(BaseModel, metaclass=OperandRegistry):
|
|
50
|
+
name: Optional[str] = None
|
|
11
51
|
alias: Optional[str] = None
|
|
12
52
|
is_unary: bool = False
|
|
13
53
|
is_symmetrical: bool = False
|
|
@@ -31,6 +71,21 @@ class Operand(BaseModel):
|
|
|
31
71
|
res.update(self.params or {})
|
|
32
72
|
return res
|
|
33
73
|
|
|
74
|
+
def to_formula(self) -> str:
|
|
75
|
+
return self.name
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ParametrizedOperand(Operand, abc.ABC):
|
|
79
|
+
|
|
80
|
+
@abc.abstractmethod
|
|
81
|
+
def to_formula(self) -> str:
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
@abc.abstractmethod
|
|
86
|
+
def from_formula(cls, formula: str) -> Optional["Operand"]:
|
|
87
|
+
pass
|
|
88
|
+
|
|
34
89
|
|
|
35
90
|
MAIN_COLUMN = "main_column"
|
|
36
91
|
|
upgini/autofe/vector.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Dict, List, Optional
|
|
2
3
|
|
|
3
4
|
import pandas as pd
|
|
5
|
+
from pydantic import validator
|
|
4
6
|
|
|
5
|
-
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
7
|
+
from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
class Mean(PandasOperand, VectorizableMixin):
|
|
@@ -22,3 +24,119 @@ class Sum(PandasOperand, VectorizableMixin):
|
|
|
22
24
|
|
|
23
25
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
24
26
|
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
30
|
+
is_vector: bool = True
|
|
31
|
+
date_unit: Optional[str] = None
|
|
32
|
+
|
|
33
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
34
|
+
res = super().get_params()
|
|
35
|
+
res.update(
|
|
36
|
+
{
|
|
37
|
+
"date_unit": self.date_unit,
|
|
38
|
+
}
|
|
39
|
+
)
|
|
40
|
+
return res
|
|
41
|
+
|
|
42
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
43
|
+
# assuming first is date, last is value, rest is group columns
|
|
44
|
+
date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
|
|
45
|
+
ts = pd.concat([date] + data[1:], axis=1)
|
|
46
|
+
ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
|
|
47
|
+
ts.set_index(date.name, inplace=True)
|
|
48
|
+
ts = ts[ts.index.notna()].sort_index()
|
|
49
|
+
ts = ts.groupby([c.name for c in data[1:-1]]) if len(data) > 2 else ts
|
|
50
|
+
ts = self._aggregate(ts)
|
|
51
|
+
ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
|
|
52
|
+
|
|
53
|
+
return ts.iloc[:, -1]
|
|
54
|
+
|
|
55
|
+
@abc.abstractmethod
|
|
56
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
64
|
+
aggregation: str
|
|
65
|
+
window_size: int = 1
|
|
66
|
+
window_unit: str = "D"
|
|
67
|
+
|
|
68
|
+
@validator("window_unit")
|
|
69
|
+
def validate_window_unit(cls, v: str) -> str:
|
|
70
|
+
try:
|
|
71
|
+
pd.tseries.frequencies.to_offset(v)
|
|
72
|
+
return v
|
|
73
|
+
except ValueError:
|
|
74
|
+
raise ValueError(
|
|
75
|
+
f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def to_formula(self) -> str:
|
|
79
|
+
return f"roll_{self.window_size}{self.window_unit}_{self.aggregation}"
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def from_formula(cls, formula: str) -> Optional["Roll"]:
|
|
83
|
+
import re
|
|
84
|
+
|
|
85
|
+
pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
|
|
86
|
+
match = re.match(pattern, formula)
|
|
87
|
+
|
|
88
|
+
if not match:
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
window_size = int(match.group(1))
|
|
92
|
+
window_unit = match.group(2)
|
|
93
|
+
aggregation = match.group(3)
|
|
94
|
+
|
|
95
|
+
return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
|
|
96
|
+
|
|
97
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
98
|
+
res = super().get_params()
|
|
99
|
+
res.update(
|
|
100
|
+
{
|
|
101
|
+
"window_size": self.window_size,
|
|
102
|
+
"window_unit": self.window_unit,
|
|
103
|
+
"aggregation": self.aggregation,
|
|
104
|
+
}
|
|
105
|
+
)
|
|
106
|
+
return res
|
|
107
|
+
|
|
108
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
109
|
+
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=self.window_size).agg(
|
|
110
|
+
_roll_aggregations.get(self.aggregation, self.aggregation)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class Lag(TimeSeriesBase, ParametrizedOperand):
|
|
115
|
+
lag_size: int
|
|
116
|
+
lag_unit: str = "D"
|
|
117
|
+
|
|
118
|
+
def to_formula(self) -> str:
|
|
119
|
+
return f"lag_{self.lag_size}{self.lag_unit}"
|
|
120
|
+
|
|
121
|
+
@classmethod
|
|
122
|
+
def from_formula(cls, formula: str) -> Optional["Lag"]:
|
|
123
|
+
import re
|
|
124
|
+
|
|
125
|
+
pattern = r"^lag_(\d+)([a-zA-Z])$"
|
|
126
|
+
match = re.match(pattern, formula)
|
|
127
|
+
|
|
128
|
+
if not match:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
lag_size = int(match.group(1))
|
|
132
|
+
lag_unit = match.group(2)
|
|
133
|
+
|
|
134
|
+
return cls(lag_size=lag_size, lag_unit=lag_unit)
|
|
135
|
+
|
|
136
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
137
|
+
res = super().get_params()
|
|
138
|
+
return res
|
|
139
|
+
|
|
140
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
141
|
+
lag_window = self.lag_size + 1
|
|
142
|
+
return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=JBP_tvOiBuuOyLx7mNqZYU1UEW5bf82plZzE0AvVsfI,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=KnkqV7Nnx3kxfQ89giDao3bmCm4MFJWqJUrONy85E-k,32030
|
|
@@ -14,14 +14,14 @@ upgini/version_validator.py,sha256=h1GViOWzULy5vf6M4dpTJuIk-4V38UCrTY1sb9yLa5I,1
|
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
upgini/autofe/all_operands.py,sha256=
|
|
17
|
+
upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
|
|
18
18
|
upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
21
|
-
upgini/autofe/groupby.py,sha256=
|
|
22
|
-
upgini/autofe/operand.py,sha256=
|
|
19
|
+
upgini/autofe/date.py,sha256=Sd1Bm_uby9liSgsUkxsFgnCFaHxmj9MLX0ymR9DLQuQ,10401
|
|
20
|
+
upgini/autofe/feature.py,sha256=l8A8E3BH2BmYvqEC81zbcIEfH6KEEhcesJ2BH4fn0-4,15140
|
|
21
|
+
upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
|
|
22
|
+
upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
|
|
23
23
|
upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
|
|
24
|
-
upgini/autofe/vector.py,sha256=
|
|
24
|
+
upgini/autofe/vector.py,sha256=MyNPuqZ5J2vqRSn2UQcKp0ekXWv-d6lImEwqfU3pbCM,4328
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
26
|
upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lYQaGZbxDnOd4A3Q,22516
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
59
59
|
upgini/utils/target_utils.py,sha256=Ed5IXkPjV9AfAZQAwCYksAmKaPGQliplvDYS_yeWdfk,11330
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.34a3657.dev1.dist-info/METADATA,sha256=marFhP2NoGmDk3lYZemMPRXcBRCB6jr_3tgx-I7fhIE,48597
|
|
63
|
+
upgini-1.2.34a3657.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
64
|
+
upgini-1.2.34a3657.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.34a3657.dev1.dist-info/RECORD,,
|
|
File without changes
|