upgini 1.2.32__py3-none-any.whl → 1.2.34a3657.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +7 -84
- upgini/autofe/date.py +39 -23
- upgini/autofe/feature.py +7 -7
- upgini/autofe/groupby.py +20 -10
- upgini/autofe/operand.py +57 -2
- upgini/autofe/vector.py +120 -2
- upgini/features_enricher.py +47 -6
- upgini/metadata.py +1 -0
- {upgini-1.2.32.dist-info → upgini-1.2.34a3657.dev1.dist-info}/METADATA +2 -2
- {upgini-1.2.32.dist-info → upgini-1.2.34a3657.dev1.dist-info}/RECORD +13 -13
- {upgini-1.2.32.dist-info → upgini-1.2.34a3657.dev1.dist-info}/WHEEL +1 -1
- {upgini-1.2.32.dist-info → upgini-1.2.34a3657.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.34a3657.dev1"
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,87 +1,10 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
-
from upgini.autofe.
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
CombineThenFreq,
|
|
8
|
-
Distance,
|
|
9
|
-
Divide,
|
|
10
|
-
JaroWinklerSim1,
|
|
11
|
-
JaroWinklerSim2,
|
|
12
|
-
LevenshteinSim,
|
|
13
|
-
Max,
|
|
14
|
-
Min,
|
|
15
|
-
Multiply,
|
|
16
|
-
Sim,
|
|
17
|
-
Subtract,
|
|
18
|
-
)
|
|
19
|
-
from upgini.autofe.date import (
|
|
20
|
-
DateDiff,
|
|
21
|
-
DateDiffType2,
|
|
22
|
-
DateListDiff,
|
|
23
|
-
DateListDiffBounded,
|
|
24
|
-
DatePercentile,
|
|
25
|
-
DatePercentileMethod2,
|
|
26
|
-
)
|
|
27
|
-
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
|
|
28
|
-
from upgini.autofe.operand import Operand
|
|
29
|
-
from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
30
|
-
from upgini.autofe.vector import Mean, Sum
|
|
31
|
-
|
|
32
|
-
ALL_OPERANDS: Dict[str, Operand] = {
|
|
33
|
-
op.name: op
|
|
34
|
-
for op in [
|
|
35
|
-
Freq(),
|
|
36
|
-
Mean(),
|
|
37
|
-
Sum(),
|
|
38
|
-
Abs(),
|
|
39
|
-
Log(),
|
|
40
|
-
Sqrt(),
|
|
41
|
-
Square(),
|
|
42
|
-
Sigmoid(),
|
|
43
|
-
Floor(),
|
|
44
|
-
Residual(),
|
|
45
|
-
Min(),
|
|
46
|
-
Max(),
|
|
47
|
-
Add(),
|
|
48
|
-
Subtract(),
|
|
49
|
-
Multiply(),
|
|
50
|
-
Divide(),
|
|
51
|
-
GroupByThenAgg(name="GroupByThenMin", agg="min"),
|
|
52
|
-
GroupByThenAgg(name="GroupByThenMax", agg="max"),
|
|
53
|
-
GroupByThenAgg(name="GroupByThenMean", agg="mean"),
|
|
54
|
-
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
55
|
-
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
56
|
-
GroupByThenRank(),
|
|
57
|
-
Combine(),
|
|
58
|
-
CombineThenFreq(),
|
|
59
|
-
GroupByThenNUnique(),
|
|
60
|
-
GroupByThenFreq(),
|
|
61
|
-
Sim(),
|
|
62
|
-
DateDiff(),
|
|
63
|
-
DateDiffType2(),
|
|
64
|
-
DateListDiff(aggregation="min"),
|
|
65
|
-
DateListDiff(aggregation="max"),
|
|
66
|
-
DateListDiff(aggregation="mean"),
|
|
67
|
-
DateListDiff(aggregation="nunique"),
|
|
68
|
-
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
|
|
69
|
-
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
|
|
70
|
-
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
|
|
71
|
-
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
|
|
72
|
-
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
|
|
73
|
-
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
|
|
74
|
-
DatePercentile(),
|
|
75
|
-
DatePercentileMethod2(),
|
|
76
|
-
Norm(),
|
|
77
|
-
JaroWinklerSim1(),
|
|
78
|
-
JaroWinklerSim2(),
|
|
79
|
-
LevenshteinSim(),
|
|
80
|
-
Distance(),
|
|
81
|
-
Embeddings(),
|
|
82
|
-
]
|
|
83
|
-
}
|
|
1
|
+
from upgini.autofe.operand import OperandRegistry
|
|
2
|
+
from upgini.autofe.unary import * # noqa
|
|
3
|
+
from upgini.autofe.binary import * # noqa
|
|
4
|
+
from upgini.autofe.groupby import * # noqa
|
|
5
|
+
from upgini.autofe.date import * # noqa
|
|
6
|
+
from upgini.autofe.vector import * # noqa
|
|
84
7
|
|
|
85
8
|
|
|
86
9
|
def find_op(name):
|
|
87
|
-
return
|
|
10
|
+
return OperandRegistry.get_operand(name)
|
upgini/autofe/date.py
CHANGED
|
@@ -7,11 +7,11 @@ import pandas as pd
|
|
|
7
7
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
8
8
|
from pydantic import BaseModel, __version__ as pydantic_version
|
|
9
9
|
|
|
10
|
-
from upgini.autofe.operand import PandasOperand
|
|
10
|
+
from upgini.autofe.operand import PandasOperand, ParametrizedOperand
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def get_pydantic_version():
|
|
14
|
-
major_version = int(pydantic_version.split(
|
|
14
|
+
major_version = int(pydantic_version.split(".")[0])
|
|
15
15
|
return major_version
|
|
16
16
|
|
|
17
17
|
|
|
@@ -109,7 +109,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
|
|
|
109
109
|
_count_aggregations = ["nunique", "count"]
|
|
110
110
|
|
|
111
111
|
|
|
112
|
-
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
112
|
+
class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
|
|
113
113
|
is_binary: bool = True
|
|
114
114
|
has_symmetry_importance: bool = True
|
|
115
115
|
|
|
@@ -129,10 +129,17 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
129
129
|
)
|
|
130
130
|
return res
|
|
131
131
|
|
|
132
|
-
def
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
132
|
+
def to_formula(self) -> str:
|
|
133
|
+
return f"date_diff_{self.aggregation}"
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def from_formula(cls, formula: str) -> Optional["DateListDiff"]:
|
|
137
|
+
if not formula.startswith("date_diff_"):
|
|
138
|
+
return None
|
|
139
|
+
aggregation = formula.replace("date_diff_", "")
|
|
140
|
+
if "_" in aggregation:
|
|
141
|
+
return None
|
|
142
|
+
return cls(aggregation=aggregation)
|
|
136
143
|
|
|
137
144
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
138
145
|
left = self._convert_to_date(left, self.left_unit)
|
|
@@ -170,23 +177,31 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
170
177
|
return method(x) if len(x) > 0 else default
|
|
171
178
|
|
|
172
179
|
|
|
173
|
-
class DateListDiffBounded(DateListDiff):
|
|
180
|
+
class DateListDiffBounded(DateListDiff, ParametrizedOperand):
|
|
174
181
|
lower_bound: Optional[int] = None
|
|
175
182
|
upper_bound: Optional[int] = None
|
|
176
183
|
|
|
177
|
-
def
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
184
|
+
def to_formula(self) -> str:
|
|
185
|
+
lower_bound = "minusinf" if self.lower_bound is None else self.lower_bound
|
|
186
|
+
upper_bound = "plusinf" if self.upper_bound is None else self.upper_bound
|
|
187
|
+
return f"date_diff_{self.diff_unit}_{lower_bound}_{upper_bound}_{self.aggregation}"
|
|
188
|
+
|
|
189
|
+
@classmethod
|
|
190
|
+
def from_formula(cls, formula: str) -> Optional["DateListDiffBounded"]:
|
|
191
|
+
import re
|
|
192
|
+
|
|
193
|
+
pattern = r"^date_diff_([^_]+)_((minusinf|\d+))_((plusinf|\d+))_(\w+)$"
|
|
194
|
+
match = re.match(pattern, formula)
|
|
195
|
+
|
|
196
|
+
if not match:
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
diff_unit = match.group(1)
|
|
200
|
+
lower_bound = None if match.group(2) == "minusinf" else int(match.group(2))
|
|
201
|
+
upper_bound = None if match.group(4) == "plusinf" else int(match.group(4))
|
|
202
|
+
aggregation = match.group(6)
|
|
203
|
+
|
|
204
|
+
return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
|
|
190
205
|
|
|
191
206
|
def _agg(self, x):
|
|
192
207
|
x = x[
|
|
@@ -257,16 +272,17 @@ class DatePercentile(DatePercentileBase):
|
|
|
257
272
|
# Use @field_validator for Pydantic 2.x
|
|
258
273
|
from pydantic import field_validator
|
|
259
274
|
|
|
260
|
-
@field_validator(
|
|
275
|
+
@field_validator("zero_bounds", mode="before")
|
|
261
276
|
def parse_zero_bounds(cls, value):
|
|
262
277
|
if isinstance(value, str):
|
|
263
278
|
return json.loads(value)
|
|
264
279
|
return value
|
|
280
|
+
|
|
265
281
|
else:
|
|
266
282
|
# Use @validator for Pydantic 1.x
|
|
267
283
|
from pydantic import validator
|
|
268
284
|
|
|
269
|
-
@validator(
|
|
285
|
+
@validator("zero_bounds", pre=True)
|
|
270
286
|
def parse_zero_bounds(cls, value):
|
|
271
287
|
if isinstance(value, str):
|
|
272
288
|
return json.loads(value)
|
upgini/autofe/feature.py
CHANGED
|
@@ -121,7 +121,7 @@ class Feature:
|
|
|
121
121
|
|
|
122
122
|
def get_hash(self) -> str:
|
|
123
123
|
return hashlib.sha256(
|
|
124
|
-
"_".join([self.op.
|
|
124
|
+
"_".join([self.op.to_formula()] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
|
|
125
125
|
).hexdigest()[:8]
|
|
126
126
|
|
|
127
127
|
def set_alias(self, alias: str) -> "Feature":
|
|
@@ -129,7 +129,7 @@ class Feature:
|
|
|
129
129
|
return self
|
|
130
130
|
|
|
131
131
|
def get_all_operand_names(self) -> Set[str]:
|
|
132
|
-
return {self.op.
|
|
132
|
+
return {self.op.to_formula()}.union(
|
|
133
133
|
{n for f in self.children if isinstance(f, Feature) for n in f.get_all_operand_names()}
|
|
134
134
|
)
|
|
135
135
|
|
|
@@ -160,7 +160,7 @@ class Feature:
|
|
|
160
160
|
child.delete_data()
|
|
161
161
|
|
|
162
162
|
def get_op_display_name(self) -> str:
|
|
163
|
-
return self.op.alias or self.op.
|
|
163
|
+
return (self.op.alias or self.op.to_formula()).lower()
|
|
164
164
|
|
|
165
165
|
def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
|
|
166
166
|
if self.cached_display_name is not None and cache:
|
|
@@ -239,9 +239,9 @@ class Feature:
|
|
|
239
239
|
if self.op.name in ["+", "-", "*", "/"]:
|
|
240
240
|
left = self.children[0].to_formula(**kwargs)
|
|
241
241
|
right = self.children[1].to_formula(**kwargs)
|
|
242
|
-
return f"({left}{self.op.
|
|
242
|
+
return f"({left}{self.op.to_formula()}{right})"
|
|
243
243
|
else:
|
|
244
|
-
result = [self.op.
|
|
244
|
+
result = [self.op.to_formula(), "("]
|
|
245
245
|
for i in range(len(self.children)):
|
|
246
246
|
string_i = self.children[i].to_formula(**kwargs)
|
|
247
247
|
result.append(string_i)
|
|
@@ -254,9 +254,9 @@ class Feature:
|
|
|
254
254
|
if self.op.name in ["+", "-", "*", "/"]:
|
|
255
255
|
left = self.children[0].to_pretty_formula()
|
|
256
256
|
right = self.children[1].to_pretty_formula()
|
|
257
|
-
return f"{left} {self.op.
|
|
257
|
+
return f"{left} {self.op.to_formula()} {right}"
|
|
258
258
|
else:
|
|
259
|
-
result = [self.op.
|
|
259
|
+
result = [self.op.to_formula(), "("]
|
|
260
260
|
for i in range(len(self.children)):
|
|
261
261
|
string_i = self.children[i].to_pretty_formula()
|
|
262
262
|
result.append(string_i)
|
upgini/autofe/groupby.py
CHANGED
|
@@ -2,33 +2,43 @@ from typing import Optional
|
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
|
-
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
5
|
+
from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
class GroupByThenAgg(
|
|
8
|
+
class GroupByThenAgg(
|
|
9
|
+
PandasOperand,
|
|
10
|
+
VectorizableMixin,
|
|
11
|
+
ParametrizedOperand,
|
|
12
|
+
):
|
|
9
13
|
agg: Optional[str]
|
|
10
14
|
is_vectorizable: bool = True
|
|
11
15
|
is_grouping: bool = True
|
|
12
16
|
is_distribution_dependent: bool = True
|
|
13
17
|
|
|
18
|
+
def to_formula(self) -> str:
|
|
19
|
+
return f"GroupByThen{self.agg}"
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def from_formula(cls, formula: str) -> Optional["GroupByThenAgg"]:
|
|
23
|
+
if not formula.startswith("GroupByThen"):
|
|
24
|
+
return None
|
|
25
|
+
agg = formula[len("GroupByThen") :]
|
|
26
|
+
if agg.lower() in ["rank", "nunique", "freq"]: # other implementation
|
|
27
|
+
return None
|
|
28
|
+
return cls(agg=agg)
|
|
29
|
+
|
|
14
30
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
15
|
-
temp = left.groupby(right).agg(self.agg)
|
|
31
|
+
temp = left.groupby(right).agg(self.agg.lower())
|
|
16
32
|
return self._loc(right, temp)
|
|
17
33
|
|
|
18
34
|
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
19
35
|
group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
|
|
20
36
|
d1 = data[value_columns]
|
|
21
37
|
d2 = data[group_column]
|
|
22
|
-
temp = d1.groupby(d2).agg(self.agg)
|
|
38
|
+
temp = d1.groupby(d2).agg(self.agg.lower())
|
|
23
39
|
return temp.merge(d2, how="right", on=[group_column])[value_columns]
|
|
24
40
|
|
|
25
41
|
|
|
26
|
-
class GroupByThenMedian(GroupByThenAgg):
|
|
27
|
-
name: str = "GroupByThenMedian"
|
|
28
|
-
pandas_agg: str = "median"
|
|
29
|
-
is_distribution_dependent: bool = True
|
|
30
|
-
|
|
31
|
-
|
|
32
42
|
class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
33
43
|
name: str = "GroupByThenRank"
|
|
34
44
|
is_vectorizable: bool = True
|
upgini/autofe/operand.py
CHANGED
|
@@ -6,8 +6,48 @@ import pandas as pd
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class
|
|
10
|
-
|
|
9
|
+
class OperandRegistry(type(BaseModel)):
|
|
10
|
+
_registry = {}
|
|
11
|
+
_parametrized_registry = []
|
|
12
|
+
|
|
13
|
+
def __new__(cls, name, bases, attrs):
|
|
14
|
+
new_class = super().__new__(cls, name, bases, attrs)
|
|
15
|
+
# Only register if it's a concrete class that inherits from Operand
|
|
16
|
+
base_classes = [b for b in bases]
|
|
17
|
+
base_names = {b.__name__ for b in bases}
|
|
18
|
+
while base_classes:
|
|
19
|
+
base = base_classes.pop()
|
|
20
|
+
base_names.update(b.__name__ for b in base.__bases__)
|
|
21
|
+
base_classes.extend(base.__bases__)
|
|
22
|
+
|
|
23
|
+
if "Operand" in base_names:
|
|
24
|
+
# Track parametrized operands separately
|
|
25
|
+
if "ParametrizedOperand" in base_names:
|
|
26
|
+
cls._parametrized_registry.append(new_class)
|
|
27
|
+
else:
|
|
28
|
+
try:
|
|
29
|
+
instance = new_class()
|
|
30
|
+
cls._registry[instance.name] = new_class
|
|
31
|
+
except Exception:
|
|
32
|
+
pass
|
|
33
|
+
return new_class
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def get_operand(cls, name: str) -> Optional["Operand"]:
|
|
37
|
+
# First try to resolve as a parametrized operand formula
|
|
38
|
+
for operand_cls in cls._parametrized_registry:
|
|
39
|
+
resolved = operand_cls.from_formula(name)
|
|
40
|
+
if resolved is not None:
|
|
41
|
+
return resolved
|
|
42
|
+
# Fall back to direct registry lookup
|
|
43
|
+
non_parametrized = cls._registry.get(name)
|
|
44
|
+
if non_parametrized is not None:
|
|
45
|
+
return non_parametrized()
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Operand(BaseModel, metaclass=OperandRegistry):
|
|
50
|
+
name: Optional[str] = None
|
|
11
51
|
alias: Optional[str] = None
|
|
12
52
|
is_unary: bool = False
|
|
13
53
|
is_symmetrical: bool = False
|
|
@@ -31,6 +71,21 @@ class Operand(BaseModel):
|
|
|
31
71
|
res.update(self.params or {})
|
|
32
72
|
return res
|
|
33
73
|
|
|
74
|
+
def to_formula(self) -> str:
|
|
75
|
+
return self.name
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ParametrizedOperand(Operand, abc.ABC):
|
|
79
|
+
|
|
80
|
+
@abc.abstractmethod
|
|
81
|
+
def to_formula(self) -> str:
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
@abc.abstractmethod
|
|
86
|
+
def from_formula(cls, formula: str) -> Optional["Operand"]:
|
|
87
|
+
pass
|
|
88
|
+
|
|
34
89
|
|
|
35
90
|
MAIN_COLUMN = "main_column"
|
|
36
91
|
|
upgini/autofe/vector.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Dict, List, Optional
|
|
2
3
|
|
|
3
4
|
import pandas as pd
|
|
5
|
+
from pydantic import validator
|
|
4
6
|
|
|
5
|
-
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
7
|
+
from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
class Mean(PandasOperand, VectorizableMixin):
|
|
@@ -22,3 +24,119 @@ class Sum(PandasOperand, VectorizableMixin):
|
|
|
22
24
|
|
|
23
25
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
24
26
|
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
30
|
+
is_vector: bool = True
|
|
31
|
+
date_unit: Optional[str] = None
|
|
32
|
+
|
|
33
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
34
|
+
res = super().get_params()
|
|
35
|
+
res.update(
|
|
36
|
+
{
|
|
37
|
+
"date_unit": self.date_unit,
|
|
38
|
+
}
|
|
39
|
+
)
|
|
40
|
+
return res
|
|
41
|
+
|
|
42
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
43
|
+
# assuming first is date, last is value, rest is group columns
|
|
44
|
+
date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
|
|
45
|
+
ts = pd.concat([date] + data[1:], axis=1)
|
|
46
|
+
ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
|
|
47
|
+
ts.set_index(date.name, inplace=True)
|
|
48
|
+
ts = ts[ts.index.notna()].sort_index()
|
|
49
|
+
ts = ts.groupby([c.name for c in data[1:-1]]) if len(data) > 2 else ts
|
|
50
|
+
ts = self._aggregate(ts)
|
|
51
|
+
ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
|
|
52
|
+
|
|
53
|
+
return ts.iloc[:, -1]
|
|
54
|
+
|
|
55
|
+
@abc.abstractmethod
|
|
56
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
64
|
+
aggregation: str
|
|
65
|
+
window_size: int = 1
|
|
66
|
+
window_unit: str = "D"
|
|
67
|
+
|
|
68
|
+
@validator("window_unit")
|
|
69
|
+
def validate_window_unit(cls, v: str) -> str:
|
|
70
|
+
try:
|
|
71
|
+
pd.tseries.frequencies.to_offset(v)
|
|
72
|
+
return v
|
|
73
|
+
except ValueError:
|
|
74
|
+
raise ValueError(
|
|
75
|
+
f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def to_formula(self) -> str:
|
|
79
|
+
return f"roll_{self.window_size}{self.window_unit}_{self.aggregation}"
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def from_formula(cls, formula: str) -> Optional["Roll"]:
|
|
83
|
+
import re
|
|
84
|
+
|
|
85
|
+
pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
|
|
86
|
+
match = re.match(pattern, formula)
|
|
87
|
+
|
|
88
|
+
if not match:
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
window_size = int(match.group(1))
|
|
92
|
+
window_unit = match.group(2)
|
|
93
|
+
aggregation = match.group(3)
|
|
94
|
+
|
|
95
|
+
return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
|
|
96
|
+
|
|
97
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
98
|
+
res = super().get_params()
|
|
99
|
+
res.update(
|
|
100
|
+
{
|
|
101
|
+
"window_size": self.window_size,
|
|
102
|
+
"window_unit": self.window_unit,
|
|
103
|
+
"aggregation": self.aggregation,
|
|
104
|
+
}
|
|
105
|
+
)
|
|
106
|
+
return res
|
|
107
|
+
|
|
108
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
109
|
+
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=self.window_size).agg(
|
|
110
|
+
_roll_aggregations.get(self.aggregation, self.aggregation)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class Lag(TimeSeriesBase, ParametrizedOperand):
|
|
115
|
+
lag_size: int
|
|
116
|
+
lag_unit: str = "D"
|
|
117
|
+
|
|
118
|
+
def to_formula(self) -> str:
|
|
119
|
+
return f"lag_{self.lag_size}{self.lag_unit}"
|
|
120
|
+
|
|
121
|
+
@classmethod
|
|
122
|
+
def from_formula(cls, formula: str) -> Optional["Lag"]:
|
|
123
|
+
import re
|
|
124
|
+
|
|
125
|
+
pattern = r"^lag_(\d+)([a-zA-Z])$"
|
|
126
|
+
match = re.match(pattern, formula)
|
|
127
|
+
|
|
128
|
+
if not match:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
lag_size = int(match.group(1))
|
|
132
|
+
lag_unit = match.group(2)
|
|
133
|
+
|
|
134
|
+
return cls(lag_size=lag_size, lag_unit=lag_unit)
|
|
135
|
+
|
|
136
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
137
|
+
res = super().get_params()
|
|
138
|
+
return res
|
|
139
|
+
|
|
140
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
141
|
+
lag_window = self.lag_size + 1
|
|
142
|
+
return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
|
upgini/features_enricher.py
CHANGED
|
@@ -111,7 +111,11 @@ try:
|
|
|
111
111
|
except Exception:
|
|
112
112
|
from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
|
|
113
113
|
|
|
114
|
-
from upgini.utils.target_utils import
|
|
114
|
+
from upgini.utils.target_utils import (
|
|
115
|
+
balance_undersample_forced,
|
|
116
|
+
calculate_psi,
|
|
117
|
+
define_task,
|
|
118
|
+
)
|
|
115
119
|
from upgini.utils.warning_counter import WarningCounter
|
|
116
120
|
from upgini.version_validator import validate_version
|
|
117
121
|
|
|
@@ -967,6 +971,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
967
971
|
self.__log_warning(self.bundle.get("metrics_no_important_free_features"))
|
|
968
972
|
return None
|
|
969
973
|
|
|
974
|
+
maybe_phone_column = self._get_phone_column(self.search_keys)
|
|
975
|
+
text_features = (
|
|
976
|
+
[f for f in self.generate_features if f != maybe_phone_column]
|
|
977
|
+
if self.generate_features is not None
|
|
978
|
+
else None
|
|
979
|
+
)
|
|
980
|
+
|
|
970
981
|
print(self.bundle.get("metrics_start"))
|
|
971
982
|
with Spinner():
|
|
972
983
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
@@ -982,7 +993,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
982
993
|
fitting_enriched_X,
|
|
983
994
|
scoring,
|
|
984
995
|
groups=groups,
|
|
985
|
-
text_features=
|
|
996
|
+
text_features=text_features,
|
|
986
997
|
has_date=has_date,
|
|
987
998
|
)
|
|
988
999
|
metric = wrapper.metric_name
|
|
@@ -1009,7 +1020,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1009
1020
|
cat_features,
|
|
1010
1021
|
add_params=custom_loss_add_params,
|
|
1011
1022
|
groups=groups,
|
|
1012
|
-
text_features=
|
|
1023
|
+
text_features=text_features,
|
|
1013
1024
|
has_date=has_date,
|
|
1014
1025
|
)
|
|
1015
1026
|
etalon_cv_result = baseline_estimator.cross_val_predict(
|
|
@@ -1044,7 +1055,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1044
1055
|
cat_features,
|
|
1045
1056
|
add_params=custom_loss_add_params,
|
|
1046
1057
|
groups=groups,
|
|
1047
|
-
text_features=
|
|
1058
|
+
text_features=text_features,
|
|
1048
1059
|
has_date=has_date,
|
|
1049
1060
|
)
|
|
1050
1061
|
enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
|
|
@@ -1827,7 +1838,27 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1827
1838
|
|
|
1828
1839
|
# downsample if need to eval_set threshold
|
|
1829
1840
|
num_samples = _num_samples(df)
|
|
1830
|
-
|
|
1841
|
+
phone_column = self._get_phone_column(self.search_keys)
|
|
1842
|
+
force_downsampling = (
|
|
1843
|
+
not self.disable_force_downsampling
|
|
1844
|
+
and self.generate_features is not None
|
|
1845
|
+
and phone_column is not None
|
|
1846
|
+
and self.fit_columns_renaming[phone_column] in self.generate_features
|
|
1847
|
+
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1848
|
+
)
|
|
1849
|
+
if force_downsampling:
|
|
1850
|
+
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1851
|
+
df = balance_undersample_forced(
|
|
1852
|
+
df=df,
|
|
1853
|
+
target_column=TARGET,
|
|
1854
|
+
task_type=self.model_task_type,
|
|
1855
|
+
random_state=self.random_state,
|
|
1856
|
+
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1857
|
+
logger=self.logger,
|
|
1858
|
+
bundle=self.bundle,
|
|
1859
|
+
warning_callback=self.__log_warning,
|
|
1860
|
+
)
|
|
1861
|
+
elif num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
|
|
1831
1862
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
|
|
1832
1863
|
df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
|
|
1833
1864
|
|
|
@@ -2063,6 +2094,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2063
2094
|
self.__display_support_link(msg)
|
|
2064
2095
|
return None, {c: c for c in X.columns}, []
|
|
2065
2096
|
|
|
2097
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
2098
|
+
online_api_features = [fm.name for fm in features_meta if fm.from_online_api]
|
|
2099
|
+
if len(online_api_features) > 0:
|
|
2100
|
+
self.logger.warning(
|
|
2101
|
+
f"There are important features for transform, that generated by online API: {online_api_features}"
|
|
2102
|
+
)
|
|
2103
|
+
# TODO
|
|
2104
|
+
raise Exception("There are features selected that are paid. Contact support (sales@upgini.com)")
|
|
2105
|
+
|
|
2066
2106
|
if not metrics_calculation:
|
|
2067
2107
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
2068
2108
|
self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
|
|
@@ -2708,8 +2748,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2708
2748
|
and self.generate_features is not None
|
|
2709
2749
|
and phone_column is not None
|
|
2710
2750
|
and self.fit_columns_renaming[phone_column] in self.generate_features
|
|
2751
|
+
and len(df) > Dataset.FORCE_SAMPLE_SIZE
|
|
2711
2752
|
)
|
|
2712
|
-
if force_downsampling
|
|
2753
|
+
if force_downsampling:
|
|
2713
2754
|
runtime_parameters.properties["fast_fit"] = True
|
|
2714
2755
|
|
|
2715
2756
|
dataset = Dataset(
|
upgini/metadata.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.34a3657.dev1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -110,7 +110,7 @@ Description-Content-Type: text/markdown
|
|
|
110
110
|
</tr>
|
|
111
111
|
</table>
|
|
112
112
|
|
|
113
|
-
⭐️ [Simple Drag & Drop Search UI](https://upgini.com/
|
|
113
|
+
⭐️ [Simple Drag & Drop Search UI](https://www.upgini.com/data-search-widget):
|
|
114
114
|
<a href="https://upgini.com/upgini-widget">
|
|
115
115
|
<img width="710" alt="Drag & Drop Search UI" src="https://github.com/upgini/upgini/assets/95645411/36b6460c-51f3-400e-9f04-445b938bf45e">
|
|
116
116
|
</a>
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=JBP_tvOiBuuOyLx7mNqZYU1UEW5bf82plZzE0AvVsfI,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=KnkqV7Nnx3kxfQ89giDao3bmCm4MFJWqJUrONy85E-k,32030
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=q11aMFPlCJy1m4sOFfGZFfb4vdG3-hdd0wgm2BXgs9A,194748
|
|
7
7
|
upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
-
upgini/metadata.py,sha256=
|
|
9
|
+
upgini/metadata.py,sha256=ACzIQQwCHCFHlUqXqKpxd3IQ4bBAaVvy8UaCGTqLGQs,11278
|
|
10
10
|
upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
@@ -14,14 +14,14 @@ upgini/version_validator.py,sha256=h1GViOWzULy5vf6M4dpTJuIk-4V38UCrTY1sb9yLa5I,1
|
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
upgini/autofe/all_operands.py,sha256=
|
|
17
|
+
upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
|
|
18
18
|
upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
21
|
-
upgini/autofe/groupby.py,sha256=
|
|
22
|
-
upgini/autofe/operand.py,sha256=
|
|
19
|
+
upgini/autofe/date.py,sha256=Sd1Bm_uby9liSgsUkxsFgnCFaHxmj9MLX0ymR9DLQuQ,10401
|
|
20
|
+
upgini/autofe/feature.py,sha256=l8A8E3BH2BmYvqEC81zbcIEfH6KEEhcesJ2BH4fn0-4,15140
|
|
21
|
+
upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
|
|
22
|
+
upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
|
|
23
23
|
upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
|
|
24
|
-
upgini/autofe/vector.py,sha256=
|
|
24
|
+
upgini/autofe/vector.py,sha256=MyNPuqZ5J2vqRSn2UQcKp0ekXWv-d6lImEwqfU3pbCM,4328
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
26
|
upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lYQaGZbxDnOd4A3Q,22516
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
59
59
|
upgini/utils/target_utils.py,sha256=Ed5IXkPjV9AfAZQAwCYksAmKaPGQliplvDYS_yeWdfk,11330
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.34a3657.dev1.dist-info/METADATA,sha256=marFhP2NoGmDk3lYZemMPRXcBRCB6jr_3tgx-I7fhIE,48597
|
|
63
|
+
upgini-1.2.34a3657.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
64
|
+
upgini-1.2.34a3657.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.34a3657.dev1.dist-info/RECORD,,
|
|
File without changes
|