upgini 1.2.61__py3-none-any.whl → 1.2.62a3818.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +2 -2
- upgini/autofe/binary.py +1 -1
- upgini/autofe/date.py +1 -1
- upgini/autofe/feature.py +1 -1
- upgini/autofe/groupby.py +1 -1
- upgini/autofe/{operand.py → operator.py} +2 -2
- upgini/autofe/timeseries.py +200 -0
- upgini/autofe/unary.py +1 -1
- upgini/autofe/vector.py +2 -198
- upgini/dataset.py +17 -7
- upgini/features_enricher.py +1 -1
- upgini/utils/target_utils.py +54 -1
- upgini/utils/ts_utils.py +41 -0
- {upgini-1.2.61.dist-info → upgini-1.2.62a3818.dev1.dist-info}/METADATA +1 -1
- {upgini-1.2.61.dist-info → upgini-1.2.62a3818.dev1.dist-info}/RECORD +18 -16
- {upgini-1.2.61.dist-info → upgini-1.2.62a3818.dev1.dist-info}/WHEEL +1 -1
- {upgini-1.2.61.dist-info → upgini-1.2.62a3818.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.62a3818.dev1"
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from upgini.autofe.
|
|
1
|
+
from upgini.autofe.operator import OperatorRegistry
|
|
2
2
|
from upgini.autofe.unary import * # noqa
|
|
3
3
|
from upgini.autofe.binary import * # noqa
|
|
4
4
|
from upgini.autofe.groupby import * # noqa
|
|
@@ -7,4 +7,4 @@ from upgini.autofe.vector import * # noqa
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def find_op(name):
|
|
10
|
-
return
|
|
10
|
+
return OperatorRegistry.get_operand(name)
|
upgini/autofe/binary.py
CHANGED
upgini/autofe/date.py
CHANGED
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
|
7
7
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
8
8
|
from pydantic import BaseModel, __version__ as pydantic_version
|
|
9
9
|
|
|
10
|
-
from upgini.autofe.
|
|
10
|
+
from upgini.autofe.operator import PandasOperand, ParametrizedOperand
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def get_pydantic_version():
|
upgini/autofe/feature.py
CHANGED
upgini/autofe/groupby.py
CHANGED
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class
|
|
9
|
+
class OperatorRegistry(type(BaseModel)):
|
|
10
10
|
_registry = {}
|
|
11
11
|
_parametrized_registry = []
|
|
12
12
|
|
|
@@ -46,7 +46,7 @@ class OperandRegistry(type(BaseModel)):
|
|
|
46
46
|
return None
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
class Operand(BaseModel, metaclass=
|
|
49
|
+
class Operand(BaseModel, metaclass=OperatorRegistry):
|
|
50
50
|
name: Optional[str] = None
|
|
51
51
|
alias: Optional[str] = None
|
|
52
52
|
is_unary: bool = False
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from upgini.autofe.operator import PandasOperand, ParametrizedOperand
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from pydantic import field_validator as validator # V2
|
|
9
|
+
except ImportError:
|
|
10
|
+
from pydantic import validator # V1
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
14
|
+
is_vector: bool = True
|
|
15
|
+
date_unit: Optional[str] = None
|
|
16
|
+
offset_size: int = 0
|
|
17
|
+
offset_unit: str = "D"
|
|
18
|
+
|
|
19
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
20
|
+
res = super().get_params()
|
|
21
|
+
res.update(
|
|
22
|
+
{
|
|
23
|
+
"date_unit": self.date_unit,
|
|
24
|
+
"offset_size": self.offset_size,
|
|
25
|
+
"offset_unit": self.offset_unit,
|
|
26
|
+
}
|
|
27
|
+
)
|
|
28
|
+
return res
|
|
29
|
+
|
|
30
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
31
|
+
# assuming first is date, last is value, rest is group columns
|
|
32
|
+
date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
|
|
33
|
+
ts = pd.concat([date] + data[1:], axis=1)
|
|
34
|
+
ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
|
|
35
|
+
ts.set_index(date.name, inplace=True)
|
|
36
|
+
ts = ts[ts.index.notna()].sort_index()
|
|
37
|
+
ts = (
|
|
38
|
+
ts.groupby([c.name for c in data[1:-1]], group_keys=True)
|
|
39
|
+
.apply(self._shift)[data[-1].name]
|
|
40
|
+
.to_frame()
|
|
41
|
+
.reset_index()
|
|
42
|
+
.set_index(date.name)
|
|
43
|
+
.groupby([c.name for c in data[1:-1]])
|
|
44
|
+
if len(data) > 2
|
|
45
|
+
else self._shift(ts)
|
|
46
|
+
)
|
|
47
|
+
ts = self._aggregate(ts)
|
|
48
|
+
ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
|
|
49
|
+
ts.index = date.index
|
|
50
|
+
|
|
51
|
+
return ts.iloc[:, -1]
|
|
52
|
+
|
|
53
|
+
def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
54
|
+
if self.offset_size > 0:
|
|
55
|
+
return ts.iloc[:, :-1].merge(
|
|
56
|
+
ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
|
|
57
|
+
left_index=True,
|
|
58
|
+
right_index=True,
|
|
59
|
+
)
|
|
60
|
+
return ts
|
|
61
|
+
|
|
62
|
+
@abc.abstractmethod
|
|
63
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean(), "last": lambda x: x[-1]}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
71
|
+
aggregation: str
|
|
72
|
+
window_size: int = 1
|
|
73
|
+
window_unit: str = "D"
|
|
74
|
+
|
|
75
|
+
@validator("window_unit")
|
|
76
|
+
@classmethod
|
|
77
|
+
def validate_window_unit(cls, v: str) -> str:
|
|
78
|
+
try:
|
|
79
|
+
pd.tseries.frequencies.to_offset(v)
|
|
80
|
+
return v
|
|
81
|
+
except ValueError:
|
|
82
|
+
raise ValueError(
|
|
83
|
+
f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def to_formula(self) -> str:
|
|
87
|
+
roll_component = f"roll_{self.window_size}{self.window_unit}"
|
|
88
|
+
if self.offset_size > 0:
|
|
89
|
+
roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
90
|
+
return f"{roll_component}_{self.aggregation}"
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def from_formula(cls, formula: str) -> Optional["Roll"]:
|
|
94
|
+
import re
|
|
95
|
+
|
|
96
|
+
# Try matching pattern with offset first
|
|
97
|
+
pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
|
|
98
|
+
match_with_offset = re.match(pattern_with_offset, formula)
|
|
99
|
+
|
|
100
|
+
if match_with_offset:
|
|
101
|
+
window_size = int(match_with_offset.group(1))
|
|
102
|
+
window_unit = match_with_offset.group(2)
|
|
103
|
+
offset_size = int(match_with_offset.group(3))
|
|
104
|
+
offset_unit = match_with_offset.group(4)
|
|
105
|
+
aggregation = match_with_offset.group(5)
|
|
106
|
+
|
|
107
|
+
return cls(
|
|
108
|
+
window_size=window_size,
|
|
109
|
+
window_unit=window_unit,
|
|
110
|
+
offset_size=offset_size,
|
|
111
|
+
offset_unit=offset_unit,
|
|
112
|
+
aggregation=aggregation,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# If no offset pattern found, try basic pattern
|
|
116
|
+
pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
|
|
117
|
+
match = re.match(pattern, formula)
|
|
118
|
+
|
|
119
|
+
if not match:
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
window_size = int(match.group(1))
|
|
123
|
+
window_unit = match.group(2)
|
|
124
|
+
aggregation = match.group(3)
|
|
125
|
+
|
|
126
|
+
return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
|
|
127
|
+
|
|
128
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
129
|
+
res = super().get_params()
|
|
130
|
+
res.update(
|
|
131
|
+
{
|
|
132
|
+
"window_size": self.window_size,
|
|
133
|
+
"window_unit": self.window_unit,
|
|
134
|
+
"aggregation": self.aggregation,
|
|
135
|
+
}
|
|
136
|
+
)
|
|
137
|
+
return res
|
|
138
|
+
|
|
139
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
140
|
+
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
|
|
141
|
+
_roll_aggregations.get(self.aggregation, self.aggregation)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class Lag(TimeSeriesBase, ParametrizedOperand):
|
|
146
|
+
lag_size: int
|
|
147
|
+
lag_unit: str = "D"
|
|
148
|
+
|
|
149
|
+
def to_formula(self) -> str:
|
|
150
|
+
lag_component = f"lag_{self.lag_size}{self.lag_unit}"
|
|
151
|
+
if self.offset_size > 0:
|
|
152
|
+
lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
153
|
+
return lag_component
|
|
154
|
+
|
|
155
|
+
@classmethod
|
|
156
|
+
def from_formula(cls, formula: str) -> Optional["Lag"]:
|
|
157
|
+
import re
|
|
158
|
+
|
|
159
|
+
# Try matching pattern with offset first
|
|
160
|
+
pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
|
|
161
|
+
match_with_offset = re.match(pattern_with_offset, formula)
|
|
162
|
+
|
|
163
|
+
if match_with_offset:
|
|
164
|
+
lag_size = int(match_with_offset.group(1))
|
|
165
|
+
lag_unit = match_with_offset.group(2)
|
|
166
|
+
offset_size = int(match_with_offset.group(3))
|
|
167
|
+
offset_unit = match_with_offset.group(4)
|
|
168
|
+
|
|
169
|
+
return cls(
|
|
170
|
+
lag_size=lag_size,
|
|
171
|
+
lag_unit=lag_unit,
|
|
172
|
+
offset_size=offset_size,
|
|
173
|
+
offset_unit=offset_unit,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# If no offset pattern found, try basic pattern
|
|
177
|
+
pattern = r"^lag_(\d+)([a-zA-Z])$"
|
|
178
|
+
match = re.match(pattern, formula)
|
|
179
|
+
|
|
180
|
+
if not match:
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
lag_size = int(match.group(1))
|
|
184
|
+
lag_unit = match.group(2)
|
|
185
|
+
|
|
186
|
+
return cls(lag_size=lag_size, lag_unit=lag_unit)
|
|
187
|
+
|
|
188
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
189
|
+
res = super().get_params()
|
|
190
|
+
res.update(
|
|
191
|
+
{
|
|
192
|
+
"lag_size": self.lag_size,
|
|
193
|
+
"lag_unit": self.lag_unit,
|
|
194
|
+
}
|
|
195
|
+
)
|
|
196
|
+
return res
|
|
197
|
+
|
|
198
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
199
|
+
lag_window = self.lag_size + 1
|
|
200
|
+
return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
|
upgini/autofe/unary.py
CHANGED
|
@@ -2,7 +2,7 @@ from typing import Dict, Optional
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
|
-
from upgini.autofe.
|
|
5
|
+
from upgini.autofe.operator import PandasOperand, VectorizableMixin
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class Abs(PandasOperand, VectorizableMixin):
|
upgini/autofe/vector.py
CHANGED
|
@@ -1,14 +1,8 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import Dict, List, Optional
|
|
1
|
+
from typing import List, Optional
|
|
3
2
|
|
|
4
3
|
import pandas as pd
|
|
5
4
|
|
|
6
|
-
|
|
7
|
-
from pydantic import field_validator as validator # V2
|
|
8
|
-
except ImportError:
|
|
9
|
-
from pydantic import validator # V1
|
|
10
|
-
|
|
11
|
-
from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
|
|
5
|
+
from upgini.autofe.operator import PandasOperand, VectorizableMixin
|
|
12
6
|
|
|
13
7
|
|
|
14
8
|
class Mean(PandasOperand, VectorizableMixin):
|
|
@@ -28,193 +22,3 @@ class Sum(PandasOperand, VectorizableMixin):
|
|
|
28
22
|
|
|
29
23
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
30
24
|
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
34
|
-
is_vector: bool = True
|
|
35
|
-
date_unit: Optional[str] = None
|
|
36
|
-
offset_size: int = 0
|
|
37
|
-
offset_unit: str = "D"
|
|
38
|
-
|
|
39
|
-
def get_params(self) -> Dict[str, Optional[str]]:
|
|
40
|
-
res = super().get_params()
|
|
41
|
-
res.update(
|
|
42
|
-
{
|
|
43
|
-
"date_unit": self.date_unit,
|
|
44
|
-
"offset_size": self.offset_size,
|
|
45
|
-
"offset_unit": self.offset_unit,
|
|
46
|
-
}
|
|
47
|
-
)
|
|
48
|
-
return res
|
|
49
|
-
|
|
50
|
-
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
51
|
-
# assuming first is date, last is value, rest is group columns
|
|
52
|
-
date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
|
|
53
|
-
ts = pd.concat([date] + data[1:], axis=1)
|
|
54
|
-
ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
|
|
55
|
-
ts.set_index(date.name, inplace=True)
|
|
56
|
-
ts = ts[ts.index.notna()].sort_index()
|
|
57
|
-
ts = (
|
|
58
|
-
ts.groupby([c.name for c in data[1:-1]])
|
|
59
|
-
.apply(self._shift)[data[-1].name]
|
|
60
|
-
.to_frame()
|
|
61
|
-
.reset_index()
|
|
62
|
-
.set_index(date.name)
|
|
63
|
-
.groupby([c.name for c in data[1:-1]])
|
|
64
|
-
if len(data) > 2
|
|
65
|
-
else self._shift(ts)
|
|
66
|
-
)
|
|
67
|
-
ts = self._aggregate(ts)
|
|
68
|
-
ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
|
|
69
|
-
ts.index = date.index
|
|
70
|
-
|
|
71
|
-
return ts.iloc[:, -1]
|
|
72
|
-
|
|
73
|
-
def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
74
|
-
if self.offset_size > 0:
|
|
75
|
-
return ts.iloc[:, :-1].merge(
|
|
76
|
-
ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
|
|
77
|
-
left_index=True,
|
|
78
|
-
right_index=True,
|
|
79
|
-
)
|
|
80
|
-
return ts
|
|
81
|
-
|
|
82
|
-
@abc.abstractmethod
|
|
83
|
-
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
84
|
-
pass
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
91
|
-
aggregation: str
|
|
92
|
-
window_size: int = 1
|
|
93
|
-
window_unit: str = "D"
|
|
94
|
-
|
|
95
|
-
@validator("window_unit")
|
|
96
|
-
@classmethod
|
|
97
|
-
def validate_window_unit(cls, v: str) -> str:
|
|
98
|
-
try:
|
|
99
|
-
pd.tseries.frequencies.to_offset(v)
|
|
100
|
-
return v
|
|
101
|
-
except ValueError:
|
|
102
|
-
raise ValueError(
|
|
103
|
-
f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
def to_formula(self) -> str:
|
|
107
|
-
roll_component = f"roll_{self.window_size}{self.window_unit}"
|
|
108
|
-
if self.offset_size > 0:
|
|
109
|
-
roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
110
|
-
return f"{roll_component}_{self.aggregation}"
|
|
111
|
-
|
|
112
|
-
@classmethod
|
|
113
|
-
def from_formula(cls, formula: str) -> Optional["Roll"]:
|
|
114
|
-
import re
|
|
115
|
-
|
|
116
|
-
# Try matching pattern with offset first
|
|
117
|
-
pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
|
|
118
|
-
match_with_offset = re.match(pattern_with_offset, formula)
|
|
119
|
-
|
|
120
|
-
if match_with_offset:
|
|
121
|
-
window_size = int(match_with_offset.group(1))
|
|
122
|
-
window_unit = match_with_offset.group(2)
|
|
123
|
-
offset_size = int(match_with_offset.group(3))
|
|
124
|
-
offset_unit = match_with_offset.group(4)
|
|
125
|
-
aggregation = match_with_offset.group(5)
|
|
126
|
-
|
|
127
|
-
return cls(
|
|
128
|
-
window_size=window_size,
|
|
129
|
-
window_unit=window_unit,
|
|
130
|
-
offset_size=offset_size,
|
|
131
|
-
offset_unit=offset_unit,
|
|
132
|
-
aggregation=aggregation,
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
# If no offset pattern found, try basic pattern
|
|
136
|
-
pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
|
|
137
|
-
match = re.match(pattern, formula)
|
|
138
|
-
|
|
139
|
-
if not match:
|
|
140
|
-
return None
|
|
141
|
-
|
|
142
|
-
window_size = int(match.group(1))
|
|
143
|
-
window_unit = match.group(2)
|
|
144
|
-
aggregation = match.group(3)
|
|
145
|
-
|
|
146
|
-
return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
|
|
147
|
-
|
|
148
|
-
def get_params(self) -> Dict[str, Optional[str]]:
|
|
149
|
-
res = super().get_params()
|
|
150
|
-
res.update(
|
|
151
|
-
{
|
|
152
|
-
"window_size": self.window_size,
|
|
153
|
-
"window_unit": self.window_unit,
|
|
154
|
-
"aggregation": self.aggregation,
|
|
155
|
-
}
|
|
156
|
-
)
|
|
157
|
-
return res
|
|
158
|
-
|
|
159
|
-
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
160
|
-
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
|
|
161
|
-
_roll_aggregations.get(self.aggregation, self.aggregation)
|
|
162
|
-
)
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
class Lag(TimeSeriesBase, ParametrizedOperand):
|
|
166
|
-
lag_size: int
|
|
167
|
-
lag_unit: str = "D"
|
|
168
|
-
|
|
169
|
-
def to_formula(self) -> str:
|
|
170
|
-
lag_component = f"lag_{self.lag_size}{self.lag_unit}"
|
|
171
|
-
if self.offset_size > 0:
|
|
172
|
-
lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
173
|
-
return lag_component
|
|
174
|
-
|
|
175
|
-
@classmethod
|
|
176
|
-
def from_formula(cls, formula: str) -> Optional["Lag"]:
|
|
177
|
-
import re
|
|
178
|
-
|
|
179
|
-
# Try matching pattern with offset first
|
|
180
|
-
pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
|
|
181
|
-
match_with_offset = re.match(pattern_with_offset, formula)
|
|
182
|
-
|
|
183
|
-
if match_with_offset:
|
|
184
|
-
lag_size = int(match_with_offset.group(1))
|
|
185
|
-
lag_unit = match_with_offset.group(2)
|
|
186
|
-
offset_size = int(match_with_offset.group(3))
|
|
187
|
-
offset_unit = match_with_offset.group(4)
|
|
188
|
-
|
|
189
|
-
return cls(
|
|
190
|
-
lag_size=lag_size,
|
|
191
|
-
lag_unit=lag_unit,
|
|
192
|
-
offset_size=offset_size,
|
|
193
|
-
offset_unit=offset_unit,
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
# If no offset pattern found, try basic pattern
|
|
197
|
-
pattern = r"^lag_(\d+)([a-zA-Z])$"
|
|
198
|
-
match = re.match(pattern, formula)
|
|
199
|
-
|
|
200
|
-
if not match:
|
|
201
|
-
return None
|
|
202
|
-
|
|
203
|
-
lag_size = int(match.group(1))
|
|
204
|
-
lag_unit = match.group(2)
|
|
205
|
-
|
|
206
|
-
return cls(lag_size=lag_size, lag_unit=lag_unit)
|
|
207
|
-
|
|
208
|
-
def get_params(self) -> Dict[str, Optional[str]]:
|
|
209
|
-
res = super().get_params()
|
|
210
|
-
res.update(
|
|
211
|
-
{
|
|
212
|
-
"lag_size": self.lag_size,
|
|
213
|
-
"lag_unit": self.lag_unit,
|
|
214
|
-
}
|
|
215
|
-
)
|
|
216
|
-
return res
|
|
217
|
-
|
|
218
|
-
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
219
|
-
lag_window = self.lag_size + 1
|
|
220
|
-
return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
|
upgini/dataset.py
CHANGED
|
@@ -40,7 +40,7 @@ from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
|
40
40
|
from upgini.utils.target_utils import (
|
|
41
41
|
balance_undersample,
|
|
42
42
|
balance_undersample_forced,
|
|
43
|
-
|
|
43
|
+
balance_undersample_time_series_trunc,
|
|
44
44
|
)
|
|
45
45
|
|
|
46
46
|
try:
|
|
@@ -58,6 +58,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
58
58
|
FIT_SAMPLE_THRESHOLD = 200_000
|
|
59
59
|
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
|
60
60
|
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
|
61
|
+
FIT_SAMPLE_THRESHOLD_TS = 54_000
|
|
62
|
+
FIT_SAMPLE_ROWS_TS = 54_000
|
|
61
63
|
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
|
62
64
|
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
|
63
65
|
IMBALANCE_THESHOLD = 0.6
|
|
@@ -301,7 +303,10 @@ class Dataset: # (pd.DataFrame):
|
|
|
301
303
|
)
|
|
302
304
|
|
|
303
305
|
# Resample over fit threshold
|
|
304
|
-
if
|
|
306
|
+
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
307
|
+
sample_threshold = self.FIT_SAMPLE_THRESHOLD_TS
|
|
308
|
+
sample_rows = self.FIT_SAMPLE_ROWS_TS
|
|
309
|
+
elif not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
|
|
305
310
|
sample_threshold = self.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
|
306
311
|
sample_rows = self.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
|
307
312
|
else:
|
|
@@ -314,7 +319,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
314
319
|
f"and will be downsampled to {sample_rows}"
|
|
315
320
|
)
|
|
316
321
|
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
317
|
-
resampled_data =
|
|
322
|
+
resampled_data = balance_undersample_time_series_trunc(
|
|
318
323
|
df=self.data,
|
|
319
324
|
id_columns=self.id_columns,
|
|
320
325
|
date_column=next(
|
|
@@ -584,10 +589,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
584
589
|
return search_customization
|
|
585
590
|
|
|
586
591
|
def _rename_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
|
587
|
-
if
|
|
588
|
-
runtime_parameters is not None
|
|
589
|
-
and runtime_parameters.properties is not None
|
|
590
|
-
):
|
|
592
|
+
if runtime_parameters is not None and runtime_parameters.properties is not None:
|
|
591
593
|
if "generate_features" in runtime_parameters.properties:
|
|
592
594
|
generate_features = runtime_parameters.properties["generate_features"].split(",")
|
|
593
595
|
renamed_generate_features = []
|
|
@@ -607,6 +609,13 @@ class Dataset: # (pd.DataFrame):
|
|
|
607
609
|
|
|
608
610
|
return runtime_parameters
|
|
609
611
|
|
|
612
|
+
def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
|
613
|
+
if runtime_parameters is not None and runtime_parameters.properties is not None:
|
|
614
|
+
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
615
|
+
runtime_parameters.properties["sample_size"] = self.FIT_SAMPLE_ROWS_TS
|
|
616
|
+
runtime_parameters.properties["iter0_sample_size"] = self.FIT_SAMPLE_ROWS_TS
|
|
617
|
+
return runtime_parameters
|
|
618
|
+
|
|
610
619
|
def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
|
611
620
|
if (
|
|
612
621
|
runtime_parameters is not None
|
|
@@ -638,6 +647,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
638
647
|
file_metrics = FileMetrics()
|
|
639
648
|
|
|
640
649
|
runtime_parameters = self._rename_generate_features(runtime_parameters)
|
|
650
|
+
runtime_parameters = self._set_sample_size(runtime_parameters)
|
|
641
651
|
|
|
642
652
|
file_metadata = self.__construct_metadata(exclude_features_sources)
|
|
643
653
|
search_customization = self.__construct_search_customization(
|
upgini/features_enricher.py
CHANGED
|
@@ -31,7 +31,7 @@ from sklearn.exceptions import NotFittedError
|
|
|
31
31
|
from sklearn.model_selection import BaseCrossValidator
|
|
32
32
|
|
|
33
33
|
from upgini.autofe.feature import Feature
|
|
34
|
-
from upgini.autofe.
|
|
34
|
+
from upgini.autofe.timeseries import TimeSeriesBase
|
|
35
35
|
from upgini.data_source.data_source_publisher import CommercialSchema
|
|
36
36
|
from upgini.dataset import Dataset
|
|
37
37
|
from upgini.errors import HttpError, ValidationError
|
upgini/utils/target_utils.py
CHANGED
|
@@ -9,6 +9,7 @@ from upgini.errors import ValidationError
|
|
|
9
9
|
from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
|
|
10
10
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
11
11
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
12
|
+
from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
|
|
12
13
|
|
|
13
14
|
TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
|
14
15
|
|
|
@@ -240,7 +241,7 @@ def balance_undersample_forced(
|
|
|
240
241
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
241
242
|
if cv_type is not None and cv_type.is_time_series():
|
|
242
243
|
logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
|
|
243
|
-
resampled_data =
|
|
244
|
+
resampled_data = balance_undersample_time_series_trunc(
|
|
244
245
|
df,
|
|
245
246
|
id_columns=id_columns,
|
|
246
247
|
date_column=date_column,
|
|
@@ -279,6 +280,58 @@ def balance_undersample_forced(
|
|
|
279
280
|
return resampled_data
|
|
280
281
|
|
|
281
282
|
|
|
283
|
+
DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
|
|
284
|
+
DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
|
|
285
|
+
DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def balance_undersample_time_series_trunc(
|
|
289
|
+
df: pd.DataFrame,
|
|
290
|
+
id_columns: List[str],
|
|
291
|
+
date_column: str,
|
|
292
|
+
sample_size: int,
|
|
293
|
+
random_state: int = 42,
|
|
294
|
+
logger: Optional[logging.Logger] = None,
|
|
295
|
+
highfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
|
|
296
|
+
lowfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
|
|
297
|
+
time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
|
|
298
|
+
**kwargs,
|
|
299
|
+
):
|
|
300
|
+
# Convert date column to datetime
|
|
301
|
+
dates_df = df[id_columns + [date_column]].copy()
|
|
302
|
+
dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
|
|
303
|
+
|
|
304
|
+
time_unit = get_most_frequent_time_unit(dates_df, id_columns, date_column)
|
|
305
|
+
if logger is not None:
|
|
306
|
+
logger.info(f"Time unit: {time_unit}")
|
|
307
|
+
|
|
308
|
+
if time_unit is None:
|
|
309
|
+
if logger is not None:
|
|
310
|
+
logger.info("Cannot detect time unit, returning original dataset")
|
|
311
|
+
return df
|
|
312
|
+
|
|
313
|
+
if time_unit < time_unit_threshold:
|
|
314
|
+
for trunc_length in highfreq_trunc_lengths:
|
|
315
|
+
sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
|
|
316
|
+
if len(sampled_df) <= sample_size:
|
|
317
|
+
break
|
|
318
|
+
if len(sampled_df) > sample_size:
|
|
319
|
+
sampled_df = balance_undersample_time_series(
|
|
320
|
+
sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
|
|
321
|
+
)
|
|
322
|
+
else:
|
|
323
|
+
for trunc_length in lowfreq_trunc_lengths:
|
|
324
|
+
sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
|
|
325
|
+
if len(sampled_df) <= sample_size:
|
|
326
|
+
break
|
|
327
|
+
if len(sampled_df) > sample_size:
|
|
328
|
+
sampled_df = balance_undersample_time_series(
|
|
329
|
+
sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
return df.loc[sampled_df.index]
|
|
333
|
+
|
|
334
|
+
|
|
282
335
|
def balance_undersample_time_series(
|
|
283
336
|
df: pd.DataFrame,
|
|
284
337
|
id_columns: List[str],
|
upgini/utils/ts_utils.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_most_frequent_time_unit(df: pd.DataFrame, id_columns: List[str], date_column: str) -> Optional[pd.DateOffset]:
|
|
7
|
+
|
|
8
|
+
def closest_unit(diff):
|
|
9
|
+
return pd.tseries.frequencies.to_offset(pd.Timedelta(diff, unit="s"))
|
|
10
|
+
|
|
11
|
+
all_diffs = []
|
|
12
|
+
groups = df.groupby(id_columns) if id_columns else [(None, df)]
|
|
13
|
+
for _, group in groups:
|
|
14
|
+
group_dates = group[date_column].sort_values().unique()
|
|
15
|
+
if len(group_dates) > 1:
|
|
16
|
+
diff_series = pd.Series(group_dates[1:] - group_dates[:-1])
|
|
17
|
+
diff_ns = diff_series.dt.total_seconds()
|
|
18
|
+
all_diffs.extend(diff_ns)
|
|
19
|
+
|
|
20
|
+
all_diffs = pd.Series(all_diffs)
|
|
21
|
+
|
|
22
|
+
most_frequent_unit = all_diffs.apply(closest_unit).mode().min()
|
|
23
|
+
|
|
24
|
+
return most_frequent_unit if isinstance(most_frequent_unit, pd.DateOffset) else None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def trunc_datetime(
|
|
28
|
+
df: pd.DataFrame,
|
|
29
|
+
id_columns: List[str],
|
|
30
|
+
date_column: str,
|
|
31
|
+
length: pd.DateOffset,
|
|
32
|
+
logger: Optional[logging.Logger] = None,
|
|
33
|
+
) -> pd.DataFrame:
|
|
34
|
+
if logger is not None:
|
|
35
|
+
logger.info(f"Truncating time series dataset to {length}")
|
|
36
|
+
|
|
37
|
+
if id_columns:
|
|
38
|
+
min_datetime = df.groupby(id_columns)[date_column].transform(lambda group: group.max() - length)
|
|
39
|
+
else:
|
|
40
|
+
min_datetime = df[date_column].max() - length
|
|
41
|
+
return df[df[date_column] > min_datetime]
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256
|
|
1
|
+
upgini/__about__.py,sha256=-inFSOjK0otU7oAU9xIxafvjGaGWyHQqEAz5nWw5yqI,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=cB2I5rNpbztjkYEEW5aJuKj2fCMnfxp40X4Eo63oyuQ,205340
|
|
7
7
|
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
|
|
@@ -14,14 +14,15 @@ upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1
|
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
upgini/autofe/all_operands.py,sha256=
|
|
18
|
-
upgini/autofe/binary.py,sha256=
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
21
|
-
upgini/autofe/groupby.py,sha256=
|
|
22
|
-
upgini/autofe/
|
|
23
|
-
upgini/autofe/
|
|
24
|
-
upgini/autofe/
|
|
17
|
+
upgini/autofe/all_operands.py,sha256=VIT5jCq5U-qypdNz1MIQ_hlIAs0ujJgRfKRUkU24nFs,332
|
|
18
|
+
upgini/autofe/binary.py,sha256=jsXa_zwlNWRmQAT5qipzU2Or03qae-a1kkY9yDECkq8,7660
|
|
19
|
+
upgini/autofe/date.py,sha256=bmoXU5vlDa1xsfCIFEC_VMRHOnV8Sy_KUMshqh0ARvA,10722
|
|
20
|
+
upgini/autofe/feature.py,sha256=n4sNNFM9b022AGJbW14AMRuERD9bwub-RWqa6hfLID0,14750
|
|
21
|
+
upgini/autofe/groupby.py,sha256=NN0T-tYbTHQDeCi2UZ06wVkDflm8DJBV4rdGrrVyVEE,3596
|
|
22
|
+
upgini/autofe/operator.py,sha256=VCGDUQ5bOtwX-jzmgHDrKF3GbglDumyEkvtLWTmSGQo,4776
|
|
23
|
+
upgini/autofe/timeseries.py,sha256=Pci7kNpFcViNZdIHlVTyxjoxzcMVdqUPopbPrJ3hE20,6593
|
|
24
|
+
upgini/autofe/unary.py,sha256=my7AYIrWCQPFxRtcphONmwieU5HpX4fHiKllFRCsMUk,4647
|
|
25
|
+
upgini/autofe/vector.py,sha256=5Lx2q_Np9PrMtZ_8O86xywq0s4XSQbooHxK3ufo3ANU,664
|
|
25
26
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
27
|
upgini/data_source/data_source_publisher.py,sha256=4S9qwlAklD8vg9tUU_c1pHE2_glUHAh15-wr5hMwKFw,22879
|
|
27
28
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
@@ -58,10 +59,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
|
|
|
58
59
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
59
60
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
60
61
|
upgini/utils/sort.py,sha256=w-CoT33W_53ekOROpKI_VRsRmiyWNr2b3IpE5_4MLLA,6395
|
|
61
|
-
upgini/utils/target_utils.py,sha256=
|
|
62
|
+
upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
|
|
62
63
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
64
|
+
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
63
65
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
66
|
-
upgini-1.2.
|
|
67
|
-
upgini-1.2.
|
|
66
|
+
upgini-1.2.62a3818.dev1.dist-info/METADATA,sha256=9mRM2yQ18CeOTHQ83UgVmItZ-npsZSla3illeXSpyTQ,49094
|
|
67
|
+
upgini-1.2.62a3818.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
68
|
+
upgini-1.2.62a3818.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
69
|
+
upgini-1.2.62a3818.dev1.dist-info/RECORD,,
|
|
File without changes
|