upgini 1.2.60a3792.dev2__py3-none-any.whl → 1.2.62a3818.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +2 -2
- upgini/autofe/binary.py +1 -1
- upgini/autofe/date.py +2 -2
- upgini/autofe/feature.py +1 -1
- upgini/autofe/groupby.py +1 -1
- upgini/autofe/{operand.py → operator.py} +2 -2
- upgini/autofe/timeseries.py +200 -0
- upgini/autofe/unary.py +1 -1
- upgini/autofe/vector.py +2 -198
- upgini/data_source/data_source_publisher.py +9 -4
- upgini/features_enricher.py +108 -46
- upgini/metrics.py +4 -7
- upgini/resource_bundle/strings.properties +1 -0
- upgini/utils/datetime_utils.py +2 -0
- upgini/utils/mstats.py +177 -0
- upgini/utils/sort.py +172 -0
- upgini/utils/target_utils.py +3 -3
- upgini/utils/ts_utils.py +0 -6
- {upgini-1.2.60a3792.dev2.dist-info → upgini-1.2.62a3818.dev1.dist-info}/METADATA +2 -1
- {upgini-1.2.60a3792.dev2.dist-info → upgini-1.2.62a3818.dev1.dist-info}/RECORD +23 -20
- {upgini-1.2.60a3792.dev2.dist-info → upgini-1.2.62a3818.dev1.dist-info}/WHEEL +0 -0
- {upgini-1.2.60a3792.dev2.dist-info → upgini-1.2.62a3818.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.62a3818.dev1"
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from upgini.autofe.
|
|
1
|
+
from upgini.autofe.operator import OperatorRegistry
|
|
2
2
|
from upgini.autofe.unary import * # noqa
|
|
3
3
|
from upgini.autofe.binary import * # noqa
|
|
4
4
|
from upgini.autofe.groupby import * # noqa
|
|
@@ -7,4 +7,4 @@ from upgini.autofe.vector import * # noqa
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def find_op(name):
|
|
10
|
-
return
|
|
10
|
+
return OperatorRegistry.get_operand(name)
|
upgini/autofe/binary.py
CHANGED
upgini/autofe/date.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
import json
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Dict, List, Optional, Union
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
8
8
|
from pydantic import BaseModel, __version__ as pydantic_version
|
|
9
9
|
|
|
10
|
-
from upgini.autofe.
|
|
10
|
+
from upgini.autofe.operator import PandasOperand, ParametrizedOperand
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def get_pydantic_version():
|
upgini/autofe/feature.py
CHANGED
upgini/autofe/groupby.py
CHANGED
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class
|
|
9
|
+
class OperatorRegistry(type(BaseModel)):
|
|
10
10
|
_registry = {}
|
|
11
11
|
_parametrized_registry = []
|
|
12
12
|
|
|
@@ -46,7 +46,7 @@ class OperandRegistry(type(BaseModel)):
|
|
|
46
46
|
return None
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
class Operand(BaseModel, metaclass=
|
|
49
|
+
class Operand(BaseModel, metaclass=OperatorRegistry):
|
|
50
50
|
name: Optional[str] = None
|
|
51
51
|
alias: Optional[str] = None
|
|
52
52
|
is_unary: bool = False
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from upgini.autofe.operator import PandasOperand, ParametrizedOperand
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from pydantic import field_validator as validator # V2
|
|
9
|
+
except ImportError:
|
|
10
|
+
from pydantic import validator # V1
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
14
|
+
is_vector: bool = True
|
|
15
|
+
date_unit: Optional[str] = None
|
|
16
|
+
offset_size: int = 0
|
|
17
|
+
offset_unit: str = "D"
|
|
18
|
+
|
|
19
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
20
|
+
res = super().get_params()
|
|
21
|
+
res.update(
|
|
22
|
+
{
|
|
23
|
+
"date_unit": self.date_unit,
|
|
24
|
+
"offset_size": self.offset_size,
|
|
25
|
+
"offset_unit": self.offset_unit,
|
|
26
|
+
}
|
|
27
|
+
)
|
|
28
|
+
return res
|
|
29
|
+
|
|
30
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
31
|
+
# assuming first is date, last is value, rest is group columns
|
|
32
|
+
date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
|
|
33
|
+
ts = pd.concat([date] + data[1:], axis=1)
|
|
34
|
+
ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
|
|
35
|
+
ts.set_index(date.name, inplace=True)
|
|
36
|
+
ts = ts[ts.index.notna()].sort_index()
|
|
37
|
+
ts = (
|
|
38
|
+
ts.groupby([c.name for c in data[1:-1]], group_keys=True)
|
|
39
|
+
.apply(self._shift)[data[-1].name]
|
|
40
|
+
.to_frame()
|
|
41
|
+
.reset_index()
|
|
42
|
+
.set_index(date.name)
|
|
43
|
+
.groupby([c.name for c in data[1:-1]])
|
|
44
|
+
if len(data) > 2
|
|
45
|
+
else self._shift(ts)
|
|
46
|
+
)
|
|
47
|
+
ts = self._aggregate(ts)
|
|
48
|
+
ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
|
|
49
|
+
ts.index = date.index
|
|
50
|
+
|
|
51
|
+
return ts.iloc[:, -1]
|
|
52
|
+
|
|
53
|
+
def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
54
|
+
if self.offset_size > 0:
|
|
55
|
+
return ts.iloc[:, :-1].merge(
|
|
56
|
+
ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
|
|
57
|
+
left_index=True,
|
|
58
|
+
right_index=True,
|
|
59
|
+
)
|
|
60
|
+
return ts
|
|
61
|
+
|
|
62
|
+
@abc.abstractmethod
|
|
63
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean(), "last": lambda x: x[-1]}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
71
|
+
aggregation: str
|
|
72
|
+
window_size: int = 1
|
|
73
|
+
window_unit: str = "D"
|
|
74
|
+
|
|
75
|
+
@validator("window_unit")
|
|
76
|
+
@classmethod
|
|
77
|
+
def validate_window_unit(cls, v: str) -> str:
|
|
78
|
+
try:
|
|
79
|
+
pd.tseries.frequencies.to_offset(v)
|
|
80
|
+
return v
|
|
81
|
+
except ValueError:
|
|
82
|
+
raise ValueError(
|
|
83
|
+
f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def to_formula(self) -> str:
|
|
87
|
+
roll_component = f"roll_{self.window_size}{self.window_unit}"
|
|
88
|
+
if self.offset_size > 0:
|
|
89
|
+
roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
90
|
+
return f"{roll_component}_{self.aggregation}"
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def from_formula(cls, formula: str) -> Optional["Roll"]:
|
|
94
|
+
import re
|
|
95
|
+
|
|
96
|
+
# Try matching pattern with offset first
|
|
97
|
+
pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
|
|
98
|
+
match_with_offset = re.match(pattern_with_offset, formula)
|
|
99
|
+
|
|
100
|
+
if match_with_offset:
|
|
101
|
+
window_size = int(match_with_offset.group(1))
|
|
102
|
+
window_unit = match_with_offset.group(2)
|
|
103
|
+
offset_size = int(match_with_offset.group(3))
|
|
104
|
+
offset_unit = match_with_offset.group(4)
|
|
105
|
+
aggregation = match_with_offset.group(5)
|
|
106
|
+
|
|
107
|
+
return cls(
|
|
108
|
+
window_size=window_size,
|
|
109
|
+
window_unit=window_unit,
|
|
110
|
+
offset_size=offset_size,
|
|
111
|
+
offset_unit=offset_unit,
|
|
112
|
+
aggregation=aggregation,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# If no offset pattern found, try basic pattern
|
|
116
|
+
pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
|
|
117
|
+
match = re.match(pattern, formula)
|
|
118
|
+
|
|
119
|
+
if not match:
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
window_size = int(match.group(1))
|
|
123
|
+
window_unit = match.group(2)
|
|
124
|
+
aggregation = match.group(3)
|
|
125
|
+
|
|
126
|
+
return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
|
|
127
|
+
|
|
128
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
129
|
+
res = super().get_params()
|
|
130
|
+
res.update(
|
|
131
|
+
{
|
|
132
|
+
"window_size": self.window_size,
|
|
133
|
+
"window_unit": self.window_unit,
|
|
134
|
+
"aggregation": self.aggregation,
|
|
135
|
+
}
|
|
136
|
+
)
|
|
137
|
+
return res
|
|
138
|
+
|
|
139
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
140
|
+
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
|
|
141
|
+
_roll_aggregations.get(self.aggregation, self.aggregation)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class Lag(TimeSeriesBase, ParametrizedOperand):
|
|
146
|
+
lag_size: int
|
|
147
|
+
lag_unit: str = "D"
|
|
148
|
+
|
|
149
|
+
def to_formula(self) -> str:
|
|
150
|
+
lag_component = f"lag_{self.lag_size}{self.lag_unit}"
|
|
151
|
+
if self.offset_size > 0:
|
|
152
|
+
lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
153
|
+
return lag_component
|
|
154
|
+
|
|
155
|
+
@classmethod
|
|
156
|
+
def from_formula(cls, formula: str) -> Optional["Lag"]:
|
|
157
|
+
import re
|
|
158
|
+
|
|
159
|
+
# Try matching pattern with offset first
|
|
160
|
+
pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
|
|
161
|
+
match_with_offset = re.match(pattern_with_offset, formula)
|
|
162
|
+
|
|
163
|
+
if match_with_offset:
|
|
164
|
+
lag_size = int(match_with_offset.group(1))
|
|
165
|
+
lag_unit = match_with_offset.group(2)
|
|
166
|
+
offset_size = int(match_with_offset.group(3))
|
|
167
|
+
offset_unit = match_with_offset.group(4)
|
|
168
|
+
|
|
169
|
+
return cls(
|
|
170
|
+
lag_size=lag_size,
|
|
171
|
+
lag_unit=lag_unit,
|
|
172
|
+
offset_size=offset_size,
|
|
173
|
+
offset_unit=offset_unit,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# If no offset pattern found, try basic pattern
|
|
177
|
+
pattern = r"^lag_(\d+)([a-zA-Z])$"
|
|
178
|
+
match = re.match(pattern, formula)
|
|
179
|
+
|
|
180
|
+
if not match:
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
lag_size = int(match.group(1))
|
|
184
|
+
lag_unit = match.group(2)
|
|
185
|
+
|
|
186
|
+
return cls(lag_size=lag_size, lag_unit=lag_unit)
|
|
187
|
+
|
|
188
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
189
|
+
res = super().get_params()
|
|
190
|
+
res.update(
|
|
191
|
+
{
|
|
192
|
+
"lag_size": self.lag_size,
|
|
193
|
+
"lag_unit": self.lag_unit,
|
|
194
|
+
}
|
|
195
|
+
)
|
|
196
|
+
return res
|
|
197
|
+
|
|
198
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
199
|
+
lag_window = self.lag_size + 1
|
|
200
|
+
return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
|
upgini/autofe/unary.py
CHANGED
|
@@ -2,7 +2,7 @@ from typing import Dict, Optional
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
|
-
from upgini.autofe.
|
|
5
|
+
from upgini.autofe.operator import PandasOperand, VectorizableMixin
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class Abs(PandasOperand, VectorizableMixin):
|
upgini/autofe/vector.py
CHANGED
|
@@ -1,14 +1,8 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import Dict, List, Optional
|
|
1
|
+
from typing import List, Optional
|
|
3
2
|
|
|
4
3
|
import pandas as pd
|
|
5
4
|
|
|
6
|
-
|
|
7
|
-
from pydantic import field_validator as validator # V2
|
|
8
|
-
except ImportError:
|
|
9
|
-
from pydantic import validator # V1
|
|
10
|
-
|
|
11
|
-
from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
|
|
5
|
+
from upgini.autofe.operator import PandasOperand, VectorizableMixin
|
|
12
6
|
|
|
13
7
|
|
|
14
8
|
class Mean(PandasOperand, VectorizableMixin):
|
|
@@ -28,193 +22,3 @@ class Sum(PandasOperand, VectorizableMixin):
|
|
|
28
22
|
|
|
29
23
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
30
24
|
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
34
|
-
is_vector: bool = True
|
|
35
|
-
date_unit: Optional[str] = None
|
|
36
|
-
offset_size: int = 0
|
|
37
|
-
offset_unit: str = "D"
|
|
38
|
-
|
|
39
|
-
def get_params(self) -> Dict[str, Optional[str]]:
|
|
40
|
-
res = super().get_params()
|
|
41
|
-
res.update(
|
|
42
|
-
{
|
|
43
|
-
"date_unit": self.date_unit,
|
|
44
|
-
"offset_size": self.offset_size,
|
|
45
|
-
"offset_unit": self.offset_unit,
|
|
46
|
-
}
|
|
47
|
-
)
|
|
48
|
-
return res
|
|
49
|
-
|
|
50
|
-
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
51
|
-
# assuming first is date, last is value, rest is group columns
|
|
52
|
-
date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
|
|
53
|
-
ts = pd.concat([date] + data[1:], axis=1)
|
|
54
|
-
ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
|
|
55
|
-
ts.set_index(date.name, inplace=True)
|
|
56
|
-
ts = ts[ts.index.notna()].sort_index()
|
|
57
|
-
ts = (
|
|
58
|
-
ts.groupby([c.name for c in data[1:-1]])
|
|
59
|
-
.apply(self._shift)[data[-1].name]
|
|
60
|
-
.to_frame()
|
|
61
|
-
.reset_index()
|
|
62
|
-
.set_index(date.name)
|
|
63
|
-
.groupby([c.name for c in data[1:-1]])
|
|
64
|
-
if len(data) > 2
|
|
65
|
-
else self._shift(ts)
|
|
66
|
-
)
|
|
67
|
-
ts = self._aggregate(ts)
|
|
68
|
-
ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
|
|
69
|
-
ts.index = date.index
|
|
70
|
-
|
|
71
|
-
return ts.iloc[:, -1]
|
|
72
|
-
|
|
73
|
-
def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
74
|
-
if self.offset_size > 0:
|
|
75
|
-
return ts.iloc[:, :-1].merge(
|
|
76
|
-
ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
|
|
77
|
-
left_index=True,
|
|
78
|
-
right_index=True,
|
|
79
|
-
)
|
|
80
|
-
return ts
|
|
81
|
-
|
|
82
|
-
@abc.abstractmethod
|
|
83
|
-
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
84
|
-
pass
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
91
|
-
aggregation: str
|
|
92
|
-
window_size: int = 1
|
|
93
|
-
window_unit: str = "D"
|
|
94
|
-
|
|
95
|
-
@validator("window_unit")
|
|
96
|
-
@classmethod
|
|
97
|
-
def validate_window_unit(cls, v: str) -> str:
|
|
98
|
-
try:
|
|
99
|
-
pd.tseries.frequencies.to_offset(v)
|
|
100
|
-
return v
|
|
101
|
-
except ValueError:
|
|
102
|
-
raise ValueError(
|
|
103
|
-
f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
def to_formula(self) -> str:
|
|
107
|
-
roll_component = f"roll_{self.window_size}{self.window_unit}"
|
|
108
|
-
if self.offset_size > 0:
|
|
109
|
-
roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
110
|
-
return f"{roll_component}_{self.aggregation}"
|
|
111
|
-
|
|
112
|
-
@classmethod
|
|
113
|
-
def from_formula(cls, formula: str) -> Optional["Roll"]:
|
|
114
|
-
import re
|
|
115
|
-
|
|
116
|
-
# Try matching pattern with offset first
|
|
117
|
-
pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
|
|
118
|
-
match_with_offset = re.match(pattern_with_offset, formula)
|
|
119
|
-
|
|
120
|
-
if match_with_offset:
|
|
121
|
-
window_size = int(match_with_offset.group(1))
|
|
122
|
-
window_unit = match_with_offset.group(2)
|
|
123
|
-
offset_size = int(match_with_offset.group(3))
|
|
124
|
-
offset_unit = match_with_offset.group(4)
|
|
125
|
-
aggregation = match_with_offset.group(5)
|
|
126
|
-
|
|
127
|
-
return cls(
|
|
128
|
-
window_size=window_size,
|
|
129
|
-
window_unit=window_unit,
|
|
130
|
-
offset_size=offset_size,
|
|
131
|
-
offset_unit=offset_unit,
|
|
132
|
-
aggregation=aggregation,
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
# If no offset pattern found, try basic pattern
|
|
136
|
-
pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
|
|
137
|
-
match = re.match(pattern, formula)
|
|
138
|
-
|
|
139
|
-
if not match:
|
|
140
|
-
return None
|
|
141
|
-
|
|
142
|
-
window_size = int(match.group(1))
|
|
143
|
-
window_unit = match.group(2)
|
|
144
|
-
aggregation = match.group(3)
|
|
145
|
-
|
|
146
|
-
return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
|
|
147
|
-
|
|
148
|
-
def get_params(self) -> Dict[str, Optional[str]]:
|
|
149
|
-
res = super().get_params()
|
|
150
|
-
res.update(
|
|
151
|
-
{
|
|
152
|
-
"window_size": self.window_size,
|
|
153
|
-
"window_unit": self.window_unit,
|
|
154
|
-
"aggregation": self.aggregation,
|
|
155
|
-
}
|
|
156
|
-
)
|
|
157
|
-
return res
|
|
158
|
-
|
|
159
|
-
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
160
|
-
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
|
|
161
|
-
_roll_aggregations.get(self.aggregation, self.aggregation)
|
|
162
|
-
)
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
class Lag(TimeSeriesBase, ParametrizedOperand):
|
|
166
|
-
lag_size: int
|
|
167
|
-
lag_unit: str = "D"
|
|
168
|
-
|
|
169
|
-
def to_formula(self) -> str:
|
|
170
|
-
lag_component = f"lag_{self.lag_size}{self.lag_unit}"
|
|
171
|
-
if self.offset_size > 0:
|
|
172
|
-
lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
173
|
-
return lag_component
|
|
174
|
-
|
|
175
|
-
@classmethod
|
|
176
|
-
def from_formula(cls, formula: str) -> Optional["Lag"]:
|
|
177
|
-
import re
|
|
178
|
-
|
|
179
|
-
# Try matching pattern with offset first
|
|
180
|
-
pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
|
|
181
|
-
match_with_offset = re.match(pattern_with_offset, formula)
|
|
182
|
-
|
|
183
|
-
if match_with_offset:
|
|
184
|
-
lag_size = int(match_with_offset.group(1))
|
|
185
|
-
lag_unit = match_with_offset.group(2)
|
|
186
|
-
offset_size = int(match_with_offset.group(3))
|
|
187
|
-
offset_unit = match_with_offset.group(4)
|
|
188
|
-
|
|
189
|
-
return cls(
|
|
190
|
-
lag_size=lag_size,
|
|
191
|
-
lag_unit=lag_unit,
|
|
192
|
-
offset_size=offset_size,
|
|
193
|
-
offset_unit=offset_unit,
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
# If no offset pattern found, try basic pattern
|
|
197
|
-
pattern = r"^lag_(\d+)([a-zA-Z])$"
|
|
198
|
-
match = re.match(pattern, formula)
|
|
199
|
-
|
|
200
|
-
if not match:
|
|
201
|
-
return None
|
|
202
|
-
|
|
203
|
-
lag_size = int(match.group(1))
|
|
204
|
-
lag_unit = match.group(2)
|
|
205
|
-
|
|
206
|
-
return cls(lag_size=lag_size, lag_unit=lag_unit)
|
|
207
|
-
|
|
208
|
-
def get_params(self) -> Dict[str, Optional[str]]:
|
|
209
|
-
res = super().get_params()
|
|
210
|
-
res.update(
|
|
211
|
-
{
|
|
212
|
-
"lag_size": self.lag_size,
|
|
213
|
-
"lag_unit": self.lag_unit,
|
|
214
|
-
}
|
|
215
|
-
)
|
|
216
|
-
return res
|
|
217
|
-
|
|
218
|
-
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
219
|
-
lag_window = self.lag_size + 1
|
|
220
|
-
return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
|
|
@@ -63,6 +63,7 @@ class DataSourcePublisher:
|
|
|
63
63
|
keep_features: Optional[List[str]] = None,
|
|
64
64
|
date_features: Optional[List[str]] = None,
|
|
65
65
|
date_vector_features: Optional[List[str]] = None,
|
|
66
|
+
date_features_format: Optional[str] = None,
|
|
66
67
|
generate_runtime_embeddings: Optional[List[str]] = None,
|
|
67
68
|
exclude_raw: Optional[List[str]] = None,
|
|
68
69
|
_force_generation=False,
|
|
@@ -160,13 +161,17 @@ class DataSourcePublisher:
|
|
|
160
161
|
if keep_features is not None:
|
|
161
162
|
request["keepFeatures"] = keep_features
|
|
162
163
|
if date_features is not None:
|
|
163
|
-
if
|
|
164
|
-
raise ValidationError("
|
|
164
|
+
if date_features_format is None:
|
|
165
|
+
raise ValidationError("date_features_format should be presented if you use date features")
|
|
165
166
|
request["dateFeatures"] = date_features
|
|
167
|
+
request["dateFeaturesFormat"] = date_features_format
|
|
166
168
|
if date_vector_features is not None:
|
|
167
|
-
if
|
|
168
|
-
raise ValidationError(
|
|
169
|
+
if date_features_format is None:
|
|
170
|
+
raise ValidationError(
|
|
171
|
+
"date_features_format should be presented if you use date vector features"
|
|
172
|
+
)
|
|
169
173
|
request["dateVectorFeatures"] = date_vector_features
|
|
174
|
+
request["dateFeaturesFormat"] = date_features_format
|
|
170
175
|
if generate_runtime_embeddings is not None:
|
|
171
176
|
request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
|
|
172
177
|
if exclude_raw is not None:
|