upgini 1.2.62a3818.dev2__py3-none-any.whl → 1.2.62a3818.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +1 -1
- upgini/autofe/operator.py +7 -5
- upgini/autofe/timeseries/__init__.py +23 -0
- upgini/autofe/timeseries/base.py +105 -0
- upgini/autofe/timeseries/cross.py +130 -0
- upgini/autofe/timeseries/delta.py +125 -0
- upgini/autofe/timeseries/lag.py +68 -0
- upgini/autofe/timeseries/roll.py +92 -0
- upgini/autofe/timeseries/trend.py +61 -0
- upgini/autofe/timeseries/volatility.py +259 -0
- {upgini-1.2.62a3818.dev2.dist-info → upgini-1.2.62a3818.dev4.dist-info}/METADATA +1 -1
- {upgini-1.2.62a3818.dev2.dist-info → upgini-1.2.62a3818.dev4.dist-info}/RECORD +15 -8
- upgini/autofe/timeseries.py +0 -200
- {upgini-1.2.62a3818.dev2.dist-info → upgini-1.2.62a3818.dev4.dist-info}/WHEEL +0 -0
- {upgini-1.2.62a3818.dev2.dist-info → upgini-1.2.62a3818.dev4.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.62a3818.
|
|
1
|
+
__version__ = "1.2.62a3818.dev4"
|
upgini/autofe/all_operands.py
CHANGED
upgini/autofe/operator.py
CHANGED
|
@@ -20,23 +20,25 @@ class OperatorRegistry(type(BaseModel)):
|
|
|
20
20
|
base_names.update(b.__name__ for b in base.__bases__)
|
|
21
21
|
base_classes.extend(base.__bases__)
|
|
22
22
|
|
|
23
|
-
if "
|
|
23
|
+
if "Operator" in base_names:
|
|
24
24
|
# Track parametrized operands separately
|
|
25
|
-
if "
|
|
25
|
+
if "ParametrizedOperator" in base_names:
|
|
26
26
|
cls._parametrized_registry.append(new_class)
|
|
27
27
|
else:
|
|
28
28
|
try:
|
|
29
29
|
instance = new_class()
|
|
30
30
|
cls._registry[instance.name] = new_class
|
|
31
|
+
if instance.alias:
|
|
32
|
+
cls._registry[instance.alias] = new_class
|
|
31
33
|
except Exception:
|
|
32
34
|
pass
|
|
33
35
|
return new_class
|
|
34
36
|
|
|
35
37
|
@classmethod
|
|
36
|
-
def
|
|
38
|
+
def get_operator(cls, name: str) -> Optional["Operator"]:
|
|
37
39
|
# First try to resolve as a parametrized operand formula
|
|
38
|
-
for
|
|
39
|
-
resolved =
|
|
40
|
+
for operator_cls in cls._parametrized_registry:
|
|
41
|
+
resolved = operator_cls.from_formula(name)
|
|
40
42
|
if resolved is not None:
|
|
41
43
|
return resolved
|
|
42
44
|
# Fall back to direct registry lookup
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Time series feature engineering operators."""
|
|
2
|
+
|
|
3
|
+
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
4
|
+
from upgini.autofe.timeseries.roll import Roll
|
|
5
|
+
from upgini.autofe.timeseries.lag import Lag
|
|
6
|
+
from upgini.autofe.timeseries.delta import Delta, Delta2
|
|
7
|
+
from upgini.autofe.timeseries.trend import TrendCoefficient
|
|
8
|
+
from upgini.autofe.timeseries.volatility import EWMAVolatility, RollingVolatility, RollingVolatility2, VolatilityRatio
|
|
9
|
+
from upgini.autofe.timeseries.cross import CrossSeriesInteraction
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"TimeSeriesBase",
|
|
13
|
+
"Roll",
|
|
14
|
+
"Lag",
|
|
15
|
+
"Delta",
|
|
16
|
+
"Delta2",
|
|
17
|
+
"TrendCoefficient",
|
|
18
|
+
"EWMAVolatility",
|
|
19
|
+
"RollingVolatility",
|
|
20
|
+
"RollingVolatility2",
|
|
21
|
+
"VolatilityRatio",
|
|
22
|
+
"CrossSeriesInteraction",
|
|
23
|
+
]
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from upgini.autofe.operator import PandasOperator
|
|
6
|
+
|
|
7
|
+
# Used in derived classes
|
|
8
|
+
try:
|
|
9
|
+
from pydantic import field_validator as validator # V2
|
|
10
|
+
except ImportError:
|
|
11
|
+
from pydantic import validator # V1
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TimeSeriesBase(PandasOperator, abc.ABC):
|
|
15
|
+
is_vector: bool = True
|
|
16
|
+
date_unit: Optional[str] = None
|
|
17
|
+
offset_size: int = 0
|
|
18
|
+
offset_unit: str = "D"
|
|
19
|
+
|
|
20
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
21
|
+
res = super().get_params()
|
|
22
|
+
res.update(
|
|
23
|
+
{
|
|
24
|
+
"date_unit": self.date_unit,
|
|
25
|
+
"offset_size": self.offset_size,
|
|
26
|
+
"offset_unit": self.offset_unit,
|
|
27
|
+
}
|
|
28
|
+
)
|
|
29
|
+
return res
|
|
30
|
+
|
|
31
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
32
|
+
# assuming first is date, last is value, rest is group columns
|
|
33
|
+
date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
|
|
34
|
+
ts = pd.concat([date] + data[1:], axis=1)
|
|
35
|
+
ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
|
|
36
|
+
ts.set_index(date.name, inplace=True)
|
|
37
|
+
ts = ts[ts.index.notna()].sort_index()
|
|
38
|
+
ts = (
|
|
39
|
+
ts.groupby([c.name for c in data[1:-1]], group_keys=True)
|
|
40
|
+
.apply(self._shift)[data[-1].name]
|
|
41
|
+
.to_frame()
|
|
42
|
+
.reset_index()
|
|
43
|
+
.set_index(date.name)
|
|
44
|
+
.groupby([c.name for c in data[1:-1]], group_keys=True)
|
|
45
|
+
if len(data) > 2
|
|
46
|
+
else self._shift(ts)
|
|
47
|
+
)
|
|
48
|
+
ts = self._aggregate(ts)
|
|
49
|
+
ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
|
|
50
|
+
ts.index = date.index
|
|
51
|
+
|
|
52
|
+
return ts.iloc[:, -1]
|
|
53
|
+
|
|
54
|
+
def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
55
|
+
if self.offset_size > 0:
|
|
56
|
+
return ts.iloc[:, :-1].merge(
|
|
57
|
+
ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
|
|
58
|
+
left_index=True,
|
|
59
|
+
right_index=True,
|
|
60
|
+
)
|
|
61
|
+
return ts
|
|
62
|
+
|
|
63
|
+
@abc.abstractmethod
|
|
64
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
def _add_offset_to_formula(self, base_formula: str) -> str:
|
|
68
|
+
if self.offset_size > 0:
|
|
69
|
+
return f"{base_formula}_offset_{self.offset_size}{self.offset_unit}"
|
|
70
|
+
return base_formula
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def _parse_offset_from_formula(cls, formula: str, base_regex: str) -> tuple[Optional[dict], Optional[str]]:
|
|
74
|
+
"""
|
|
75
|
+
Parse the offset component from a formula.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
formula: The formula to parse
|
|
79
|
+
base_regex: The regex pattern for the base formula (without offset)
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
A tuple with:
|
|
83
|
+
- Dictionary with offset parameters if found, None otherwise
|
|
84
|
+
- Remaining part of the formula after removing offset component (for further parsing)
|
|
85
|
+
"""
|
|
86
|
+
import re
|
|
87
|
+
|
|
88
|
+
offset_regex = f"{base_regex}_offset_(\\d+)([a-zA-Z])"
|
|
89
|
+
match = re.match(offset_regex, formula)
|
|
90
|
+
|
|
91
|
+
if match:
|
|
92
|
+
# Get groups from the offset part
|
|
93
|
+
offset_size = int(match.group(match.lastindex - 1))
|
|
94
|
+
offset_unit = match.group(match.lastindex)
|
|
95
|
+
|
|
96
|
+
# Return the parameters and the base formula for further parsing if needed
|
|
97
|
+
# Extract the base formula by using the match object
|
|
98
|
+
base_formula = formula[: match.start(match.lastindex - 1) - len("_offset_")]
|
|
99
|
+
return {"offset_size": offset_size, "offset_unit": offset_unit}, base_formula
|
|
100
|
+
|
|
101
|
+
# Check if it matches the base regex (no offset)
|
|
102
|
+
if re.match(f"^{base_regex}$", formula) or re.match(f"^{base_regex}_", formula):
|
|
103
|
+
return None, formula
|
|
104
|
+
|
|
105
|
+
return None, None
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from typing import Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from pydantic import field_validator as validator # V2
|
|
8
|
+
except ImportError:
|
|
9
|
+
from pydantic import validator # V1
|
|
10
|
+
|
|
11
|
+
from upgini.autofe.all_operands import find_op
|
|
12
|
+
from upgini.autofe.operator import PandasOperator, ParametrizedOperator
|
|
13
|
+
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
|
|
17
|
+
base_name: str = "cross"
|
|
18
|
+
interaction_op: PandasOperator
|
|
19
|
+
descriptor_indices: List[int] = []
|
|
20
|
+
left_descriptor: List[str] = []
|
|
21
|
+
right_descriptor: List[str] = []
|
|
22
|
+
|
|
23
|
+
@validator("descriptor_indices")
|
|
24
|
+
@classmethod
|
|
25
|
+
def validate_descriptor_indices(cls, v):
|
|
26
|
+
if not v:
|
|
27
|
+
raise ValueError("descriptor_indices cannot be empty for CrossSeriesInteraction")
|
|
28
|
+
return v
|
|
29
|
+
|
|
30
|
+
def __init__(self, **data):
|
|
31
|
+
super().__init__(**data)
|
|
32
|
+
indices = self.descriptor_indices
|
|
33
|
+
left = self.left_descriptor
|
|
34
|
+
right = self.right_descriptor
|
|
35
|
+
|
|
36
|
+
if len(left) != len(indices):
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"left_descriptor length ({len(left)}) " f"must match descriptor_indices length ({len(indices)})"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
if len(right) != len(indices):
|
|
42
|
+
raise ValueError(
|
|
43
|
+
f"right_descriptor length ({len(right)}) " f"must match descriptor_indices length ({len(indices)})"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def to_formula(self) -> str:
|
|
47
|
+
base_formula = f"{self.base_name}_{self._get_interaction_op_name()}"
|
|
48
|
+
return self._add_offset_to_formula(base_formula)
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def from_formula(cls, formula: str) -> Optional["CrossSeriesInteraction"]:
|
|
52
|
+
base_regex = r"cross_(.+)"
|
|
53
|
+
|
|
54
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
55
|
+
|
|
56
|
+
if remaining_formula is None:
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
import re
|
|
60
|
+
|
|
61
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
62
|
+
|
|
63
|
+
if not match:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
# Extract the operator formula
|
|
67
|
+
op_formula = match.group(1)
|
|
68
|
+
|
|
69
|
+
op = find_op(op_formula)
|
|
70
|
+
if op is None or not op.is_binary:
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
# Include default values to pass validation
|
|
74
|
+
params = {
|
|
75
|
+
"interaction_op": op,
|
|
76
|
+
"descriptor_indices": [0], # Default index
|
|
77
|
+
"left_descriptor": ["default"], # Default left descriptor
|
|
78
|
+
"right_descriptor": ["default"], # Default right descriptor
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if offset_params:
|
|
82
|
+
params.update(offset_params)
|
|
83
|
+
|
|
84
|
+
return cls(**params)
|
|
85
|
+
|
|
86
|
+
def get_params(self) -> Dict[str, str | None]:
|
|
87
|
+
res = super().get_params()
|
|
88
|
+
res.update(
|
|
89
|
+
{
|
|
90
|
+
"interaction_op": self._get_interaction_op_name(),
|
|
91
|
+
"descriptor_indices": self.descriptor_indices,
|
|
92
|
+
"left_descriptor": self.left_descriptor,
|
|
93
|
+
"right_descriptor": self.right_descriptor,
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
return res
|
|
97
|
+
|
|
98
|
+
def _get_interaction_op_name(self) -> str:
|
|
99
|
+
return self.interaction_op.alias or self.interaction_op.to_formula()
|
|
100
|
+
|
|
101
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
102
|
+
left_mask = self._get_mask(data, self.left_descriptor)
|
|
103
|
+
left = self._extract_series(data, left_mask)
|
|
104
|
+
|
|
105
|
+
right_mask = self._get_mask(data, self.right_descriptor)
|
|
106
|
+
right = self._extract_series(data, right_mask)
|
|
107
|
+
|
|
108
|
+
interaction: pd.Series = self.interaction_op.calculate_binary(left, right)
|
|
109
|
+
interaction = interaction.reindex(self._get_index(data))
|
|
110
|
+
res = pd.Series(np.nan, index=data[-1].index, name=data[-1].name)
|
|
111
|
+
res.loc[left_mask] = interaction[left_mask].values
|
|
112
|
+
res.loc[right_mask] = interaction[right_mask].values
|
|
113
|
+
return res
|
|
114
|
+
|
|
115
|
+
def _get_mask(self, data: List[pd.Series], descriptor: List[str]) -> pd.Series:
|
|
116
|
+
mask = np.logical_and.reduce([data[i] == v for i, v in zip(self.descriptor_indices, descriptor)])
|
|
117
|
+
return mask
|
|
118
|
+
|
|
119
|
+
def _extract_series(self, data: List[pd.Series], mask: pd.Series) -> pd.Series:
|
|
120
|
+
masked_data = [d[mask] for d in data]
|
|
121
|
+
shifted = super().calculate_vector(masked_data)
|
|
122
|
+
shifted.index = self._get_index(masked_data)
|
|
123
|
+
return shifted
|
|
124
|
+
|
|
125
|
+
def _get_index(self, data: List[pd.Series]) -> pd.Series:
|
|
126
|
+
index = [d for i, d in enumerate(data[:-1]) if i not in self.descriptor_indices]
|
|
127
|
+
return index if len(index) > 1 else index[0]
|
|
128
|
+
|
|
129
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
130
|
+
return ts.apply(lambda x: x).iloc[:, [-1]]
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
|
|
4
|
+
from upgini.autofe.operator import ParametrizedOperator
|
|
5
|
+
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
6
|
+
from upgini.autofe.timeseries.lag import Lag
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Delta(TimeSeriesBase, ParametrizedOperator):
|
|
10
|
+
delta_size: int
|
|
11
|
+
delta_unit: str = "D"
|
|
12
|
+
|
|
13
|
+
def to_formula(self) -> str:
|
|
14
|
+
base_formula = f"delta_{self.delta_size}{self.delta_unit}"
|
|
15
|
+
return self._add_offset_to_formula(base_formula)
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def from_formula(cls, formula: str) -> Optional["Delta"]:
|
|
19
|
+
# Base regex for Delta class
|
|
20
|
+
base_regex = r"delta_(\d+)([a-zA-Z])"
|
|
21
|
+
|
|
22
|
+
# Parse offset first
|
|
23
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
24
|
+
|
|
25
|
+
if remaining_formula is None:
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
# Now parse the delta part
|
|
29
|
+
import re
|
|
30
|
+
|
|
31
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
32
|
+
|
|
33
|
+
if not match:
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
delta_size = int(match.group(1))
|
|
37
|
+
delta_unit = match.group(2)
|
|
38
|
+
|
|
39
|
+
# Create instance with appropriate parameters
|
|
40
|
+
params = {
|
|
41
|
+
"delta_size": delta_size,
|
|
42
|
+
"delta_unit": delta_unit,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if offset_params:
|
|
46
|
+
params.update(offset_params)
|
|
47
|
+
|
|
48
|
+
return cls(**params)
|
|
49
|
+
|
|
50
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
51
|
+
res = super().get_params()
|
|
52
|
+
res.update(
|
|
53
|
+
{
|
|
54
|
+
"delta_size": self.delta_size,
|
|
55
|
+
"delta_unit": self.delta_unit,
|
|
56
|
+
"offset_size": self.offset_size,
|
|
57
|
+
"offset_unit": self.offset_unit,
|
|
58
|
+
}
|
|
59
|
+
)
|
|
60
|
+
return res
|
|
61
|
+
|
|
62
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
63
|
+
lag0 = Lag(lag_size=0, lag_unit=self.delta_unit)
|
|
64
|
+
lag = Lag(lag_size=self.delta_size, lag_unit=self.delta_unit)
|
|
65
|
+
return lag0._aggregate(ts) - lag._aggregate(ts)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class Delta2(TimeSeriesBase, ParametrizedOperator):
|
|
69
|
+
delta_size: int
|
|
70
|
+
delta_unit: str = "D"
|
|
71
|
+
|
|
72
|
+
def to_formula(self) -> str:
|
|
73
|
+
base_formula = f"delta2_{self.delta_size}{self.delta_unit}"
|
|
74
|
+
return self._add_offset_to_formula(base_formula)
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def from_formula(cls, formula: str) -> Optional["Delta2"]:
|
|
78
|
+
# Base regex for Delta2 class
|
|
79
|
+
base_regex = r"delta2_(\d+)([a-zA-Z])"
|
|
80
|
+
|
|
81
|
+
# Parse offset first
|
|
82
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
83
|
+
|
|
84
|
+
if remaining_formula is None:
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
# Now parse the delta part
|
|
88
|
+
import re
|
|
89
|
+
|
|
90
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
91
|
+
|
|
92
|
+
if not match:
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
delta_size = int(match.group(1))
|
|
96
|
+
delta_unit = match.group(2)
|
|
97
|
+
|
|
98
|
+
# Create instance with appropriate parameters
|
|
99
|
+
params = {
|
|
100
|
+
"delta_size": delta_size,
|
|
101
|
+
"delta_unit": delta_unit,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if offset_params:
|
|
105
|
+
params.update(offset_params)
|
|
106
|
+
|
|
107
|
+
return cls(**params)
|
|
108
|
+
|
|
109
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
110
|
+
res = super().get_params()
|
|
111
|
+
res.update(
|
|
112
|
+
{
|
|
113
|
+
"delta_size": self.delta_size,
|
|
114
|
+
"delta_unit": self.delta_unit,
|
|
115
|
+
}
|
|
116
|
+
)
|
|
117
|
+
return res
|
|
118
|
+
|
|
119
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
120
|
+
# Calculate first delta
|
|
121
|
+
delta1 = Delta(delta_size=self.delta_size, delta_unit=self.delta_unit)
|
|
122
|
+
first_delta = delta1._aggregate(ts)
|
|
123
|
+
|
|
124
|
+
# Calculate delta of delta (second derivative)
|
|
125
|
+
return delta1._aggregate(first_delta)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from typing import Dict, Optional
|
|
4
|
+
|
|
5
|
+
from upgini.autofe.operator import ParametrizedOperator
|
|
6
|
+
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Lag(TimeSeriesBase, ParametrizedOperator):
|
|
10
|
+
lag_size: int
|
|
11
|
+
lag_unit: str = "D"
|
|
12
|
+
|
|
13
|
+
def to_formula(self) -> str:
|
|
14
|
+
base_formula = f"lag_{self.lag_size}{self.lag_unit}"
|
|
15
|
+
return self._add_offset_to_formula(base_formula)
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def from_formula(cls, formula: str) -> Optional["Lag"]:
|
|
19
|
+
# Base regex for Lag class
|
|
20
|
+
base_regex = r"lag_(\d+)([a-zA-Z])"
|
|
21
|
+
|
|
22
|
+
# Parse offset first
|
|
23
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
24
|
+
|
|
25
|
+
if remaining_formula is None:
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
# Now parse the lag part
|
|
29
|
+
import re
|
|
30
|
+
|
|
31
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
32
|
+
|
|
33
|
+
if not match:
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
lag_size = int(match.group(1))
|
|
37
|
+
lag_unit = match.group(2)
|
|
38
|
+
|
|
39
|
+
# Create instance with appropriate parameters
|
|
40
|
+
params = {
|
|
41
|
+
"lag_size": lag_size,
|
|
42
|
+
"lag_unit": lag_unit,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if offset_params:
|
|
46
|
+
params.update(offset_params)
|
|
47
|
+
|
|
48
|
+
return cls(**params)
|
|
49
|
+
|
|
50
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
51
|
+
res = super().get_params()
|
|
52
|
+
res.update(
|
|
53
|
+
{
|
|
54
|
+
"lag_size": self.lag_size,
|
|
55
|
+
"lag_unit": self.lag_unit,
|
|
56
|
+
}
|
|
57
|
+
)
|
|
58
|
+
return res
|
|
59
|
+
|
|
60
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
61
|
+
lag_window = self.lag_size + 1
|
|
62
|
+
return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=1).agg(self._lag)
|
|
63
|
+
|
|
64
|
+
def _lag(self, x):
|
|
65
|
+
if x.index.min() > (x.index.max() - pd.Timedelta(self.lag_size, self.lag_unit)):
|
|
66
|
+
return np.nan
|
|
67
|
+
else:
|
|
68
|
+
return x[0]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
|
|
4
|
+
from upgini.autofe.operator import ParametrizedOperator
|
|
5
|
+
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
6
|
+
|
|
7
|
+
# Roll aggregation functions
|
|
8
|
+
roll_aggregations = {
|
|
9
|
+
"norm_mean": lambda x: x[-1] / x.mean(),
|
|
10
|
+
"q25": lambda x: x.quantile(0.25),
|
|
11
|
+
"q75": lambda x: x.quantile(0.75),
|
|
12
|
+
"iqr": lambda x: x.quantile(0.75) - x.quantile(0.25),
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from pydantic import field_validator as validator # V2
|
|
17
|
+
except ImportError:
|
|
18
|
+
from pydantic import validator # V1
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Roll(TimeSeriesBase, ParametrizedOperator):
|
|
22
|
+
aggregation: str
|
|
23
|
+
window_size: int = 1
|
|
24
|
+
window_unit: str = "D"
|
|
25
|
+
|
|
26
|
+
@validator("window_unit")
|
|
27
|
+
@classmethod
|
|
28
|
+
def validate_window_unit(cls, v: str) -> str:
|
|
29
|
+
try:
|
|
30
|
+
pd.tseries.frequencies.to_offset(v)
|
|
31
|
+
return v
|
|
32
|
+
except ValueError:
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def to_formula(self) -> str:
|
|
38
|
+
# First add window size and unit, then add aggregation, then add offset
|
|
39
|
+
base_formula = f"roll_{self.window_size}{self.window_unit}"
|
|
40
|
+
formula_with_agg = f"{base_formula}_{self.aggregation}"
|
|
41
|
+
return self._add_offset_to_formula(formula_with_agg)
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def from_formula(cls, formula: str) -> Optional["Roll"]:
|
|
45
|
+
import re
|
|
46
|
+
|
|
47
|
+
# Base regex for Roll class (with aggregation)
|
|
48
|
+
base_regex = r"roll_(\d+)([a-zA-Z])_(\w+)"
|
|
49
|
+
|
|
50
|
+
# Parse offset first - this removes the offset part if present
|
|
51
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
52
|
+
|
|
53
|
+
if remaining_formula is None:
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
# Parse the window part and aggregation
|
|
57
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
58
|
+
|
|
59
|
+
if not match:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
window_size = int(match.group(1))
|
|
63
|
+
window_unit = match.group(2)
|
|
64
|
+
aggregation = match.group(3)
|
|
65
|
+
|
|
66
|
+
# Create instance with appropriate parameters
|
|
67
|
+
params = {
|
|
68
|
+
"window_size": window_size,
|
|
69
|
+
"window_unit": window_unit,
|
|
70
|
+
"aggregation": aggregation,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if offset_params:
|
|
74
|
+
params.update(offset_params)
|
|
75
|
+
|
|
76
|
+
return cls(**params)
|
|
77
|
+
|
|
78
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
79
|
+
res = super().get_params()
|
|
80
|
+
res.update(
|
|
81
|
+
{
|
|
82
|
+
"window_size": self.window_size,
|
|
83
|
+
"window_unit": self.window_unit,
|
|
84
|
+
"aggregation": self.aggregation,
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
return res
|
|
88
|
+
|
|
89
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
90
|
+
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
|
|
91
|
+
roll_aggregations.get(self.aggregation, self.aggregation)
|
|
92
|
+
)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from typing import Dict, Optional, Union
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TrendCoefficient(TimeSeriesBase):
|
|
9
|
+
name: str = "trend_coef"
|
|
10
|
+
step_size: int = 1
|
|
11
|
+
step_unit: str = "D"
|
|
12
|
+
|
|
13
|
+
def to_formula(self) -> str:
|
|
14
|
+
base_formula = "trend_coef"
|
|
15
|
+
return self._add_offset_to_formula(base_formula)
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def from_formula(cls, formula: str) -> Optional["TrendCoefficient"]:
|
|
19
|
+
# Base regex for TrendCoefficient class
|
|
20
|
+
base_regex = r"trend_coef"
|
|
21
|
+
|
|
22
|
+
# Parse offset first
|
|
23
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
24
|
+
|
|
25
|
+
if remaining_formula is None:
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
# Basic pattern (no offset)
|
|
29
|
+
if remaining_formula == "trend_coef":
|
|
30
|
+
params = {}
|
|
31
|
+
if offset_params:
|
|
32
|
+
params.update(offset_params)
|
|
33
|
+
return cls(**params)
|
|
34
|
+
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
38
|
+
res = super().get_params()
|
|
39
|
+
res.update(
|
|
40
|
+
{
|
|
41
|
+
"step_size": self.step_size,
|
|
42
|
+
"step_unit": self.step_unit,
|
|
43
|
+
"offset_size": self.offset_size,
|
|
44
|
+
"offset_unit": self.offset_unit,
|
|
45
|
+
}
|
|
46
|
+
)
|
|
47
|
+
return res
|
|
48
|
+
|
|
49
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
50
|
+
return ts.apply(self._trend_coef).iloc[:, [-1]].fillna(0)
|
|
51
|
+
|
|
52
|
+
def _trend_coef(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
|
|
53
|
+
return_series = isinstance(x, pd.Series)
|
|
54
|
+
x = pd.DataFrame(x)
|
|
55
|
+
resampled = (
|
|
56
|
+
x.iloc[:, -1].resample(f"{self.step_size}{self.step_unit}").fillna(method="ffill").fillna(method="bfill")
|
|
57
|
+
)
|
|
58
|
+
idx = np.arange(len(resampled))
|
|
59
|
+
coeffs = np.polyfit(idx, resampled, 1)
|
|
60
|
+
x.iloc[:, -1] = coeffs[0]
|
|
61
|
+
return x.iloc[:, -1] if return_series else x
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
from typing import Dict, Optional, Union
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from upgini.autofe.operator import ParametrizedOperator
|
|
6
|
+
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VolatilityBase(TimeSeriesBase):
|
|
10
|
+
@staticmethod
|
|
11
|
+
def _get_returns(ts: pd.Series, freq: str) -> pd.Series:
|
|
12
|
+
return ts.pct_change(freq=freq).fillna(0)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EWMAVolatility(VolatilityBase, ParametrizedOperator):
|
|
16
|
+
step_size: int = 1
|
|
17
|
+
step_unit: str = "D"
|
|
18
|
+
window_size: int
|
|
19
|
+
|
|
20
|
+
def to_formula(self) -> str:
|
|
21
|
+
base_formula = f"ewma_vol_{self.window_size}"
|
|
22
|
+
return self._add_offset_to_formula(base_formula)
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def from_formula(cls, formula: str) -> Optional["EWMAVolatility"]:
|
|
26
|
+
base_regex = r"ewma_vol_(\d+)"
|
|
27
|
+
|
|
28
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
29
|
+
|
|
30
|
+
if remaining_formula is None:
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
import re
|
|
34
|
+
|
|
35
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
36
|
+
|
|
37
|
+
if not match:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
window_size = int(match.group(1))
|
|
41
|
+
|
|
42
|
+
params = {
|
|
43
|
+
"window_size": window_size,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if offset_params:
|
|
47
|
+
params.update(offset_params)
|
|
48
|
+
|
|
49
|
+
return cls(**params)
|
|
50
|
+
|
|
51
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
52
|
+
res = super().get_params()
|
|
53
|
+
res.update(
|
|
54
|
+
{
|
|
55
|
+
"step_size": self.step_size,
|
|
56
|
+
"step_unit": self.step_unit,
|
|
57
|
+
"window_size": self.window_size,
|
|
58
|
+
}
|
|
59
|
+
)
|
|
60
|
+
return res
|
|
61
|
+
|
|
62
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
63
|
+
return ts.apply(self._ewma_vol)
|
|
64
|
+
|
|
65
|
+
def _ewma_vol(self, x):
|
|
66
|
+
x = pd.DataFrame(x).iloc[:, -1]
|
|
67
|
+
returns = self._get_returns(x, f"{self.step_size}{self.step_unit}")
|
|
68
|
+
return returns.ewm(span=self.window_size).std()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class RollingVolBase(VolatilityBase):
|
|
72
|
+
step_size: int = 1
|
|
73
|
+
step_unit: str = "D"
|
|
74
|
+
window_size: int
|
|
75
|
+
window_unit: str = "D"
|
|
76
|
+
|
|
77
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
78
|
+
res = super().get_params()
|
|
79
|
+
res.update(
|
|
80
|
+
{
|
|
81
|
+
"step_size": self.step_size,
|
|
82
|
+
"step_unit": self.step_unit,
|
|
83
|
+
"window_size": self.window_size,
|
|
84
|
+
"window_unit": self.window_unit,
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
return res
|
|
88
|
+
|
|
89
|
+
def _rolling_vol(
|
|
90
|
+
self, x: Union[pd.DataFrame, pd.Series], window_size: int, window_unit: str, abs_returns: bool = False
|
|
91
|
+
) -> Union[pd.DataFrame, pd.Series]:
|
|
92
|
+
return_series = isinstance(x, pd.Series)
|
|
93
|
+
x = pd.DataFrame(x)
|
|
94
|
+
returns = self._get_returns(x.iloc[:, -1], f"{self.step_size}{self.step_unit}")
|
|
95
|
+
if abs_returns:
|
|
96
|
+
returns = returns.abs()
|
|
97
|
+
x.iloc[:, -1] = returns.rolling(f"{window_size}{window_unit}", min_periods=1).std()
|
|
98
|
+
return x.iloc[:, -1] if return_series else x
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class RollingVolatility(RollingVolBase, ParametrizedOperator):
|
|
102
|
+
abs_returns: bool = False
|
|
103
|
+
|
|
104
|
+
def to_formula(self) -> str:
|
|
105
|
+
base_formula = f"roll_vol_{self.window_size}{self.window_unit}"
|
|
106
|
+
return self._add_offset_to_formula(base_formula)
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def from_formula(cls, formula: str) -> Optional["RollingVolatility"]:
|
|
110
|
+
base_regex = r"roll_vol_(\d+)([a-zA-Z])"
|
|
111
|
+
|
|
112
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
113
|
+
|
|
114
|
+
if remaining_formula is None:
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
import re
|
|
118
|
+
|
|
119
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
120
|
+
|
|
121
|
+
if not match:
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
window_size = int(match.group(1))
|
|
125
|
+
window_unit = match.group(2)
|
|
126
|
+
|
|
127
|
+
params = {
|
|
128
|
+
"window_size": window_size,
|
|
129
|
+
"window_unit": window_unit,
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
if offset_params:
|
|
133
|
+
params.update(offset_params)
|
|
134
|
+
|
|
135
|
+
return cls(**params)
|
|
136
|
+
|
|
137
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
138
|
+
return ts.apply(
|
|
139
|
+
self._rolling_vol, window_size=self.window_size, window_unit=self.window_unit, abs_returns=self.abs_returns
|
|
140
|
+
).iloc[:, [-1]]
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class RollingVolatility2(RollingVolBase, ParametrizedOperator):
|
|
144
|
+
"""
|
|
145
|
+
Computes the volatility on volatility of a time series. Volatility is computed using the RollingVolatility.
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
def to_formula(self) -> str:
|
|
149
|
+
base_formula = f"roll_vol2_{self.window_size}{self.window_unit}"
|
|
150
|
+
return self._add_offset_to_formula(base_formula)
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
def from_formula(cls, formula: str) -> Optional["RollingVolatility2"]:
|
|
154
|
+
base_regex = r"roll_vol2_(\d+)([a-zA-Z])"
|
|
155
|
+
|
|
156
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
157
|
+
|
|
158
|
+
if remaining_formula is None:
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
import re
|
|
162
|
+
|
|
163
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
164
|
+
|
|
165
|
+
if not match:
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
window_size = int(match.group(1))
|
|
169
|
+
window_unit = match.group(2)
|
|
170
|
+
|
|
171
|
+
params = {
|
|
172
|
+
"window_size": window_size,
|
|
173
|
+
"window_unit": window_unit,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
if offset_params:
|
|
177
|
+
params.update(offset_params)
|
|
178
|
+
|
|
179
|
+
return cls(**params)
|
|
180
|
+
|
|
181
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
182
|
+
return ts.apply(self._vol_on_vol).iloc[:, [-1]]
|
|
183
|
+
|
|
184
|
+
def _vol_on_vol(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
|
|
185
|
+
vol1 = self._rolling_vol(x, self.window_size, self.window_unit, abs_returns=True)
|
|
186
|
+
vol2 = self._rolling_vol(vol1, self.window_size, self.window_unit, abs_returns=False)
|
|
187
|
+
return vol2
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class VolatilityRatio(RollingVolBase, ParametrizedOperator):
|
|
191
|
+
"""
|
|
192
|
+
Computes the ratio of short-term volatility to long-term volatility.
|
|
193
|
+
Both volatilities are computed using RollingVolatility.
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
short_window_size: int
|
|
197
|
+
short_window_unit: str = "D"
|
|
198
|
+
|
|
199
|
+
def to_formula(self) -> str:
|
|
200
|
+
base_formula = (
|
|
201
|
+
f"vol_ratio_{self.short_window_size}{self.short_window_unit}_to_{self.window_size}{self.window_unit}"
|
|
202
|
+
)
|
|
203
|
+
return self._add_offset_to_formula(base_formula)
|
|
204
|
+
|
|
205
|
+
@classmethod
|
|
206
|
+
def from_formula(cls, formula: str) -> Optional["VolatilityRatio"]:
|
|
207
|
+
base_regex = r"vol_ratio_(\d+)([a-zA-Z])_to_(\d+)([a-zA-Z])"
|
|
208
|
+
|
|
209
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
210
|
+
|
|
211
|
+
if remaining_formula is None:
|
|
212
|
+
return None
|
|
213
|
+
|
|
214
|
+
import re
|
|
215
|
+
|
|
216
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
217
|
+
|
|
218
|
+
if not match:
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
short_window_size = int(match.group(1))
|
|
222
|
+
short_window_unit = match.group(2)
|
|
223
|
+
window_size = int(match.group(3))
|
|
224
|
+
window_unit = match.group(4)
|
|
225
|
+
|
|
226
|
+
params = {
|
|
227
|
+
"short_window_size": short_window_size,
|
|
228
|
+
"short_window_unit": short_window_unit,
|
|
229
|
+
"window_size": window_size,
|
|
230
|
+
"window_unit": window_unit,
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
if offset_params:
|
|
234
|
+
params.update(offset_params)
|
|
235
|
+
|
|
236
|
+
return cls(**params)
|
|
237
|
+
|
|
238
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
239
|
+
res = super().get_params()
|
|
240
|
+
res.update(
|
|
241
|
+
{
|
|
242
|
+
"short_window_size": self.short_window_size,
|
|
243
|
+
"short_window_unit": self.short_window_unit,
|
|
244
|
+
}
|
|
245
|
+
)
|
|
246
|
+
return res
|
|
247
|
+
|
|
248
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
249
|
+
return ts.apply(self._vol_ratio).iloc[:, [-1]]
|
|
250
|
+
|
|
251
|
+
def _vol_ratio(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
|
|
252
|
+
short_vol = self._rolling_vol(x, self.short_window_size, self.short_window_unit)
|
|
253
|
+
long_vol = self._rolling_vol(x, self.window_size, self.window_unit)
|
|
254
|
+
ratio = VolatilityRatio._handle_div_errors(short_vol / long_vol)
|
|
255
|
+
return ratio
|
|
256
|
+
|
|
257
|
+
@staticmethod
|
|
258
|
+
def _handle_div_errors(x: pd.Series) -> pd.Series:
|
|
259
|
+
return x.replace([np.inf, -np.inf], np.nan).fillna(1)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.62a3818.
|
|
3
|
+
Version: 1.2.62a3818.dev4
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=DRRGQ9hjWuzUUDq0H9hZpymmoGVeS9BXeeOQ2XoHmjc,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
|
|
@@ -14,15 +14,22 @@ upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1
|
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
upgini/autofe/all_operands.py,sha256=
|
|
17
|
+
upgini/autofe/all_operands.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
|
|
18
18
|
upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
|
|
19
19
|
upgini/autofe/date.py,sha256=I07psJerrxOcHao91PdSCk9X6KWu61IBVyFRLjGNgK8,10730
|
|
20
20
|
upgini/autofe/feature.py,sha256=Xto7FHH1JG-5QvkfTPNWKtV9GAzPviTNPKFZOUN7RQA,14757
|
|
21
21
|
upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
|
22
|
-
upgini/autofe/operator.py,sha256=
|
|
23
|
-
upgini/autofe/timeseries.py,sha256=-BnDp0z_Hv6Vol1Vov6QC_82U8XPV3pfIPFspK2aTCE,6598
|
|
22
|
+
upgini/autofe/operator.py,sha256=KKLFixtEFq-qP6WVks19F0AY2iOnB8_g8uYpbrC8USM,4894
|
|
24
23
|
upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
|
|
25
24
|
upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
|
|
25
|
+
upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
|
|
26
|
+
upgini/autofe/timeseries/base.py,sha256=T9Ec8LKJbiwTUGGsd_xhM0U0NUJblqmKchkzUI1sK88,3755
|
|
27
|
+
upgini/autofe/timeseries/cross.py,sha256=8ggDhsvwdxHkrWKRPl2fcFt7wamTYhkVzQcOWvIIyvU,4612
|
|
28
|
+
upgini/autofe/timeseries/delta.py,sha256=hXEiFWHdZndz8I7Ef5zhTHLJac9illhZOZITwpL9ppw,3618
|
|
29
|
+
upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
|
|
30
|
+
upgini/autofe/timeseries/roll.py,sha256=bNFMDszSYTWvB7EyhHbRY1DJqzSURvHlPAcBebt0y0Y,2878
|
|
31
|
+
upgini/autofe/timeseries/trend.py,sha256=eP0q1fBW4MYPrjfy7vr88tTG8qk0xypClaGHaVv1hAs,1962
|
|
32
|
+
upgini/autofe/timeseries/volatility.py,sha256=9shUmIKjpWTHVYjj80YBsk0XheBJ9uBuLv5NW9Mchnk,7953
|
|
26
33
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
34
|
upgini/data_source/data_source_publisher.py,sha256=4S9qwlAklD8vg9tUU_c1pHE2_glUHAh15-wr5hMwKFw,22879
|
|
28
35
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
@@ -63,7 +70,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
|
|
|
63
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
64
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
65
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
66
|
-
upgini-1.2.62a3818.
|
|
67
|
-
upgini-1.2.62a3818.
|
|
68
|
-
upgini-1.2.62a3818.
|
|
69
|
-
upgini-1.2.62a3818.
|
|
73
|
+
upgini-1.2.62a3818.dev4.dist-info/METADATA,sha256=_sL9eQLnB5X1kyhbUiMzXIB5HUgK0KFfmuwgp3Su59c,49094
|
|
74
|
+
upgini-1.2.62a3818.dev4.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
75
|
+
upgini-1.2.62a3818.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
76
|
+
upgini-1.2.62a3818.dev4.dist-info/RECORD,,
|
upgini/autofe/timeseries.py
DELETED
|
@@ -1,200 +0,0 @@
|
|
|
1
|
-
import abc
|
|
2
|
-
from typing import Dict, List, Optional
|
|
3
|
-
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from upgini.autofe.operator import PandasOperator, ParametrizedOperator
|
|
6
|
-
|
|
7
|
-
try:
|
|
8
|
-
from pydantic import field_validator as validator # V2
|
|
9
|
-
except ImportError:
|
|
10
|
-
from pydantic import validator # V1
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class TimeSeriesBase(PandasOperator, abc.ABC):
|
|
14
|
-
is_vector: bool = True
|
|
15
|
-
date_unit: Optional[str] = None
|
|
16
|
-
offset_size: int = 0
|
|
17
|
-
offset_unit: str = "D"
|
|
18
|
-
|
|
19
|
-
def get_params(self) -> Dict[str, Optional[str]]:
|
|
20
|
-
res = super().get_params()
|
|
21
|
-
res.update(
|
|
22
|
-
{
|
|
23
|
-
"date_unit": self.date_unit,
|
|
24
|
-
"offset_size": self.offset_size,
|
|
25
|
-
"offset_unit": self.offset_unit,
|
|
26
|
-
}
|
|
27
|
-
)
|
|
28
|
-
return res
|
|
29
|
-
|
|
30
|
-
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
31
|
-
# assuming first is date, last is value, rest is group columns
|
|
32
|
-
date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
|
|
33
|
-
ts = pd.concat([date] + data[1:], axis=1)
|
|
34
|
-
ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
|
|
35
|
-
ts.set_index(date.name, inplace=True)
|
|
36
|
-
ts = ts[ts.index.notna()].sort_index()
|
|
37
|
-
ts = (
|
|
38
|
-
ts.groupby([c.name for c in data[1:-1]], group_keys=True)
|
|
39
|
-
.apply(self._shift)[data[-1].name]
|
|
40
|
-
.to_frame()
|
|
41
|
-
.reset_index()
|
|
42
|
-
.set_index(date.name)
|
|
43
|
-
.groupby([c.name for c in data[1:-1]])
|
|
44
|
-
if len(data) > 2
|
|
45
|
-
else self._shift(ts)
|
|
46
|
-
)
|
|
47
|
-
ts = self._aggregate(ts)
|
|
48
|
-
ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
|
|
49
|
-
ts.index = date.index
|
|
50
|
-
|
|
51
|
-
return ts.iloc[:, -1]
|
|
52
|
-
|
|
53
|
-
def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
54
|
-
if self.offset_size > 0:
|
|
55
|
-
return ts.iloc[:, :-1].merge(
|
|
56
|
-
ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
|
|
57
|
-
left_index=True,
|
|
58
|
-
right_index=True,
|
|
59
|
-
)
|
|
60
|
-
return ts
|
|
61
|
-
|
|
62
|
-
@abc.abstractmethod
|
|
63
|
-
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
64
|
-
pass
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean(), "last": lambda x: x[-1]}
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class Roll(TimeSeriesBase, ParametrizedOperator):
|
|
71
|
-
aggregation: str
|
|
72
|
-
window_size: int = 1
|
|
73
|
-
window_unit: str = "D"
|
|
74
|
-
|
|
75
|
-
@validator("window_unit")
|
|
76
|
-
@classmethod
|
|
77
|
-
def validate_window_unit(cls, v: str) -> str:
|
|
78
|
-
try:
|
|
79
|
-
pd.tseries.frequencies.to_offset(v)
|
|
80
|
-
return v
|
|
81
|
-
except ValueError:
|
|
82
|
-
raise ValueError(
|
|
83
|
-
f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
def to_formula(self) -> str:
|
|
87
|
-
roll_component = f"roll_{self.window_size}{self.window_unit}"
|
|
88
|
-
if self.offset_size > 0:
|
|
89
|
-
roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
90
|
-
return f"{roll_component}_{self.aggregation}"
|
|
91
|
-
|
|
92
|
-
@classmethod
|
|
93
|
-
def from_formula(cls, formula: str) -> Optional["Roll"]:
|
|
94
|
-
import re
|
|
95
|
-
|
|
96
|
-
# Try matching pattern with offset first
|
|
97
|
-
pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
|
|
98
|
-
match_with_offset = re.match(pattern_with_offset, formula)
|
|
99
|
-
|
|
100
|
-
if match_with_offset:
|
|
101
|
-
window_size = int(match_with_offset.group(1))
|
|
102
|
-
window_unit = match_with_offset.group(2)
|
|
103
|
-
offset_size = int(match_with_offset.group(3))
|
|
104
|
-
offset_unit = match_with_offset.group(4)
|
|
105
|
-
aggregation = match_with_offset.group(5)
|
|
106
|
-
|
|
107
|
-
return cls(
|
|
108
|
-
window_size=window_size,
|
|
109
|
-
window_unit=window_unit,
|
|
110
|
-
offset_size=offset_size,
|
|
111
|
-
offset_unit=offset_unit,
|
|
112
|
-
aggregation=aggregation,
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
# If no offset pattern found, try basic pattern
|
|
116
|
-
pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
|
|
117
|
-
match = re.match(pattern, formula)
|
|
118
|
-
|
|
119
|
-
if not match:
|
|
120
|
-
return None
|
|
121
|
-
|
|
122
|
-
window_size = int(match.group(1))
|
|
123
|
-
window_unit = match.group(2)
|
|
124
|
-
aggregation = match.group(3)
|
|
125
|
-
|
|
126
|
-
return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
|
|
127
|
-
|
|
128
|
-
def get_params(self) -> Dict[str, Optional[str]]:
|
|
129
|
-
res = super().get_params()
|
|
130
|
-
res.update(
|
|
131
|
-
{
|
|
132
|
-
"window_size": self.window_size,
|
|
133
|
-
"window_unit": self.window_unit,
|
|
134
|
-
"aggregation": self.aggregation,
|
|
135
|
-
}
|
|
136
|
-
)
|
|
137
|
-
return res
|
|
138
|
-
|
|
139
|
-
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
140
|
-
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
|
|
141
|
-
_roll_aggregations.get(self.aggregation, self.aggregation)
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
class Lag(TimeSeriesBase, ParametrizedOperator):
|
|
146
|
-
lag_size: int
|
|
147
|
-
lag_unit: str = "D"
|
|
148
|
-
|
|
149
|
-
def to_formula(self) -> str:
|
|
150
|
-
lag_component = f"lag_{self.lag_size}{self.lag_unit}"
|
|
151
|
-
if self.offset_size > 0:
|
|
152
|
-
lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
153
|
-
return lag_component
|
|
154
|
-
|
|
155
|
-
@classmethod
|
|
156
|
-
def from_formula(cls, formula: str) -> Optional["Lag"]:
|
|
157
|
-
import re
|
|
158
|
-
|
|
159
|
-
# Try matching pattern with offset first
|
|
160
|
-
pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
|
|
161
|
-
match_with_offset = re.match(pattern_with_offset, formula)
|
|
162
|
-
|
|
163
|
-
if match_with_offset:
|
|
164
|
-
lag_size = int(match_with_offset.group(1))
|
|
165
|
-
lag_unit = match_with_offset.group(2)
|
|
166
|
-
offset_size = int(match_with_offset.group(3))
|
|
167
|
-
offset_unit = match_with_offset.group(4)
|
|
168
|
-
|
|
169
|
-
return cls(
|
|
170
|
-
lag_size=lag_size,
|
|
171
|
-
lag_unit=lag_unit,
|
|
172
|
-
offset_size=offset_size,
|
|
173
|
-
offset_unit=offset_unit,
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
# If no offset pattern found, try basic pattern
|
|
177
|
-
pattern = r"^lag_(\d+)([a-zA-Z])$"
|
|
178
|
-
match = re.match(pattern, formula)
|
|
179
|
-
|
|
180
|
-
if not match:
|
|
181
|
-
return None
|
|
182
|
-
|
|
183
|
-
lag_size = int(match.group(1))
|
|
184
|
-
lag_unit = match.group(2)
|
|
185
|
-
|
|
186
|
-
return cls(lag_size=lag_size, lag_unit=lag_unit)
|
|
187
|
-
|
|
188
|
-
def get_params(self) -> Dict[str, Optional[str]]:
|
|
189
|
-
res = super().get_params()
|
|
190
|
-
res.update(
|
|
191
|
-
{
|
|
192
|
-
"lag_size": self.lag_size,
|
|
193
|
-
"lag_unit": self.lag_unit,
|
|
194
|
-
}
|
|
195
|
-
)
|
|
196
|
-
return res
|
|
197
|
-
|
|
198
|
-
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
199
|
-
lag_window = self.lag_size + 1
|
|
200
|
-
return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
|
|
File without changes
|
|
File without changes
|