upgini 1.2.63__py3-none-any.whl → 1.2.65a3818.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/{all_operands.py → all_operators.py} +2 -2
- upgini/autofe/binary.py +11 -11
- upgini/autofe/date.py +6 -6
- upgini/autofe/feature.py +8 -8
- upgini/autofe/groupby.py +6 -6
- upgini/autofe/{operand.py → operator.py} +16 -11
- upgini/autofe/timeseries/__init__.py +23 -0
- upgini/autofe/timeseries/base.py +105 -0
- upgini/autofe/timeseries/cross.py +139 -0
- upgini/autofe/timeseries/delta.py +119 -0
- upgini/autofe/timeseries/lag.py +68 -0
- upgini/autofe/timeseries/roll.py +92 -0
- upgini/autofe/timeseries/trend.py +61 -0
- upgini/autofe/timeseries/volatility.py +259 -0
- upgini/autofe/unary.py +11 -11
- upgini/autofe/vector.py +4 -200
- upgini/features_enricher.py +2 -2
- upgini/utils/sort.py +4 -2
- {upgini-1.2.63.dist-info → upgini-1.2.65a3818.dev6.dist-info}/METADATA +1 -1
- {upgini-1.2.63.dist-info → upgini-1.2.65a3818.dev6.dist-info}/RECORD +23 -15
- {upgini-1.2.63.dist-info → upgini-1.2.65a3818.dev6.dist-info}/WHEEL +1 -1
- {upgini-1.2.63.dist-info → upgini-1.2.65a3818.dev6.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import Dict, Optional, Union
|
|
3
|
+
|
|
4
|
+
from upgini.autofe.operator import ParametrizedOperator
|
|
5
|
+
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
6
|
+
from upgini.autofe.timeseries.lag import Lag
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DeltaBase(TimeSeriesBase):
|
|
10
|
+
delta_size: int
|
|
11
|
+
delta_unit: str = "D"
|
|
12
|
+
|
|
13
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
14
|
+
res = super().get_params()
|
|
15
|
+
res.update(
|
|
16
|
+
{
|
|
17
|
+
"delta_size": self.delta_size,
|
|
18
|
+
"delta_unit": self.delta_unit,
|
|
19
|
+
}
|
|
20
|
+
)
|
|
21
|
+
return res
|
|
22
|
+
|
|
23
|
+
def _calculate_delta(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
|
|
24
|
+
return_series = isinstance(x, pd.Series)
|
|
25
|
+
x = pd.DataFrame(x)
|
|
26
|
+
lag = Lag(lag_size=self.delta_size, lag_unit=self.delta_unit)
|
|
27
|
+
x.iloc[:, -1] = x.iloc[:, -1] - lag._aggregate(x.iloc[:, -1])
|
|
28
|
+
return x.iloc[:, -1] if return_series else x
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Delta(DeltaBase, ParametrizedOperator):
|
|
32
|
+
def to_formula(self) -> str:
|
|
33
|
+
base_formula = f"delta_{self.delta_size}{self.delta_unit}"
|
|
34
|
+
return self._add_offset_to_formula(base_formula)
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def from_formula(cls, formula: str) -> Optional["Delta"]:
|
|
38
|
+
# Base regex for Delta class
|
|
39
|
+
base_regex = r"delta_(\d+)([a-zA-Z])"
|
|
40
|
+
|
|
41
|
+
# Parse offset first
|
|
42
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
43
|
+
|
|
44
|
+
if remaining_formula is None:
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
# Now parse the delta part
|
|
48
|
+
import re
|
|
49
|
+
|
|
50
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
51
|
+
|
|
52
|
+
if not match:
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
delta_size = int(match.group(1))
|
|
56
|
+
delta_unit = match.group(2)
|
|
57
|
+
|
|
58
|
+
# Create instance with appropriate parameters
|
|
59
|
+
params = {
|
|
60
|
+
"delta_size": delta_size,
|
|
61
|
+
"delta_unit": delta_unit,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if offset_params:
|
|
65
|
+
params.update(offset_params)
|
|
66
|
+
|
|
67
|
+
return cls(**params)
|
|
68
|
+
|
|
69
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
70
|
+
return ts.apply(self._calculate_delta).iloc[:, [-1]]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class Delta2(DeltaBase, ParametrizedOperator):
|
|
74
|
+
def to_formula(self) -> str:
|
|
75
|
+
base_formula = f"delta2_{self.delta_size}{self.delta_unit}"
|
|
76
|
+
return self._add_offset_to_formula(base_formula)
|
|
77
|
+
|
|
78
|
+
@classmethod
|
|
79
|
+
def from_formula(cls, formula: str) -> Optional["Delta2"]:
|
|
80
|
+
# Base regex for Delta2 class
|
|
81
|
+
base_regex = r"delta2_(\d+)([a-zA-Z])"
|
|
82
|
+
|
|
83
|
+
# Parse offset first
|
|
84
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
85
|
+
|
|
86
|
+
if remaining_formula is None:
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
# Now parse the delta part
|
|
90
|
+
import re
|
|
91
|
+
|
|
92
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
93
|
+
|
|
94
|
+
if not match:
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
delta_size = int(match.group(1))
|
|
98
|
+
delta_unit = match.group(2)
|
|
99
|
+
|
|
100
|
+
# Create instance with appropriate parameters
|
|
101
|
+
params = {
|
|
102
|
+
"delta_size": delta_size,
|
|
103
|
+
"delta_unit": delta_unit,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if offset_params:
|
|
107
|
+
params.update(offset_params)
|
|
108
|
+
|
|
109
|
+
return cls(**params)
|
|
110
|
+
|
|
111
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
112
|
+
return ts.apply(self._calculate_delta2).iloc[:, [-1]]
|
|
113
|
+
|
|
114
|
+
def _calculate_delta2(self, x):
|
|
115
|
+
# Calculate first delta
|
|
116
|
+
first_delta = self._calculate_delta(x)
|
|
117
|
+
|
|
118
|
+
# Calculate delta of delta (second derivative)
|
|
119
|
+
return self._calculate_delta(first_delta)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from typing import Dict, Optional
|
|
4
|
+
|
|
5
|
+
from upgini.autofe.operator import ParametrizedOperator
|
|
6
|
+
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Lag(TimeSeriesBase, ParametrizedOperator):
|
|
10
|
+
lag_size: int
|
|
11
|
+
lag_unit: str = "D"
|
|
12
|
+
|
|
13
|
+
def to_formula(self) -> str:
|
|
14
|
+
base_formula = f"lag_{self.lag_size}{self.lag_unit}"
|
|
15
|
+
return self._add_offset_to_formula(base_formula)
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def from_formula(cls, formula: str) -> Optional["Lag"]:
|
|
19
|
+
# Base regex for Lag class
|
|
20
|
+
base_regex = r"lag_(\d+)([a-zA-Z])"
|
|
21
|
+
|
|
22
|
+
# Parse offset first
|
|
23
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
24
|
+
|
|
25
|
+
if remaining_formula is None:
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
# Now parse the lag part
|
|
29
|
+
import re
|
|
30
|
+
|
|
31
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
32
|
+
|
|
33
|
+
if not match:
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
lag_size = int(match.group(1))
|
|
37
|
+
lag_unit = match.group(2)
|
|
38
|
+
|
|
39
|
+
# Create instance with appropriate parameters
|
|
40
|
+
params = {
|
|
41
|
+
"lag_size": lag_size,
|
|
42
|
+
"lag_unit": lag_unit,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if offset_params:
|
|
46
|
+
params.update(offset_params)
|
|
47
|
+
|
|
48
|
+
return cls(**params)
|
|
49
|
+
|
|
50
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
51
|
+
res = super().get_params()
|
|
52
|
+
res.update(
|
|
53
|
+
{
|
|
54
|
+
"lag_size": self.lag_size,
|
|
55
|
+
"lag_unit": self.lag_unit,
|
|
56
|
+
}
|
|
57
|
+
)
|
|
58
|
+
return res
|
|
59
|
+
|
|
60
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
61
|
+
lag_window = self.lag_size + 1
|
|
62
|
+
return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=1).agg(self._lag)
|
|
63
|
+
|
|
64
|
+
def _lag(self, x):
|
|
65
|
+
if x.index.min() > (x.index.max() - pd.Timedelta(self.lag_size, self.lag_unit)):
|
|
66
|
+
return np.nan
|
|
67
|
+
else:
|
|
68
|
+
return x[0]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
|
|
4
|
+
from upgini.autofe.operator import ParametrizedOperator
|
|
5
|
+
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
6
|
+
|
|
7
|
+
# Roll aggregation functions
|
|
8
|
+
roll_aggregations = {
|
|
9
|
+
"norm_mean": lambda x: x[-1] / x.mean(),
|
|
10
|
+
"q25": lambda x: x.quantile(0.25),
|
|
11
|
+
"q75": lambda x: x.quantile(0.75),
|
|
12
|
+
"iqr": lambda x: x.quantile(0.75) - x.quantile(0.25),
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from pydantic import field_validator as validator # V2
|
|
17
|
+
except ImportError:
|
|
18
|
+
from pydantic import validator # V1
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Roll(TimeSeriesBase, ParametrizedOperator):
|
|
22
|
+
aggregation: str
|
|
23
|
+
window_size: int = 1
|
|
24
|
+
window_unit: str = "D"
|
|
25
|
+
|
|
26
|
+
@validator("window_unit")
|
|
27
|
+
@classmethod
|
|
28
|
+
def validate_window_unit(cls, v: str) -> str:
|
|
29
|
+
try:
|
|
30
|
+
pd.tseries.frequencies.to_offset(v)
|
|
31
|
+
return v
|
|
32
|
+
except ValueError:
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def to_formula(self) -> str:
|
|
38
|
+
# First add window size and unit, then add aggregation, then add offset
|
|
39
|
+
base_formula = f"roll_{self.window_size}{self.window_unit}"
|
|
40
|
+
formula_with_agg = f"{base_formula}_{self.aggregation}"
|
|
41
|
+
return self._add_offset_to_formula(formula_with_agg)
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def from_formula(cls, formula: str) -> Optional["Roll"]:
|
|
45
|
+
import re
|
|
46
|
+
|
|
47
|
+
# Base regex for Roll class (with aggregation)
|
|
48
|
+
base_regex = r"roll_(\d+)([a-zA-Z])_(\w+)"
|
|
49
|
+
|
|
50
|
+
# Parse offset first - this removes the offset part if present
|
|
51
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
52
|
+
|
|
53
|
+
if remaining_formula is None:
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
# Parse the window part and aggregation
|
|
57
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
58
|
+
|
|
59
|
+
if not match:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
window_size = int(match.group(1))
|
|
63
|
+
window_unit = match.group(2)
|
|
64
|
+
aggregation = match.group(3)
|
|
65
|
+
|
|
66
|
+
# Create instance with appropriate parameters
|
|
67
|
+
params = {
|
|
68
|
+
"window_size": window_size,
|
|
69
|
+
"window_unit": window_unit,
|
|
70
|
+
"aggregation": aggregation,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if offset_params:
|
|
74
|
+
params.update(offset_params)
|
|
75
|
+
|
|
76
|
+
return cls(**params)
|
|
77
|
+
|
|
78
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
79
|
+
res = super().get_params()
|
|
80
|
+
res.update(
|
|
81
|
+
{
|
|
82
|
+
"window_size": self.window_size,
|
|
83
|
+
"window_unit": self.window_unit,
|
|
84
|
+
"aggregation": self.aggregation,
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
return res
|
|
88
|
+
|
|
89
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
90
|
+
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
|
|
91
|
+
roll_aggregations.get(self.aggregation, self.aggregation)
|
|
92
|
+
)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from typing import Dict, Optional, Union
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TrendCoefficient(TimeSeriesBase):
|
|
9
|
+
name: str = "trend_coef"
|
|
10
|
+
step_size: int = 1
|
|
11
|
+
step_unit: str = "D"
|
|
12
|
+
|
|
13
|
+
def to_formula(self) -> str:
|
|
14
|
+
base_formula = "trend_coef"
|
|
15
|
+
return self._add_offset_to_formula(base_formula)
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def from_formula(cls, formula: str) -> Optional["TrendCoefficient"]:
|
|
19
|
+
# Base regex for TrendCoefficient class
|
|
20
|
+
base_regex = r"trend_coef"
|
|
21
|
+
|
|
22
|
+
# Parse offset first
|
|
23
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
24
|
+
|
|
25
|
+
if remaining_formula is None:
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
# Basic pattern (no offset)
|
|
29
|
+
if remaining_formula == "trend_coef":
|
|
30
|
+
params = {}
|
|
31
|
+
if offset_params:
|
|
32
|
+
params.update(offset_params)
|
|
33
|
+
return cls(**params)
|
|
34
|
+
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
38
|
+
res = super().get_params()
|
|
39
|
+
res.update(
|
|
40
|
+
{
|
|
41
|
+
"step_size": self.step_size,
|
|
42
|
+
"step_unit": self.step_unit,
|
|
43
|
+
"offset_size": self.offset_size,
|
|
44
|
+
"offset_unit": self.offset_unit,
|
|
45
|
+
}
|
|
46
|
+
)
|
|
47
|
+
return res
|
|
48
|
+
|
|
49
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
50
|
+
return ts.apply(self._trend_coef).iloc[:, [-1]].fillna(0)
|
|
51
|
+
|
|
52
|
+
def _trend_coef(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
|
|
53
|
+
return_series = isinstance(x, pd.Series)
|
|
54
|
+
x = pd.DataFrame(x)
|
|
55
|
+
resampled = (
|
|
56
|
+
x.iloc[:, -1].resample(f"{self.step_size}{self.step_unit}").fillna(method="ffill").fillna(method="bfill")
|
|
57
|
+
)
|
|
58
|
+
idx = np.arange(len(resampled))
|
|
59
|
+
coeffs = np.polyfit(idx, resampled, 1)
|
|
60
|
+
x.iloc[:, -1] = coeffs[0]
|
|
61
|
+
return x.iloc[:, -1] if return_series else x
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
from typing import Dict, Optional, Union
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from upgini.autofe.operator import ParametrizedOperator
|
|
6
|
+
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VolatilityBase(TimeSeriesBase):
|
|
10
|
+
@staticmethod
|
|
11
|
+
def _get_returns(ts: pd.Series, freq: str) -> pd.Series:
|
|
12
|
+
return ts.pct_change(freq=freq).fillna(0)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EWMAVolatility(VolatilityBase, ParametrizedOperator):
|
|
16
|
+
step_size: int = 1
|
|
17
|
+
step_unit: str = "D"
|
|
18
|
+
window_size: int
|
|
19
|
+
|
|
20
|
+
def to_formula(self) -> str:
|
|
21
|
+
base_formula = f"ewma_vol_{self.window_size}"
|
|
22
|
+
return self._add_offset_to_formula(base_formula)
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def from_formula(cls, formula: str) -> Optional["EWMAVolatility"]:
|
|
26
|
+
base_regex = r"ewma_vol_(\d+)"
|
|
27
|
+
|
|
28
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
29
|
+
|
|
30
|
+
if remaining_formula is None:
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
import re
|
|
34
|
+
|
|
35
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
36
|
+
|
|
37
|
+
if not match:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
window_size = int(match.group(1))
|
|
41
|
+
|
|
42
|
+
params = {
|
|
43
|
+
"window_size": window_size,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if offset_params:
|
|
47
|
+
params.update(offset_params)
|
|
48
|
+
|
|
49
|
+
return cls(**params)
|
|
50
|
+
|
|
51
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
52
|
+
res = super().get_params()
|
|
53
|
+
res.update(
|
|
54
|
+
{
|
|
55
|
+
"step_size": self.step_size,
|
|
56
|
+
"step_unit": self.step_unit,
|
|
57
|
+
"window_size": self.window_size,
|
|
58
|
+
}
|
|
59
|
+
)
|
|
60
|
+
return res
|
|
61
|
+
|
|
62
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
63
|
+
return ts.apply(self._ewma_vol)
|
|
64
|
+
|
|
65
|
+
def _ewma_vol(self, x):
|
|
66
|
+
x = pd.DataFrame(x).iloc[:, -1]
|
|
67
|
+
returns = self._get_returns(x, f"{self.step_size}{self.step_unit}")
|
|
68
|
+
return returns.ewm(span=self.window_size).std()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class RollingVolBase(VolatilityBase):
|
|
72
|
+
step_size: int = 1
|
|
73
|
+
step_unit: str = "D"
|
|
74
|
+
window_size: int
|
|
75
|
+
window_unit: str = "D"
|
|
76
|
+
|
|
77
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
78
|
+
res = super().get_params()
|
|
79
|
+
res.update(
|
|
80
|
+
{
|
|
81
|
+
"step_size": self.step_size,
|
|
82
|
+
"step_unit": self.step_unit,
|
|
83
|
+
"window_size": self.window_size,
|
|
84
|
+
"window_unit": self.window_unit,
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
return res
|
|
88
|
+
|
|
89
|
+
def _rolling_vol(
|
|
90
|
+
self, x: Union[pd.DataFrame, pd.Series], window_size: int, window_unit: str, abs_returns: bool = False
|
|
91
|
+
) -> Union[pd.DataFrame, pd.Series]:
|
|
92
|
+
return_series = isinstance(x, pd.Series)
|
|
93
|
+
x = pd.DataFrame(x)
|
|
94
|
+
returns = self._get_returns(x.iloc[:, -1], f"{self.step_size}{self.step_unit}")
|
|
95
|
+
if abs_returns:
|
|
96
|
+
returns = returns.abs()
|
|
97
|
+
x.iloc[:, -1] = returns.rolling(f"{window_size}{window_unit}", min_periods=1).std()
|
|
98
|
+
return x.iloc[:, -1] if return_series else x
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class RollingVolatility(RollingVolBase, ParametrizedOperator):
|
|
102
|
+
abs_returns: bool = False
|
|
103
|
+
|
|
104
|
+
def to_formula(self) -> str:
|
|
105
|
+
base_formula = f"roll_vol_{self.window_size}{self.window_unit}"
|
|
106
|
+
return self._add_offset_to_formula(base_formula)
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def from_formula(cls, formula: str) -> Optional["RollingVolatility"]:
|
|
110
|
+
base_regex = r"roll_vol_(\d+)([a-zA-Z])"
|
|
111
|
+
|
|
112
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
113
|
+
|
|
114
|
+
if remaining_formula is None:
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
import re
|
|
118
|
+
|
|
119
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
120
|
+
|
|
121
|
+
if not match:
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
window_size = int(match.group(1))
|
|
125
|
+
window_unit = match.group(2)
|
|
126
|
+
|
|
127
|
+
params = {
|
|
128
|
+
"window_size": window_size,
|
|
129
|
+
"window_unit": window_unit,
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
if offset_params:
|
|
133
|
+
params.update(offset_params)
|
|
134
|
+
|
|
135
|
+
return cls(**params)
|
|
136
|
+
|
|
137
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
138
|
+
return ts.apply(
|
|
139
|
+
self._rolling_vol, window_size=self.window_size, window_unit=self.window_unit, abs_returns=self.abs_returns
|
|
140
|
+
).iloc[:, [-1]]
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class RollingVolatility2(RollingVolBase, ParametrizedOperator):
|
|
144
|
+
"""
|
|
145
|
+
Computes the volatility on volatility of a time series. Volatility is computed using the RollingVolatility.
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
def to_formula(self) -> str:
|
|
149
|
+
base_formula = f"roll_vol2_{self.window_size}{self.window_unit}"
|
|
150
|
+
return self._add_offset_to_formula(base_formula)
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
def from_formula(cls, formula: str) -> Optional["RollingVolatility2"]:
|
|
154
|
+
base_regex = r"roll_vol2_(\d+)([a-zA-Z])"
|
|
155
|
+
|
|
156
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
157
|
+
|
|
158
|
+
if remaining_formula is None:
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
import re
|
|
162
|
+
|
|
163
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
164
|
+
|
|
165
|
+
if not match:
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
window_size = int(match.group(1))
|
|
169
|
+
window_unit = match.group(2)
|
|
170
|
+
|
|
171
|
+
params = {
|
|
172
|
+
"window_size": window_size,
|
|
173
|
+
"window_unit": window_unit,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
if offset_params:
|
|
177
|
+
params.update(offset_params)
|
|
178
|
+
|
|
179
|
+
return cls(**params)
|
|
180
|
+
|
|
181
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
182
|
+
return ts.apply(self._vol_on_vol).iloc[:, [-1]]
|
|
183
|
+
|
|
184
|
+
def _vol_on_vol(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
|
|
185
|
+
vol1 = self._rolling_vol(x, self.window_size, self.window_unit, abs_returns=True)
|
|
186
|
+
vol2 = self._rolling_vol(vol1, self.window_size, self.window_unit, abs_returns=False)
|
|
187
|
+
return vol2
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class VolatilityRatio(RollingVolBase, ParametrizedOperator):
|
|
191
|
+
"""
|
|
192
|
+
Computes the ratio of short-term volatility to long-term volatility.
|
|
193
|
+
Both volatilities are computed using RollingVolatility.
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
short_window_size: int
|
|
197
|
+
short_window_unit: str = "D"
|
|
198
|
+
|
|
199
|
+
def to_formula(self) -> str:
|
|
200
|
+
base_formula = (
|
|
201
|
+
f"vol_ratio_{self.short_window_size}{self.short_window_unit}_to_{self.window_size}{self.window_unit}"
|
|
202
|
+
)
|
|
203
|
+
return self._add_offset_to_formula(base_formula)
|
|
204
|
+
|
|
205
|
+
@classmethod
|
|
206
|
+
def from_formula(cls, formula: str) -> Optional["VolatilityRatio"]:
|
|
207
|
+
base_regex = r"vol_ratio_(\d+)([a-zA-Z])_to_(\d+)([a-zA-Z])"
|
|
208
|
+
|
|
209
|
+
offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
|
|
210
|
+
|
|
211
|
+
if remaining_formula is None:
|
|
212
|
+
return None
|
|
213
|
+
|
|
214
|
+
import re
|
|
215
|
+
|
|
216
|
+
match = re.match(f"^{base_regex}$", remaining_formula)
|
|
217
|
+
|
|
218
|
+
if not match:
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
short_window_size = int(match.group(1))
|
|
222
|
+
short_window_unit = match.group(2)
|
|
223
|
+
window_size = int(match.group(3))
|
|
224
|
+
window_unit = match.group(4)
|
|
225
|
+
|
|
226
|
+
params = {
|
|
227
|
+
"short_window_size": short_window_size,
|
|
228
|
+
"short_window_unit": short_window_unit,
|
|
229
|
+
"window_size": window_size,
|
|
230
|
+
"window_unit": window_unit,
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
if offset_params:
|
|
234
|
+
params.update(offset_params)
|
|
235
|
+
|
|
236
|
+
return cls(**params)
|
|
237
|
+
|
|
238
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
239
|
+
res = super().get_params()
|
|
240
|
+
res.update(
|
|
241
|
+
{
|
|
242
|
+
"short_window_size": self.short_window_size,
|
|
243
|
+
"short_window_unit": self.short_window_unit,
|
|
244
|
+
}
|
|
245
|
+
)
|
|
246
|
+
return res
|
|
247
|
+
|
|
248
|
+
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
249
|
+
return ts.apply(self._vol_ratio).iloc[:, [-1]]
|
|
250
|
+
|
|
251
|
+
def _vol_ratio(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
|
|
252
|
+
short_vol = self._rolling_vol(x, self.short_window_size, self.short_window_unit)
|
|
253
|
+
long_vol = self._rolling_vol(x, self.window_size, self.window_unit)
|
|
254
|
+
ratio = VolatilityRatio._handle_div_errors(short_vol / long_vol)
|
|
255
|
+
return ratio
|
|
256
|
+
|
|
257
|
+
@staticmethod
|
|
258
|
+
def _handle_div_errors(x: pd.Series) -> pd.Series:
|
|
259
|
+
return x.replace([np.inf, -np.inf], np.nan).fillna(1)
|
upgini/autofe/unary.py
CHANGED
|
@@ -2,10 +2,10 @@ from typing import Dict, Optional
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
|
-
from upgini.autofe.
|
|
5
|
+
from upgini.autofe.operator import PandasOperator, VectorizableMixin
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
class Abs(
|
|
8
|
+
class Abs(PandasOperator, VectorizableMixin):
|
|
9
9
|
name: str = "abs"
|
|
10
10
|
is_unary: bool = True
|
|
11
11
|
is_vectorizable: bool = True
|
|
@@ -20,7 +20,7 @@ class Abs(PandasOperand, VectorizableMixin):
|
|
|
20
20
|
# return data.abs()
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
class Log(
|
|
23
|
+
class Log(PandasOperator, VectorizableMixin):
|
|
24
24
|
name: str = "log"
|
|
25
25
|
is_unary: bool = True
|
|
26
26
|
is_vectorizable: bool = True
|
|
@@ -34,7 +34,7 @@ class Log(PandasOperand, VectorizableMixin):
|
|
|
34
34
|
return self._round_value(np.log(data.replace(0, np.nan).abs()), 10)
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
class Sqrt(
|
|
37
|
+
class Sqrt(PandasOperator, VectorizableMixin):
|
|
38
38
|
name: str = "sqrt"
|
|
39
39
|
is_unary: bool = True
|
|
40
40
|
is_vectorizable: bool = True
|
|
@@ -48,7 +48,7 @@ class Sqrt(PandasOperand, VectorizableMixin):
|
|
|
48
48
|
return self._round_value(np.sqrt(data.abs()))
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
class Square(
|
|
51
|
+
class Square(PandasOperator, VectorizableMixin):
|
|
52
52
|
name: str = "square"
|
|
53
53
|
is_unary: bool = True
|
|
54
54
|
is_vectorizable: bool = True
|
|
@@ -61,7 +61,7 @@ class Square(PandasOperand, VectorizableMixin):
|
|
|
61
61
|
return np.square(data)
|
|
62
62
|
|
|
63
63
|
|
|
64
|
-
class Sigmoid(
|
|
64
|
+
class Sigmoid(PandasOperator, VectorizableMixin):
|
|
65
65
|
name: str = "sigmoid"
|
|
66
66
|
is_unary: bool = True
|
|
67
67
|
is_vectorizable: bool = True
|
|
@@ -75,7 +75,7 @@ class Sigmoid(PandasOperand, VectorizableMixin):
|
|
|
75
75
|
return self._round_value(1 / (1 + np.exp(-data)))
|
|
76
76
|
|
|
77
77
|
|
|
78
|
-
class Floor(
|
|
78
|
+
class Floor(PandasOperator, VectorizableMixin):
|
|
79
79
|
name: str = "floor"
|
|
80
80
|
is_unary: bool = True
|
|
81
81
|
is_vectorizable: bool = True
|
|
@@ -90,7 +90,7 @@ class Floor(PandasOperand, VectorizableMixin):
|
|
|
90
90
|
return np.floor(data)
|
|
91
91
|
|
|
92
92
|
|
|
93
|
-
class Residual(
|
|
93
|
+
class Residual(PandasOperator, VectorizableMixin):
|
|
94
94
|
name: str = "residual"
|
|
95
95
|
is_unary: bool = True
|
|
96
96
|
is_vectorizable: bool = True
|
|
@@ -104,7 +104,7 @@ class Residual(PandasOperand, VectorizableMixin):
|
|
|
104
104
|
return data - np.floor(data)
|
|
105
105
|
|
|
106
106
|
|
|
107
|
-
class Freq(
|
|
107
|
+
class Freq(PandasOperator):
|
|
108
108
|
name: str = "freq"
|
|
109
109
|
is_unary: bool = True
|
|
110
110
|
output_type: Optional[str] = "float"
|
|
@@ -116,7 +116,7 @@ class Freq(PandasOperand):
|
|
|
116
116
|
return self._loc(data, value_counts)
|
|
117
117
|
|
|
118
118
|
|
|
119
|
-
class Norm(
|
|
119
|
+
class Norm(PandasOperator):
|
|
120
120
|
name: str = "norm"
|
|
121
121
|
is_unary: bool = True
|
|
122
122
|
output_type: Optional[str] = "float"
|
|
@@ -148,7 +148,7 @@ class Norm(PandasOperand):
|
|
|
148
148
|
return res
|
|
149
149
|
|
|
150
150
|
|
|
151
|
-
class Embeddings(
|
|
151
|
+
class Embeddings(PandasOperator):
|
|
152
152
|
name: str = "emb"
|
|
153
153
|
is_unary: bool = True
|
|
154
154
|
input_type: Optional[str] = "string"
|