openenergyid 0.1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openenergyid/__init__.py +8 -0
- openenergyid/abstractsim/__init__.py +5 -0
- openenergyid/abstractsim/abstract.py +102 -0
- openenergyid/baseload/__init__.py +15 -0
- openenergyid/baseload/analysis.py +190 -0
- openenergyid/baseload/exceptions.py +9 -0
- openenergyid/baseload/models.py +32 -0
- openenergyid/capacity/__init__.py +6 -0
- openenergyid/capacity/main.py +103 -0
- openenergyid/capacity/models.py +32 -0
- openenergyid/const.py +29 -0
- openenergyid/dyntar/__init__.py +20 -0
- openenergyid/dyntar/const.py +31 -0
- openenergyid/dyntar/main.py +313 -0
- openenergyid/dyntar/models.py +101 -0
- openenergyid/elia/__init__.py +4 -0
- openenergyid/elia/api.py +91 -0
- openenergyid/elia/const.py +18 -0
- openenergyid/energysharing/__init__.py +12 -0
- openenergyid/energysharing/const.py +8 -0
- openenergyid/energysharing/data_formatting.py +77 -0
- openenergyid/energysharing/main.py +122 -0
- openenergyid/energysharing/models.py +80 -0
- openenergyid/enums.py +16 -0
- openenergyid/models.py +174 -0
- openenergyid/mvlr/__init__.py +19 -0
- openenergyid/mvlr/helpers.py +30 -0
- openenergyid/mvlr/main.py +34 -0
- openenergyid/mvlr/models.py +227 -0
- openenergyid/mvlr/mvlr.py +450 -0
- openenergyid/pvsim/__init__.py +8 -0
- openenergyid/pvsim/abstract.py +60 -0
- openenergyid/pvsim/elia/__init__.py +3 -0
- openenergyid/pvsim/elia/main.py +89 -0
- openenergyid/pvsim/main.py +49 -0
- openenergyid/pvsim/pvlib/__init__.py +11 -0
- openenergyid/pvsim/pvlib/main.py +115 -0
- openenergyid/pvsim/pvlib/models.py +235 -0
- openenergyid/pvsim/pvlib/quickscan.py +99 -0
- openenergyid/pvsim/pvlib/weather.py +91 -0
- openenergyid/sim/__init__.py +5 -0
- openenergyid/sim/main.py +67 -0
- openenergyid/simeval/__init__.py +6 -0
- openenergyid/simeval/main.py +148 -0
- openenergyid/simeval/models.py +162 -0
- openenergyid-0.1.31.dist-info/METADATA +32 -0
- openenergyid-0.1.31.dist-info/RECORD +50 -0
- openenergyid-0.1.31.dist-info/WHEEL +5 -0
- openenergyid-0.1.31.dist-info/licenses/LICENSE +21 -0
- openenergyid-0.1.31.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""Models for multivariable linear regression."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import statsmodels.formula.api as fm
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
8
|
+
|
|
9
|
+
from openenergyid.enums import Granularity
|
|
10
|
+
from openenergyid.models import TimeDataFrame
|
|
11
|
+
|
|
12
|
+
from .mvlr import MultiVariableLinearRegression
|
|
13
|
+
|
|
14
|
+
COLUMN_TEMPERATUREEQUIVALENT = "temperatureEquivalent"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
######################
|
|
18
|
+
# MVLR Input Models #
|
|
19
|
+
######################
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ValidationParameters(BaseModel):
|
|
23
|
+
"""Parameters for validation of a multivariable linear regression model."""
|
|
24
|
+
|
|
25
|
+
rsquared: float = Field(
|
|
26
|
+
0.75, ge=0, le=1, description="Minimum acceptable value for the adjusted R-squared"
|
|
27
|
+
)
|
|
28
|
+
f_pvalue: float = Field(
|
|
29
|
+
0.05, ge=0, le=1, description="Maximum acceptable value for the F-statistic"
|
|
30
|
+
)
|
|
31
|
+
pvalues: float = Field(
|
|
32
|
+
0.05, ge=0, le=1, description="Maximum acceptable value for the p-values of the t-statistic"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class IndependentVariableInput(BaseModel):
|
|
37
|
+
"""
|
|
38
|
+
Independent variable.
|
|
39
|
+
|
|
40
|
+
Has to corresponds to a column in the data frame.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
name: str = Field(
|
|
44
|
+
description="Name of the independent variable. "
|
|
45
|
+
"If the name is `temperatureEquivalent`, "
|
|
46
|
+
"it will be unpacked into columns according to the variants."
|
|
47
|
+
)
|
|
48
|
+
variants: list[str] | None = Field(
|
|
49
|
+
default=None,
|
|
50
|
+
description="Variants of the `temperatureEquivalent` independent variable. "
|
|
51
|
+
"Eg. `HDD_16.5` will be Heating Degree Days with a base temperature of 16.5°C, "
|
|
52
|
+
"`CDD_0` will be Cooling Degree Days with a base temperature of 0°C.",
|
|
53
|
+
)
|
|
54
|
+
allow_negative_coefficient: bool = Field(
|
|
55
|
+
default=True,
|
|
56
|
+
alias="allowNegativeCoefficient",
|
|
57
|
+
description="Whether the coefficient can be negative.",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class MultiVariableRegressionInput(BaseModel):
|
|
62
|
+
"""Multi-variable regression input."""
|
|
63
|
+
|
|
64
|
+
timezone: str = Field(alias="timeZone")
|
|
65
|
+
independent_variables: list[IndependentVariableInput] = Field(
|
|
66
|
+
alias="independentVariables", min_length=1
|
|
67
|
+
)
|
|
68
|
+
dependent_variable: str = Field(alias="dependentVariable")
|
|
69
|
+
frame: TimeDataFrame
|
|
70
|
+
granularities: list[Granularity]
|
|
71
|
+
allow_negative_predictions: bool = Field(alias="allowNegativePredictions", default=False)
|
|
72
|
+
validation_parameters: ValidationParameters = Field(
|
|
73
|
+
alias="validationParameters", default=ValidationParameters()
|
|
74
|
+
)
|
|
75
|
+
single_use_exog_prefixes: list[str] | None = Field(
|
|
76
|
+
# default=["HDD", "CDD", "FDD"],
|
|
77
|
+
default=None,
|
|
78
|
+
alias="singleUseExogPrefixes",
|
|
79
|
+
description="List of prefixes to be used as single-use exogenous variables.",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def model_post_init(self, __context: Any) -> None:
|
|
83
|
+
"""Post init hook."""
|
|
84
|
+
# Check if all independent variables are present in the data frame
|
|
85
|
+
for iv in self.independent_variables: # pylint: disable=not-an-iterable
|
|
86
|
+
if iv.name not in self.frame.columns:
|
|
87
|
+
raise ValueError(f"Independent variable {iv.name} not found in the data frame.")
|
|
88
|
+
|
|
89
|
+
return super().model_post_init(__context)
|
|
90
|
+
|
|
91
|
+
def _data_frame(self) -> pd.DataFrame:
|
|
92
|
+
"""Convert the data to a pandas DataFrame."""
|
|
93
|
+
return self.frame.to_pandas(timezone=self.timezone)
|
|
94
|
+
|
|
95
|
+
def data_frame(self) -> pd.DataFrame:
|
|
96
|
+
"""
|
|
97
|
+
Return the data frame ready for analysis.
|
|
98
|
+
|
|
99
|
+
Unpacks degree days and removes unnecessary columns.
|
|
100
|
+
|
|
101
|
+
If an independent variable named `temperatureEquivalent` is present,
|
|
102
|
+
it will be unpacked into columns according to the variants.
|
|
103
|
+
Eg. Variant "HDD_16.5" will be Heating Degree Days
|
|
104
|
+
with a base temperature of 16.5°C,
|
|
105
|
+
"CDD_0" will be Cooling Degree Days with a base temperature of 0°C.
|
|
106
|
+
"""
|
|
107
|
+
frame = self._data_frame()
|
|
108
|
+
columns_to_retain = [self.dependent_variable]
|
|
109
|
+
for iv in self.independent_variables: # pylint: disable=not-an-iterable
|
|
110
|
+
if iv.name == COLUMN_TEMPERATUREEQUIVALENT and iv.variants is not None:
|
|
111
|
+
for variant in iv.variants:
|
|
112
|
+
prefix, base_temperature = variant.split("_")
|
|
113
|
+
if prefix == "CDD":
|
|
114
|
+
frame[variant] = frame[COLUMN_TEMPERATUREEQUIVALENT] - float(
|
|
115
|
+
base_temperature
|
|
116
|
+
)
|
|
117
|
+
else:
|
|
118
|
+
frame[variant] = (
|
|
119
|
+
float(base_temperature) - frame[COLUMN_TEMPERATUREEQUIVALENT]
|
|
120
|
+
)
|
|
121
|
+
frame[variant] = frame[variant].clip(lower=0)
|
|
122
|
+
columns_to_retain.append(variant)
|
|
123
|
+
frame.drop(columns=[COLUMN_TEMPERATUREEQUIVALENT], inplace=True)
|
|
124
|
+
else:
|
|
125
|
+
columns_to_retain.append(iv.name)
|
|
126
|
+
|
|
127
|
+
frame = frame[columns_to_retain].copy()
|
|
128
|
+
|
|
129
|
+
return frame
|
|
130
|
+
|
|
131
|
+
def get_disallowed_negative_coefficients(self) -> list[str]:
|
|
132
|
+
"""Get independent variables that are not allowed to have a negative coefficient."""
|
|
133
|
+
result = []
|
|
134
|
+
for iv in self.independent_variables: # pylint: disable=not-an-iterable
|
|
135
|
+
if iv.name == COLUMN_TEMPERATUREEQUIVALENT and iv.variants is not None:
|
|
136
|
+
if not iv.allow_negative_coefficient:
|
|
137
|
+
result.extend(iv.variants)
|
|
138
|
+
elif not iv.allow_negative_coefficient:
|
|
139
|
+
result.append(iv.name)
|
|
140
|
+
return result
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
######################
|
|
144
|
+
# MVLR Result Models #
|
|
145
|
+
######################
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class ConfidenceInterval(BaseModel):
|
|
149
|
+
"""Confidence interval for a coefficient."""
|
|
150
|
+
|
|
151
|
+
confidence: float = Field(ge=0, le=1)
|
|
152
|
+
lower: float
|
|
153
|
+
upper: float
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class IndependentVariableResult(BaseModel):
|
|
157
|
+
"""Independent variable for a multivariable linear regression model."""
|
|
158
|
+
|
|
159
|
+
name: str
|
|
160
|
+
coef: float
|
|
161
|
+
t_stat: float | None = Field(default=None, alias="tStat")
|
|
162
|
+
p_value: float | None = Field(ge=0, le=1, default=None, alias="pValue")
|
|
163
|
+
std_err: float | None = Field(default=None, alias="stdErr")
|
|
164
|
+
confidence_interval: ConfidenceInterval | None = Field(default=None, alias="confidenceInterval")
|
|
165
|
+
|
|
166
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
167
|
+
|
|
168
|
+
@classmethod
|
|
169
|
+
def from_fit(cls, fit: fm.ols, name: str) -> "IndependentVariableResult":
|
|
170
|
+
"""Create an IndependentVariable from a fit."""
|
|
171
|
+
return cls(
|
|
172
|
+
name=name,
|
|
173
|
+
coef=fit.params[name],
|
|
174
|
+
t_stat=fit.tvalues[name],
|
|
175
|
+
p_value=fit.pvalues[name],
|
|
176
|
+
std_err=fit.bse[name],
|
|
177
|
+
confidence_interval=ConfidenceInterval(
|
|
178
|
+
confidence=0.95,
|
|
179
|
+
lower=fit.conf_int().transpose()[name][0],
|
|
180
|
+
upper=fit.conf_int().transpose()[name][1],
|
|
181
|
+
),
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class MultiVariableRegressionResult(BaseModel):
|
|
186
|
+
"""Result of a multivariable regression model."""
|
|
187
|
+
|
|
188
|
+
dependent_variable: str = Field(alias="dependentVariable")
|
|
189
|
+
independent_variables: list[IndependentVariableResult] = Field(alias="independentVariables")
|
|
190
|
+
r2: float = Field(ge=0, le=1, alias="rSquared")
|
|
191
|
+
r2_adj: float = Field(ge=0, le=1, alias="rSquaredAdjusted")
|
|
192
|
+
f_stat: float = Field(ge=0, alias="fStat")
|
|
193
|
+
prob_f_stat: float = Field(ge=0, le=1, alias="probFStat")
|
|
194
|
+
intercept: IndependentVariableResult
|
|
195
|
+
granularity: Granularity
|
|
196
|
+
frame: TimeDataFrame
|
|
197
|
+
|
|
198
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
199
|
+
|
|
200
|
+
@classmethod
|
|
201
|
+
def from_mvlr(cls, mvlr: MultiVariableLinearRegression) -> "MultiVariableRegressionResult":
|
|
202
|
+
"""Create a MultiVariableRegressionResult from a MultiVariableLinearRegression."""
|
|
203
|
+
|
|
204
|
+
# Get independent variables
|
|
205
|
+
param_keys = mvlr.fit.params.keys().tolist()
|
|
206
|
+
param_keys.remove("Intercept")
|
|
207
|
+
independent_variables = []
|
|
208
|
+
for k in param_keys:
|
|
209
|
+
independent_variables.append(IndependentVariableResult.from_fit(mvlr.fit, k))
|
|
210
|
+
|
|
211
|
+
# Create resulting TimeSeries
|
|
212
|
+
cols_to_keep = list(param_keys)
|
|
213
|
+
cols_to_keep.append(mvlr.y)
|
|
214
|
+
cols_to_remove = list(filter(lambda v: v not in cols_to_keep, mvlr.data.columns.values))
|
|
215
|
+
frame = mvlr.data.drop(cols_to_remove, axis=1)
|
|
216
|
+
|
|
217
|
+
return cls(
|
|
218
|
+
dependent_variable=mvlr.y,
|
|
219
|
+
independent_variables=independent_variables,
|
|
220
|
+
r2=mvlr.fit.rsquared,
|
|
221
|
+
r2_adj=mvlr.fit.rsquared_adj,
|
|
222
|
+
f_stat=mvlr.fit.fvalue,
|
|
223
|
+
prob_f_stat=mvlr.fit.f_pvalue,
|
|
224
|
+
intercept=IndependentVariableResult.from_fit(mvlr.fit, "Intercept"),
|
|
225
|
+
granularity=mvlr.granularity,
|
|
226
|
+
frame=TimeDataFrame.from_pandas(frame),
|
|
227
|
+
)
|
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
"""Multi-variable linear regression based on statsmodels
|
|
2
|
+
and Ordinary Least Squares (ols)."""
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import statsmodels.formula.api as fm
|
|
7
|
+
from patsy import LookupFactor, ModelDesc, Term # pylint: disable=no-name-in-module
|
|
8
|
+
from statsmodels.sandbox.regression.predstd import wls_prediction_std
|
|
9
|
+
|
|
10
|
+
from openenergyid.enums import Granularity
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MultiVariableLinearRegression:
|
|
14
|
+
"""Multi-variable linear regression.
|
|
15
|
+
|
|
16
|
+
Based on statsmodels and Ordinary Least Squares (ols).
|
|
17
|
+
|
|
18
|
+
Pass a dataframe with the variable to be modelled y (dependent variable)
|
|
19
|
+
and the possible independent variables x.
|
|
20
|
+
Specify as string the name of the dependent variable, and optionally pass a list with names of
|
|
21
|
+
independent variables to try
|
|
22
|
+
(by default all other columns will be tried as independent variables).
|
|
23
|
+
|
|
24
|
+
The analysis is based on a forward-selection approach: starting from a simple model,
|
|
25
|
+
the model is iteratively refined and verified until no statistical relevant improvements
|
|
26
|
+
can be obtained.
|
|
27
|
+
Each model in the iteration loop is stored in the attribute self.list_of_fits.
|
|
28
|
+
The selected model is self.fit (=pointer to the last element of self.list_of_fits).
|
|
29
|
+
|
|
30
|
+
The dataframe can contain daily, weekly, monthly, yearly ... values. Each row is an instance.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
data: pd.DataFrame,
|
|
36
|
+
y: str,
|
|
37
|
+
p_max: float = 0.05,
|
|
38
|
+
list_of_x: list[str] = None,
|
|
39
|
+
confint: float = 0.95,
|
|
40
|
+
cross_validation: bool = False,
|
|
41
|
+
allow_negative_predictions: bool = False,
|
|
42
|
+
granularity: Granularity = None,
|
|
43
|
+
single_use_exog_prefixes: list[str] = None,
|
|
44
|
+
exogs__disallow_negative_coefficient: list[str] = None,
|
|
45
|
+
):
|
|
46
|
+
"""Parameters
|
|
47
|
+
----------
|
|
48
|
+
data : TimeSeries
|
|
49
|
+
Datetimeindex and both independent variables (x) and dependent variable (y) as columns
|
|
50
|
+
y : str
|
|
51
|
+
Name of the dependent (endogeneous) variable to model
|
|
52
|
+
p_max : float (default=0.05)
|
|
53
|
+
Acceptable p-value of the t-statistic for estimated parameters
|
|
54
|
+
list_of_x : list of str (default=None)
|
|
55
|
+
If None (default), try to build a model with all columns in the dataframe
|
|
56
|
+
If a list with column names is given, only try these columns as independent variables
|
|
57
|
+
confint : float, default=0.95
|
|
58
|
+
Two-sided confidence interval for predictions.
|
|
59
|
+
cross_validation : bool, default=False
|
|
60
|
+
If True, compute the model based on cross-validation (leave one out)
|
|
61
|
+
Only possible if the df has less than 15 entries.
|
|
62
|
+
Note: this will take much longer computation times!
|
|
63
|
+
allow_negative_predictions : bool, default=False
|
|
64
|
+
If True, allow predictions to be negative.
|
|
65
|
+
For gas consumption or PV production, this is not physical
|
|
66
|
+
so allow_negative_predictions should be False
|
|
67
|
+
granularity : Granularity, default=None
|
|
68
|
+
Granularity of the data. Is only used for the output of the model.
|
|
69
|
+
If None, the granularity is not set.
|
|
70
|
+
single_use_exog_prefixes : list of str, default=None
|
|
71
|
+
List of variable prefixes that indicate a variable type that should only be used once.
|
|
72
|
+
For example, if the list contains "HDD", only one of the columns "HDD1", "HDD2", "HDD3" etc.
|
|
73
|
+
will be used as an independent variable.
|
|
74
|
+
Once the best fit using a variable with a given prefix is found, the other variables with the same
|
|
75
|
+
prefix will not be used as independent variables.
|
|
76
|
+
exogs__disallow_negative_coefficient : list of str, default=None
|
|
77
|
+
List of variable names for which the coefficient is not allowed to be negative.
|
|
78
|
+
"""
|
|
79
|
+
self.data = data.copy()
|
|
80
|
+
if y not in self.data.columns:
|
|
81
|
+
raise AssertionError(
|
|
82
|
+
f"The dependent variable {y} is not a column in the dataframe",
|
|
83
|
+
)
|
|
84
|
+
self.y = y
|
|
85
|
+
|
|
86
|
+
self.p_max = p_max
|
|
87
|
+
self.list_of_x = list_of_x or [x for x in self.data.columns if x != self.y]
|
|
88
|
+
self.confint = confint
|
|
89
|
+
self.cross_validation = cross_validation
|
|
90
|
+
self.allow_negative_predictions = allow_negative_predictions
|
|
91
|
+
self.granularity = granularity
|
|
92
|
+
self.single_use_exog_prefixes = single_use_exog_prefixes
|
|
93
|
+
self.exogs__disallow_negative_coefficient = exogs__disallow_negative_coefficient
|
|
94
|
+
self._fit = None
|
|
95
|
+
self._list_of_fits = []
|
|
96
|
+
self.list_of_cverrors = []
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def fit(self) -> fm.ols:
|
|
100
|
+
"""Fits a model to the data.
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
The fitted model.
|
|
105
|
+
|
|
106
|
+
Raises
|
|
107
|
+
------
|
|
108
|
+
UnboundLocalError: If `do_analysis()` has not been run before calling `fit()`.
|
|
109
|
+
"""
|
|
110
|
+
if self._fit is None:
|
|
111
|
+
raise UnboundLocalError(
|
|
112
|
+
'Run "do_analysis()" first to fit a model to the data.',
|
|
113
|
+
)
|
|
114
|
+
else:
|
|
115
|
+
return self._fit
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def list_of_fits(self) -> list[fm.ols]:
|
|
119
|
+
"""Returns the list of fits generated by the model.
|
|
120
|
+
|
|
121
|
+
Raises
|
|
122
|
+
------
|
|
123
|
+
UnboundLocalError: If the model has not been fitted yet.
|
|
124
|
+
|
|
125
|
+
Returns
|
|
126
|
+
-------
|
|
127
|
+
list: The list of fits generated by the model.
|
|
128
|
+
"""
|
|
129
|
+
if not self._list_of_fits:
|
|
130
|
+
raise UnboundLocalError(
|
|
131
|
+
'Run "do_analysis()" first to fit a model to the data.',
|
|
132
|
+
)
|
|
133
|
+
else:
|
|
134
|
+
return self._list_of_fits
|
|
135
|
+
|
|
136
|
+
def do_analysis(self):
|
|
137
|
+
"""Find the best model (fit) and create self.list_of_fits and self.fit"""
|
|
138
|
+
if self.cross_validation:
|
|
139
|
+
return self._do_analysis_cross_validation()
|
|
140
|
+
else:
|
|
141
|
+
return self._do_analysis_no_cross_validation()
|
|
142
|
+
|
|
143
|
+
def _do_analysis_no_cross_validation(self):
|
|
144
|
+
"""Find the best model (fit) and create self.list_of_fits and self.fit"""
|
|
145
|
+
# first model is just the mean
|
|
146
|
+
response_term = [Term([LookupFactor(self.y)])]
|
|
147
|
+
model_terms = [Term([])] # empty term is the intercept
|
|
148
|
+
all_model_terms_dict = {x: Term([LookupFactor(x)]) for x in self.list_of_x}
|
|
149
|
+
# ...then add another term for each candidate
|
|
150
|
+
# model_terms += [Term([LookupFactor(c)]) for c in candidates]
|
|
151
|
+
model_desc = ModelDesc(response_term, model_terms)
|
|
152
|
+
self._list_of_fits.append(fm.ols(model_desc, data=self.data).fit())
|
|
153
|
+
# try to improve the model until no improvements can be found
|
|
154
|
+
|
|
155
|
+
while all_model_terms_dict:
|
|
156
|
+
# try each x and overwrite the best_fit if we find a better one
|
|
157
|
+
# the first best_fit is the one from the previous round
|
|
158
|
+
ref_fit = self._list_of_fits[-1]
|
|
159
|
+
best_fit = self._list_of_fits[-1]
|
|
160
|
+
best_bic = best_fit.bic
|
|
161
|
+
for x, term in all_model_terms_dict.items():
|
|
162
|
+
# make new_fit, compare with best found so far
|
|
163
|
+
model_desc = ModelDesc(
|
|
164
|
+
response_term,
|
|
165
|
+
ref_fit.model.formula.rhs_termlist + [term],
|
|
166
|
+
)
|
|
167
|
+
fit = fm.ols(model_desc, data=self.data).fit()
|
|
168
|
+
|
|
169
|
+
# Check if the coefficient of the variable is allowed to be negative
|
|
170
|
+
if (
|
|
171
|
+
self.exogs__disallow_negative_coefficient is not None
|
|
172
|
+
and x in self.exogs__disallow_negative_coefficient
|
|
173
|
+
and fit.params[x] < 0
|
|
174
|
+
):
|
|
175
|
+
continue
|
|
176
|
+
|
|
177
|
+
if fit.bic < best_bic:
|
|
178
|
+
best_bic = fit.bic
|
|
179
|
+
best_fit = fit
|
|
180
|
+
best_x = x
|
|
181
|
+
# Sometimes, the obtained fit may be better, but contains unsignificant parameters.
|
|
182
|
+
# Correct the fit by removing the unsignificant parameters and estimate again
|
|
183
|
+
best_fit = self._prune(best_fit, p_max=self.p_max)
|
|
184
|
+
|
|
185
|
+
# if best_fit does not contain more variables than ref fit, exit
|
|
186
|
+
if len(best_fit.model.formula.rhs_termlist) == len(
|
|
187
|
+
ref_fit.model.formula.rhs_termlist,
|
|
188
|
+
):
|
|
189
|
+
break
|
|
190
|
+
|
|
191
|
+
self._list_of_fits.append(best_fit)
|
|
192
|
+
all_model_terms_dict.pop(best_x)
|
|
193
|
+
|
|
194
|
+
# Check if `best_x` starts with a prefix that should only be used once
|
|
195
|
+
# If so, remove all other variables with the same prefix from the list of candidates
|
|
196
|
+
if self.single_use_exog_prefixes:
|
|
197
|
+
for prefix in self.single_use_exog_prefixes:
|
|
198
|
+
if best_x.startswith(prefix):
|
|
199
|
+
all_model_terms_dict = {
|
|
200
|
+
k: v
|
|
201
|
+
for k, v in all_model_terms_dict.items()
|
|
202
|
+
if not k.startswith(prefix)
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
self._fit = self._list_of_fits[-1]
|
|
206
|
+
|
|
207
|
+
def _do_analysis_cross_validation(self):
|
|
208
|
+
"""Find the best model (fit) based on cross-valiation (leave one out)"""
|
|
209
|
+
assert len(self.data) < 15, (
|
|
210
|
+
"Cross-validation is not implemented if your sample contains more than 15 datapoints"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# initialization: first model is the mean, but compute cv correctly.
|
|
214
|
+
errors = []
|
|
215
|
+
response_term = [Term([LookupFactor(self.y)])]
|
|
216
|
+
model_terms = [Term([])] # empty term is the intercept
|
|
217
|
+
model_desc = ModelDesc(response_term, model_terms)
|
|
218
|
+
for i in self.data.index:
|
|
219
|
+
# make new_fit, compute cross-validation and store error
|
|
220
|
+
df_ = self.data.drop(i, axis=0)
|
|
221
|
+
fit = fm.ols(model_desc, data=df_).fit()
|
|
222
|
+
cross_prediction = self._predict(fit=fit, data=self.data.loc[[i], :])
|
|
223
|
+
errors.append(cross_prediction["predicted"] - cross_prediction[self.y])
|
|
224
|
+
|
|
225
|
+
self._list_of_fits = [fm.ols(model_desc, data=self.data).fit()]
|
|
226
|
+
self.list_of_cverrors = [np.mean(np.abs(np.array(errors)))]
|
|
227
|
+
|
|
228
|
+
# try to improve the model until no improvements can be found
|
|
229
|
+
all_model_terms_dict = {x: Term([LookupFactor(x)]) for x in self.list_of_x}
|
|
230
|
+
while all_model_terms_dict:
|
|
231
|
+
# import pdb;pdb.set_trace()
|
|
232
|
+
# try each x in all_exog and overwrite if we find a better one
|
|
233
|
+
# at the end of iteration (and not earlier), save the best of the iteration
|
|
234
|
+
better_model_found = False
|
|
235
|
+
best = dict(fit=self._list_of_fits[-1], cverror=self.list_of_cverrors[-1])
|
|
236
|
+
for x, term in all_model_terms_dict.items():
|
|
237
|
+
model_desc = ModelDesc(
|
|
238
|
+
response_term,
|
|
239
|
+
self._list_of_fits[-1].model.formula.rhs_termlist + [term],
|
|
240
|
+
)
|
|
241
|
+
# cross_validation, currently only implemented for monthly data
|
|
242
|
+
# compute the mean error for a given formula based on leave-one-out.
|
|
243
|
+
errors = []
|
|
244
|
+
for i in self.data.index:
|
|
245
|
+
# make new_fit, compute cross-validation and store error
|
|
246
|
+
df_ = self.data.drop(i, axis=0)
|
|
247
|
+
fit = fm.ols(model_desc, data=df_).fit()
|
|
248
|
+
cross_prediction = self._predict(
|
|
249
|
+
fit=fit,
|
|
250
|
+
data=self.data.loc[[i], :],
|
|
251
|
+
)
|
|
252
|
+
errors.append(
|
|
253
|
+
cross_prediction["predicted"] - cross_prediction[self.y],
|
|
254
|
+
)
|
|
255
|
+
cverror = np.mean(np.abs(np.array(errors)))
|
|
256
|
+
# compare the model with the current fit
|
|
257
|
+
if cverror < best["cverror"]:
|
|
258
|
+
# better model, keep it
|
|
259
|
+
# first, reidentify using all the datapoints
|
|
260
|
+
best["fit"] = fm.ols(model_desc, data=self.data).fit()
|
|
261
|
+
best["cverror"] = cverror
|
|
262
|
+
better_model_found = True
|
|
263
|
+
best_x = x
|
|
264
|
+
|
|
265
|
+
if better_model_found:
|
|
266
|
+
self._list_of_fits.append(best["fit"])
|
|
267
|
+
self.list_of_cverrors.append(best["cverror"])
|
|
268
|
+
|
|
269
|
+
else:
|
|
270
|
+
# if we did not find a better model, exit
|
|
271
|
+
break
|
|
272
|
+
|
|
273
|
+
# next iteration with the found exog removed
|
|
274
|
+
all_model_terms_dict.pop(best_x)
|
|
275
|
+
|
|
276
|
+
# Check if `best_x` starts with a prefix that should only be used once
|
|
277
|
+
# If so, remove all other variables with the same prefix from the list of candidates
|
|
278
|
+
if self.single_use_exog_prefixes:
|
|
279
|
+
for prefix in self.single_use_exog_prefixes:
|
|
280
|
+
if best_x.startswith(prefix):
|
|
281
|
+
all_model_terms_dict = {
|
|
282
|
+
k: v
|
|
283
|
+
for k, v in all_model_terms_dict.items()
|
|
284
|
+
if not k.startswith(prefix)
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
self._fit = self._list_of_fits[-1]
|
|
288
|
+
|
|
289
|
+
def _prune(self, fit: fm.ols, p_max: float) -> fm.ols:
|
|
290
|
+
"""If the fit contains statistically insignificant parameters, remove them.
|
|
291
|
+
Returns a pruned fit where all parameters have p-values of the t-statistic below p_max
|
|
292
|
+
|
|
293
|
+
Parameters
|
|
294
|
+
----------
|
|
295
|
+
fit: fm.ols fit object
|
|
296
|
+
Can contain insignificant parameters
|
|
297
|
+
p_max : float
|
|
298
|
+
Maximum allowed probability of the t-statistic
|
|
299
|
+
|
|
300
|
+
Returns
|
|
301
|
+
-------
|
|
302
|
+
fit: fm.ols fit object
|
|
303
|
+
Won't contain any insignificant parameters
|
|
304
|
+
|
|
305
|
+
"""
|
|
306
|
+
|
|
307
|
+
def remove_from_model_desc(x: str, model_desc: ModelDesc) -> ModelDesc:
|
|
308
|
+
"""Return a model_desc without x"""
|
|
309
|
+
rhs_termlist = []
|
|
310
|
+
for t in model_desc.rhs_termlist:
|
|
311
|
+
if not t.factors:
|
|
312
|
+
# intercept, add anyway
|
|
313
|
+
rhs_termlist.append(t)
|
|
314
|
+
elif x != t.factors[0]._varname: # pylint: disable=protected-access
|
|
315
|
+
# this is not the term with x
|
|
316
|
+
rhs_termlist.append(t)
|
|
317
|
+
|
|
318
|
+
md = ModelDesc(model_desc.lhs_termlist, rhs_termlist)
|
|
319
|
+
return md
|
|
320
|
+
|
|
321
|
+
corrected_model_desc = ModelDesc(
|
|
322
|
+
fit.model.formula.lhs_termlist[:],
|
|
323
|
+
fit.model.formula.rhs_termlist[:],
|
|
324
|
+
)
|
|
325
|
+
pars_to_prune = fit.pvalues.where(fit.pvalues > p_max).dropna().index.tolist()
|
|
326
|
+
try:
|
|
327
|
+
pars_to_prune.remove("Intercept")
|
|
328
|
+
except ValueError:
|
|
329
|
+
pass
|
|
330
|
+
while pars_to_prune:
|
|
331
|
+
corrected_model_desc = remove_from_model_desc(
|
|
332
|
+
pars_to_prune[0],
|
|
333
|
+
corrected_model_desc,
|
|
334
|
+
)
|
|
335
|
+
fit = fm.ols(corrected_model_desc, data=self.data).fit()
|
|
336
|
+
pars_to_prune = fit.pvalues.where(fit.pvalues > p_max).dropna().index.tolist()
|
|
337
|
+
try:
|
|
338
|
+
pars_to_prune.remove("Intercept")
|
|
339
|
+
except ValueError:
|
|
340
|
+
pass
|
|
341
|
+
return fit
|
|
342
|
+
|
|
343
|
+
@staticmethod
|
|
344
|
+
def find_best_rsquared(list_of_fits: list[fm.ols]) -> fm.ols:
|
|
345
|
+
"""Return the best fit, based on rsquared"""
|
|
346
|
+
res = sorted(list_of_fits, key=lambda x: x.rsquared)
|
|
347
|
+
return res[-1]
|
|
348
|
+
|
|
349
|
+
@staticmethod
|
|
350
|
+
def find_best_akaike(list_of_fits: list[fm.ols]) -> fm.ols:
|
|
351
|
+
"""Return the best fit, based on Akaike information criterion"""
|
|
352
|
+
res = sorted(list_of_fits, key=lambda x: x.aic)
|
|
353
|
+
return res[0]
|
|
354
|
+
|
|
355
|
+
@staticmethod
|
|
356
|
+
def find_best_bic(list_of_fits: list[fm.ols]) -> fm.ols:
|
|
357
|
+
"""Return the best fit, based on Akaike information criterion"""
|
|
358
|
+
res = sorted(list_of_fits, key=lambda x: x.bic)
|
|
359
|
+
return res[0]
|
|
360
|
+
|
|
361
|
+
def _predict(self, fit: fm.ols, data: pd.DataFrame) -> pd.DataFrame:
|
|
362
|
+
"""Return a df with predictions and confidence interval
|
|
363
|
+
|
|
364
|
+
Notes
|
|
365
|
+
-----
|
|
366
|
+
The df will contain the following columns:
|
|
367
|
+
- 'predicted': the model output
|
|
368
|
+
- 'interval_u', 'interval_l': upper and lower confidence bounds.
|
|
369
|
+
The result will depend on the following attributes of self:
|
|
370
|
+
confint : float (default=0.95)
|
|
371
|
+
Confidence level for two-sided hypothesis
|
|
372
|
+
allow_negative_predictions : bool (default=True)
|
|
373
|
+
If False, correct negative predictions to zero
|
|
374
|
+
(typically for energy consumption predictions)
|
|
375
|
+
|
|
376
|
+
Parameters
|
|
377
|
+
----------
|
|
378
|
+
fit : Statsmodels fit
|
|
379
|
+
data : pandas DataFrame or None (default)
|
|
380
|
+
If None, use self.data
|
|
381
|
+
|
|
382
|
+
Returns
|
|
383
|
+
-------
|
|
384
|
+
result : pandas DataFrame
|
|
385
|
+
Copy of df with additional columns 'predicted', 'interval_u' and 'interval_l'
|
|
386
|
+
"""
|
|
387
|
+
# Add model results to data as column 'predictions'
|
|
388
|
+
result = data.copy()
|
|
389
|
+
if "Intercept" in fit.model.exog_names:
|
|
390
|
+
result["Intercept"] = 1.0
|
|
391
|
+
result["predicted"] = fit.predict(result)
|
|
392
|
+
if not self.allow_negative_predictions:
|
|
393
|
+
result.loc[result["predicted"] < 0, "predicted"] = 0
|
|
394
|
+
|
|
395
|
+
_prstd, interval_l, interval_u = wls_prediction_std(
|
|
396
|
+
fit,
|
|
397
|
+
result[fit.model.exog_names],
|
|
398
|
+
alpha=1 - self.confint,
|
|
399
|
+
)
|
|
400
|
+
result["interval_l"] = interval_l
|
|
401
|
+
result["interval_u"] = interval_u
|
|
402
|
+
|
|
403
|
+
if "Intercept" in result:
|
|
404
|
+
result.drop(labels=["Intercept"], axis=1, inplace=True)
|
|
405
|
+
|
|
406
|
+
return result
|
|
407
|
+
|
|
408
|
+
def add_prediction(self):
|
|
409
|
+
"""Add predictions and confidence interval to self.df
|
|
410
|
+
self.df will contain the following columns:
|
|
411
|
+
- 'predicted': the model output
|
|
412
|
+
- 'interval_u', 'interval_l': upper and lower confidence bounds.
|
|
413
|
+
|
|
414
|
+
Parameters
|
|
415
|
+
----------
|
|
416
|
+
None, but the result depends on the following attributes of self:
|
|
417
|
+
confint : float (default=0.95)
|
|
418
|
+
Confidence level for two-sided hypothesis
|
|
419
|
+
allow_negative_predictions : bool (default=True)
|
|
420
|
+
If False, correct negative predictions to zero
|
|
421
|
+
(typically for energy consumption predictions)
|
|
422
|
+
|
|
423
|
+
Returns
|
|
424
|
+
-------
|
|
425
|
+
Nothing, adds columns to self.df
|
|
426
|
+
"""
|
|
427
|
+
self.data = self._predict(fit=self.fit, data=self.data)
|
|
428
|
+
|
|
429
|
+
def validate(
|
|
430
|
+
self, min_rsquared: float = 0.75, max_f_pvalue: float = 0.05, max_pvalues: float = 0.05
|
|
431
|
+
) -> bool:
|
|
432
|
+
"""Checks if the model is valid.
|
|
433
|
+
|
|
434
|
+
Returns
|
|
435
|
+
-------
|
|
436
|
+
bool: True if the model is valid, False otherwise.
|
|
437
|
+
"""
|
|
438
|
+
if self.fit.rsquared_adj < min_rsquared:
|
|
439
|
+
return False
|
|
440
|
+
|
|
441
|
+
if self.fit.f_pvalue > max_f_pvalue:
|
|
442
|
+
return False
|
|
443
|
+
|
|
444
|
+
param_keys = self.fit.pvalues.keys().tolist()
|
|
445
|
+
param_keys.remove("Intercept")
|
|
446
|
+
for k in param_keys:
|
|
447
|
+
if self.fit.pvalues[k] > max_pvalues:
|
|
448
|
+
return False
|
|
449
|
+
|
|
450
|
+
return True
|