openenergyid 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. openenergyid/__init__.py +8 -0
  2. openenergyid/abstractsim/__init__.py +5 -0
  3. openenergyid/abstractsim/abstract.py +102 -0
  4. openenergyid/baseload/__init__.py +15 -0
  5. openenergyid/baseload/analysis.py +190 -0
  6. openenergyid/baseload/exceptions.py +9 -0
  7. openenergyid/baseload/models.py +32 -0
  8. openenergyid/capacity/__init__.py +6 -0
  9. openenergyid/capacity/main.py +103 -0
  10. openenergyid/capacity/models.py +32 -0
  11. openenergyid/const.py +29 -0
  12. openenergyid/dyntar/__init__.py +20 -0
  13. openenergyid/dyntar/const.py +31 -0
  14. openenergyid/dyntar/main.py +313 -0
  15. openenergyid/dyntar/models.py +101 -0
  16. openenergyid/elia/__init__.py +4 -0
  17. openenergyid/elia/api.py +91 -0
  18. openenergyid/elia/const.py +18 -0
  19. openenergyid/energysharing/__init__.py +12 -0
  20. openenergyid/energysharing/const.py +8 -0
  21. openenergyid/energysharing/data_formatting.py +77 -0
  22. openenergyid/energysharing/main.py +122 -0
  23. openenergyid/energysharing/models.py +80 -0
  24. openenergyid/enums.py +16 -0
  25. openenergyid/models.py +174 -0
  26. openenergyid/mvlr/__init__.py +19 -0
  27. openenergyid/mvlr/helpers.py +30 -0
  28. openenergyid/mvlr/main.py +34 -0
  29. openenergyid/mvlr/models.py +227 -0
  30. openenergyid/mvlr/mvlr.py +450 -0
  31. openenergyid/pvsim/__init__.py +8 -0
  32. openenergyid/pvsim/abstract.py +60 -0
  33. openenergyid/pvsim/elia/__init__.py +3 -0
  34. openenergyid/pvsim/elia/main.py +89 -0
  35. openenergyid/pvsim/main.py +49 -0
  36. openenergyid/pvsim/pvlib/__init__.py +11 -0
  37. openenergyid/pvsim/pvlib/main.py +115 -0
  38. openenergyid/pvsim/pvlib/models.py +235 -0
  39. openenergyid/pvsim/pvlib/quickscan.py +99 -0
  40. openenergyid/pvsim/pvlib/weather.py +91 -0
  41. openenergyid/sim/__init__.py +5 -0
  42. openenergyid/sim/main.py +67 -0
  43. openenergyid/simeval/__init__.py +6 -0
  44. openenergyid/simeval/main.py +148 -0
  45. openenergyid/simeval/models.py +162 -0
  46. openenergyid-0.1.31.dist-info/METADATA +32 -0
  47. openenergyid-0.1.31.dist-info/RECORD +50 -0
  48. openenergyid-0.1.31.dist-info/WHEEL +5 -0
  49. openenergyid-0.1.31.dist-info/licenses/LICENSE +21 -0
  50. openenergyid-0.1.31.dist-info/top_level.txt +1 -0
@@ -0,0 +1,227 @@
1
+ """Models for multivariable linear regression."""
2
+
3
+ from typing import Any
4
+
5
+ import pandas as pd
6
+ import statsmodels.formula.api as fm
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+
9
+ from openenergyid.enums import Granularity
10
+ from openenergyid.models import TimeDataFrame
11
+
12
+ from .mvlr import MultiVariableLinearRegression
13
+
14
+ COLUMN_TEMPERATUREEQUIVALENT = "temperatureEquivalent"
15
+
16
+
17
+ ######################
18
+ # MVLR Input Models #
19
+ ######################
20
+
21
+
22
+ class ValidationParameters(BaseModel):
23
+ """Parameters for validation of a multivariable linear regression model."""
24
+
25
+ rsquared: float = Field(
26
+ 0.75, ge=0, le=1, description="Minimum acceptable value for the adjusted R-squared"
27
+ )
28
+ f_pvalue: float = Field(
29
+ 0.05, ge=0, le=1, description="Maximum acceptable value for the F-statistic"
30
+ )
31
+ pvalues: float = Field(
32
+ 0.05, ge=0, le=1, description="Maximum acceptable value for the p-values of the t-statistic"
33
+ )
34
+
35
+
36
+ class IndependentVariableInput(BaseModel):
37
+ """
38
+ Independent variable.
39
+
40
+ Has to corresponds to a column in the data frame.
41
+ """
42
+
43
+ name: str = Field(
44
+ description="Name of the independent variable. "
45
+ "If the name is `temperatureEquivalent`, "
46
+ "it will be unpacked into columns according to the variants."
47
+ )
48
+ variants: list[str] | None = Field(
49
+ default=None,
50
+ description="Variants of the `temperatureEquivalent` independent variable. "
51
+ "Eg. `HDD_16.5` will be Heating Degree Days with a base temperature of 16.5°C, "
52
+ "`CDD_0` will be Cooling Degree Days with a base temperature of 0°C.",
53
+ )
54
+ allow_negative_coefficient: bool = Field(
55
+ default=True,
56
+ alias="allowNegativeCoefficient",
57
+ description="Whether the coefficient can be negative.",
58
+ )
59
+
60
+
61
+ class MultiVariableRegressionInput(BaseModel):
62
+ """Multi-variable regression input."""
63
+
64
+ timezone: str = Field(alias="timeZone")
65
+ independent_variables: list[IndependentVariableInput] = Field(
66
+ alias="independentVariables", min_length=1
67
+ )
68
+ dependent_variable: str = Field(alias="dependentVariable")
69
+ frame: TimeDataFrame
70
+ granularities: list[Granularity]
71
+ allow_negative_predictions: bool = Field(alias="allowNegativePredictions", default=False)
72
+ validation_parameters: ValidationParameters = Field(
73
+ alias="validationParameters", default=ValidationParameters()
74
+ )
75
+ single_use_exog_prefixes: list[str] | None = Field(
76
+ # default=["HDD", "CDD", "FDD"],
77
+ default=None,
78
+ alias="singleUseExogPrefixes",
79
+ description="List of prefixes to be used as single-use exogenous variables.",
80
+ )
81
+
82
+ def model_post_init(self, __context: Any) -> None:
83
+ """Post init hook."""
84
+ # Check if all independent variables are present in the data frame
85
+ for iv in self.independent_variables: # pylint: disable=not-an-iterable
86
+ if iv.name not in self.frame.columns:
87
+ raise ValueError(f"Independent variable {iv.name} not found in the data frame.")
88
+
89
+ return super().model_post_init(__context)
90
+
91
+ def _data_frame(self) -> pd.DataFrame:
92
+ """Convert the data to a pandas DataFrame."""
93
+ return self.frame.to_pandas(timezone=self.timezone)
94
+
95
+ def data_frame(self) -> pd.DataFrame:
96
+ """
97
+ Return the data frame ready for analysis.
98
+
99
+ Unpacks degree days and removes unnecessary columns.
100
+
101
+ If an independent variable named `temperatureEquivalent` is present,
102
+ it will be unpacked into columns according to the variants.
103
+ Eg. Variant "HDD_16.5" will be Heating Degree Days
104
+ with a base temperature of 16.5°C,
105
+ "CDD_0" will be Cooling Degree Days with a base temperature of 0°C.
106
+ """
107
+ frame = self._data_frame()
108
+ columns_to_retain = [self.dependent_variable]
109
+ for iv in self.independent_variables: # pylint: disable=not-an-iterable
110
+ if iv.name == COLUMN_TEMPERATUREEQUIVALENT and iv.variants is not None:
111
+ for variant in iv.variants:
112
+ prefix, base_temperature = variant.split("_")
113
+ if prefix == "CDD":
114
+ frame[variant] = frame[COLUMN_TEMPERATUREEQUIVALENT] - float(
115
+ base_temperature
116
+ )
117
+ else:
118
+ frame[variant] = (
119
+ float(base_temperature) - frame[COLUMN_TEMPERATUREEQUIVALENT]
120
+ )
121
+ frame[variant] = frame[variant].clip(lower=0)
122
+ columns_to_retain.append(variant)
123
+ frame.drop(columns=[COLUMN_TEMPERATUREEQUIVALENT], inplace=True)
124
+ else:
125
+ columns_to_retain.append(iv.name)
126
+
127
+ frame = frame[columns_to_retain].copy()
128
+
129
+ return frame
130
+
131
+ def get_disallowed_negative_coefficients(self) -> list[str]:
132
+ """Get independent variables that are not allowed to have a negative coefficient."""
133
+ result = []
134
+ for iv in self.independent_variables: # pylint: disable=not-an-iterable
135
+ if iv.name == COLUMN_TEMPERATUREEQUIVALENT and iv.variants is not None:
136
+ if not iv.allow_negative_coefficient:
137
+ result.extend(iv.variants)
138
+ elif not iv.allow_negative_coefficient:
139
+ result.append(iv.name)
140
+ return result
141
+
142
+
143
+ ######################
144
+ # MVLR Result Models #
145
+ ######################
146
+
147
+
148
+ class ConfidenceInterval(BaseModel):
149
+ """Confidence interval for a coefficient."""
150
+
151
+ confidence: float = Field(ge=0, le=1)
152
+ lower: float
153
+ upper: float
154
+
155
+
156
+ class IndependentVariableResult(BaseModel):
157
+ """Independent variable for a multivariable linear regression model."""
158
+
159
+ name: str
160
+ coef: float
161
+ t_stat: float | None = Field(default=None, alias="tStat")
162
+ p_value: float | None = Field(ge=0, le=1, default=None, alias="pValue")
163
+ std_err: float | None = Field(default=None, alias="stdErr")
164
+ confidence_interval: ConfidenceInterval | None = Field(default=None, alias="confidenceInterval")
165
+
166
+ model_config = ConfigDict(populate_by_name=True)
167
+
168
+ @classmethod
169
+ def from_fit(cls, fit: fm.ols, name: str) -> "IndependentVariableResult":
170
+ """Create an IndependentVariable from a fit."""
171
+ return cls(
172
+ name=name,
173
+ coef=fit.params[name],
174
+ t_stat=fit.tvalues[name],
175
+ p_value=fit.pvalues[name],
176
+ std_err=fit.bse[name],
177
+ confidence_interval=ConfidenceInterval(
178
+ confidence=0.95,
179
+ lower=fit.conf_int().transpose()[name][0],
180
+ upper=fit.conf_int().transpose()[name][1],
181
+ ),
182
+ )
183
+
184
+
185
+ class MultiVariableRegressionResult(BaseModel):
186
+ """Result of a multivariable regression model."""
187
+
188
+ dependent_variable: str = Field(alias="dependentVariable")
189
+ independent_variables: list[IndependentVariableResult] = Field(alias="independentVariables")
190
+ r2: float = Field(ge=0, le=1, alias="rSquared")
191
+ r2_adj: float = Field(ge=0, le=1, alias="rSquaredAdjusted")
192
+ f_stat: float = Field(ge=0, alias="fStat")
193
+ prob_f_stat: float = Field(ge=0, le=1, alias="probFStat")
194
+ intercept: IndependentVariableResult
195
+ granularity: Granularity
196
+ frame: TimeDataFrame
197
+
198
+ model_config = ConfigDict(populate_by_name=True)
199
+
200
+ @classmethod
201
+ def from_mvlr(cls, mvlr: MultiVariableLinearRegression) -> "MultiVariableRegressionResult":
202
+ """Create a MultiVariableRegressionResult from a MultiVariableLinearRegression."""
203
+
204
+ # Get independent variables
205
+ param_keys = mvlr.fit.params.keys().tolist()
206
+ param_keys.remove("Intercept")
207
+ independent_variables = []
208
+ for k in param_keys:
209
+ independent_variables.append(IndependentVariableResult.from_fit(mvlr.fit, k))
210
+
211
+ # Create resulting TimeSeries
212
+ cols_to_keep = list(param_keys)
213
+ cols_to_keep.append(mvlr.y)
214
+ cols_to_remove = list(filter(lambda v: v not in cols_to_keep, mvlr.data.columns.values))
215
+ frame = mvlr.data.drop(cols_to_remove, axis=1)
216
+
217
+ return cls(
218
+ dependent_variable=mvlr.y,
219
+ independent_variables=independent_variables,
220
+ r2=mvlr.fit.rsquared,
221
+ r2_adj=mvlr.fit.rsquared_adj,
222
+ f_stat=mvlr.fit.fvalue,
223
+ prob_f_stat=mvlr.fit.f_pvalue,
224
+ intercept=IndependentVariableResult.from_fit(mvlr.fit, "Intercept"),
225
+ granularity=mvlr.granularity,
226
+ frame=TimeDataFrame.from_pandas(frame),
227
+ )
@@ -0,0 +1,450 @@
1
+ """Multi-variable linear regression based on statsmodels
2
+ and Ordinary Least Squares (ols)."""
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import statsmodels.formula.api as fm
7
+ from patsy import LookupFactor, ModelDesc, Term # pylint: disable=no-name-in-module
8
+ from statsmodels.sandbox.regression.predstd import wls_prediction_std
9
+
10
+ from openenergyid.enums import Granularity
11
+
12
+
13
+ class MultiVariableLinearRegression:
14
+ """Multi-variable linear regression.
15
+
16
+ Based on statsmodels and Ordinary Least Squares (ols).
17
+
18
+ Pass a dataframe with the variable to be modelled y (dependent variable)
19
+ and the possible independent variables x.
20
+ Specify as string the name of the dependent variable, and optionally pass a list with names of
21
+ independent variables to try
22
+ (by default all other columns will be tried as independent variables).
23
+
24
+ The analysis is based on a forward-selection approach: starting from a simple model,
25
+ the model is iteratively refined and verified until no statistical relevant improvements
26
+ can be obtained.
27
+ Each model in the iteration loop is stored in the attribute self.list_of_fits.
28
+ The selected model is self.fit (=pointer to the last element of self.list_of_fits).
29
+
30
+ The dataframe can contain daily, weekly, monthly, yearly ... values. Each row is an instance.
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ data: pd.DataFrame,
36
+ y: str,
37
+ p_max: float = 0.05,
38
+ list_of_x: list[str] = None,
39
+ confint: float = 0.95,
40
+ cross_validation: bool = False,
41
+ allow_negative_predictions: bool = False,
42
+ granularity: Granularity = None,
43
+ single_use_exog_prefixes: list[str] = None,
44
+ exogs__disallow_negative_coefficient: list[str] = None,
45
+ ):
46
+ """Parameters
47
+ ----------
48
+ data : TimeSeries
49
+ Datetimeindex and both independent variables (x) and dependent variable (y) as columns
50
+ y : str
51
+ Name of the dependent (endogeneous) variable to model
52
+ p_max : float (default=0.05)
53
+ Acceptable p-value of the t-statistic for estimated parameters
54
+ list_of_x : list of str (default=None)
55
+ If None (default), try to build a model with all columns in the dataframe
56
+ If a list with column names is given, only try these columns as independent variables
57
+ confint : float, default=0.95
58
+ Two-sided confidence interval for predictions.
59
+ cross_validation : bool, default=False
60
+ If True, compute the model based on cross-validation (leave one out)
61
+ Only possible if the df has less than 15 entries.
62
+ Note: this will take much longer computation times!
63
+ allow_negative_predictions : bool, default=False
64
+ If True, allow predictions to be negative.
65
+ For gas consumption or PV production, this is not physical
66
+ so allow_negative_predictions should be False
67
+ granularity : Granularity, default=None
68
+ Granularity of the data. Is only used for the output of the model.
69
+ If None, the granularity is not set.
70
+ single_use_exog_prefixes : list of str, default=None
71
+ List of variable prefixes that indicate a variable type that should only be used once.
72
+ For example, if the list contains "HDD", only one of the columns "HDD1", "HDD2", "HDD3" etc.
73
+ will be used as an independent variable.
74
+ Once the best fit using a variable with a given prefix is found, the other variables with the same
75
+ prefix will not be used as independent variables.
76
+ exogs__disallow_negative_coefficient : list of str, default=None
77
+ List of variable names for which the coefficient is not allowed to be negative.
78
+ """
79
+ self.data = data.copy()
80
+ if y not in self.data.columns:
81
+ raise AssertionError(
82
+ f"The dependent variable {y} is not a column in the dataframe",
83
+ )
84
+ self.y = y
85
+
86
+ self.p_max = p_max
87
+ self.list_of_x = list_of_x or [x for x in self.data.columns if x != self.y]
88
+ self.confint = confint
89
+ self.cross_validation = cross_validation
90
+ self.allow_negative_predictions = allow_negative_predictions
91
+ self.granularity = granularity
92
+ self.single_use_exog_prefixes = single_use_exog_prefixes
93
+ self.exogs__disallow_negative_coefficient = exogs__disallow_negative_coefficient
94
+ self._fit = None
95
+ self._list_of_fits = []
96
+ self.list_of_cverrors = []
97
+
98
+ @property
99
+ def fit(self) -> fm.ols:
100
+ """Fits a model to the data.
101
+
102
+ Returns
103
+ -------
104
+ The fitted model.
105
+
106
+ Raises
107
+ ------
108
+ UnboundLocalError: If `do_analysis()` has not been run before calling `fit()`.
109
+ """
110
+ if self._fit is None:
111
+ raise UnboundLocalError(
112
+ 'Run "do_analysis()" first to fit a model to the data.',
113
+ )
114
+ else:
115
+ return self._fit
116
+
117
+ @property
118
+ def list_of_fits(self) -> list[fm.ols]:
119
+ """Returns the list of fits generated by the model.
120
+
121
+ Raises
122
+ ------
123
+ UnboundLocalError: If the model has not been fitted yet.
124
+
125
+ Returns
126
+ -------
127
+ list: The list of fits generated by the model.
128
+ """
129
+ if not self._list_of_fits:
130
+ raise UnboundLocalError(
131
+ 'Run "do_analysis()" first to fit a model to the data.',
132
+ )
133
+ else:
134
+ return self._list_of_fits
135
+
136
+ def do_analysis(self):
137
+ """Find the best model (fit) and create self.list_of_fits and self.fit"""
138
+ if self.cross_validation:
139
+ return self._do_analysis_cross_validation()
140
+ else:
141
+ return self._do_analysis_no_cross_validation()
142
+
143
+ def _do_analysis_no_cross_validation(self):
144
+ """Find the best model (fit) and create self.list_of_fits and self.fit"""
145
+ # first model is just the mean
146
+ response_term = [Term([LookupFactor(self.y)])]
147
+ model_terms = [Term([])] # empty term is the intercept
148
+ all_model_terms_dict = {x: Term([LookupFactor(x)]) for x in self.list_of_x}
149
+ # ...then add another term for each candidate
150
+ # model_terms += [Term([LookupFactor(c)]) for c in candidates]
151
+ model_desc = ModelDesc(response_term, model_terms)
152
+ self._list_of_fits.append(fm.ols(model_desc, data=self.data).fit())
153
+ # try to improve the model until no improvements can be found
154
+
155
+ while all_model_terms_dict:
156
+ # try each x and overwrite the best_fit if we find a better one
157
+ # the first best_fit is the one from the previous round
158
+ ref_fit = self._list_of_fits[-1]
159
+ best_fit = self._list_of_fits[-1]
160
+ best_bic = best_fit.bic
161
+ for x, term in all_model_terms_dict.items():
162
+ # make new_fit, compare with best found so far
163
+ model_desc = ModelDesc(
164
+ response_term,
165
+ ref_fit.model.formula.rhs_termlist + [term],
166
+ )
167
+ fit = fm.ols(model_desc, data=self.data).fit()
168
+
169
+ # Check if the coefficient of the variable is allowed to be negative
170
+ if (
171
+ self.exogs__disallow_negative_coefficient is not None
172
+ and x in self.exogs__disallow_negative_coefficient
173
+ and fit.params[x] < 0
174
+ ):
175
+ continue
176
+
177
+ if fit.bic < best_bic:
178
+ best_bic = fit.bic
179
+ best_fit = fit
180
+ best_x = x
181
+ # Sometimes, the obtained fit may be better, but contains unsignificant parameters.
182
+ # Correct the fit by removing the unsignificant parameters and estimate again
183
+ best_fit = self._prune(best_fit, p_max=self.p_max)
184
+
185
+ # if best_fit does not contain more variables than ref fit, exit
186
+ if len(best_fit.model.formula.rhs_termlist) == len(
187
+ ref_fit.model.formula.rhs_termlist,
188
+ ):
189
+ break
190
+
191
+ self._list_of_fits.append(best_fit)
192
+ all_model_terms_dict.pop(best_x)
193
+
194
+ # Check if `best_x` starts with a prefix that should only be used once
195
+ # If so, remove all other variables with the same prefix from the list of candidates
196
+ if self.single_use_exog_prefixes:
197
+ for prefix in self.single_use_exog_prefixes:
198
+ if best_x.startswith(prefix):
199
+ all_model_terms_dict = {
200
+ k: v
201
+ for k, v in all_model_terms_dict.items()
202
+ if not k.startswith(prefix)
203
+ }
204
+
205
+ self._fit = self._list_of_fits[-1]
206
+
207
+ def _do_analysis_cross_validation(self):
208
+ """Find the best model (fit) based on cross-valiation (leave one out)"""
209
+ assert len(self.data) < 15, (
210
+ "Cross-validation is not implemented if your sample contains more than 15 datapoints"
211
+ )
212
+
213
+ # initialization: first model is the mean, but compute cv correctly.
214
+ errors = []
215
+ response_term = [Term([LookupFactor(self.y)])]
216
+ model_terms = [Term([])] # empty term is the intercept
217
+ model_desc = ModelDesc(response_term, model_terms)
218
+ for i in self.data.index:
219
+ # make new_fit, compute cross-validation and store error
220
+ df_ = self.data.drop(i, axis=0)
221
+ fit = fm.ols(model_desc, data=df_).fit()
222
+ cross_prediction = self._predict(fit=fit, data=self.data.loc[[i], :])
223
+ errors.append(cross_prediction["predicted"] - cross_prediction[self.y])
224
+
225
+ self._list_of_fits = [fm.ols(model_desc, data=self.data).fit()]
226
+ self.list_of_cverrors = [np.mean(np.abs(np.array(errors)))]
227
+
228
+ # try to improve the model until no improvements can be found
229
+ all_model_terms_dict = {x: Term([LookupFactor(x)]) for x in self.list_of_x}
230
+ while all_model_terms_dict:
231
+ # import pdb;pdb.set_trace()
232
+ # try each x in all_exog and overwrite if we find a better one
233
+ # at the end of iteration (and not earlier), save the best of the iteration
234
+ better_model_found = False
235
+ best = dict(fit=self._list_of_fits[-1], cverror=self.list_of_cverrors[-1])
236
+ for x, term in all_model_terms_dict.items():
237
+ model_desc = ModelDesc(
238
+ response_term,
239
+ self._list_of_fits[-1].model.formula.rhs_termlist + [term],
240
+ )
241
+ # cross_validation, currently only implemented for monthly data
242
+ # compute the mean error for a given formula based on leave-one-out.
243
+ errors = []
244
+ for i in self.data.index:
245
+ # make new_fit, compute cross-validation and store error
246
+ df_ = self.data.drop(i, axis=0)
247
+ fit = fm.ols(model_desc, data=df_).fit()
248
+ cross_prediction = self._predict(
249
+ fit=fit,
250
+ data=self.data.loc[[i], :],
251
+ )
252
+ errors.append(
253
+ cross_prediction["predicted"] - cross_prediction[self.y],
254
+ )
255
+ cverror = np.mean(np.abs(np.array(errors)))
256
+ # compare the model with the current fit
257
+ if cverror < best["cverror"]:
258
+ # better model, keep it
259
+ # first, reidentify using all the datapoints
260
+ best["fit"] = fm.ols(model_desc, data=self.data).fit()
261
+ best["cverror"] = cverror
262
+ better_model_found = True
263
+ best_x = x
264
+
265
+ if better_model_found:
266
+ self._list_of_fits.append(best["fit"])
267
+ self.list_of_cverrors.append(best["cverror"])
268
+
269
+ else:
270
+ # if we did not find a better model, exit
271
+ break
272
+
273
+ # next iteration with the found exog removed
274
+ all_model_terms_dict.pop(best_x)
275
+
276
+ # Check if `best_x` starts with a prefix that should only be used once
277
+ # If so, remove all other variables with the same prefix from the list of candidates
278
+ if self.single_use_exog_prefixes:
279
+ for prefix in self.single_use_exog_prefixes:
280
+ if best_x.startswith(prefix):
281
+ all_model_terms_dict = {
282
+ k: v
283
+ for k, v in all_model_terms_dict.items()
284
+ if not k.startswith(prefix)
285
+ }
286
+
287
+ self._fit = self._list_of_fits[-1]
288
+
289
+ def _prune(self, fit: fm.ols, p_max: float) -> fm.ols:
290
+ """If the fit contains statistically insignificant parameters, remove them.
291
+ Returns a pruned fit where all parameters have p-values of the t-statistic below p_max
292
+
293
+ Parameters
294
+ ----------
295
+ fit: fm.ols fit object
296
+ Can contain insignificant parameters
297
+ p_max : float
298
+ Maximum allowed probability of the t-statistic
299
+
300
+ Returns
301
+ -------
302
+ fit: fm.ols fit object
303
+ Won't contain any insignificant parameters
304
+
305
+ """
306
+
307
+ def remove_from_model_desc(x: str, model_desc: ModelDesc) -> ModelDesc:
308
+ """Return a model_desc without x"""
309
+ rhs_termlist = []
310
+ for t in model_desc.rhs_termlist:
311
+ if not t.factors:
312
+ # intercept, add anyway
313
+ rhs_termlist.append(t)
314
+ elif x != t.factors[0]._varname: # pylint: disable=protected-access
315
+ # this is not the term with x
316
+ rhs_termlist.append(t)
317
+
318
+ md = ModelDesc(model_desc.lhs_termlist, rhs_termlist)
319
+ return md
320
+
321
+ corrected_model_desc = ModelDesc(
322
+ fit.model.formula.lhs_termlist[:],
323
+ fit.model.formula.rhs_termlist[:],
324
+ )
325
+ pars_to_prune = fit.pvalues.where(fit.pvalues > p_max).dropna().index.tolist()
326
+ try:
327
+ pars_to_prune.remove("Intercept")
328
+ except ValueError:
329
+ pass
330
+ while pars_to_prune:
331
+ corrected_model_desc = remove_from_model_desc(
332
+ pars_to_prune[0],
333
+ corrected_model_desc,
334
+ )
335
+ fit = fm.ols(corrected_model_desc, data=self.data).fit()
336
+ pars_to_prune = fit.pvalues.where(fit.pvalues > p_max).dropna().index.tolist()
337
+ try:
338
+ pars_to_prune.remove("Intercept")
339
+ except ValueError:
340
+ pass
341
+ return fit
342
+
343
+ @staticmethod
344
+ def find_best_rsquared(list_of_fits: list[fm.ols]) -> fm.ols:
345
+ """Return the best fit, based on rsquared"""
346
+ res = sorted(list_of_fits, key=lambda x: x.rsquared)
347
+ return res[-1]
348
+
349
+ @staticmethod
350
+ def find_best_akaike(list_of_fits: list[fm.ols]) -> fm.ols:
351
+ """Return the best fit, based on Akaike information criterion"""
352
+ res = sorted(list_of_fits, key=lambda x: x.aic)
353
+ return res[0]
354
+
355
+ @staticmethod
356
+ def find_best_bic(list_of_fits: list[fm.ols]) -> fm.ols:
357
+ """Return the best fit, based on Akaike information criterion"""
358
+ res = sorted(list_of_fits, key=lambda x: x.bic)
359
+ return res[0]
360
+
361
+ def _predict(self, fit: fm.ols, data: pd.DataFrame) -> pd.DataFrame:
362
+ """Return a df with predictions and confidence interval
363
+
364
+ Notes
365
+ -----
366
+ The df will contain the following columns:
367
+ - 'predicted': the model output
368
+ - 'interval_u', 'interval_l': upper and lower confidence bounds.
369
+ The result will depend on the following attributes of self:
370
+ confint : float (default=0.95)
371
+ Confidence level for two-sided hypothesis
372
+ allow_negative_predictions : bool (default=True)
373
+ If False, correct negative predictions to zero
374
+ (typically for energy consumption predictions)
375
+
376
+ Parameters
377
+ ----------
378
+ fit : Statsmodels fit
379
+ data : pandas DataFrame or None (default)
380
+ If None, use self.data
381
+
382
+ Returns
383
+ -------
384
+ result : pandas DataFrame
385
+ Copy of df with additional columns 'predicted', 'interval_u' and 'interval_l'
386
+ """
387
+ # Add model results to data as column 'predictions'
388
+ result = data.copy()
389
+ if "Intercept" in fit.model.exog_names:
390
+ result["Intercept"] = 1.0
391
+ result["predicted"] = fit.predict(result)
392
+ if not self.allow_negative_predictions:
393
+ result.loc[result["predicted"] < 0, "predicted"] = 0
394
+
395
+ _prstd, interval_l, interval_u = wls_prediction_std(
396
+ fit,
397
+ result[fit.model.exog_names],
398
+ alpha=1 - self.confint,
399
+ )
400
+ result["interval_l"] = interval_l
401
+ result["interval_u"] = interval_u
402
+
403
+ if "Intercept" in result:
404
+ result.drop(labels=["Intercept"], axis=1, inplace=True)
405
+
406
+ return result
407
+
408
+ def add_prediction(self):
409
+ """Add predictions and confidence interval to self.df
410
+ self.df will contain the following columns:
411
+ - 'predicted': the model output
412
+ - 'interval_u', 'interval_l': upper and lower confidence bounds.
413
+
414
+ Parameters
415
+ ----------
416
+ None, but the result depends on the following attributes of self:
417
+ confint : float (default=0.95)
418
+ Confidence level for two-sided hypothesis
419
+ allow_negative_predictions : bool (default=True)
420
+ If False, correct negative predictions to zero
421
+ (typically for energy consumption predictions)
422
+
423
+ Returns
424
+ -------
425
+ Nothing, adds columns to self.df
426
+ """
427
+ self.data = self._predict(fit=self.fit, data=self.data)
428
+
429
+ def validate(
430
+ self, min_rsquared: float = 0.75, max_f_pvalue: float = 0.05, max_pvalues: float = 0.05
431
+ ) -> bool:
432
+ """Checks if the model is valid.
433
+
434
+ Returns
435
+ -------
436
+ bool: True if the model is valid, False otherwise.
437
+ """
438
+ if self.fit.rsquared_adj < min_rsquared:
439
+ return False
440
+
441
+ if self.fit.f_pvalue > max_f_pvalue:
442
+ return False
443
+
444
+ param_keys = self.fit.pvalues.keys().tolist()
445
+ param_keys.remove("Intercept")
446
+ for k in param_keys:
447
+ if self.fit.pvalues[k] > max_pvalues:
448
+ return False
449
+
450
+ return True
@@ -0,0 +1,8 @@
1
+ from .main import (
2
+ PVSimulationInput,
3
+ PVSimulationSummary,
4
+ apply_simulation,
5
+ get_simulator,
6
+ )
7
+
8
+ __all__ = ["PVSimulationInput", "get_simulator", "apply_simulation", "PVSimulationSummary"]