openenergyid 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of openenergyid might be problematic. Click here for more details.

@@ -0,0 +1,79 @@
1
+ """Data models for energy sharing."""
2
+
3
+ from enum import Enum
4
+ from typing import Annotated, Any
5
+
6
+ from pydantic import BaseModel, Field, confloat
7
+ import pandas as pd
8
+
9
+ from openenergyid import TimeDataFrame
10
+ from .data_formatting import create_multi_index_input_frame
11
+ from .const import NET_INJECTION, NET_OFFTAKE, SHARED_ENERGY
12
+
13
+
14
+ class CalculationMethod(Enum):
15
+ """Calculation method for energy sharing."""
16
+
17
+ FIXED = "Fixed"
18
+ RELATIVE = "Relative"
19
+ OPTIMAL = "Optimal"
20
+
21
+
22
+ class KeyInput(TimeDataFrame):
23
+ """Energy Sharing Keys."""
24
+
25
+ data: Annotated[
26
+ list[list[confloat(ge=0.0, le=1.0)]], # type: ignore
27
+ Field(
28
+ description="Key data, column per participant. "
29
+ "Must be between 0 and 1. "
30
+ "Each row must sum to 1."
31
+ ),
32
+ ]
33
+
34
+ def model_post_init(self, __context: Any) -> None:
35
+ """Post-initialization validation."""
36
+ for row in self.data:
37
+ if round(sum(row), 3) != 1.0:
38
+ raise ValueError("Each row must sum to 1.")
39
+ return super().model_post_init(__context)
40
+
41
+
42
+ class EnergySharingInput(BaseModel):
43
+ """Input data for energy sharing."""
44
+
45
+ gross_injection: Annotated[
46
+ TimeDataFrame,
47
+ Field(alias="grossInjection", description="Gross injection data, column per participant"),
48
+ ]
49
+ gross_offtake: Annotated[
50
+ TimeDataFrame,
51
+ Field(alias="grossOfftake", description="Gross offtake data, column per participant"),
52
+ ]
53
+ key: KeyInput
54
+
55
+ def to_pandas(self) -> pd.DataFrame:
56
+ """Return the data as a combined DataFrame"""
57
+ df = create_multi_index_input_frame(
58
+ gross_injection=self.gross_injection.to_pandas(),
59
+ gross_offtake=self.gross_offtake.to_pandas(),
60
+ key=self.key.to_pandas(),
61
+ )
62
+ return df
63
+
64
+
65
+ class EnergySharingOutput(BaseModel):
66
+ """Output data for energy sharing."""
67
+
68
+ net_injection: TimeDataFrame = Field(alias="netInjection")
69
+ net_offtake: TimeDataFrame = Field(alias="netOfftake")
70
+ shared_energy: TimeDataFrame = Field(alias="sharedEnergy")
71
+
72
+ @classmethod
73
+ def from_calculation_result(cls, result: pd.DataFrame) -> "EnergySharingOutput":
74
+ """Create an output model from a calculation result."""
75
+ return cls.model_construct(
76
+ net_injection=TimeDataFrame.from_pandas(result[NET_INJECTION]),
77
+ net_offtake=TimeDataFrame.from_pandas(result[NET_OFFTAKE]),
78
+ shared_energy=TimeDataFrame.from_pandas(result[SHARED_ENERGY]),
79
+ )
openenergyid/enums.py ADDED
@@ -0,0 +1,16 @@
1
+ """Static enums for Open Energy ID."""
2
+
3
+ from enum import Enum
4
+
5
+
6
+ class Granularity(Enum):
7
+ """Granularity of a time series."""
8
+
9
+ P1Y = "P1Y" # 1 year
10
+ P1M = "P1M" # 1 month
11
+ P7D = "P7D" # 7 days
12
+ P1D = "P1D" # 1 day
13
+ PT1H = "PT1H" # 1 hour
14
+ PT15M = "PT15M" # 15 minutes
15
+ PT5M = "PT5M" # 5 minutes
16
+ PT1M = "PT1M" # 1 minute
openenergyid/models.py ADDED
@@ -0,0 +1,164 @@
1
+ """Data models for the Open Energy ID."""
2
+
3
+ import datetime as dt
4
+ from typing import overload
5
+
6
+ from typing import Self
7
+
8
+ import pandas as pd
9
+ from pydantic import BaseModel
10
+ import polars as pl
11
+
12
+
13
+ class TimeSeriesBase(BaseModel):
14
+ """Pydantic base model for time series data."""
15
+
16
+ index: list[dt.datetime]
17
+
18
+ @classmethod
19
+ def from_pandas(cls, data: pd.Series | pd.DataFrame) -> Self:
20
+ """Create from a Pandas Object."""
21
+ raise NotImplementedError
22
+
23
+ def to_pandas(self, timezone: str = "UTC") -> pd.Series | pd.DataFrame:
24
+ """Convert to a Pandas Object."""
25
+ raise NotImplementedError
26
+
27
+ @overload
28
+ def to_json(self, path: None = None, **kwargs) -> str:
29
+ """Dump to a JSON string."""
30
+
31
+ @overload
32
+ def to_json(self, path: str, **kwargs) -> None:
33
+ """Dump to a JSON file."""
34
+
35
+ def to_json(self, path: str | None = None, **kwargs) -> str | None:
36
+ """Dump to a JSON string or file."""
37
+ if path is None:
38
+ return self.model_dump_json(**kwargs)
39
+ encoding = kwargs.pop("encoding", "UTF-8")
40
+ with open(path, "w", encoding=encoding) as file:
41
+ file.write(self.model_dump_json(**kwargs))
42
+ return None
43
+
44
+ @overload
45
+ @classmethod
46
+ def from_json(cls, string: str, **kwargs) -> Self:
47
+ """Load from a JSON string."""
48
+
49
+ @overload
50
+ @classmethod
51
+ def from_json(cls, *, path: str, **kwargs) -> Self:
52
+ """Load from a JSON file."""
53
+
54
+ @classmethod
55
+ def from_json(cls, string: str | None = None, path: str | None = None, **kwargs) -> Self:
56
+ """Load from a JSON file or string."""
57
+ if string:
58
+ return cls.model_validate_json(string, **kwargs)
59
+ if path:
60
+ encoding = kwargs.pop("encoding", "UTF-8")
61
+ with open(path, encoding=encoding) as file:
62
+ return cls.model_validate_json(file.read(), **kwargs)
63
+ raise ValueError("Either string or path must be provided.")
64
+
65
+
66
+ class TimeSeries(TimeSeriesBase):
67
+ """
68
+ Represents a time series data.
69
+ Attributes:
70
+ name (str | None): The name of the time series.
71
+ data (list[float | None]): The data points of the time series.
72
+ Methods:
73
+ replace_nan_with_none(cls, data: list[float]) -> list[float | None]:
74
+ Replace NaN values with None.
75
+ from_pandas(cls, data: pd.Series) -> Self:
76
+ Create a TimeSeries object from a Pandas Series.
77
+ to_pandas(self, timezone: str = "UTC") -> pd.Series:
78
+ Convert the TimeSeries object to a Pandas Series.
79
+ from_polars(cls, data: pl.DataFrame | pl.LazyFrame) -> Self:
80
+ Create a TimeSeries object from Polars data.
81
+ to_polars(self, timezone: str = "UTC") -> pl.LazyFrame:
82
+ Convert the TimeSeries object to a Polars LazyFrame.
83
+ """
84
+
85
+ name: str | None = None
86
+ data: list[float | None]
87
+
88
+ @classmethod
89
+ def from_pandas(cls, data: pd.Series) -> Self:
90
+ """Create from a Pandas Series."""
91
+ return cls(name=str(data.name), data=data.tolist(), index=data.index.tolist())
92
+
93
+ def to_pandas(self, timezone: str = "UTC") -> pd.Series:
94
+ """Convert to a Pandas Series."""
95
+ series = pd.Series(self.data, name=self.name, index=self.index)
96
+ series.index = pd.to_datetime(series.index, utc=True)
97
+ return series.tz_convert(timezone)
98
+
99
+ @classmethod
100
+ def from_polars(cls, data: pl.DataFrame | pl.LazyFrame) -> Self:
101
+ """Create from Polars data."""
102
+ # Always work with DataFrame
103
+ df = data.collect() if isinstance(data, pl.LazyFrame) else data
104
+
105
+ if len(df.columns) != 2:
106
+ raise ValueError("Must contain exactly two columns: timestamp and value")
107
+
108
+ value_col = [col for col in df.columns if col != "timestamp"][0]
109
+ return cls(
110
+ name=value_col,
111
+ data=df[value_col].cast(pl.Float64).to_list(), # Ensure float type
112
+ index=df["timestamp"].cast(pl.Datetime).dt.convert_time_zone("UTC").to_list(),
113
+ )
114
+
115
+ def to_polars(self, timezone: str = "UTC") -> pl.LazyFrame:
116
+ """Convert to Polars LazyFrame."""
117
+ # Always return LazyFrame as specified in return type
118
+ df = pl.DataFrame(
119
+ {
120
+ "timestamp": pl.Series(self.index, dtype=pl.Datetime).dt.convert_time_zone(
121
+ timezone
122
+ ),
123
+ "total" if self.name is None else self.name: pl.Series(self.data, dtype=pl.Float64),
124
+ }
125
+ )
126
+ return df.lazy()
127
+
128
+
129
+ class TimeDataFrame(TimeSeriesBase):
130
+ """Time series data with multiple columns."""
131
+
132
+ columns: list[str]
133
+ data: list[list[float | None]]
134
+
135
+ @classmethod
136
+ def from_pandas(cls, data: pd.DataFrame) -> Self:
137
+ """Create from a Pandas DataFrame."""
138
+ # Cast values to float | None
139
+ values = [
140
+ [float(x) if pd.notnull(x) else None for x in row] for row in data.values.tolist()
141
+ ]
142
+ return cls(columns=data.columns.tolist(), data=values, index=data.index.tolist())
143
+
144
+ def to_pandas(self, timezone: str = "UTC") -> pd.DataFrame:
145
+ """Convert to a Pandas DataFrame."""
146
+ frame = pd.DataFrame(self.data, columns=self.columns, index=self.index)
147
+ frame.index = pd.to_datetime(frame.index, utc=True)
148
+ return frame.tz_convert(timezone)
149
+
150
+ @classmethod
151
+ def from_timeseries(cls, data: list[TimeSeries]) -> Self:
152
+ """Create from a list of TimeSeries objects."""
153
+ return cls(
154
+ columns=[series.name or "" for series in data], # Handle None names
155
+ data=[series.data for series in data], # Pass list of value lists
156
+ index=data[0].index,
157
+ )
158
+
159
+ def to_timeseries(self) -> list[TimeSeries]:
160
+ """Convert to a list of TimeSeries objects."""
161
+ return [
162
+ TimeSeries(name=col, data=[row[i] for row in self.data], index=self.index)
163
+ for i, col in enumerate(self.columns)
164
+ ]
@@ -0,0 +1,19 @@
1
+ """Multi-variable linear regression (MVLR) module."""
2
+
3
+ from .main import find_best_mvlr
4
+ from .models import (
5
+ IndependentVariableInput,
6
+ MultiVariableRegressionInput,
7
+ MultiVariableRegressionResult,
8
+ ValidationParameters,
9
+ IndependentVariableResult,
10
+ )
11
+
12
+ __all__ = [
13
+ "find_best_mvlr",
14
+ "IndependentVariableInput",
15
+ "MultiVariableRegressionInput",
16
+ "MultiVariableRegressionResult",
17
+ "ValidationParameters",
18
+ "IndependentVariableResult",
19
+ ]
@@ -0,0 +1,30 @@
1
+ """Miscelaneous helper functions for the MVLR app."""
2
+
3
+ import pandas as pd
4
+
5
+ from openenergyid.enums import Granularity
6
+
7
+ pandas_granularity_map = {Granularity.P7D: "W-MON", Granularity.P1M: "MS", Granularity.P1D: "D"}
8
+
9
+
10
+ def resample_input_data(
11
+ data: pd.DataFrame,
12
+ granularity: Granularity,
13
+ aggregation_methods: dict = None,
14
+ ) -> pd.DataFrame:
15
+ """Resample input data to the given granularity.
16
+
17
+ By default, the data is summed up for each column.
18
+ Provide a dictionary of aggregation methods to override this behaviour.
19
+ """
20
+ if granularity not in pandas_granularity_map:
21
+ raise NotImplementedError("Granularity not implemented.")
22
+ aggregation_methods = aggregation_methods.copy() if aggregation_methods else {}
23
+
24
+ for column in data.columns:
25
+ if column not in aggregation_methods:
26
+ aggregation_methods[column] = "sum"
27
+
28
+ return data.resample(rule=pandas_granularity_map[granularity]).agg(
29
+ aggregation_methods,
30
+ )
@@ -0,0 +1,34 @@
1
+ """Main module for the MultiVariableLinearRegression class."""
2
+
3
+ from .models import MultiVariableRegressionInput, MultiVariableRegressionResult
4
+ from .helpers import resample_input_data
5
+ from .mvlr import MultiVariableLinearRegression
6
+
7
+
8
+ def find_best_mvlr(
9
+ data: MultiVariableRegressionInput,
10
+ ) -> MultiVariableRegressionResult:
11
+ """Cycle through multiple granularities and return the best model."""
12
+ best_rsquared = 0
13
+ for granularity in data.granularities:
14
+ frame = data.data_frame()
15
+ frame = resample_input_data(data=frame, granularity=granularity)
16
+ mvlr = MultiVariableLinearRegression(
17
+ data=frame,
18
+ y=data.dependent_variable,
19
+ granularity=granularity,
20
+ allow_negative_predictions=data.allow_negative_predictions,
21
+ single_use_exog_prefixes=data.single_use_exog_prefixes or [],
22
+ exogs__disallow_negative_coefficient=data.get_disallowed_negative_coefficients(),
23
+ )
24
+ mvlr.do_analysis()
25
+ if mvlr.validate(
26
+ min_rsquared=data.validation_parameters.rsquared,
27
+ max_f_pvalue=data.validation_parameters.f_pvalue,
28
+ max_pvalues=data.validation_parameters.pvalues,
29
+ ):
30
+ return MultiVariableRegressionResult.from_mvlr(mvlr)
31
+ best_rsquared = max(best_rsquared, mvlr.fit.rsquared_adj)
32
+ raise ValueError(
33
+ f"No valid model found. Best R²: {best_rsquared:.3f} (need ≥{data.validation_parameters.rsquared})"
34
+ )
@@ -0,0 +1,228 @@
1
+ """Models for multivariable linear regression."""
2
+
3
+ from typing import Any
4
+ import pandas as pd
5
+
6
+ from pydantic import BaseModel, Field, ConfigDict
7
+ import statsmodels.formula.api as fm
8
+
9
+ from openenergyid.enums import Granularity
10
+ from openenergyid.models import TimeDataFrame
11
+
12
+ from .mvlr import MultiVariableLinearRegression
13
+
14
+
15
+ COLUMN_TEMPERATUREEQUIVALENT = "temperatureEquivalent"
16
+
17
+
18
+ ######################
19
+ # MVLR Input Models #
20
+ ######################
21
+
22
+
23
+ class ValidationParameters(BaseModel):
24
+ """Parameters for validation of a multivariable linear regression model."""
25
+
26
+ rsquared: float = Field(
27
+ 0.75, ge=0, le=1, description="Minimum acceptable value for the adjusted R-squared"
28
+ )
29
+ f_pvalue: float = Field(
30
+ 0.05, ge=0, le=1, description="Maximum acceptable value for the F-statistic"
31
+ )
32
+ pvalues: float = Field(
33
+ 0.05, ge=0, le=1, description="Maximum acceptable value for the p-values of the t-statistic"
34
+ )
35
+
36
+
37
+ class IndependentVariableInput(BaseModel):
38
+ """
39
+ Independent variable.
40
+
41
+ Has to corresponds to a column in the data frame.
42
+ """
43
+
44
+ name: str = Field(
45
+ description="Name of the independent variable. "
46
+ "If the name is `temperatureEquivalent`, "
47
+ "it will be unpacked into columns according to the variants."
48
+ )
49
+ variants: list[str] | None = Field(
50
+ default=None,
51
+ description="Variants of the `temperatureEquivalent` independent variable. "
52
+ "Eg. `HDD_16.5` will be Heating Degree Days with a base temperature of 16.5°C, "
53
+ "`CDD_0` will be Cooling Degree Days with a base temperature of 0°C.",
54
+ )
55
+ allow_negative_coefficient: bool = Field(
56
+ default=True,
57
+ alias="allowNegativeCoefficient",
58
+ description="Whether the coefficient can be negative.",
59
+ )
60
+
61
+
62
+ class MultiVariableRegressionInput(BaseModel):
63
+ """Multi-variable regression input."""
64
+
65
+ timezone: str = Field(alias="timeZone")
66
+ independent_variables: list[IndependentVariableInput] = Field(
67
+ alias="independentVariables", min_length=1
68
+ )
69
+ dependent_variable: str = Field(alias="dependentVariable")
70
+ frame: TimeDataFrame
71
+ granularities: list[Granularity]
72
+ allow_negative_predictions: bool = Field(alias="allowNegativePredictions", default=False)
73
+ validation_parameters: ValidationParameters = Field(
74
+ alias="validationParameters", default=ValidationParameters()
75
+ )
76
+ single_use_exog_prefixes: list[str] | None = Field(
77
+ # default=["HDD", "CDD", "FDD"],
78
+ default=None,
79
+ alias="singleUseExogPrefixes",
80
+ description="List of prefixes to be used as single-use exogenous variables.",
81
+ )
82
+
83
+ def model_post_init(self, __context: Any) -> None:
84
+ """Post init hook."""
85
+ # Check if all independent variables are present in the data frame
86
+ for iv in self.independent_variables: # pylint: disable=not-an-iterable
87
+ if iv.name not in self.frame.columns:
88
+ raise ValueError(f"Independent variable {iv.name} not found in the data frame.")
89
+
90
+ return super().model_post_init(__context)
91
+
92
+ def _data_frame(self) -> pd.DataFrame:
93
+ """Convert the data to a pandas DataFrame."""
94
+ return self.frame.to_pandas(timezone=self.timezone)
95
+
96
+ def data_frame(self) -> pd.DataFrame:
97
+ """
98
+ Return the data frame ready for analysis.
99
+
100
+ Unpacks degree days and removes unnecessary columns.
101
+
102
+ If an independent variable named `temperatureEquivalent` is present,
103
+ it will be unpacked into columns according to the variants.
104
+ Eg. Variant "HDD_16.5" will be Heating Degree Days
105
+ with a base temperature of 16.5°C,
106
+ "CDD_0" will be Cooling Degree Days with a base temperature of 0°C.
107
+ """
108
+ frame = self._data_frame()
109
+ columns_to_retain = [self.dependent_variable]
110
+ for iv in self.independent_variables: # pylint: disable=not-an-iterable
111
+ if iv.name == COLUMN_TEMPERATUREEQUIVALENT and iv.variants is not None:
112
+ for variant in iv.variants:
113
+ prefix, base_temperature = variant.split("_")
114
+ if prefix == "CDD":
115
+ frame[variant] = frame[COLUMN_TEMPERATUREEQUIVALENT] - float(
116
+ base_temperature
117
+ )
118
+ else:
119
+ frame[variant] = (
120
+ float(base_temperature) - frame[COLUMN_TEMPERATUREEQUIVALENT]
121
+ )
122
+ frame[variant] = frame[variant].clip(lower=0)
123
+ columns_to_retain.append(variant)
124
+ frame.drop(columns=[COLUMN_TEMPERATUREEQUIVALENT], inplace=True)
125
+ else:
126
+ columns_to_retain.append(iv.name)
127
+
128
+ frame = frame[columns_to_retain].copy()
129
+
130
+ return frame
131
+
132
+ def get_disallowed_negative_coefficients(self) -> list[str]:
133
+ """Get independent variables that are not allowed to have a negative coefficient."""
134
+ result = []
135
+ for iv in self.independent_variables: # pylint: disable=not-an-iterable
136
+ if iv.name == COLUMN_TEMPERATUREEQUIVALENT and iv.variants is not None:
137
+ if not iv.allow_negative_coefficient:
138
+ result.extend(iv.variants)
139
+ elif not iv.allow_negative_coefficient:
140
+ result.append(iv.name)
141
+ return result
142
+
143
+
144
+ ######################
145
+ # MVLR Result Models #
146
+ ######################
147
+
148
+
149
+ class ConfidenceInterval(BaseModel):
150
+ """Confidence interval for a coefficient."""
151
+
152
+ confidence: float = Field(ge=0, le=1)
153
+ lower: float
154
+ upper: float
155
+
156
+
157
+ class IndependentVariableResult(BaseModel):
158
+ """Independent variable for a multivariable linear regression model."""
159
+
160
+ name: str
161
+ coef: float
162
+ t_stat: float | None = Field(default=None, alias="tStat")
163
+ p_value: float | None = Field(ge=0, le=1, default=None, alias="pValue")
164
+ std_err: float | None = Field(default=None, alias="stdErr")
165
+ confidence_interval: ConfidenceInterval | None = Field(default=None, alias="confidenceInterval")
166
+
167
+ model_config = ConfigDict(populate_by_name=True)
168
+
169
+ @classmethod
170
+ def from_fit(cls, fit: fm.ols, name: str) -> "IndependentVariableResult":
171
+ """Create an IndependentVariable from a fit."""
172
+ return cls(
173
+ name=name,
174
+ coef=fit.params[name],
175
+ t_stat=fit.tvalues[name],
176
+ p_value=fit.pvalues[name],
177
+ std_err=fit.bse[name],
178
+ confidence_interval=ConfidenceInterval(
179
+ confidence=0.95,
180
+ lower=fit.conf_int().transpose()[name][0],
181
+ upper=fit.conf_int().transpose()[name][1],
182
+ ),
183
+ )
184
+
185
+
186
+ class MultiVariableRegressionResult(BaseModel):
187
+ """Result of a multivariable regression model."""
188
+
189
+ dependent_variable: str = Field(alias="dependentVariable")
190
+ independent_variables: list[IndependentVariableResult] = Field(alias="independentVariables")
191
+ r2: float = Field(ge=0, le=1, alias="rSquared")
192
+ r2_adj: float = Field(ge=0, le=1, alias="rSquaredAdjusted")
193
+ f_stat: float = Field(ge=0, alias="fStat")
194
+ prob_f_stat: float = Field(ge=0, le=1, alias="probFStat")
195
+ intercept: IndependentVariableResult
196
+ granularity: Granularity
197
+ frame: TimeDataFrame
198
+
199
+ model_config = ConfigDict(populate_by_name=True)
200
+
201
+ @classmethod
202
+ def from_mvlr(cls, mvlr: MultiVariableLinearRegression) -> "MultiVariableRegressionResult":
203
+ """Create a MultiVariableRegressionResult from a MultiVariableLinearRegression."""
204
+
205
+ # Get independent variables
206
+ param_keys = mvlr.fit.params.keys().tolist()
207
+ param_keys.remove("Intercept")
208
+ independent_variables = []
209
+ for k in param_keys:
210
+ independent_variables.append(IndependentVariableResult.from_fit(mvlr.fit, k))
211
+
212
+ # Create resulting TimeSeries
213
+ cols_to_keep = list(param_keys)
214
+ cols_to_keep.append(mvlr.y)
215
+ cols_to_remove = list(filter(lambda v: v not in cols_to_keep, mvlr.data.columns.values))
216
+ frame = mvlr.data.drop(cols_to_remove, axis=1)
217
+
218
+ return cls(
219
+ dependent_variable=mvlr.y,
220
+ independent_variables=independent_variables,
221
+ r2=mvlr.fit.rsquared,
222
+ r2_adj=mvlr.fit.rsquared_adj,
223
+ f_stat=mvlr.fit.fvalue,
224
+ prob_f_stat=mvlr.fit.f_pvalue,
225
+ intercept=IndependentVariableResult.from_fit(mvlr.fit, "Intercept"),
226
+ granularity=mvlr.granularity,
227
+ frame=TimeDataFrame.from_pandas(frame),
228
+ )