spotforecast2 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. spotforecast2/.DS_Store +0 -0
  2. spotforecast2/__init__.py +2 -0
  3. spotforecast2/data/__init__.py +0 -0
  4. spotforecast2/data/data.py +130 -0
  5. spotforecast2/data/fetch_data.py +209 -0
  6. spotforecast2/exceptions.py +681 -0
  7. spotforecast2/forecaster/.DS_Store +0 -0
  8. spotforecast2/forecaster/__init__.py +7 -0
  9. spotforecast2/forecaster/base.py +448 -0
  10. spotforecast2/forecaster/metrics.py +527 -0
  11. spotforecast2/forecaster/recursive/__init__.py +4 -0
  12. spotforecast2/forecaster/recursive/_forecaster_equivalent_date.py +1075 -0
  13. spotforecast2/forecaster/recursive/_forecaster_recursive.py +939 -0
  14. spotforecast2/forecaster/recursive/_warnings.py +15 -0
  15. spotforecast2/forecaster/utils.py +954 -0
  16. spotforecast2/model_selection/__init__.py +5 -0
  17. spotforecast2/model_selection/bayesian_search.py +453 -0
  18. spotforecast2/model_selection/grid_search.py +314 -0
  19. spotforecast2/model_selection/random_search.py +151 -0
  20. spotforecast2/model_selection/split_base.py +357 -0
  21. spotforecast2/model_selection/split_one_step.py +245 -0
  22. spotforecast2/model_selection/split_ts_cv.py +634 -0
  23. spotforecast2/model_selection/utils_common.py +718 -0
  24. spotforecast2/model_selection/utils_metrics.py +103 -0
  25. spotforecast2/model_selection/validation.py +685 -0
  26. spotforecast2/preprocessing/__init__.py +30 -0
  27. spotforecast2/preprocessing/_binner.py +378 -0
  28. spotforecast2/preprocessing/_common.py +123 -0
  29. spotforecast2/preprocessing/_differentiator.py +123 -0
  30. spotforecast2/preprocessing/_rolling.py +136 -0
  31. spotforecast2/preprocessing/curate_data.py +254 -0
  32. spotforecast2/preprocessing/imputation.py +92 -0
  33. spotforecast2/preprocessing/outlier.py +114 -0
  34. spotforecast2/preprocessing/split.py +139 -0
  35. spotforecast2/py.typed +0 -0
  36. spotforecast2/utils/__init__.py +43 -0
  37. spotforecast2/utils/convert_to_utc.py +44 -0
  38. spotforecast2/utils/data_transform.py +208 -0
  39. spotforecast2/utils/forecaster_config.py +344 -0
  40. spotforecast2/utils/generate_holiday.py +70 -0
  41. spotforecast2/utils/validation.py +569 -0
  42. spotforecast2/weather/__init__.py +0 -0
  43. spotforecast2/weather/weather_client.py +288 -0
  44. spotforecast2-0.0.1.dist-info/METADATA +47 -0
  45. spotforecast2-0.0.1.dist-info/RECORD +46 -0
  46. spotforecast2-0.0.1.dist-info/WHEEL +4 -0
Binary file
@@ -0,0 +1,2 @@
1
+ def hello() -> str:
2
+ return "Hello from spotforecast2!"
File without changes
@@ -0,0 +1,130 @@
1
+ """Data structures for input and processed data."""
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import List, Optional
6
+
7
+ import pandas as pd
8
+
9
+ from spotforecast2.utils.convert_to_utc import convert_to_utc
10
+
11
+
12
+ @dataclass
13
+ class Data:
14
+ """Container for input time series data.
15
+
16
+ Attributes:
17
+ data: pandas DataFrame containing the input time series data.
18
+ """
19
+
20
+ data: pd.DataFrame
21
+
22
+ @classmethod
23
+ def from_csv(
24
+ cls,
25
+ csv_path: Path,
26
+ timezone: Optional[str],
27
+ columns: Optional[List[str]] = None,
28
+ parse_dates=True,
29
+ index_col=0,
30
+ **kwargs,
31
+ ) -> "Data":
32
+ """Load data from a CSV file.
33
+
34
+ The CSV must contain a datetime column that becomes the DataFrame index.
35
+ The index is localized to the provided timezone if it is naive, and then
36
+ converted to UTC.
37
+
38
+ Args:
39
+ csv_path (Path): Path to the CSV file.
40
+ timezone (Optional[str]): Timezone to assign if the index has no
41
+ timezone. Must be provided if the index is naive.
42
+ columns (Optional[List[str]]): List of column names to include. If
43
+ provided, only these columns will be loaded from the CSV
44
+ (optimizes reading speed). If None, all columns are loaded.
45
+ parse_dates (bool or list, optional): Passed to ``pd.read_csv``.
46
+ Defaults to True.
47
+ index_col (int or str, optional): Column to use as index. Defaults to 0.
48
+ **kwargs (Any): Additional keyword arguments forwarded to ``pd.read_csv``.
49
+
50
+ Returns:
51
+ Data: Instance containing the loaded DataFrame.
52
+
53
+ Raises:
54
+ ValueError: If the CSV does not yield a DatetimeIndex.
55
+ ValueError: If the index is timezone-naive and no timezone is provided.
56
+
57
+ Examples:
58
+ >>> data = Data.from_csv(
59
+ ... Path("data.csv"),
60
+ ... timezone="UTC",
61
+ ... columns=["target_col"]
62
+ ... )
63
+ """
64
+ # If columns specified, add index column to usecols for efficient reading
65
+ usecols = None
66
+ if columns is not None:
67
+ # Get the index column name/number
68
+ if isinstance(index_col, int):
69
+ # Read header first to get column names
70
+ header_df = pd.read_csv(csv_path, nrows=0)
71
+ index_col_name = header_df.columns[index_col]
72
+ else:
73
+ index_col_name = index_col
74
+ usecols = [index_col_name] + columns
75
+
76
+ df = pd.read_csv(
77
+ csv_path,
78
+ parse_dates=parse_dates,
79
+ index_col=index_col,
80
+ usecols=usecols,
81
+ **kwargs,
82
+ )
83
+ df = convert_to_utc(df, timezone)
84
+ if df.index.freq is None:
85
+ try:
86
+ df.index.freq = pd.infer_freq(df.index)
87
+ except (ValueError, TypeError):
88
+ pass
89
+ return cls(data=df)
90
+
91
+ @classmethod
92
+ def from_dataframe(
93
+ cls,
94
+ df: pd.DataFrame,
95
+ timezone: Optional[str],
96
+ columns: Optional[List[str]] = None,
97
+ ) -> "Data":
98
+ """Create a new Data instance from an existing DataFrame.
99
+
100
+ The DataFrame must have a datetime index. The index is localized to the
101
+ provided timezone if it is naive, and then converted to UTC.
102
+
103
+ Args:
104
+ df (pd.DataFrame): Input DataFrame containing data.
105
+ timezone (Optional[str]): Timezone to assign if the index is naive.
106
+ Must be provided if the index has no timezone.
107
+ columns (Optional[List[str]]): List of column names to include.
108
+ If provided, only these columns will be selected from the
109
+ DataFrame. If None, all columns are used.
110
+
111
+ Returns:
112
+ Data: Instance containing the provided DataFrame.
113
+
114
+ Raises:
115
+ ValueError: If the DataFrame index is not a DatetimeIndex.
116
+ ValueError: If the index is timezone-naive and no timezone is provided.
117
+ """
118
+ df = convert_to_utc(df, timezone)
119
+
120
+ # Select columns if specified
121
+ if columns is not None:
122
+ df = df[columns].copy()
123
+
124
+ if df.index.freq is None:
125
+ try:
126
+ df.index.freq = pd.infer_freq(df.index)
127
+ except (ValueError, TypeError):
128
+ pass
129
+
130
+ return cls(data=df)
@@ -0,0 +1,209 @@
1
+ import pandas as pd
2
+ from spotforecast2.data.data import Data
3
+ from pathlib import Path
4
+ from os import environ
5
+ from typing import Optional, Union
6
+ from spotforecast2.utils.generate_holiday import create_holiday_df
7
+ from pandas import Timestamp
8
+ from spotforecast2.weather.weather_client import WeatherService
9
+
10
+
11
+ def get_data_home(data_home: Optional[Union[str, Path]] = None) -> Path:
12
+ """Return the location where datasets are to be stored.
13
+
14
+ By default the data directory is set to a folder named 'spotforecast2_data' in the
15
+ user home folder. Alternatively, it can be set by the 'SPOTFORECAST2_DATA' environment
16
+ variable or programmatically by giving an explicit folder path. The '~'
17
+ symbol is expanded to the user home folder.
18
+ If the folder does not already exist, it is automatically created.
19
+
20
+ Args:
21
+ data_home (str or pathlib.Path, optional):
22
+ The path to spotforecast data directory. If `None`, the default path
23
+ is `~/spotforecast2_data`.
24
+
25
+ Returns:
26
+ data_home (pathlib.Path):
27
+ The path to the spotforecast data directory.
28
+ Examples:
29
+ >>> from pathlib import Path
30
+ >>> get_data_home()
31
+ PosixPath('/home/user/spotforecast2_data')
32
+ >>> get_data_home(Path('/tmp/spotforecast2_data'))
33
+ PosixPath('/tmp/spotforecast2_data')
34
+ """
35
+ if data_home is None:
36
+ data_home = environ.get(
37
+ "SPOTFORECAST2_DATA", Path.home() / "spotforecast2_data"
38
+ )
39
+ # Ensure data_home is a Path() object pointing to an absolute path
40
+ data_home = Path(data_home).expanduser().absolute()
41
+ # Create data directory if it does not exists.
42
+ data_home.mkdir(parents=True, exist_ok=True)
43
+ return data_home
44
+
45
+
46
+ def fetch_data(
47
+ filename: str = "integrated_raw_data.csv",
48
+ columns: Optional[list] = None,
49
+ index_col: int = 0,
50
+ parse_dates: bool = True,
51
+ dayfirst: bool = False,
52
+ timezone: str = "UTC",
53
+ ) -> pd.DataFrame:
54
+ """Fetches the integrated raw dataset from a CSV file.
55
+
56
+ Args:
57
+ filename (str):
58
+ Filename of the CSV file containing the dataset. It must be located in the data home directory, which can be get or set using `get_data_home()`.
59
+ columns (list):
60
+ List of columns to be included in the dataset. Must be specified.
61
+ index_col (int):
62
+ Column index to be used as the index.
63
+ parse_dates (bool):
64
+ Whether to parse dates in the index column.
65
+ dayfirst (bool):
66
+ Whether the day comes first in date parsing.
67
+ timezone (str):
68
+ Timezone to set for the datetime index.
69
+
70
+ Returns:
71
+ pd.DataFrame: The integrated raw dataset.
72
+
73
+ Raises:
74
+ ValueError: If columns is None or empty.
75
+
76
+ Examples:
77
+ >>> from spotforecast2.data.fetch_data import fetch_data
78
+ >>> data = fetch_data(columns=["col1", "col2"])
79
+ >>> data.head()
80
+ Header1 Header2 Header3
81
+ """
82
+ if columns is None or len(columns) == 0:
83
+ raise ValueError("columns must be specified and cannot be empty.")
84
+
85
+ csv_path = get_data_home() / filename
86
+ if not Path(csv_path).is_file():
87
+ raise FileNotFoundError(f"The file {csv_path} does not exist.")
88
+
89
+ dataset = Data.from_csv(
90
+ csv_path=csv_path,
91
+ index_col=index_col,
92
+ parse_dates=parse_dates,
93
+ dayfirst=dayfirst,
94
+ timezone=timezone,
95
+ columns=columns,
96
+ )
97
+
98
+ return dataset.data
99
+
100
+
101
+ def fetch_holiday_data(
102
+ start: str | Timestamp,
103
+ end: str | Timestamp,
104
+ tz: str = "UTC",
105
+ freq: str = "h",
106
+ country_code: str = "DE",
107
+ state: str = "NW",
108
+ ) -> pd.DataFrame:
109
+ """Fetches holiday data for the dataset period.
110
+
111
+ Args:
112
+ start (str or pd.Timestamp):
113
+ Start date of the dataset period.
114
+ end (str or pd.Timestamp):
115
+ End date of the dataset period.
116
+ tz (str):
117
+ Timezone for the holiday data.
118
+ freq (str):
119
+ Frequency of the holiday data.
120
+ country_code (str):
121
+ Country code for the holidays.
122
+ state (str):
123
+ State code for the holidays.
124
+
125
+ Returns:
126
+ pd.DataFrame: DataFrame containing holiday information.
127
+
128
+ Examples:
129
+ >>> from spotforecast2.data.fetch_data import fetch_holiday_data
130
+ >>> holiday_df = fetch_holiday_data(
131
+ ... start='2023-01-01T00:00',
132
+ ... end='2023-01-10T00:00',
133
+ ... tz='UTC',
134
+ ... freq='h',
135
+ ... country_code='DE',
136
+ ... state='NW'
137
+ ... )
138
+ >>> holiday_df.head()
139
+ is_holiday
140
+ """
141
+
142
+ holiday_df = create_holiday_df(
143
+ start=start, end=end, tz=tz, freq=freq, country_code=country_code, state=state
144
+ )
145
+ return holiday_df
146
+
147
+
148
+ def fetch_weather_data(
149
+ cov_start: str,
150
+ cov_end: str,
151
+ latitude: float = 51.5136,
152
+ longitude: float = 7.4653,
153
+ timezone: str = "UTC",
154
+ freq: str = "h",
155
+ fallback_on_failure: bool = True,
156
+ cached=True,
157
+ ) -> pd.DataFrame:
158
+ """Fetches weather data for the dataset period plus forecast horizon.
159
+ Create weather dataframe using API with optional caching.
160
+ Args:
161
+ cov_start (str):
162
+ Start date for covariate data.
163
+ cov_end (str):
164
+ End date for covariate data.
165
+ latitude (float):
166
+ Latitude of the location for weather data. Default is 51.5136 (Dortmund).
167
+ longitude (float):
168
+ Longitude of the location for weather data. Default is 7.4653 (Dortmund).
169
+ timezone (str):
170
+ Timezone for the weather data.
171
+ freq (str):
172
+ Frequency of the weather data.
173
+ fallback_on_failure (bool):
174
+ Whether to use fallback data in case of failure.
175
+
176
+ Returns:
177
+ pd.DataFrame: DataFrame containing weather information.
178
+
179
+ Examples:
180
+ >>> from spotforecast2.data.fetch_data import fetch_weather_data
181
+ >>> weather_df = fetch_weather_data(
182
+ ... cov_start='2023-01-01T00:00',
183
+ ... cov_end='2023-01-11T00:00',
184
+ ... latitude=51.5136,
185
+ ... longitude=7.4653,
186
+ ... timezone='UTC',
187
+ ... freq='h',
188
+ ... fallback_on_failure=True,
189
+ ... cached=True
190
+ ... )
191
+ >>> weather_df.head()
192
+ """
193
+ if cached:
194
+ cache_path = get_data_home() / "weather_cache.parquet"
195
+ else:
196
+ cache_path = None
197
+
198
+ service = WeatherService(
199
+ latitude=latitude, longitude=longitude, cache_path=cache_path
200
+ )
201
+
202
+ weather_df = service.get_dataframe(
203
+ start=cov_start,
204
+ end=cov_end,
205
+ timezone=timezone,
206
+ freq=freq,
207
+ fallback_on_failure=fallback_on_failure,
208
+ )
209
+ return weather_df