imsciences 1.0.2__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/__init__.py +2 -0
- imsciences/oecd_pull.py +423 -0
- imsciences/pull-IMS-24Ltp-3.py +3132 -0
- imsciences/pull.py +137 -218
- imsciences-1.1.6.dist-info/METADATA +365 -0
- imsciences-1.0.2.dist-info/METADATA → imsciences-1.1.6.dist-info/PKG-INFO-IMS-24Ltp-3 +1 -1
- imsciences-1.1.6.dist-info/RECORD +14 -0
- {imsciences-1.0.2.dist-info → imsciences-1.1.6.dist-info}/WHEEL +1 -1
- imsciences/unittesting.py +0 -1314
- imsciences-1.0.2.dist-info/RECORD +0 -12
- {imsciences-1.0.2.dist-info → imsciences-1.1.6.dist-info}/PKG-INFO-TomG-HP-290722 +0 -0
- {imsciences-1.0.2.dist-info → imsciences-1.1.6.dist-info/licenses}/LICENSE.txt +0 -0
- {imsciences-1.0.2.dist-info → imsciences-1.1.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,3132 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import re
|
|
3
|
+
import time
|
|
4
|
+
import urllib.request
|
|
5
|
+
import xml.etree.ElementTree as ET
|
|
6
|
+
from datetime import datetime, timedelta
|
|
7
|
+
from io import StringIO
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import requests
|
|
12
|
+
import yfinance as yf
|
|
13
|
+
from bs4 import BeautifulSoup
|
|
14
|
+
from dateutil.easter import easter
|
|
15
|
+
from fredapi import Fred
|
|
16
|
+
from geopy.geocoders import Nominatim
|
|
17
|
+
|
|
18
|
+
from imsciences.mmm import dataprocessing
|
|
19
|
+
|
|
20
|
+
ims_proc = dataprocessing()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class datapull:
|
|
24
|
+
def help(self):
|
|
25
|
+
print("This is the help section. The functions in the package are as follows:")
|
|
26
|
+
|
|
27
|
+
print("\n1. pull_fred_data")
|
|
28
|
+
print(" - Description: Get data from FRED by using series id tokens.")
|
|
29
|
+
print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
|
|
30
|
+
print(" - Example: pull_fred_data('mon', ['GPDIC1'])")
|
|
31
|
+
|
|
32
|
+
print("\n2. pull_boe_data")
|
|
33
|
+
print(" - Description: Fetch and process Bank of England interest rate data.")
|
|
34
|
+
print(" - Usage: pull_boe_data(week_commencing)")
|
|
35
|
+
print(" - Example: pull_boe_data('mon')")
|
|
36
|
+
|
|
37
|
+
print("\n3. pull_oecd")
|
|
38
|
+
print(
|
|
39
|
+
" - Description: Fetch macroeconomic data from OECD for a specified country.",
|
|
40
|
+
)
|
|
41
|
+
print(
|
|
42
|
+
" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '2020-01-01')",
|
|
43
|
+
)
|
|
44
|
+
print(" - Example: pull_oecd('GBR', 'mon', '2000-01-01')")
|
|
45
|
+
|
|
46
|
+
print("\n4. get_google_mobility_data")
|
|
47
|
+
print(" - Description: Fetch Google Mobility data for the specified country.")
|
|
48
|
+
print(" - Usage: get_google_mobility_data(country, wc)")
|
|
49
|
+
print(" - Example: get_google_mobility_data('United Kingdom', 'mon')")
|
|
50
|
+
|
|
51
|
+
print("\n5. pull_seasonality")
|
|
52
|
+
print(
|
|
53
|
+
" - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.",
|
|
54
|
+
)
|
|
55
|
+
print(" - Usage: pull_seasonality(week_commencing, start_date, countries)")
|
|
56
|
+
print(" - Example: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])")
|
|
57
|
+
|
|
58
|
+
print("\n6. pull_weather")
|
|
59
|
+
print(
|
|
60
|
+
" - Description: Fetch and process historical weather data for the specified country.",
|
|
61
|
+
)
|
|
62
|
+
print(" - Usage: pull_weather(week_commencing, start_date, country)")
|
|
63
|
+
print(" - Example: pull_weather('mon', '2020-01-01', ['GBR'])")
|
|
64
|
+
|
|
65
|
+
print("\n7. pull_macro_ons_uk")
|
|
66
|
+
print(
|
|
67
|
+
" - Description: Fetch and process time series data from the Beta ONS API.",
|
|
68
|
+
)
|
|
69
|
+
print(" - Usage: pull_macro_ons_uk(aditional_list, week_commencing, sector)")
|
|
70
|
+
print(" - Example: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')")
|
|
71
|
+
|
|
72
|
+
print("\n8. pull_yfinance")
|
|
73
|
+
print(
|
|
74
|
+
" - Description: Fetch and process time series data from the Beta ONS API.",
|
|
75
|
+
)
|
|
76
|
+
print(" - Usage: pull_yfinance(tickers, week_start_day)")
|
|
77
|
+
print(" - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
|
|
78
|
+
|
|
79
|
+
print("\n9. pull_sports_events")
|
|
80
|
+
print(
|
|
81
|
+
" - Description: Pull a veriety of sports events primaraly football and rugby.",
|
|
82
|
+
)
|
|
83
|
+
print(" - Usage: pull_sports_events(start_date, week_commencing)")
|
|
84
|
+
print(" - Example: pull_sports_events('2020-01-01', 'mon')")
|
|
85
|
+
|
|
86
|
+
############################################################### MACRO ##########################################################################
|
|
87
|
+
|
|
88
|
+
def pull_fred_data(
|
|
89
|
+
self,
|
|
90
|
+
week_commencing: str = "mon",
|
|
91
|
+
series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"],
|
|
92
|
+
) -> pd.DataFrame:
|
|
93
|
+
"""
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
week_commencing : str
|
|
97
|
+
specify the day for the week commencing, the default is 'sun' (e.g., 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
|
|
98
|
+
|
|
99
|
+
series_id_list : list[str]
|
|
100
|
+
provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
|
|
101
|
+
["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
-------
|
|
105
|
+
pd.DataFrame
|
|
106
|
+
Return a data frame with FRED data according to the series IDs provided
|
|
107
|
+
|
|
108
|
+
"""
|
|
109
|
+
# Fred API
|
|
110
|
+
fred = Fred(api_key="76f5f8156145fdb8fbaf66f1eb944f8a")
|
|
111
|
+
|
|
112
|
+
# Fetch the metadata for each series to get the full names
|
|
113
|
+
series_names = {
|
|
114
|
+
series_id: fred.get_series_info(series_id).title
|
|
115
|
+
for series_id in series_id_list
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
# Download data from series id list
|
|
119
|
+
fred_series = {
|
|
120
|
+
series_id: fred.get_series(series_id) for series_id in series_id_list
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
# Data processing
|
|
124
|
+
date_range = {
|
|
125
|
+
"OBS": pd.date_range(
|
|
126
|
+
"1950-01-01",
|
|
127
|
+
datetime.today().strftime("%Y-%m-%d"),
|
|
128
|
+
freq="d",
|
|
129
|
+
),
|
|
130
|
+
}
|
|
131
|
+
fred_series_df = pd.DataFrame(date_range)
|
|
132
|
+
|
|
133
|
+
for series_id, series_data in fred_series.items():
|
|
134
|
+
series_data = series_data.reset_index()
|
|
135
|
+
series_data.columns = [
|
|
136
|
+
"OBS",
|
|
137
|
+
series_names[series_id],
|
|
138
|
+
] # Use the series name as the column header
|
|
139
|
+
fred_series_df = pd.merge_asof(
|
|
140
|
+
fred_series_df,
|
|
141
|
+
series_data,
|
|
142
|
+
on="OBS",
|
|
143
|
+
direction="backward",
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Handle duplicate columns
|
|
147
|
+
for col in fred_series_df.columns:
|
|
148
|
+
if "_x" in col:
|
|
149
|
+
base_col = col.replace("_x", "")
|
|
150
|
+
fred_series_df[base_col] = fred_series_df[col].combine_first(
|
|
151
|
+
fred_series_df[base_col + "_y"],
|
|
152
|
+
)
|
|
153
|
+
fred_series_df.drop([col, base_col + "_y"], axis=1, inplace=True)
|
|
154
|
+
|
|
155
|
+
# Ensure sum_columns are present in the DataFrame
|
|
156
|
+
sum_columns = [
|
|
157
|
+
series_names[series_id]
|
|
158
|
+
for series_id in series_id_list
|
|
159
|
+
if series_names[series_id] in fred_series_df.columns
|
|
160
|
+
]
|
|
161
|
+
|
|
162
|
+
# Aggregate results by week
|
|
163
|
+
fred_df_final = ims_proc.aggregate_daily_to_wc_wide(
|
|
164
|
+
df=fred_series_df,
|
|
165
|
+
date_column="OBS",
|
|
166
|
+
group_columns=[],
|
|
167
|
+
sum_columns=sum_columns,
|
|
168
|
+
wc=week_commencing,
|
|
169
|
+
aggregation="average",
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Remove anything after the instance of any ':' in the column names and rename, except for 'OBS'
|
|
173
|
+
fred_df_final.columns = [
|
|
174
|
+
"OBS"
|
|
175
|
+
if col == "OBS"
|
|
176
|
+
else "macro_" + col.lower().split(":")[0].replace(" ", "_")
|
|
177
|
+
for col in fred_df_final.columns
|
|
178
|
+
]
|
|
179
|
+
|
|
180
|
+
return fred_df_final
|
|
181
|
+
|
|
182
|
+
def pull_boe_data(self, week_commencing="mon", max_retries=5, delay=5):
|
|
183
|
+
"""
|
|
184
|
+
Fetch and process Bank of England interest rate data.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
week_commencing (str): The starting day of the week for aggregation.
|
|
188
|
+
Options are "mon", "tue", "wed", "thu", "fri", "sat", "sun".
|
|
189
|
+
Default is "mon".
|
|
190
|
+
max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
|
|
191
|
+
delay (int): Delay in seconds between retry attempts. Default is 5.
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
|
|
195
|
+
The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
|
|
196
|
+
and 'macro_boe_intr_rate' contains the average interest rate for the week.
|
|
197
|
+
|
|
198
|
+
"""
|
|
199
|
+
# Week commencing dictionary
|
|
200
|
+
day_dict = {
|
|
201
|
+
"mon": 0,
|
|
202
|
+
"tue": 1,
|
|
203
|
+
"wed": 2,
|
|
204
|
+
"thu": 3,
|
|
205
|
+
"fri": 4,
|
|
206
|
+
"sat": 5,
|
|
207
|
+
"sun": 6,
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
# URL of the Bank of England data page
|
|
211
|
+
url = "https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp"
|
|
212
|
+
|
|
213
|
+
# Retry logic for HTTP request
|
|
214
|
+
for attempt in range(max_retries):
|
|
215
|
+
try:
|
|
216
|
+
# Set up headers to mimic a browser request
|
|
217
|
+
headers = {
|
|
218
|
+
"User-Agent": (
|
|
219
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
220
|
+
"Chrome/91.0.4472.124 Safari/537.36"
|
|
221
|
+
),
|
|
222
|
+
}
|
|
223
|
+
response = requests.get(url, headers=headers)
|
|
224
|
+
response.raise_for_status() # Raise an exception for HTTP errors
|
|
225
|
+
break
|
|
226
|
+
except requests.exceptions.RequestException as e:
|
|
227
|
+
print(f"Attempt {attempt + 1} failed: {e}")
|
|
228
|
+
if attempt < max_retries - 1:
|
|
229
|
+
time.sleep(delay)
|
|
230
|
+
else:
|
|
231
|
+
raise
|
|
232
|
+
|
|
233
|
+
# Parse the HTML page
|
|
234
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
235
|
+
|
|
236
|
+
# Find the table on the page
|
|
237
|
+
table = soup.find("table") # Locate the first table
|
|
238
|
+
table_html = str(table) # Convert table to string
|
|
239
|
+
df = pd.read_html(StringIO(table_html))[
|
|
240
|
+
0
|
|
241
|
+
] # Use StringIO to wrap the table HTML
|
|
242
|
+
|
|
243
|
+
# Rename and clean up columns
|
|
244
|
+
df.rename(
|
|
245
|
+
columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"},
|
|
246
|
+
inplace=True,
|
|
247
|
+
)
|
|
248
|
+
df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
|
|
249
|
+
df.sort_values("OBS", inplace=True)
|
|
250
|
+
|
|
251
|
+
# Create a daily date range
|
|
252
|
+
date_range = pd.date_range(df["OBS"].min(), datetime.today(), freq="D")
|
|
253
|
+
df_daily = pd.DataFrame(date_range, columns=["OBS"])
|
|
254
|
+
|
|
255
|
+
# Adjust each date to the specified week commencing day
|
|
256
|
+
df_daily["Week_Commencing"] = df_daily["OBS"].apply(
|
|
257
|
+
lambda x: x - timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7),
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Merge and forward-fill missing rates
|
|
261
|
+
df_daily = df_daily.merge(df, on="OBS", how="left")
|
|
262
|
+
df_daily["macro_boe_intr_rate"] = df_daily["macro_boe_intr_rate"].ffill()
|
|
263
|
+
|
|
264
|
+
# Group by week commencing and calculate the average rate
|
|
265
|
+
df_final = (
|
|
266
|
+
df_daily.groupby("Week_Commencing")["macro_boe_intr_rate"]
|
|
267
|
+
.mean()
|
|
268
|
+
.reset_index()
|
|
269
|
+
)
|
|
270
|
+
df_final["Week_Commencing"] = df_final["Week_Commencing"].dt.strftime(
|
|
271
|
+
"%d/%m/%Y",
|
|
272
|
+
)
|
|
273
|
+
df_final.rename(columns={"Week_Commencing": "OBS"}, inplace=True)
|
|
274
|
+
|
|
275
|
+
return df_final
|
|
276
|
+
|
|
277
|
+
def pull_oecd(
|
|
278
|
+
self,
|
|
279
|
+
country: str = "GBR",
|
|
280
|
+
week_commencing: str = "mon",
|
|
281
|
+
start_date: str = "2020-01-01",
|
|
282
|
+
) -> pd.DataFrame:
|
|
283
|
+
"""
|
|
284
|
+
Fetch and process time series data from the OECD API.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
|
|
288
|
+
week_commencing (str): The starting day of the week for aggregation.
|
|
289
|
+
Options are "mon", "tue", "wed", "thu", "fri", "sat", "sun".
|
|
290
|
+
start_date (str): Dataset start date in the format "YYYY-MM-DD"
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
pd.DataFrame: A DataFrame with weekly aggregated OECD data. The 'OBS' column contains the week
|
|
294
|
+
commencing dates, and other columns contain the aggregated time series values.
|
|
295
|
+
|
|
296
|
+
"""
|
|
297
|
+
|
|
298
|
+
def parse_quarter(date_str):
|
|
299
|
+
"""Parses a string in 'YYYY-Q#' format into a datetime object."""
|
|
300
|
+
year, quarter = date_str.split("-")
|
|
301
|
+
quarter_number = int(quarter[1])
|
|
302
|
+
month = (quarter_number - 1) * 3 + 1
|
|
303
|
+
return pd.Timestamp(f"{year}-{month:02d}-01")
|
|
304
|
+
|
|
305
|
+
# Generate a date range from 1950-01-01 to today
|
|
306
|
+
date_range = pd.date_range(start=start_date, end=datetime.today(), freq="D")
|
|
307
|
+
|
|
308
|
+
url_details = [
|
|
309
|
+
[
|
|
310
|
+
"BCICP",
|
|
311
|
+
"SDD.STES,DSD_STES@DF_CLI,",
|
|
312
|
+
".....",
|
|
313
|
+
"macro_business_confidence_index",
|
|
314
|
+
],
|
|
315
|
+
[
|
|
316
|
+
"CCICP",
|
|
317
|
+
"SDD.STES,DSD_STES@DF_CLI,",
|
|
318
|
+
".....",
|
|
319
|
+
"macro_consumer_confidence_index",
|
|
320
|
+
],
|
|
321
|
+
[
|
|
322
|
+
"N.CPI",
|
|
323
|
+
"SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
|
|
324
|
+
"PA._T.N.GY",
|
|
325
|
+
"macro_cpi_total",
|
|
326
|
+
],
|
|
327
|
+
[
|
|
328
|
+
"N.CPI",
|
|
329
|
+
"SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
|
|
330
|
+
"PA.CP041T043.N.GY",
|
|
331
|
+
"macro_cpi_housing",
|
|
332
|
+
],
|
|
333
|
+
[
|
|
334
|
+
"N.CPI",
|
|
335
|
+
"SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
|
|
336
|
+
"PA.CP01.N.GY",
|
|
337
|
+
"macro_cpi_food",
|
|
338
|
+
],
|
|
339
|
+
[
|
|
340
|
+
"N.CPI",
|
|
341
|
+
"SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
|
|
342
|
+
"PA.CP045_0722.N.GY",
|
|
343
|
+
"macro_cpi_energy",
|
|
344
|
+
],
|
|
345
|
+
[
|
|
346
|
+
"UNE_LF_M",
|
|
347
|
+
"SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,",
|
|
348
|
+
"._Z.Y._T.Y_GE15.",
|
|
349
|
+
"macro_unemployment_rate",
|
|
350
|
+
],
|
|
351
|
+
[
|
|
352
|
+
"EAR",
|
|
353
|
+
"SDD.TPS,DSD_EAR@DF_HOU_EAR,",
|
|
354
|
+
".Y..S1D",
|
|
355
|
+
"macro_private_hourly_earnings",
|
|
356
|
+
],
|
|
357
|
+
[
|
|
358
|
+
"RHP",
|
|
359
|
+
"ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0",
|
|
360
|
+
"",
|
|
361
|
+
"macro_real_house_prices",
|
|
362
|
+
],
|
|
363
|
+
[
|
|
364
|
+
"PRVM",
|
|
365
|
+
"SDD.STES,DSD_KEI@DF_KEI,4.0",
|
|
366
|
+
"IX.C..",
|
|
367
|
+
"macro_manufacturing_production_volume",
|
|
368
|
+
],
|
|
369
|
+
[
|
|
370
|
+
"TOVM",
|
|
371
|
+
"SDD.STES,DSD_KEI@DF_KEI,4.0",
|
|
372
|
+
"IX...",
|
|
373
|
+
"macro_retail_trade_volume",
|
|
374
|
+
],
|
|
375
|
+
["IRSTCI", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_interbank_rate"],
|
|
376
|
+
[
|
|
377
|
+
"IRLT",
|
|
378
|
+
"SDD.STES,DSD_KEI@DF_KEI,4.0",
|
|
379
|
+
"PA...",
|
|
380
|
+
"macro_long_term_interest_rate",
|
|
381
|
+
],
|
|
382
|
+
[
|
|
383
|
+
"B1GQ",
|
|
384
|
+
"SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1",
|
|
385
|
+
"._Z....GY.T0102",
|
|
386
|
+
"macro_gdp_growth_yoy",
|
|
387
|
+
],
|
|
388
|
+
]
|
|
389
|
+
|
|
390
|
+
# Create empty final dataframe
|
|
391
|
+
oecd_df_final = pd.DataFrame()
|
|
392
|
+
|
|
393
|
+
daily_df = pd.DataFrame({"OBS": date_range})
|
|
394
|
+
value_columns = []
|
|
395
|
+
|
|
396
|
+
# Iterate for each variable of interest
|
|
397
|
+
for series_details in url_details:
|
|
398
|
+
series = series_details[0]
|
|
399
|
+
dataset_id = series_details[1]
|
|
400
|
+
filter = series_details[2]
|
|
401
|
+
col_name = series_details[3]
|
|
402
|
+
|
|
403
|
+
# check if request was successful and determine the most granular data available
|
|
404
|
+
for freq in ["M", "Q", "A"]:
|
|
405
|
+
if series in ["UNE_LF_M", "EAR"]:
|
|
406
|
+
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{series}.{filter}.{freq}?startPeriod=1950-01"
|
|
407
|
+
elif series in ["B1GQ"]:
|
|
408
|
+
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{country}...{series}.{filter}?startPeriod=1950-01"
|
|
409
|
+
else:
|
|
410
|
+
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{freq}.{series}.{filter}?startPeriod=1950-01"
|
|
411
|
+
|
|
412
|
+
# Make the request to the OECD API for data
|
|
413
|
+
data_response = requests.get(data_url)
|
|
414
|
+
|
|
415
|
+
# Check if the request was successful
|
|
416
|
+
if data_response.status_code != 200:
|
|
417
|
+
print(
|
|
418
|
+
f"Failed to fetch data for series {series} with frequency '{freq}' for {country}: {data_response.status_code} {data_response.text}",
|
|
419
|
+
)
|
|
420
|
+
url_test = False
|
|
421
|
+
continue
|
|
422
|
+
url_test = True
|
|
423
|
+
break
|
|
424
|
+
|
|
425
|
+
# get data for the next variable if url doesn't exist
|
|
426
|
+
if url_test is False:
|
|
427
|
+
continue
|
|
428
|
+
|
|
429
|
+
root = ET.fromstring(data_response.content)
|
|
430
|
+
|
|
431
|
+
# Define namespaces if necessary (the namespace is included in the tags)
|
|
432
|
+
namespaces = {
|
|
433
|
+
"generic": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic",
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
# Lists to store the data
|
|
437
|
+
dates = []
|
|
438
|
+
values = []
|
|
439
|
+
|
|
440
|
+
# Iterate over all <Obs> elements and extract date and value
|
|
441
|
+
for obs in root.findall(".//generic:Obs", namespaces):
|
|
442
|
+
# Extracting the time period (date)
|
|
443
|
+
time_period = obs.find(".//generic:ObsDimension", namespaces).get(
|
|
444
|
+
"value",
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Extracting the observation value
|
|
448
|
+
value = obs.find(".//generic:ObsValue", namespaces).get("value")
|
|
449
|
+
|
|
450
|
+
# Storing the data
|
|
451
|
+
if time_period and value:
|
|
452
|
+
dates.append(time_period)
|
|
453
|
+
values.append(float(value)) # Convert value to float
|
|
454
|
+
|
|
455
|
+
# Add variable names that were found to a list
|
|
456
|
+
value_columns.append(col_name)
|
|
457
|
+
|
|
458
|
+
# Creating a DataFrame
|
|
459
|
+
data = pd.DataFrame({"OBS": dates, col_name: values})
|
|
460
|
+
|
|
461
|
+
# Convert date strings into datetime format
|
|
462
|
+
if freq == "Q":
|
|
463
|
+
data["OBS"] = data["OBS"].apply(parse_quarter)
|
|
464
|
+
else:
|
|
465
|
+
# Display the DataFrame
|
|
466
|
+
data["OBS"] = data["OBS"].apply(lambda x: datetime.strptime(x, "%Y-%m"))
|
|
467
|
+
|
|
468
|
+
# Sort data by chronological order
|
|
469
|
+
data.sort_values(by="OBS", inplace=True)
|
|
470
|
+
|
|
471
|
+
# Merge the data based on the observation date
|
|
472
|
+
daily_df = pd.merge_asof(
|
|
473
|
+
daily_df,
|
|
474
|
+
data[["OBS", col_name]],
|
|
475
|
+
on="OBS",
|
|
476
|
+
direction="backward",
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
# Ensure columns are numeric
|
|
480
|
+
for col in value_columns:
|
|
481
|
+
if col in daily_df.columns:
|
|
482
|
+
daily_df[col] = pd.to_numeric(daily_df[col], errors="coerce").fillna(0)
|
|
483
|
+
else:
|
|
484
|
+
print(f"Column {col} not found in daily_df")
|
|
485
|
+
|
|
486
|
+
# Aggregate results by week
|
|
487
|
+
country_df = ims_proc.aggregate_daily_to_wc_wide(
|
|
488
|
+
df=daily_df,
|
|
489
|
+
date_column="OBS",
|
|
490
|
+
group_columns=[],
|
|
491
|
+
sum_columns=value_columns,
|
|
492
|
+
wc=week_commencing,
|
|
493
|
+
aggregation="average",
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
oecd_df_final = pd.concat(
|
|
497
|
+
[oecd_df_final, country_df],
|
|
498
|
+
axis=0,
|
|
499
|
+
ignore_index=True,
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
return oecd_df_final
|
|
503
|
+
|
|
504
|
+
def get_google_mobility_data(
|
|
505
|
+
self,
|
|
506
|
+
country="United Kingdom",
|
|
507
|
+
wc="mon",
|
|
508
|
+
) -> pd.DataFrame:
|
|
509
|
+
"""
|
|
510
|
+
Fetch Google Mobility data for the specified country.
|
|
511
|
+
|
|
512
|
+
Parameters
|
|
513
|
+
----------
|
|
514
|
+
- country (str): The name of the country for which to fetch data.
|
|
515
|
+
|
|
516
|
+
Returns
|
|
517
|
+
-------
|
|
518
|
+
- pd.DataFrame: A DataFrame containing the Google Mobility data.
|
|
519
|
+
|
|
520
|
+
"""
|
|
521
|
+
# URL of the Google Mobility Reports CSV file
|
|
522
|
+
url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv"
|
|
523
|
+
|
|
524
|
+
# Fetch the CSV file
|
|
525
|
+
response = requests.get(url)
|
|
526
|
+
if response.status_code != 200:
|
|
527
|
+
raise Exception(f"Failed to fetch data: {response.status_code}")
|
|
528
|
+
|
|
529
|
+
# Load the CSV file into a pandas DataFrame
|
|
530
|
+
csv_data = StringIO(response.text)
|
|
531
|
+
df = pd.read_csv(csv_data, low_memory=False)
|
|
532
|
+
|
|
533
|
+
# Filter the DataFrame for the specified country
|
|
534
|
+
country_df = df[df["country_region"] == country]
|
|
535
|
+
|
|
536
|
+
final_covid = ims_proc.aggregate_daily_to_wc_wide(
|
|
537
|
+
country_df,
|
|
538
|
+
"date",
|
|
539
|
+
[],
|
|
540
|
+
[
|
|
541
|
+
"retail_and_recreation_percent_change_from_baseline",
|
|
542
|
+
"grocery_and_pharmacy_percent_change_from_baseline",
|
|
543
|
+
"parks_percent_change_from_baseline",
|
|
544
|
+
"transit_stations_percent_change_from_baseline",
|
|
545
|
+
"workplaces_percent_change_from_baseline",
|
|
546
|
+
"residential_percent_change_from_baseline",
|
|
547
|
+
],
|
|
548
|
+
wc,
|
|
549
|
+
"average",
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
final_covid1 = ims_proc.rename_cols(final_covid, "covid_")
|
|
553
|
+
return final_covid1
|
|
554
|
+
|
|
555
|
+
############################################################### Seasonality ##########################################################################
|
|
556
|
+
|
|
557
|
+
def pull_seasonality(self, week_commencing, start_date, countries):
|
|
558
|
+
"""
|
|
559
|
+
Generates a DataFrame with weekly seasonality features.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
week_commencing (str): The starting day of the week ('mon', 'tue', ..., 'sun').
|
|
563
|
+
start_date (str): The start date in 'YYYY-MM-DD' format.
|
|
564
|
+
countries (list): A list of country codes (e.g., ['GB', 'US']) for holidays.
|
|
565
|
+
|
|
566
|
+
Returns:
|
|
567
|
+
pd.DataFrame: A DataFrame indexed by week start date, containing various
|
|
568
|
+
seasonal dummy variables, holidays, trend, and constant.
|
|
569
|
+
The date column is named 'OBS'.
|
|
570
|
+
|
|
571
|
+
"""
|
|
572
|
+
# ---------------------------------------------------------------------
|
|
573
|
+
# 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
|
|
574
|
+
# ---------------------------------------------------------------------
|
|
575
|
+
day_dict = {
|
|
576
|
+
"mon": 0,
|
|
577
|
+
"tue": 1,
|
|
578
|
+
"wed": 2,
|
|
579
|
+
"thu": 3,
|
|
580
|
+
"fri": 4,
|
|
581
|
+
"sat": 5,
|
|
582
|
+
"sun": 6,
|
|
583
|
+
}
|
|
584
|
+
if week_commencing not in day_dict:
|
|
585
|
+
raise ValueError(
|
|
586
|
+
f"Invalid week_commencing value: {week_commencing}. Use one of {list(day_dict.keys())}",
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
# ---------------------------------------------------------------------
|
|
590
|
+
# 0.2 Setup: dictionary continents and countries
|
|
591
|
+
# ---------------------------------------------------------------------
|
|
592
|
+
COUNTRY_TO_CONTINENT = {
|
|
593
|
+
# Europe
|
|
594
|
+
"Austria": "europe",
|
|
595
|
+
"Belarus": "europe",
|
|
596
|
+
"Belgium": "europe",
|
|
597
|
+
"Bulgaria": "europe",
|
|
598
|
+
"Croatia": "europe",
|
|
599
|
+
"Cyprus": "europe",
|
|
600
|
+
"Czechia": "europe",
|
|
601
|
+
"CzechRepublic": "europe",
|
|
602
|
+
"Denmark": "europe",
|
|
603
|
+
"Estonia": "europe",
|
|
604
|
+
"EuropeanCentralBank": "europe",
|
|
605
|
+
"Finland": "europe",
|
|
606
|
+
"France": "europe",
|
|
607
|
+
"FranceAlsaceMoselle": "europe",
|
|
608
|
+
"Germany": "europe",
|
|
609
|
+
"GermanyBaden": "europe",
|
|
610
|
+
"GermanyBavaria": "europe",
|
|
611
|
+
"GermanyBerlin": "europe",
|
|
612
|
+
"GermanyBrandenburg": "europe",
|
|
613
|
+
"GermanyBremen": "europe",
|
|
614
|
+
"GermanyHamburg": "europe",
|
|
615
|
+
"GermanyHesse": "europe",
|
|
616
|
+
"GermanyLowerSaxony": "europe",
|
|
617
|
+
"GermanyMecklenburgVorpommern": "europe",
|
|
618
|
+
"GermanyNorthRhineWestphalia": "europe",
|
|
619
|
+
"GermanyRhinelandPalatinate": "europe",
|
|
620
|
+
"GermanySaarland": "europe",
|
|
621
|
+
"GermanySaxony": "europe",
|
|
622
|
+
"GermanySaxonyAnhalt": "europe",
|
|
623
|
+
"GermanySchleswigHolstein": "europe",
|
|
624
|
+
"GermanyThuringia": "europe",
|
|
625
|
+
"Greece": "europe",
|
|
626
|
+
"Hungary": "europe",
|
|
627
|
+
"Iceland": "europe",
|
|
628
|
+
"Ireland": "europe",
|
|
629
|
+
"Italy": "europe",
|
|
630
|
+
"Latvia": "europe",
|
|
631
|
+
"Lithuania": "europe",
|
|
632
|
+
"Luxembourg": "europe",
|
|
633
|
+
"Malta": "europe",
|
|
634
|
+
"Monaco": "europe",
|
|
635
|
+
"Netherlands": "europe",
|
|
636
|
+
"Norway": "europe",
|
|
637
|
+
"Poland": "europe",
|
|
638
|
+
"Portugal": "europe",
|
|
639
|
+
"Romania": "europe",
|
|
640
|
+
"Russia": "europe",
|
|
641
|
+
"Serbia": "europe",
|
|
642
|
+
"Slovakia": "europe",
|
|
643
|
+
"Slovenia": "europe",
|
|
644
|
+
"Spain": "europe",
|
|
645
|
+
"SpainAndalusia": "europe",
|
|
646
|
+
"SpainAragon": "europe",
|
|
647
|
+
"SpainAsturias": "europe",
|
|
648
|
+
"SpainBalearicIslands": "europe",
|
|
649
|
+
"SpainBasqueCountry": "europe",
|
|
650
|
+
"SpainCanaryIslands": "europe",
|
|
651
|
+
"SpainCantabria": "europe",
|
|
652
|
+
"SpainCastileAndLeon": "europe",
|
|
653
|
+
"SpainCastillaLaMancha": "europe",
|
|
654
|
+
"SpainCatalonia": "europe",
|
|
655
|
+
"SpainExtremadura": "europe",
|
|
656
|
+
"SpainGalicia": "europe",
|
|
657
|
+
"SpainLaRioja": "europe",
|
|
658
|
+
"SpainMadrid": "europe",
|
|
659
|
+
"SpainMurcia": "europe",
|
|
660
|
+
"SpainNavarre": "europe",
|
|
661
|
+
"SpainValencia": "europe",
|
|
662
|
+
"Sweden": "europe",
|
|
663
|
+
"Switzerland": "europe",
|
|
664
|
+
"Ukraine": "europe",
|
|
665
|
+
"UnitedKingdom": "europe",
|
|
666
|
+
# Americas
|
|
667
|
+
"Argentina": "america",
|
|
668
|
+
"Barbados": "america",
|
|
669
|
+
"Brazil": "america",
|
|
670
|
+
"Canada": "america",
|
|
671
|
+
"Chile": "america",
|
|
672
|
+
"Colombia": "america",
|
|
673
|
+
"Mexico": "america",
|
|
674
|
+
"Panama": "america",
|
|
675
|
+
"Paraguay": "america",
|
|
676
|
+
"Peru": "america",
|
|
677
|
+
"UnitedStates": "usa",
|
|
678
|
+
# US States
|
|
679
|
+
"Alabama": "usa.states",
|
|
680
|
+
"Alaska": "usa.states",
|
|
681
|
+
"Arizona": "usa.states",
|
|
682
|
+
"Arkansas": "usa.states",
|
|
683
|
+
"California": "usa.states",
|
|
684
|
+
"Colorado": "usa.states",
|
|
685
|
+
"Connecticut": "usa.states",
|
|
686
|
+
"Delaware": "usa.states",
|
|
687
|
+
"DistrictOfColumbia": "usa.states",
|
|
688
|
+
"Florida": "usa.states",
|
|
689
|
+
"Georgia": "usa.states",
|
|
690
|
+
"Hawaii": "usa.states",
|
|
691
|
+
"Idaho": "usa.states",
|
|
692
|
+
"Illinois": "usa.states",
|
|
693
|
+
"Indiana": "usa.states",
|
|
694
|
+
"Iowa": "usa.states",
|
|
695
|
+
"Kansas": "usa.states",
|
|
696
|
+
"Kentucky": "usa.states",
|
|
697
|
+
"Louisiana": "usa.states",
|
|
698
|
+
"Maine": "usa.states",
|
|
699
|
+
"Maryland": "usa.states",
|
|
700
|
+
"Massachusetts": "usa.states",
|
|
701
|
+
"Michigan": "usa.states",
|
|
702
|
+
"Minnesota": "usa.states",
|
|
703
|
+
"Mississippi": "usa.states",
|
|
704
|
+
"Missouri": "usa.states",
|
|
705
|
+
"Montana": "usa.states",
|
|
706
|
+
"Nebraska": "usa.states",
|
|
707
|
+
"Nevada": "usa.states",
|
|
708
|
+
"NewHampshire": "usa.states",
|
|
709
|
+
"NewJersey": "usa.states",
|
|
710
|
+
"NewMexico": "usa.states",
|
|
711
|
+
"NewYork": "usa.states",
|
|
712
|
+
"NorthCarolina": "usa.states",
|
|
713
|
+
"NorthDakota": "usa.states",
|
|
714
|
+
"Ohio": "usa.states",
|
|
715
|
+
"Oklahoma": "usa.states",
|
|
716
|
+
"Oregon": "usa.states",
|
|
717
|
+
"Pennsylvania": "usa.states",
|
|
718
|
+
"RhodeIsland": "usa.states",
|
|
719
|
+
"SouthCarolina": "usa.states",
|
|
720
|
+
"SouthDakota": "usa.states",
|
|
721
|
+
"Tennessee": "usa.states",
|
|
722
|
+
"Texas": "usa.states",
|
|
723
|
+
"Utah": "usa.states",
|
|
724
|
+
"Vermont": "usa.states",
|
|
725
|
+
"Virginia": "usa.states",
|
|
726
|
+
"Washington": "usa.states",
|
|
727
|
+
"WestVirginia": "usa.states",
|
|
728
|
+
"Wisconsin": "usa.states",
|
|
729
|
+
"Wyoming": "usa.states",
|
|
730
|
+
# Oceania
|
|
731
|
+
"Australia": "oceania",
|
|
732
|
+
"AustraliaCapitalTerritory": "oceania",
|
|
733
|
+
"AustraliaNewSouthWales": "oceania",
|
|
734
|
+
"AustraliaNorthernTerritory": "oceania",
|
|
735
|
+
"AustraliaQueensland": "oceania",
|
|
736
|
+
"AustraliaSouthAustralia": "oceania",
|
|
737
|
+
"AustraliaTasmania": "oceania",
|
|
738
|
+
"AustraliaVictoria": "oceania",
|
|
739
|
+
"AustraliaWesternAustralia": "oceania",
|
|
740
|
+
"MarshallIslands": "oceania",
|
|
741
|
+
"NewZealand": "oceania",
|
|
742
|
+
# Asia
|
|
743
|
+
"China": "asia",
|
|
744
|
+
"HongKong": "asia",
|
|
745
|
+
"India": "asia",
|
|
746
|
+
"Israel": "asia",
|
|
747
|
+
"Japan": "asia",
|
|
748
|
+
"Kazakhstan": "asia",
|
|
749
|
+
"Malaysia": "asia",
|
|
750
|
+
"Qatar": "asia",
|
|
751
|
+
"Singapore": "asia",
|
|
752
|
+
"SouthKorea": "asia",
|
|
753
|
+
"Taiwan": "asia",
|
|
754
|
+
"Turkey": "asia",
|
|
755
|
+
"Vietnam": "asia",
|
|
756
|
+
# Africa
|
|
757
|
+
"Algeria": "africa",
|
|
758
|
+
"Angola": "africa",
|
|
759
|
+
"Benin": "africa",
|
|
760
|
+
"IvoryCoast": "africa",
|
|
761
|
+
"Kenya": "africa",
|
|
762
|
+
"Madagascar": "africa",
|
|
763
|
+
"Nigeria": "africa",
|
|
764
|
+
"SaoTomeAndPrincipe": "africa",
|
|
765
|
+
"SouthAfrica": "africa",
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
# Dictionary mapping ISO country codes to their corresponding workalendar country names
|
|
769
|
+
holiday_country = {
|
|
770
|
+
# Major countries with required formats
|
|
771
|
+
"GB": "UnitedKingdom",
|
|
772
|
+
"US": "UnitedStates",
|
|
773
|
+
"USA": "UnitedStates", # Alternative code for US
|
|
774
|
+
"CA": "Canada",
|
|
775
|
+
"ZA": "SouthAfrica",
|
|
776
|
+
"FR": "France",
|
|
777
|
+
"DE": "Germany",
|
|
778
|
+
"AU": "Australia",
|
|
779
|
+
"AUS": "Australia", # Alternative code for Australia
|
|
780
|
+
# European countries
|
|
781
|
+
"AT": "Austria",
|
|
782
|
+
"BY": "Belarus",
|
|
783
|
+
"BE": "Belgium",
|
|
784
|
+
"BG": "Bulgaria",
|
|
785
|
+
"HR": "Croatia",
|
|
786
|
+
"CY": "Cyprus",
|
|
787
|
+
"CZ": "Czechia",
|
|
788
|
+
"DK": "Denmark",
|
|
789
|
+
"EE": "Estonia",
|
|
790
|
+
"FI": "Finland",
|
|
791
|
+
"GR": "Greece",
|
|
792
|
+
"HU": "Hungary",
|
|
793
|
+
"IS": "Iceland",
|
|
794
|
+
"IE": "Ireland",
|
|
795
|
+
"IT": "Italy",
|
|
796
|
+
"LV": "Latvia",
|
|
797
|
+
"LT": "Lithuania",
|
|
798
|
+
"LU": "Luxembourg",
|
|
799
|
+
"MT": "Malta",
|
|
800
|
+
"MC": "Monaco",
|
|
801
|
+
"NL": "Netherlands",
|
|
802
|
+
"NO": "Norway",
|
|
803
|
+
"PL": "Poland",
|
|
804
|
+
"PT": "Portugal",
|
|
805
|
+
"RO": "Romania",
|
|
806
|
+
"RU": "Russia",
|
|
807
|
+
"RS": "Serbia",
|
|
808
|
+
"SK": "Slovakia",
|
|
809
|
+
"SI": "Slovenia",
|
|
810
|
+
"ES": "Spain",
|
|
811
|
+
"SE": "Sweden",
|
|
812
|
+
"CH": "Switzerland",
|
|
813
|
+
"UA": "Ukraine",
|
|
814
|
+
# Americas
|
|
815
|
+
"AR": "Argentina",
|
|
816
|
+
"BB": "Barbados",
|
|
817
|
+
"BR": "Brazil",
|
|
818
|
+
"CL": "Chile",
|
|
819
|
+
"CO": "Colombia",
|
|
820
|
+
"MX": "Mexico",
|
|
821
|
+
"PA": "Panama",
|
|
822
|
+
"PY": "Paraguay",
|
|
823
|
+
"PE": "Peru",
|
|
824
|
+
# USA States (using common abbreviations)
|
|
825
|
+
"AL": "Alabama",
|
|
826
|
+
"AK": "Alaska",
|
|
827
|
+
"AZ": "Arizona",
|
|
828
|
+
"AR": "Arkansas",
|
|
829
|
+
"CA_US": "California",
|
|
830
|
+
"CO_US": "Colorado",
|
|
831
|
+
"CT": "Connecticut",
|
|
832
|
+
"DE_US": "Delaware",
|
|
833
|
+
"DC": "DistrictOfColumbia",
|
|
834
|
+
"FL": "Florida",
|
|
835
|
+
"GA": "Georgia",
|
|
836
|
+
"HI": "Hawaii",
|
|
837
|
+
"ID": "Idaho",
|
|
838
|
+
"IL": "Illinois",
|
|
839
|
+
"IN": "Indiana",
|
|
840
|
+
"IA": "Iowa",
|
|
841
|
+
"KS": "Kansas",
|
|
842
|
+
"KY": "Kentucky",
|
|
843
|
+
"LA": "Louisiana",
|
|
844
|
+
"ME": "Maine",
|
|
845
|
+
"MD": "Maryland",
|
|
846
|
+
"MA": "Massachusetts",
|
|
847
|
+
"MI": "Michigan",
|
|
848
|
+
"MN": "Minnesota",
|
|
849
|
+
"MS": "Mississippi",
|
|
850
|
+
"MO": "Missouri",
|
|
851
|
+
"MT": "Montana",
|
|
852
|
+
"NE": "Nebraska",
|
|
853
|
+
"NV": "Nevada",
|
|
854
|
+
"NH": "NewHampshire",
|
|
855
|
+
"NJ": "NewJersey",
|
|
856
|
+
"NM": "NewMexico",
|
|
857
|
+
"NY": "NewYork",
|
|
858
|
+
"NC": "NorthCarolina",
|
|
859
|
+
"ND": "NorthDakota",
|
|
860
|
+
"OH": "Ohio",
|
|
861
|
+
"OK": "Oklahoma",
|
|
862
|
+
"OR": "Oregon",
|
|
863
|
+
"PA_US": "Pennsylvania",
|
|
864
|
+
"RI": "RhodeIsland",
|
|
865
|
+
"SC": "SouthCarolina",
|
|
866
|
+
"SD": "SouthDakota",
|
|
867
|
+
"TN": "Tennessee",
|
|
868
|
+
"TX": "Texas",
|
|
869
|
+
"UT": "Utah",
|
|
870
|
+
"VT": "Vermont",
|
|
871
|
+
"VA": "Virginia",
|
|
872
|
+
"WA": "Washington",
|
|
873
|
+
"WV": "WestVirginia",
|
|
874
|
+
"WI": "Wisconsin",
|
|
875
|
+
"WY": "Wyoming",
|
|
876
|
+
# Australia territories
|
|
877
|
+
"ACT": "AustraliaCapitalTerritory",
|
|
878
|
+
"NSW": "AustraliaNewSouthWales",
|
|
879
|
+
"NT": "AustraliaNorthernTerritory",
|
|
880
|
+
"QLD": "AustraliaQueensland",
|
|
881
|
+
"SA": "AustraliaSouthAustralia",
|
|
882
|
+
"TAS": "AustraliaTasmania",
|
|
883
|
+
"VIC": "AustraliaVictoria",
|
|
884
|
+
"WA_AU": "AustraliaWesternAustralia",
|
|
885
|
+
# Asian countries
|
|
886
|
+
"CN": "China",
|
|
887
|
+
"HK": "HongKong",
|
|
888
|
+
"IN": "India",
|
|
889
|
+
"IL": "Israel",
|
|
890
|
+
"JP": "Japan",
|
|
891
|
+
"KZ": "Kazakhstan",
|
|
892
|
+
"MY": "Malaysia",
|
|
893
|
+
"QA": "Qatar",
|
|
894
|
+
"SG": "Singapore",
|
|
895
|
+
"KR": "SouthKorea",
|
|
896
|
+
"TW": "Taiwan",
|
|
897
|
+
"TR": "Turkey",
|
|
898
|
+
"VN": "Vietnam",
|
|
899
|
+
# Other Oceania countries
|
|
900
|
+
"MH": "MarshallIslands",
|
|
901
|
+
"NZ": "NewZealand",
|
|
902
|
+
# African countries
|
|
903
|
+
"DZ": "Algeria",
|
|
904
|
+
"AO": "Angola",
|
|
905
|
+
"BJ": "Benin",
|
|
906
|
+
"CI": "IvoryCoast",
|
|
907
|
+
"KE": "Kenya",
|
|
908
|
+
"MG": "Madagascar",
|
|
909
|
+
"NG": "Nigeria",
|
|
910
|
+
"ST": "SaoTomeAndPrincipe",
|
|
911
|
+
}
|
|
912
|
+
|
|
913
|
+
# ---------------------------------------------------------------------
|
|
914
|
+
# 1. Create daily date range from start_date to today
|
|
915
|
+
# ---------------------------------------------------------------------
|
|
916
|
+
try:
|
|
917
|
+
start_dt = pd.to_datetime(start_date)
|
|
918
|
+
except ValueError:
|
|
919
|
+
raise ValueError(
|
|
920
|
+
f"Invalid start_date format: {start_date}. Use 'YYYY-MM-DD'",
|
|
921
|
+
)
|
|
922
|
+
|
|
923
|
+
end_dt = datetime.today()
|
|
924
|
+
# Ensure end date is not before start date
|
|
925
|
+
if end_dt < start_dt:
|
|
926
|
+
end_dt = start_dt + timedelta(days=1) # Or handle as error if preferred
|
|
927
|
+
|
|
928
|
+
date_range = pd.date_range(start=start_dt, end=end_dt, freq="D")
|
|
929
|
+
df_daily = pd.DataFrame(date_range, columns=["Date"])
|
|
930
|
+
|
|
931
|
+
# ---------------------------------------------------------------------
|
|
932
|
+
# 1.1 Identify "week_start" for each daily row, based on week_commencing
|
|
933
|
+
# ---------------------------------------------------------------------
|
|
934
|
+
start_day_int = day_dict[week_commencing]
|
|
935
|
+
df_daily["week_start"] = df_daily["Date"].apply(
|
|
936
|
+
lambda x: x - pd.Timedelta(days=(x.weekday() - start_day_int) % 7),
|
|
937
|
+
)
|
|
938
|
+
|
|
939
|
+
# ---------------------------------------------------------------------
|
|
940
|
+
# 1.2 Calculate ISO week number for each DAY (for later aggregation)
|
|
941
|
+
# Also calculate Year for each DAY to handle year transitions correctly
|
|
942
|
+
# ---------------------------------------------------------------------
|
|
943
|
+
df_daily["iso_week_daily"] = df_daily["Date"].dt.isocalendar().week.astype(int)
|
|
944
|
+
df_daily["iso_year_daily"] = df_daily["Date"].dt.isocalendar().year.astype(int)
|
|
945
|
+
|
|
946
|
+
# ---------------------------------------------------------------------
|
|
947
|
+
# 2. Build a weekly index (df_weekly_start) based on unique week_start dates
|
|
948
|
+
# ---------------------------------------------------------------------
|
|
949
|
+
df_weekly_start = (
|
|
950
|
+
df_daily[["week_start"]]
|
|
951
|
+
.drop_duplicates()
|
|
952
|
+
.sort_values("week_start")
|
|
953
|
+
.reset_index(drop=True)
|
|
954
|
+
)
|
|
955
|
+
df_weekly_start.rename(columns={"week_start": "Date"}, inplace=True)
|
|
956
|
+
df_weekly_start.set_index("Date", inplace=True)
|
|
957
|
+
|
|
958
|
+
# Create individual weekly dummies (optional, uncomment if needed)
|
|
959
|
+
dummy_columns = {}
|
|
960
|
+
for i, date_index in enumerate(df_weekly_start.index):
|
|
961
|
+
col_name = f"dum_{date_index.strftime('%Y_%m_%d')}"
|
|
962
|
+
dummy_columns[col_name] = [0] * len(df_weekly_start)
|
|
963
|
+
dummy_columns[col_name][i] = 1
|
|
964
|
+
df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
|
|
965
|
+
df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
|
|
966
|
+
|
|
967
|
+
# ---------------------------------------------------------------------
|
|
968
|
+
# 3. Public holidays (daily) using WorkCalendar
|
|
969
|
+
# ---------------------------------------------------------------------
|
|
970
|
+
start_year = start_dt.year
|
|
971
|
+
end_year = end_dt.year
|
|
972
|
+
years_range = range(start_year, end_year + 1)
|
|
973
|
+
|
|
974
|
+
# Dictionary to store holiday dummies for each country
|
|
975
|
+
country_holiday_dummies = {}
|
|
976
|
+
|
|
977
|
+
for country_code in countries:
|
|
978
|
+
# Skip if country code not found in holiday_country dictionary
|
|
979
|
+
if country_code not in holiday_country:
|
|
980
|
+
print(
|
|
981
|
+
f"Warning: Country code '{country_code}' not found in country code dictionary. Skipping.",
|
|
982
|
+
)
|
|
983
|
+
continue
|
|
984
|
+
|
|
985
|
+
country = holiday_country[country_code]
|
|
986
|
+
|
|
987
|
+
# Skip if country not found in continent lookup dictionary
|
|
988
|
+
if country not in COUNTRY_TO_CONTINENT:
|
|
989
|
+
print(
|
|
990
|
+
f"Warning: Country '{country}' not found in continent lookup dictionary. Skipping.",
|
|
991
|
+
)
|
|
992
|
+
continue
|
|
993
|
+
|
|
994
|
+
continent = COUNTRY_TO_CONTINENT[country]
|
|
995
|
+
module_path = f"workalendar.{continent}"
|
|
996
|
+
try:
|
|
997
|
+
module = importlib.import_module(module_path)
|
|
998
|
+
calendar_class = getattr(module, country)
|
|
999
|
+
cal = calendar_class()
|
|
1000
|
+
except (ImportError, AttributeError) as e:
|
|
1001
|
+
print(f"Error importing calendar for {country}: {e}. Skipping.")
|
|
1002
|
+
continue
|
|
1003
|
+
|
|
1004
|
+
# Collect holidays
|
|
1005
|
+
holidays_list = []
|
|
1006
|
+
for year in years_range:
|
|
1007
|
+
holidays_list.extend(cal.holidays(year))
|
|
1008
|
+
|
|
1009
|
+
holidays_df = pd.DataFrame(holidays_list, columns=["Date", "Holiday"])
|
|
1010
|
+
holidays_df["Date"] = pd.to_datetime(holidays_df["Date"])
|
|
1011
|
+
|
|
1012
|
+
# Filter out any holidays with "shift" or "substitute" in their name
|
|
1013
|
+
holidays_df = holidays_df[
|
|
1014
|
+
~(
|
|
1015
|
+
holidays_df["Holiday"].str.lower().str.contains("shift")
|
|
1016
|
+
| holidays_df["Holiday"].str.lower().str.contains("substitute")
|
|
1017
|
+
)
|
|
1018
|
+
]
|
|
1019
|
+
|
|
1020
|
+
# Filter by date range
|
|
1021
|
+
holidays_df = holidays_df[
|
|
1022
|
+
(holidays_df["Date"] >= start_dt) & (holidays_df["Date"] <= end_dt)
|
|
1023
|
+
]
|
|
1024
|
+
# ---------------------------------------------------------------------
|
|
1025
|
+
# 3.1 Additional Public Holidays for Canada due to poor API data
|
|
1026
|
+
# ---------------------------------------------------------------------
|
|
1027
|
+
if country_code == "CA":
|
|
1028
|
+
# Add Canada Day (July 1st) if not already in the list
|
|
1029
|
+
for year in years_range:
|
|
1030
|
+
canada_day = pd.Timestamp(f"{year}-07-01")
|
|
1031
|
+
if canada_day >= start_dt and canada_day <= end_dt:
|
|
1032
|
+
if not (
|
|
1033
|
+
(holidays_df["Date"] == canada_day)
|
|
1034
|
+
& (
|
|
1035
|
+
holidays_df["Holiday"]
|
|
1036
|
+
.str.lower()
|
|
1037
|
+
.str.contains("canada day")
|
|
1038
|
+
)
|
|
1039
|
+
).any():
|
|
1040
|
+
holidays_df = pd.concat(
|
|
1041
|
+
[
|
|
1042
|
+
holidays_df,
|
|
1043
|
+
pd.DataFrame(
|
|
1044
|
+
{
|
|
1045
|
+
"Date": [canada_day],
|
|
1046
|
+
"Holiday": ["Canada Day"],
|
|
1047
|
+
},
|
|
1048
|
+
),
|
|
1049
|
+
],
|
|
1050
|
+
ignore_index=True,
|
|
1051
|
+
)
|
|
1052
|
+
|
|
1053
|
+
# Add Labour Day (first Monday in September)
|
|
1054
|
+
for year in years_range:
|
|
1055
|
+
# Get first day of September
|
|
1056
|
+
first_day = pd.Timestamp(f"{year}-09-01")
|
|
1057
|
+
# Calculate days until first Monday (Monday is weekday 0)
|
|
1058
|
+
days_until_monday = (7 - first_day.weekday()) % 7
|
|
1059
|
+
if days_until_monday == 0: # If first day is already Monday
|
|
1060
|
+
labour_day = first_day
|
|
1061
|
+
else:
|
|
1062
|
+
labour_day = first_day + pd.Timedelta(days=days_until_monday)
|
|
1063
|
+
|
|
1064
|
+
if labour_day >= start_dt and labour_day <= end_dt:
|
|
1065
|
+
if not (
|
|
1066
|
+
(holidays_df["Date"] == labour_day)
|
|
1067
|
+
& (
|
|
1068
|
+
holidays_df["Holiday"]
|
|
1069
|
+
.str.lower()
|
|
1070
|
+
.str.contains("labour day")
|
|
1071
|
+
)
|
|
1072
|
+
).any():
|
|
1073
|
+
holidays_df = pd.concat(
|
|
1074
|
+
[
|
|
1075
|
+
holidays_df,
|
|
1076
|
+
pd.DataFrame(
|
|
1077
|
+
{
|
|
1078
|
+
"Date": [labour_day],
|
|
1079
|
+
"Holiday": ["Labour Day"],
|
|
1080
|
+
},
|
|
1081
|
+
),
|
|
1082
|
+
],
|
|
1083
|
+
ignore_index=True,
|
|
1084
|
+
)
|
|
1085
|
+
|
|
1086
|
+
# Add Thanksgiving (second Monday in October)
|
|
1087
|
+
for year in years_range:
|
|
1088
|
+
# Get first day of October
|
|
1089
|
+
first_day = pd.Timestamp(f"{year}-10-01")
|
|
1090
|
+
# Calculate days until first Monday
|
|
1091
|
+
days_until_monday = (7 - first_day.weekday()) % 7
|
|
1092
|
+
if days_until_monday == 0: # If first day is already Monday
|
|
1093
|
+
first_monday = first_day
|
|
1094
|
+
else:
|
|
1095
|
+
first_monday = first_day + pd.Timedelta(days=days_until_monday)
|
|
1096
|
+
|
|
1097
|
+
# Second Monday is 7 days after first Monday
|
|
1098
|
+
thanksgiving = first_monday + pd.Timedelta(days=7)
|
|
1099
|
+
|
|
1100
|
+
if thanksgiving >= start_dt and thanksgiving <= end_dt:
|
|
1101
|
+
if not (
|
|
1102
|
+
(holidays_df["Date"] == thanksgiving)
|
|
1103
|
+
& (
|
|
1104
|
+
holidays_df["Holiday"]
|
|
1105
|
+
.str.lower()
|
|
1106
|
+
.str.contains("thanksgiving")
|
|
1107
|
+
)
|
|
1108
|
+
).any():
|
|
1109
|
+
holidays_df = pd.concat(
|
|
1110
|
+
[
|
|
1111
|
+
holidays_df,
|
|
1112
|
+
pd.DataFrame(
|
|
1113
|
+
{
|
|
1114
|
+
"Date": [thanksgiving],
|
|
1115
|
+
"Holiday": ["Thanksgiving"],
|
|
1116
|
+
},
|
|
1117
|
+
),
|
|
1118
|
+
],
|
|
1119
|
+
ignore_index=True,
|
|
1120
|
+
)
|
|
1121
|
+
|
|
1122
|
+
# Now process the collected holidays and add to df_daily
|
|
1123
|
+
for _, row in holidays_df.iterrows():
|
|
1124
|
+
holiday_date = row["Date"]
|
|
1125
|
+
# Create column name without modifying original holiday names
|
|
1126
|
+
holiday_name = row["Holiday"].lower().replace(" ", "_")
|
|
1127
|
+
|
|
1128
|
+
# Remove "_shift" or "_substitute" if they appear as standalone suffixes
|
|
1129
|
+
if holiday_name.endswith("_shift"):
|
|
1130
|
+
holiday_name = holiday_name[:-6]
|
|
1131
|
+
elif holiday_name.endswith("_substitute"):
|
|
1132
|
+
holiday_name = holiday_name[:-11]
|
|
1133
|
+
|
|
1134
|
+
column_name = f"seas_{holiday_name}_{country_code.lower()}"
|
|
1135
|
+
|
|
1136
|
+
if column_name not in df_daily.columns:
|
|
1137
|
+
df_daily[column_name] = 0
|
|
1138
|
+
|
|
1139
|
+
# Mark the specific holiday date
|
|
1140
|
+
df_daily.loc[df_daily["Date"] == holiday_date, column_name] = 1
|
|
1141
|
+
|
|
1142
|
+
# Also mark a general holiday indicator for each country
|
|
1143
|
+
holiday_indicator = f"seas_holiday_{country_code.lower()}"
|
|
1144
|
+
if holiday_indicator not in df_daily.columns:
|
|
1145
|
+
df_daily[holiday_indicator] = 0
|
|
1146
|
+
df_daily.loc[df_daily["Date"] == holiday_date, holiday_indicator] = 1
|
|
1147
|
+
|
|
1148
|
+
# ---------------------------------------------------------------------
|
|
1149
|
+
# 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
|
|
1150
|
+
# ---------------------------------------------------------------------
|
|
1151
|
+
extra_cols = [
|
|
1152
|
+
"seas_valentines_day",
|
|
1153
|
+
"seas_halloween",
|
|
1154
|
+
"seas_fathers_day_us_uk", # Note: UK/US is 3rd Sun Jun, others vary
|
|
1155
|
+
"seas_mothers_day_us", # Note: US is 2nd Sun May
|
|
1156
|
+
"seas_mothers_day_uk", # Note: UK Mothering Sunday varies with Easter
|
|
1157
|
+
"seas_good_friday",
|
|
1158
|
+
"seas_easter_monday",
|
|
1159
|
+
"seas_black_friday", # US-centric, but globally adopted
|
|
1160
|
+
"seas_cyber_monday", # US-centric, but globally adopted
|
|
1161
|
+
]
|
|
1162
|
+
for c in extra_cols:
|
|
1163
|
+
if (
|
|
1164
|
+
c not in df_daily.columns
|
|
1165
|
+
): # Avoid overwriting if already created by holidays pkg
|
|
1166
|
+
df_daily[c] = 0
|
|
1167
|
+
|
|
1168
|
+
# Helper: nth_weekday_of_month(year, month, weekday, nth)
|
|
1169
|
+
def nth_weekday_of_month(year, month, weekday, nth):
|
|
1170
|
+
d = datetime(year, month, 1)
|
|
1171
|
+
w = d.weekday()
|
|
1172
|
+
delta = (weekday - w + 7) % 7 # Ensure positive delta
|
|
1173
|
+
first_weekday = d + timedelta(days=delta)
|
|
1174
|
+
target_date = first_weekday + timedelta(days=7 * (nth - 1))
|
|
1175
|
+
# Check if the calculated date is still in the same month
|
|
1176
|
+
if target_date.month == month:
|
|
1177
|
+
return target_date
|
|
1178
|
+
# This can happen if nth is too large (e.g., 5th Friday)
|
|
1179
|
+
# Return the last occurrence of that weekday in the month instead
|
|
1180
|
+
return target_date - timedelta(days=7)
|
|
1181
|
+
|
|
1182
|
+
def get_good_friday(year):
|
|
1183
|
+
return easter(year) - timedelta(days=2)
|
|
1184
|
+
|
|
1185
|
+
def get_easter_monday(year):
|
|
1186
|
+
return easter(year) + timedelta(days=1)
|
|
1187
|
+
|
|
1188
|
+
def get_black_friday(year):
|
|
1189
|
+
# US Thanksgiving is 4th Thursday in November (weekday=3)
|
|
1190
|
+
thanksgiving = nth_weekday_of_month(year, 11, 3, 4)
|
|
1191
|
+
return thanksgiving + timedelta(days=1)
|
|
1192
|
+
|
|
1193
|
+
def get_cyber_monday(year):
|
|
1194
|
+
# Monday after US Thanksgiving
|
|
1195
|
+
thanksgiving = nth_weekday_of_month(year, 11, 3, 4)
|
|
1196
|
+
return thanksgiving + timedelta(days=4)
|
|
1197
|
+
|
|
1198
|
+
def get_mothering_sunday_uk(year):
|
|
1199
|
+
# Fourth Sunday in Lent (3 weeks before Easter Sunday)
|
|
1200
|
+
# Lent starts on Ash Wednesday, 46 days before Easter.
|
|
1201
|
+
# Easter Sunday is day 0. Sunday before is -7, etc.
|
|
1202
|
+
# 4th Sunday in Lent is 3 weeks before Easter.
|
|
1203
|
+
return easter(year) - timedelta(days=21)
|
|
1204
|
+
|
|
1205
|
+
# Loop over each year in range
|
|
1206
|
+
for yr in range(start_year, end_year + 1):
|
|
1207
|
+
try: # Wrap calculations in try-except for robustness
|
|
1208
|
+
# Valentines = Feb 14
|
|
1209
|
+
valentines_day = datetime(yr, 2, 14)
|
|
1210
|
+
# Halloween = Oct 31
|
|
1211
|
+
halloween_day = datetime(yr, 10, 31)
|
|
1212
|
+
# Father's Day (US & UK) = 3rd Sunday (6) in June
|
|
1213
|
+
fathers_day = nth_weekday_of_month(yr, 6, 6, 3)
|
|
1214
|
+
# Mother's Day US = 2nd Sunday (6) in May
|
|
1215
|
+
mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
|
|
1216
|
+
# Mother's Day UK (Mothering Sunday)
|
|
1217
|
+
mothering_sunday = get_mothering_sunday_uk(yr)
|
|
1218
|
+
|
|
1219
|
+
# Good Friday, Easter Monday
|
|
1220
|
+
gf = get_good_friday(yr)
|
|
1221
|
+
em = get_easter_monday(yr)
|
|
1222
|
+
|
|
1223
|
+
# Black Friday, Cyber Monday
|
|
1224
|
+
bf = get_black_friday(yr)
|
|
1225
|
+
cm = get_cyber_monday(yr)
|
|
1226
|
+
|
|
1227
|
+
# Mark them in df_daily if in range
|
|
1228
|
+
special_days_map = [
|
|
1229
|
+
(valentines_day, "seas_valentines_day"),
|
|
1230
|
+
(halloween_day, "seas_halloween"),
|
|
1231
|
+
(fathers_day, "seas_fathers_day_us_uk"),
|
|
1232
|
+
(mothers_day_us, "seas_mothers_day_us"),
|
|
1233
|
+
(mothering_sunday, "seas_mothers_day_uk"),
|
|
1234
|
+
(gf, "seas_good_friday"),
|
|
1235
|
+
(em, "seas_easter_monday"),
|
|
1236
|
+
(bf, "seas_black_friday"),
|
|
1237
|
+
(cm, "seas_cyber_monday"),
|
|
1238
|
+
]
|
|
1239
|
+
|
|
1240
|
+
for special_date, col in special_days_map:
|
|
1241
|
+
if (
|
|
1242
|
+
special_date is not None
|
|
1243
|
+
): # nth_weekday_of_month can return None edge cases
|
|
1244
|
+
special_ts = pd.Timestamp(special_date)
|
|
1245
|
+
# Only set if it's within the daily range AND column exists
|
|
1246
|
+
if (
|
|
1247
|
+
(special_ts >= df_daily["Date"].min())
|
|
1248
|
+
and (special_ts <= df_daily["Date"].max())
|
|
1249
|
+
and (col in df_daily.columns)
|
|
1250
|
+
):
|
|
1251
|
+
df_daily.loc[df_daily["Date"] == special_ts, col] = 1
|
|
1252
|
+
except Exception as e:
|
|
1253
|
+
print(f"Warning: Could not calculate special days for year {yr}: {e}")
|
|
1254
|
+
|
|
1255
|
+
# ---------------------------------------------------------------------
|
|
1256
|
+
# 4. Add daily indicators for last day & last Friday of month & payday
|
|
1257
|
+
# ---------------------------------------------------------------------
|
|
1258
|
+
df_daily["is_last_day_of_month"] = df_daily["Date"].dt.is_month_end
|
|
1259
|
+
|
|
1260
|
+
def is_last_friday(date):
|
|
1261
|
+
# Check if it's a Friday first
|
|
1262
|
+
if date.weekday() != 4: # Friday is 4
|
|
1263
|
+
return 0
|
|
1264
|
+
# Check if next Friday is in the next month
|
|
1265
|
+
next_friday = date + timedelta(days=7)
|
|
1266
|
+
return 1 if next_friday.month != date.month else 0
|
|
1267
|
+
|
|
1268
|
+
def is_payday(date):
|
|
1269
|
+
return 1 if date.day >= 25 else 0
|
|
1270
|
+
|
|
1271
|
+
df_daily["is_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
|
|
1272
|
+
|
|
1273
|
+
df_daily["is_payday"] = df_daily["Date"].apply(is_payday)
|
|
1274
|
+
|
|
1275
|
+
# Rename for clarity prefix
|
|
1276
|
+
df_daily.rename(
|
|
1277
|
+
columns={
|
|
1278
|
+
"is_last_day_of_month": "seas_last_day_of_month",
|
|
1279
|
+
"is_last_friday_of_month": "seas_last_friday_of_month",
|
|
1280
|
+
"is_payday": "seas_payday",
|
|
1281
|
+
},
|
|
1282
|
+
inplace=True,
|
|
1283
|
+
)
|
|
1284
|
+
|
|
1285
|
+
# ---------------------------------------------------------------------
|
|
1286
|
+
# 5. Weekly aggregation
|
|
1287
|
+
# ---------------------------------------------------------------------
|
|
1288
|
+
|
|
1289
|
+
# Select only columns that are indicators/flags (intended for max aggregation)
|
|
1290
|
+
flag_cols = [
|
|
1291
|
+
col
|
|
1292
|
+
for col in df_daily.columns
|
|
1293
|
+
if (col.startswith("seas_") or col.startswith("is_"))
|
|
1294
|
+
and col != "seas_payday"
|
|
1295
|
+
]
|
|
1296
|
+
# Ensure 'week_start' is present for grouping
|
|
1297
|
+
df_to_agg = df_daily[["week_start"] + flag_cols]
|
|
1298
|
+
|
|
1299
|
+
df_weekly_flags = (
|
|
1300
|
+
df_to_agg.groupby("week_start")
|
|
1301
|
+
.max() # if any day=1 in that week, entire week=1
|
|
1302
|
+
.reset_index()
|
|
1303
|
+
.rename(columns={"week_start": "Date"})
|
|
1304
|
+
.set_index("Date")
|
|
1305
|
+
)
|
|
1306
|
+
|
|
1307
|
+
# Do specific aggregation for payday
|
|
1308
|
+
# Make sure 'date' column exists in df_daily
|
|
1309
|
+
df_daily["month"] = df_daily["Date"].dt.month
|
|
1310
|
+
df_daily["year"] = df_daily["Date"].dt.year
|
|
1311
|
+
|
|
1312
|
+
# Sum of seas_payday flags per week
|
|
1313
|
+
week_payday_sum = df_daily.groupby("week_start")["seas_payday"].sum()
|
|
1314
|
+
|
|
1315
|
+
# Divide the number of payday flags by number of paydays per month
|
|
1316
|
+
payday_days_in_month = df_daily.groupby(["year", "month"])["seas_payday"].sum()
|
|
1317
|
+
week_month = df_daily.groupby("week_start").first()[["month", "year"]]
|
|
1318
|
+
week_days_in_month = week_month.apply(
|
|
1319
|
+
lambda row: payday_days_in_month.loc[(row["year"], row["month"])],
|
|
1320
|
+
axis=1,
|
|
1321
|
+
)
|
|
1322
|
+
df_weekly_flags["seas_payday"] = (
|
|
1323
|
+
(week_payday_sum / week_days_in_month).fillna(0).values
|
|
1324
|
+
)
|
|
1325
|
+
|
|
1326
|
+
# # Drop intermediate columns
|
|
1327
|
+
# df_weekly_flags = df_weekly_flags.drop(columns=["month", "year"])
|
|
1328
|
+
|
|
1329
|
+
# --- Aggregate Week Number using MODE ---
|
|
1330
|
+
# Define aggregation function for mode (handling potential multi-modal cases by taking the first)
|
|
1331
|
+
def get_mode(x):
|
|
1332
|
+
modes = pd.Series.mode(x)
|
|
1333
|
+
return modes[0] if not modes.empty else np.nan # Return first mode or NaN
|
|
1334
|
+
|
|
1335
|
+
df_weekly_iso_week_year = (
|
|
1336
|
+
df_daily[["week_start", "iso_week_daily", "iso_year_daily"]]
|
|
1337
|
+
.groupby("week_start")
|
|
1338
|
+
.agg(
|
|
1339
|
+
# Find the most frequent week number and year within the group
|
|
1340
|
+
Week=("iso_week_daily", get_mode),
|
|
1341
|
+
Year=("iso_year_daily", get_mode),
|
|
1342
|
+
)
|
|
1343
|
+
.reset_index()
|
|
1344
|
+
.rename(columns={"week_start": "Date"})
|
|
1345
|
+
.set_index("Date")
|
|
1346
|
+
)
|
|
1347
|
+
# Convert Week/Year back to integer type after aggregation
|
|
1348
|
+
df_weekly_iso_week_year["Week"] = df_weekly_iso_week_year["Week"].astype(int)
|
|
1349
|
+
df_weekly_iso_week_year["Year"] = df_weekly_iso_week_year["Year"].astype(int)
|
|
1350
|
+
|
|
1351
|
+
# --- Monthly dummies (spread evenly across week) ---
|
|
1352
|
+
df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
|
|
1353
|
+
df_monthly_dummies_daily = pd.get_dummies(
|
|
1354
|
+
df_daily[["week_start", "Month"]], # Only need these columns
|
|
1355
|
+
prefix="seas_month",
|
|
1356
|
+
columns=["Month"],
|
|
1357
|
+
dtype=float, # Use float for division
|
|
1358
|
+
)
|
|
1359
|
+
# Sum daily dummies within the week
|
|
1360
|
+
df_monthly_dummies_summed = df_monthly_dummies_daily.groupby("week_start").sum()
|
|
1361
|
+
# Divide by number of days in that specific week group (usually 7, except potentially start/end)
|
|
1362
|
+
days_in_week = df_daily.groupby("week_start").size()
|
|
1363
|
+
df_weekly_monthly_dummies = df_monthly_dummies_summed.div(days_in_week, axis=0)
|
|
1364
|
+
|
|
1365
|
+
# Reset index to merge
|
|
1366
|
+
df_weekly_monthly_dummies.reset_index(inplace=True)
|
|
1367
|
+
df_weekly_monthly_dummies.rename(columns={"week_start": "Date"}, inplace=True)
|
|
1368
|
+
df_weekly_monthly_dummies.set_index("Date", inplace=True)
|
|
1369
|
+
|
|
1370
|
+
# ---------------------------------------------------------------------
|
|
1371
|
+
# 6. Combine all weekly components
|
|
1372
|
+
# ---------------------------------------------------------------------
|
|
1373
|
+
# Start with the basic weekly index
|
|
1374
|
+
df_combined = df_weekly_start.copy()
|
|
1375
|
+
|
|
1376
|
+
# Join the other aggregated DataFrames
|
|
1377
|
+
df_combined = df_combined.join(df_weekly_flags, how="left")
|
|
1378
|
+
df_combined = df_combined.join(df_weekly_iso_week_year, how="left")
|
|
1379
|
+
df_combined = df_combined.join(df_weekly_monthly_dummies, how="left")
|
|
1380
|
+
|
|
1381
|
+
# Fill potential NaNs created by joins (e.g., if a flag column didn't exist) with 0
|
|
1382
|
+
# Exclude 'Week' and 'Year' which should always be present
|
|
1383
|
+
cols_to_fill = df_combined.columns.difference(["Week", "Year"])
|
|
1384
|
+
df_combined[cols_to_fill] = df_combined[cols_to_fill].fillna(0)
|
|
1385
|
+
|
|
1386
|
+
# Ensure correct types for flag columns (int)
|
|
1387
|
+
for col in df_weekly_flags.columns:
|
|
1388
|
+
if col != "seas_payday":
|
|
1389
|
+
if col in df_combined.columns:
|
|
1390
|
+
df_combined[col] = df_combined[col].astype(int)
|
|
1391
|
+
|
|
1392
|
+
# Ensure correct types for month columns (float)
|
|
1393
|
+
for col in df_weekly_monthly_dummies.columns:
|
|
1394
|
+
if col in df_combined.columns:
|
|
1395
|
+
df_combined[col] = df_combined[col].astype(float)
|
|
1396
|
+
|
|
1397
|
+
# ---------------------------------------------------------------------
|
|
1398
|
+
# 7. Create weekly dummies for Week of Year & yearly dummies from aggregated cols
|
|
1399
|
+
# ---------------------------------------------------------------------
|
|
1400
|
+
df_combined.reset_index(inplace=True) # 'Date', 'Week', 'Year' become columns
|
|
1401
|
+
|
|
1402
|
+
# Create dummies from the aggregated 'Week' column
|
|
1403
|
+
df_combined = pd.get_dummies(
|
|
1404
|
+
df_combined,
|
|
1405
|
+
prefix="seas",
|
|
1406
|
+
columns=["Week"],
|
|
1407
|
+
dtype=int,
|
|
1408
|
+
prefix_sep="_",
|
|
1409
|
+
)
|
|
1410
|
+
|
|
1411
|
+
# Create dummies from the aggregated 'Year' column
|
|
1412
|
+
df_combined = pd.get_dummies(
|
|
1413
|
+
df_combined,
|
|
1414
|
+
prefix="seas",
|
|
1415
|
+
columns=["Year"],
|
|
1416
|
+
dtype=int,
|
|
1417
|
+
prefix_sep="_",
|
|
1418
|
+
)
|
|
1419
|
+
|
|
1420
|
+
# ---------------------------------------------------------------------
|
|
1421
|
+
# 8. Add constant & trend
|
|
1422
|
+
# ---------------------------------------------------------------------
|
|
1423
|
+
df_combined["Constant"] = 1
|
|
1424
|
+
df_combined.reset_index(
|
|
1425
|
+
drop=True,
|
|
1426
|
+
inplace=True,
|
|
1427
|
+
) # Ensure index is 0, 1, 2... for trend
|
|
1428
|
+
df_combined["Trend"] = df_combined.index + 1
|
|
1429
|
+
|
|
1430
|
+
# ---------------------------------------------------------------------
|
|
1431
|
+
# 9. Rename Date -> OBS and select final columns
|
|
1432
|
+
# ---------------------------------------------------------------------
|
|
1433
|
+
df_combined.rename(columns={"Date": "OBS"}, inplace=True)
|
|
1434
|
+
|
|
1435
|
+
# Reorder columns - OBS first, then Constant, Trend, then seasonal features
|
|
1436
|
+
cols_order = (
|
|
1437
|
+
["OBS", "Constant", "Trend"]
|
|
1438
|
+
+ sorted([col for col in df_combined.columns if col.startswith("seas_")])
|
|
1439
|
+
+ sorted([col for col in df_combined.columns if col.startswith("dum_")])
|
|
1440
|
+
) # If individual week dummies were enabled
|
|
1441
|
+
|
|
1442
|
+
# Filter out columns not in the desired order list (handles case where dum_ cols are off)
|
|
1443
|
+
final_cols = [col for col in cols_order if col in df_combined.columns]
|
|
1444
|
+
df_combined = df_combined[final_cols]
|
|
1445
|
+
|
|
1446
|
+
return df_combined
|
|
1447
|
+
|
|
1448
|
+
def pull_weather(self, week_commencing, start_date, country_codes) -> pd.DataFrame:
|
|
1449
|
+
"""
|
|
1450
|
+
Pull weather data for a given week-commencing day and one or more country codes.
|
|
1451
|
+
Tester
|
|
1452
|
+
LOGIC:
|
|
1453
|
+
1) For non-US countries (AU, GB, DE, CA, ZA):
|
|
1454
|
+
- Mesonet => max_temp_f, min_temp_f -> compute mean_temp_f -> weekly average => 'avg_max_temp_f', etc.
|
|
1455
|
+
- Open-Meteo => precipitation_sum => 'avg_rain_sum', snowfall_sum => 'avg_snow_sum'.
|
|
1456
|
+
- Merge, then rename columns with prefix 'seas_{country}_'.
|
|
1457
|
+
|
|
1458
|
+
2) For the US:
|
|
1459
|
+
- We have multiple <STATE>_ASOS networks (e.g. CA_ASOS, TX_ASOS).
|
|
1460
|
+
- For each state, fetch from Mesonet => max_temp_f, min_temp_f, precip_in, snow_in -> compute mean_temp_f -> weekly average => 'avg_max_temp_f', 'avg_rain_sum', 'avg_snow_sum', etc.
|
|
1461
|
+
- Rename columns for each state with prefix 'seas_us_{state}_'.
|
|
1462
|
+
- Merge all states (and countries) into a single DataFrame.
|
|
1463
|
+
|
|
1464
|
+
:param week_commencing: A string in {"mon","tue","wed","thur","fri","sat","sun"}.
|
|
1465
|
+
:param country_codes: A list of 2-letter country codes or a single string, e.g. ["GB","US"].
|
|
1466
|
+
:return: A single Pandas DataFrame with weekly-aggregated data for all requested countries.
|
|
1467
|
+
"""
|
|
1468
|
+
# ------------------------------------------------------------------ #
|
|
1469
|
+
# 0) Handle either a single code or list of codes
|
|
1470
|
+
# ------------------------------------------------------------------ #
|
|
1471
|
+
if isinstance(country_codes, str):
|
|
1472
|
+
country_codes = [country_codes]
|
|
1473
|
+
elif not isinstance(country_codes, (list, tuple)):
|
|
1474
|
+
raise ValueError("country_codes must be a list/tuple or a single string.")
|
|
1475
|
+
|
|
1476
|
+
# --- Setup / Constants --- #
|
|
1477
|
+
day_dict = {
|
|
1478
|
+
"mon": 0,
|
|
1479
|
+
"tue": 1,
|
|
1480
|
+
"wed": 2,
|
|
1481
|
+
"thu": 3,
|
|
1482
|
+
"fri": 4,
|
|
1483
|
+
"sat": 5,
|
|
1484
|
+
"sun": 6,
|
|
1485
|
+
}
|
|
1486
|
+
# Map each 2-letter code to a key
|
|
1487
|
+
country_dict = {
|
|
1488
|
+
"US": "US_STATES",
|
|
1489
|
+
"CA": "Canada",
|
|
1490
|
+
"AU": "AU__ASOS",
|
|
1491
|
+
"GB": "GB__ASOS",
|
|
1492
|
+
"DE": "DE__ASOS",
|
|
1493
|
+
"ZA": "ZA__ASOS",
|
|
1494
|
+
}
|
|
1495
|
+
|
|
1496
|
+
# Station-based countries for Mesonet
|
|
1497
|
+
station_map = {
|
|
1498
|
+
"GB__ASOS": [
|
|
1499
|
+
"&stations=EGCC",
|
|
1500
|
+
"&stations=EGNM",
|
|
1501
|
+
"&stations=EGBB",
|
|
1502
|
+
"&stations=EGSH",
|
|
1503
|
+
"&stations=EGFF",
|
|
1504
|
+
"&stations=EGHI",
|
|
1505
|
+
"&stations=EGLC",
|
|
1506
|
+
"&stations=EGHQ",
|
|
1507
|
+
"&stations=EGAC",
|
|
1508
|
+
"&stations=EGPF",
|
|
1509
|
+
"&stations=EGGD",
|
|
1510
|
+
"&stations=EGPE",
|
|
1511
|
+
"&stations=EGNT",
|
|
1512
|
+
],
|
|
1513
|
+
"AU__ASOS": [
|
|
1514
|
+
"&stations=YPDN",
|
|
1515
|
+
"&stations=YBCS",
|
|
1516
|
+
"&stations=YBBN",
|
|
1517
|
+
"&stations=YSSY",
|
|
1518
|
+
"&stations=YSSY",
|
|
1519
|
+
"&stations=YMEN",
|
|
1520
|
+
"&stations=YPAD",
|
|
1521
|
+
"&stations=YPPH",
|
|
1522
|
+
],
|
|
1523
|
+
"DE__ASOS": [
|
|
1524
|
+
"&stations=EDDL",
|
|
1525
|
+
"&stations=EDDH",
|
|
1526
|
+
"&stations=EDDB",
|
|
1527
|
+
"&stations=EDDN",
|
|
1528
|
+
"&stations=EDDF",
|
|
1529
|
+
"&stations=EDDK",
|
|
1530
|
+
"&stations=EDLW",
|
|
1531
|
+
"&stations=EDDM",
|
|
1532
|
+
],
|
|
1533
|
+
# Example: if ZA is also station-based, add it here.
|
|
1534
|
+
"ZA__ASOS": [
|
|
1535
|
+
# If you know the station codes, add them here:
|
|
1536
|
+
# e.g. "&stations=FACT", "&stations=FAJS", ...
|
|
1537
|
+
],
|
|
1538
|
+
# "FR__ASOS" if you need France, etc.
|
|
1539
|
+
}
|
|
1540
|
+
|
|
1541
|
+
# Non-US countries that also fetch RAIN & SNOW from Open-Meteo
|
|
1542
|
+
rainfall_city_map = {
|
|
1543
|
+
"GB__ASOS": [
|
|
1544
|
+
"Manchester",
|
|
1545
|
+
"Leeds",
|
|
1546
|
+
"Birmingham",
|
|
1547
|
+
"London",
|
|
1548
|
+
"Glasgow",
|
|
1549
|
+
],
|
|
1550
|
+
"AU__ASOS": [
|
|
1551
|
+
"Darwin",
|
|
1552
|
+
"Cairns",
|
|
1553
|
+
"Brisbane",
|
|
1554
|
+
"Sydney",
|
|
1555
|
+
"Melbourne",
|
|
1556
|
+
"Adelaide",
|
|
1557
|
+
"Perth",
|
|
1558
|
+
],
|
|
1559
|
+
"DE__ASOS": [
|
|
1560
|
+
"Dortmund",
|
|
1561
|
+
"Düsseldorf",
|
|
1562
|
+
"Frankfurt",
|
|
1563
|
+
"Munich",
|
|
1564
|
+
"Cologne",
|
|
1565
|
+
"Berlin",
|
|
1566
|
+
"Hamburg",
|
|
1567
|
+
"Nuernberg",
|
|
1568
|
+
],
|
|
1569
|
+
"ZA__ASOS": ["Johannesburg", "Cape Town", "Durban", "Pretoria"],
|
|
1570
|
+
}
|
|
1571
|
+
|
|
1572
|
+
# Canada sub-networks
|
|
1573
|
+
institute_vector = [
|
|
1574
|
+
"CA_NB_ASOS",
|
|
1575
|
+
"CA_NF_ASOS",
|
|
1576
|
+
"CA_NT_ASOS",
|
|
1577
|
+
"CA_NS_ASOS",
|
|
1578
|
+
"CA_NU_ASOS",
|
|
1579
|
+
]
|
|
1580
|
+
stations_list_canada = [
|
|
1581
|
+
[
|
|
1582
|
+
"&stations=CYQM",
|
|
1583
|
+
"&stations=CERM",
|
|
1584
|
+
"&stations=CZCR",
|
|
1585
|
+
"&stations=CZBF",
|
|
1586
|
+
"&stations=CYFC",
|
|
1587
|
+
"&stations=CYCX",
|
|
1588
|
+
],
|
|
1589
|
+
[
|
|
1590
|
+
"&stations=CWZZ",
|
|
1591
|
+
"&stations=CYDP",
|
|
1592
|
+
"&stations=CYMH",
|
|
1593
|
+
"&stations=CYAY",
|
|
1594
|
+
"&stations=CWDO",
|
|
1595
|
+
"&stations=CXTP",
|
|
1596
|
+
"&stations=CYJT",
|
|
1597
|
+
"&stations=CYYR",
|
|
1598
|
+
"&stations=CZUM",
|
|
1599
|
+
"&stations=CYWK",
|
|
1600
|
+
"&stations=CYWK",
|
|
1601
|
+
],
|
|
1602
|
+
[
|
|
1603
|
+
"&stations=CYHI",
|
|
1604
|
+
"&stations=CZCP",
|
|
1605
|
+
"&stations=CWLI",
|
|
1606
|
+
"&stations=CWND",
|
|
1607
|
+
"&stations=CXTV",
|
|
1608
|
+
"&stations=CYVL",
|
|
1609
|
+
"&stations=CYCO",
|
|
1610
|
+
"&stations=CXDE",
|
|
1611
|
+
"&stations=CYWE",
|
|
1612
|
+
"&stations=CYLK",
|
|
1613
|
+
"&stations=CWID",
|
|
1614
|
+
"&stations=CYRF",
|
|
1615
|
+
"&stations=CXYH",
|
|
1616
|
+
"&stations=CYWY",
|
|
1617
|
+
"&stations=CWMT",
|
|
1618
|
+
],
|
|
1619
|
+
[
|
|
1620
|
+
"&stations=CWEF",
|
|
1621
|
+
"&stations=CXIB",
|
|
1622
|
+
"&stations=CYQY",
|
|
1623
|
+
"&stations=CYPD",
|
|
1624
|
+
"&stations=CXNP",
|
|
1625
|
+
"&stations=CXMY",
|
|
1626
|
+
"&stations=CYAW",
|
|
1627
|
+
"&stations=CWKG",
|
|
1628
|
+
"&stations=CWVU",
|
|
1629
|
+
"&stations=CXLB",
|
|
1630
|
+
"&stations=CWSA",
|
|
1631
|
+
"&stations=CWRN",
|
|
1632
|
+
],
|
|
1633
|
+
[
|
|
1634
|
+
"&stations=CYLT",
|
|
1635
|
+
"&stations=CWEU",
|
|
1636
|
+
"&stations=CWGZ",
|
|
1637
|
+
"&stations=CYIO",
|
|
1638
|
+
"&stations=CXSE",
|
|
1639
|
+
"&stations=CYCB",
|
|
1640
|
+
"&stations=CWIL",
|
|
1641
|
+
"&stations=CXWB",
|
|
1642
|
+
"&stations=CYZS",
|
|
1643
|
+
"&stations=CWJC",
|
|
1644
|
+
"&stations=CYFB",
|
|
1645
|
+
"&stations=CWUW",
|
|
1646
|
+
],
|
|
1647
|
+
]
|
|
1648
|
+
|
|
1649
|
+
# US states and stations - each sub-network
|
|
1650
|
+
us_state_networks = {
|
|
1651
|
+
state: f"{state}_ASOS"
|
|
1652
|
+
for state in [
|
|
1653
|
+
"AL",
|
|
1654
|
+
"AR",
|
|
1655
|
+
"AZ",
|
|
1656
|
+
"CA",
|
|
1657
|
+
"CO",
|
|
1658
|
+
"CT",
|
|
1659
|
+
"DE",
|
|
1660
|
+
"FL",
|
|
1661
|
+
"GA",
|
|
1662
|
+
"IA",
|
|
1663
|
+
"ID",
|
|
1664
|
+
"IL",
|
|
1665
|
+
"IN",
|
|
1666
|
+
"KS",
|
|
1667
|
+
"KY",
|
|
1668
|
+
"LA",
|
|
1669
|
+
"MA",
|
|
1670
|
+
"MD",
|
|
1671
|
+
"ME",
|
|
1672
|
+
"MI",
|
|
1673
|
+
"MN",
|
|
1674
|
+
"MO",
|
|
1675
|
+
"MS",
|
|
1676
|
+
"MT",
|
|
1677
|
+
"NC",
|
|
1678
|
+
"ND",
|
|
1679
|
+
"NE",
|
|
1680
|
+
"NH",
|
|
1681
|
+
"NJ",
|
|
1682
|
+
"NM",
|
|
1683
|
+
"NV",
|
|
1684
|
+
"NY",
|
|
1685
|
+
"OH",
|
|
1686
|
+
"OK",
|
|
1687
|
+
"OR",
|
|
1688
|
+
"PA",
|
|
1689
|
+
"RI",
|
|
1690
|
+
"SC",
|
|
1691
|
+
"SD",
|
|
1692
|
+
"TN",
|
|
1693
|
+
"TX",
|
|
1694
|
+
"UT",
|
|
1695
|
+
"VA",
|
|
1696
|
+
"VT",
|
|
1697
|
+
"WA",
|
|
1698
|
+
"WI",
|
|
1699
|
+
"WV",
|
|
1700
|
+
"WY",
|
|
1701
|
+
]
|
|
1702
|
+
}
|
|
1703
|
+
|
|
1704
|
+
us_stations_map = {
|
|
1705
|
+
"AL_ASOS": [
|
|
1706
|
+
"&stations=BHM",
|
|
1707
|
+
"&stations=HSV",
|
|
1708
|
+
"&stations=MGM",
|
|
1709
|
+
"&stations=MOB",
|
|
1710
|
+
"&stations=TCL",
|
|
1711
|
+
],
|
|
1712
|
+
"AR_ASOS": [
|
|
1713
|
+
"&stations=LIT",
|
|
1714
|
+
"&stations=FSM",
|
|
1715
|
+
"&stations=TXK",
|
|
1716
|
+
"&stations=HOT",
|
|
1717
|
+
"&stations=FYV",
|
|
1718
|
+
],
|
|
1719
|
+
"AZ_ASOS": [
|
|
1720
|
+
"&stations=PHX",
|
|
1721
|
+
"&stations=TUS",
|
|
1722
|
+
"&stations=FLG",
|
|
1723
|
+
"&stations=YUM",
|
|
1724
|
+
"&stations=PRC",
|
|
1725
|
+
],
|
|
1726
|
+
"CA_ASOS": [
|
|
1727
|
+
"&stations=LAX",
|
|
1728
|
+
"&stations=SAN",
|
|
1729
|
+
"&stations=SJC",
|
|
1730
|
+
"&stations=SFO",
|
|
1731
|
+
"&stations=FAT",
|
|
1732
|
+
],
|
|
1733
|
+
"CO_ASOS": [
|
|
1734
|
+
"&stations=DEN",
|
|
1735
|
+
"&stations=COS",
|
|
1736
|
+
"&stations=GJT",
|
|
1737
|
+
"&stations=PUB",
|
|
1738
|
+
"&stations=ASE",
|
|
1739
|
+
],
|
|
1740
|
+
"CT_ASOS": [
|
|
1741
|
+
"&stations=BDL",
|
|
1742
|
+
"&stations=HVN",
|
|
1743
|
+
"&stations=BDR",
|
|
1744
|
+
"&stations=GON",
|
|
1745
|
+
"&stations=HFD",
|
|
1746
|
+
],
|
|
1747
|
+
"DE_ASOS": ["&stations=ILG", "&stations=GED", "&stations=DOV"],
|
|
1748
|
+
"FL_ASOS": [
|
|
1749
|
+
"&stations=MIA",
|
|
1750
|
+
"&stations=TPA",
|
|
1751
|
+
"&stations=ORL",
|
|
1752
|
+
"&stations=JAX",
|
|
1753
|
+
"&stations=TLH",
|
|
1754
|
+
],
|
|
1755
|
+
"GA_ASOS": [
|
|
1756
|
+
"&stations=ATL",
|
|
1757
|
+
"&stations=SAV",
|
|
1758
|
+
"&stations=CSG",
|
|
1759
|
+
"&stations=MCN",
|
|
1760
|
+
"&stations=AGS",
|
|
1761
|
+
],
|
|
1762
|
+
"IA_ASOS": [
|
|
1763
|
+
"&stations=DSM",
|
|
1764
|
+
"&stations=CID",
|
|
1765
|
+
"&stations=DBQ",
|
|
1766
|
+
"&stations=ALO",
|
|
1767
|
+
"&stations=SUX",
|
|
1768
|
+
],
|
|
1769
|
+
"ID_ASOS": [
|
|
1770
|
+
"&stations=BOI",
|
|
1771
|
+
"&stations=IDA",
|
|
1772
|
+
"&stations=PIH",
|
|
1773
|
+
"&stations=SUN",
|
|
1774
|
+
"&stations=COE",
|
|
1775
|
+
],
|
|
1776
|
+
"IL_ASOS": [
|
|
1777
|
+
"&stations=ORD",
|
|
1778
|
+
"&stations=MDW",
|
|
1779
|
+
"&stations=PIA",
|
|
1780
|
+
"&stations=SPI",
|
|
1781
|
+
"&stations=MLI",
|
|
1782
|
+
],
|
|
1783
|
+
"IN_ASOS": [
|
|
1784
|
+
"&stations=IND",
|
|
1785
|
+
"&stations=FWA",
|
|
1786
|
+
"&stations=SBN",
|
|
1787
|
+
"&stations=EVV",
|
|
1788
|
+
"&stations=HUF",
|
|
1789
|
+
],
|
|
1790
|
+
"KS_ASOS": [
|
|
1791
|
+
"&stations=ICT",
|
|
1792
|
+
"&stations=FOE",
|
|
1793
|
+
"&stations=GCK",
|
|
1794
|
+
"&stations=HYS",
|
|
1795
|
+
"&stations=SLN",
|
|
1796
|
+
],
|
|
1797
|
+
"KY_ASOS": [
|
|
1798
|
+
"&stations=SDF",
|
|
1799
|
+
"&stations=LEX",
|
|
1800
|
+
"&stations=CVG",
|
|
1801
|
+
"&stations=PAH",
|
|
1802
|
+
"&stations=BWG",
|
|
1803
|
+
],
|
|
1804
|
+
"LA_ASOS": [
|
|
1805
|
+
"&stations=MSY",
|
|
1806
|
+
"&stations=SHV",
|
|
1807
|
+
"&stations=LFT",
|
|
1808
|
+
"&stations=BTR",
|
|
1809
|
+
"&stations=MLU",
|
|
1810
|
+
],
|
|
1811
|
+
"MA_ASOS": [
|
|
1812
|
+
"&stations=BOS",
|
|
1813
|
+
"&stations=ORH",
|
|
1814
|
+
"&stations=HYA",
|
|
1815
|
+
"&stations=ACK",
|
|
1816
|
+
"&stations=BED",
|
|
1817
|
+
],
|
|
1818
|
+
"MD_ASOS": [
|
|
1819
|
+
"&stations=BWI",
|
|
1820
|
+
"&stations=MTN",
|
|
1821
|
+
"&stations=SBY",
|
|
1822
|
+
"&stations=HGR",
|
|
1823
|
+
"&stations=ADW",
|
|
1824
|
+
],
|
|
1825
|
+
"ME_ASOS": [
|
|
1826
|
+
"&stations=PWM",
|
|
1827
|
+
"&stations=BGR",
|
|
1828
|
+
"&stations=CAR",
|
|
1829
|
+
"&stations=PQI",
|
|
1830
|
+
"&stations=RKD",
|
|
1831
|
+
],
|
|
1832
|
+
"MI_ASOS": [
|
|
1833
|
+
"&stations=DTW",
|
|
1834
|
+
"&stations=GRR",
|
|
1835
|
+
"&stations=FNT",
|
|
1836
|
+
"&stations=LAN",
|
|
1837
|
+
"&stations=MKG",
|
|
1838
|
+
],
|
|
1839
|
+
"MN_ASOS": [
|
|
1840
|
+
"&stations=MSP",
|
|
1841
|
+
"&stations=DLH",
|
|
1842
|
+
"&stations=RST",
|
|
1843
|
+
"&stations=STC",
|
|
1844
|
+
"&stations=INL",
|
|
1845
|
+
],
|
|
1846
|
+
"MO_ASOS": [
|
|
1847
|
+
"&stations=STL",
|
|
1848
|
+
"&stations=MCI",
|
|
1849
|
+
"&stations=SGF",
|
|
1850
|
+
"&stations=COU",
|
|
1851
|
+
"&stations=JLN",
|
|
1852
|
+
],
|
|
1853
|
+
"MS_ASOS": [
|
|
1854
|
+
"&stations=JAN",
|
|
1855
|
+
"&stations=GPT",
|
|
1856
|
+
"&stations=MEI",
|
|
1857
|
+
"&stations=PIB",
|
|
1858
|
+
"&stations=GLH",
|
|
1859
|
+
],
|
|
1860
|
+
"MT_ASOS": [
|
|
1861
|
+
"&stations=BIL",
|
|
1862
|
+
"&stations=MSO",
|
|
1863
|
+
"&stations=GTF",
|
|
1864
|
+
"&stations=HLN",
|
|
1865
|
+
"&stations=BZN",
|
|
1866
|
+
],
|
|
1867
|
+
"NC_ASOS": [
|
|
1868
|
+
"&stations=CLT",
|
|
1869
|
+
"&stations=RDU",
|
|
1870
|
+
"&stations=GSO",
|
|
1871
|
+
"&stations=ILM",
|
|
1872
|
+
"&stations=AVL",
|
|
1873
|
+
],
|
|
1874
|
+
"ND_ASOS": [
|
|
1875
|
+
"&stations=BIS",
|
|
1876
|
+
"&stations=FAR",
|
|
1877
|
+
"&stations=GFK",
|
|
1878
|
+
"&stations=ISN",
|
|
1879
|
+
"&stations=JMS",
|
|
1880
|
+
],
|
|
1881
|
+
"NE_ASOS": ["&stations=OMA"],
|
|
1882
|
+
"NH_ASOS": [
|
|
1883
|
+
"&stations=MHT",
|
|
1884
|
+
"&stations=PSM",
|
|
1885
|
+
"&stations=CON",
|
|
1886
|
+
"&stations=LEB",
|
|
1887
|
+
"&stations=ASH",
|
|
1888
|
+
],
|
|
1889
|
+
"NJ_ASOS": [
|
|
1890
|
+
"&stations=EWR",
|
|
1891
|
+
"&stations=ACY",
|
|
1892
|
+
"&stations=TTN",
|
|
1893
|
+
"&stations=MMU",
|
|
1894
|
+
"&stations=TEB",
|
|
1895
|
+
],
|
|
1896
|
+
"NM_ASOS": [
|
|
1897
|
+
"&stations=ABQ",
|
|
1898
|
+
"&stations=SAF",
|
|
1899
|
+
"&stations=ROW",
|
|
1900
|
+
"&stations=HOB",
|
|
1901
|
+
"&stations=FMN",
|
|
1902
|
+
],
|
|
1903
|
+
"NV_ASOS": ["&stations=LAS"],
|
|
1904
|
+
"NY_ASOS": [
|
|
1905
|
+
"&stations=JFK",
|
|
1906
|
+
"&stations=LGA",
|
|
1907
|
+
"&stations=BUF",
|
|
1908
|
+
"&stations=ALB",
|
|
1909
|
+
"&stations=SYR",
|
|
1910
|
+
],
|
|
1911
|
+
"OH_ASOS": ["&stations=CMH"],
|
|
1912
|
+
"OK_ASOS": [
|
|
1913
|
+
"&stations=OKC",
|
|
1914
|
+
"&stations=TUL",
|
|
1915
|
+
"&stations=LAW",
|
|
1916
|
+
"&stations=SWO",
|
|
1917
|
+
"&stations=PNC",
|
|
1918
|
+
],
|
|
1919
|
+
"OR_ASOS": ["&stations=PDX"],
|
|
1920
|
+
"PA_ASOS": [
|
|
1921
|
+
"&stations=PHL",
|
|
1922
|
+
"&stations=PIT",
|
|
1923
|
+
"&stations=ERI",
|
|
1924
|
+
"&stations=MDT",
|
|
1925
|
+
"&stations=AVP",
|
|
1926
|
+
],
|
|
1927
|
+
"RI_ASOS": ["&stations=PVD", "&stations=WST", "&stations=UUU"],
|
|
1928
|
+
"SC_ASOS": [
|
|
1929
|
+
"&stations=CHS",
|
|
1930
|
+
"&stations=CAE",
|
|
1931
|
+
"&stations=GSP",
|
|
1932
|
+
"&stations=MYR",
|
|
1933
|
+
"&stations=FLO",
|
|
1934
|
+
],
|
|
1935
|
+
"SD_ASOS": [
|
|
1936
|
+
"&stations=FSD",
|
|
1937
|
+
"&stations=RAP",
|
|
1938
|
+
"&stations=PIR",
|
|
1939
|
+
"&stations=ABR",
|
|
1940
|
+
"&stations=YKN",
|
|
1941
|
+
],
|
|
1942
|
+
"TN_ASOS": [
|
|
1943
|
+
"&stations=BNA",
|
|
1944
|
+
"&stations=MEM",
|
|
1945
|
+
"&stations=TYS",
|
|
1946
|
+
"&stations=CHA",
|
|
1947
|
+
"&stations=TRI",
|
|
1948
|
+
],
|
|
1949
|
+
"TX_ASOS": [
|
|
1950
|
+
"&stations=DFW",
|
|
1951
|
+
"&stations=IAH",
|
|
1952
|
+
"&stations=AUS",
|
|
1953
|
+
"&stations=SAT",
|
|
1954
|
+
"&stations=ELP",
|
|
1955
|
+
],
|
|
1956
|
+
"UT_ASOS": [
|
|
1957
|
+
"&stations=SLC",
|
|
1958
|
+
"&stations=OGD",
|
|
1959
|
+
"&stations=PVU",
|
|
1960
|
+
"&stations=SGU",
|
|
1961
|
+
"&stations=CNY",
|
|
1962
|
+
],
|
|
1963
|
+
"VA_ASOS": [
|
|
1964
|
+
"&stations=DCA",
|
|
1965
|
+
"&stations=RIC",
|
|
1966
|
+
"&stations=ROA",
|
|
1967
|
+
"&stations=ORF",
|
|
1968
|
+
"&stations=SHD",
|
|
1969
|
+
],
|
|
1970
|
+
"VT_ASOS": [
|
|
1971
|
+
"&stations=BTV",
|
|
1972
|
+
"&stations=MPV",
|
|
1973
|
+
"&stations=RUT",
|
|
1974
|
+
"&stations=VSF",
|
|
1975
|
+
"&stations=MVL",
|
|
1976
|
+
],
|
|
1977
|
+
"WA_ASOS": [
|
|
1978
|
+
"&stations=SEA",
|
|
1979
|
+
"&stations=GEG",
|
|
1980
|
+
"&stations=TIW",
|
|
1981
|
+
"&stations=VUO",
|
|
1982
|
+
"&stations=BFI",
|
|
1983
|
+
],
|
|
1984
|
+
"WI_ASOS": [
|
|
1985
|
+
"&stations=MKE",
|
|
1986
|
+
"&stations=MSN",
|
|
1987
|
+
"&stations=GRB",
|
|
1988
|
+
"&stations=EAU",
|
|
1989
|
+
"&stations=LSE",
|
|
1990
|
+
],
|
|
1991
|
+
"WV_ASOS": [
|
|
1992
|
+
"&stations=CRW",
|
|
1993
|
+
"&stations=CKB",
|
|
1994
|
+
"&stations=HTS",
|
|
1995
|
+
"&stations=MGW",
|
|
1996
|
+
"&stations=BKW",
|
|
1997
|
+
],
|
|
1998
|
+
"WY_ASOS": [
|
|
1999
|
+
"&stations=CPR",
|
|
2000
|
+
"&stations=JAC",
|
|
2001
|
+
"&stations=SHR",
|
|
2002
|
+
"&stations=COD",
|
|
2003
|
+
"&stations=RKS",
|
|
2004
|
+
],
|
|
2005
|
+
}
|
|
2006
|
+
# --- Date setup --- #
|
|
2007
|
+
date_object = datetime.strptime(start_date, "%Y-%m-%d")
|
|
2008
|
+
start_day = date_object.day
|
|
2009
|
+
start_month = date_object.month
|
|
2010
|
+
start_year = date_object.year
|
|
2011
|
+
formatted_date = f"{start_year:04d}-01-01" # "2000-01-01"
|
|
2012
|
+
today = datetime.now()
|
|
2013
|
+
end_day, end_month, end_year = today.day, today.month, today.year
|
|
2014
|
+
|
|
2015
|
+
# ------------------------------------------------------------------ #
|
|
2016
|
+
# Utility functions
|
|
2017
|
+
# ------------------------------------------------------------------ #
|
|
2018
|
+
def convert_f_to_c(series_f: pd.Series) -> pd.Series:
|
|
2019
|
+
"""Convert Fahrenheit to Celsius."""
|
|
2020
|
+
return (series_f - 32) * 5.0 / 9.0
|
|
2021
|
+
|
|
2022
|
+
def fetch_mesonet_data(network: str, stations: list) -> pd.DataFrame:
|
|
2023
|
+
"""Fetch station-based data (daily) from Iowa Mesonet."""
|
|
2024
|
+
import csv
|
|
2025
|
+
|
|
2026
|
+
station_query = "".join(stations)
|
|
2027
|
+
url = (
|
|
2028
|
+
"https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?"
|
|
2029
|
+
f"network={network}{station_query}"
|
|
2030
|
+
f"&year1={start_year}&month1={start_month}&day1={start_day}"
|
|
2031
|
+
f"&year2={end_year}&month2={end_month}&day2={end_day}"
|
|
2032
|
+
)
|
|
2033
|
+
with urllib.request.urlopen(url) as f:
|
|
2034
|
+
df = pd.read_csv(f, dtype=str, quoting=csv.QUOTE_ALL)
|
|
2035
|
+
return df
|
|
2036
|
+
|
|
2037
|
+
def fetch_canada_data() -> pd.DataFrame:
|
|
2038
|
+
"""Canada uses multiple sub-networks. Combine them all."""
|
|
2039
|
+
import csv
|
|
2040
|
+
|
|
2041
|
+
final_df = pd.DataFrame()
|
|
2042
|
+
for i, institute_temp in enumerate(institute_vector):
|
|
2043
|
+
station_query_temp = "".join(stations_list_canada[i])
|
|
2044
|
+
mesonet_url = (
|
|
2045
|
+
"https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?"
|
|
2046
|
+
f"network={institute_temp}{station_query_temp}"
|
|
2047
|
+
f"&year1={start_year}&month1={start_month}&day1={start_day}"
|
|
2048
|
+
f"&year2={end_year}&month2={end_month}&day2={end_day}"
|
|
2049
|
+
)
|
|
2050
|
+
with urllib.request.urlopen(mesonet_url) as f:
|
|
2051
|
+
temp_df = pd.read_csv(f, dtype=str, quoting=csv.QUOTE_ALL)
|
|
2052
|
+
|
|
2053
|
+
if not temp_df.empty:
|
|
2054
|
+
final_df = pd.concat([final_df, temp_df], ignore_index=True)
|
|
2055
|
+
return final_df
|
|
2056
|
+
|
|
2057
|
+
def fetch_openmeteo_rain_snow(cities: list) -> pd.DataFrame:
|
|
2058
|
+
"""
|
|
2059
|
+
Fetch daily precipitation_sum (rain) and snowfall_sum (snow) from Open-Meteo.
|
|
2060
|
+
Returns columns: ["date", "rain_sum", "snow_sum", "city"] for each day.
|
|
2061
|
+
We'll then do a weekly aggregator that yields avg_rain_sum, avg_snow_sum.
|
|
2062
|
+
"""
|
|
2063
|
+
weather_data_list = []
|
|
2064
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
2065
|
+
|
|
2066
|
+
for city in cities:
|
|
2067
|
+
loc = geolocator.geocode(city)
|
|
2068
|
+
if not loc:
|
|
2069
|
+
print(f"Could not find location for {city}, skipping.")
|
|
2070
|
+
continue
|
|
2071
|
+
|
|
2072
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
2073
|
+
params = {
|
|
2074
|
+
"latitude": loc.latitude,
|
|
2075
|
+
"longitude": loc.longitude,
|
|
2076
|
+
"start_date": formatted_date,
|
|
2077
|
+
"end_date": today.strftime("%Y-%m-%d"),
|
|
2078
|
+
"daily": "precipitation_sum,snowfall_sum",
|
|
2079
|
+
"timezone": "auto",
|
|
2080
|
+
}
|
|
2081
|
+
resp = requests.get(url, params=params)
|
|
2082
|
+
if resp.status_code != 200:
|
|
2083
|
+
print(
|
|
2084
|
+
f"[ERROR] open-meteo returned status {resp.status_code} for city={city}",
|
|
2085
|
+
)
|
|
2086
|
+
continue
|
|
2087
|
+
try:
|
|
2088
|
+
data_json = resp.json()
|
|
2089
|
+
except ValueError:
|
|
2090
|
+
print(f"[ERROR] invalid JSON from open-meteo for city={city}")
|
|
2091
|
+
continue
|
|
2092
|
+
|
|
2093
|
+
daily_block = data_json.get("daily", {})
|
|
2094
|
+
if not {"time", "precipitation_sum", "snowfall_sum"}.issubset(
|
|
2095
|
+
daily_block.keys(),
|
|
2096
|
+
):
|
|
2097
|
+
print(
|
|
2098
|
+
f"[ERROR] missing required keys in open-meteo for city={city}",
|
|
2099
|
+
)
|
|
2100
|
+
continue
|
|
2101
|
+
|
|
2102
|
+
df_temp = pd.DataFrame(
|
|
2103
|
+
{
|
|
2104
|
+
"date": daily_block["time"],
|
|
2105
|
+
"rain_sum": daily_block["precipitation_sum"],
|
|
2106
|
+
"snow_sum": daily_block["snowfall_sum"],
|
|
2107
|
+
},
|
|
2108
|
+
)
|
|
2109
|
+
df_temp["city"] = city
|
|
2110
|
+
weather_data_list.append(df_temp)
|
|
2111
|
+
|
|
2112
|
+
if weather_data_list:
|
|
2113
|
+
return pd.concat(weather_data_list, ignore_index=True)
|
|
2114
|
+
return pd.DataFrame()
|
|
2115
|
+
|
|
2116
|
+
def weekly_aggregate_temp_mesonet(df: pd.DataFrame) -> pd.DataFrame:
|
|
2117
|
+
"""
|
|
2118
|
+
For NON-US mesonet data, we only keep max_temp_f, min_temp_f,
|
|
2119
|
+
then compute mean_temp_f, plus Celsius, and do weekly average.
|
|
2120
|
+
"""
|
|
2121
|
+
import pandas as pd
|
|
2122
|
+
|
|
2123
|
+
# Convert day col
|
|
2124
|
+
if "day" not in df.columns:
|
|
2125
|
+
return pd.DataFrame()
|
|
2126
|
+
|
|
2127
|
+
# Only keep relevant columns
|
|
2128
|
+
keep_cols = []
|
|
2129
|
+
for c in ["day", "max_temp_f", "min_temp_f"]:
|
|
2130
|
+
if c in df.columns:
|
|
2131
|
+
keep_cols.append(c)
|
|
2132
|
+
df = df[keep_cols].copy()
|
|
2133
|
+
|
|
2134
|
+
# Convert "None" => numeric
|
|
2135
|
+
for c in ["max_temp_f", "min_temp_f"]:
|
|
2136
|
+
if c in df.columns:
|
|
2137
|
+
df[c] = df[c].replace("None", pd.NA)
|
|
2138
|
+
df[c] = pd.to_numeric(df[c], errors="coerce")
|
|
2139
|
+
|
|
2140
|
+
df["day"] = pd.to_datetime(df["day"], errors="coerce")
|
|
2141
|
+
df["mean_temp_f"] = (df["max_temp_f"] + df["min_temp_f"]) / 2
|
|
2142
|
+
df["max_temp_c"] = convert_f_to_c(df["max_temp_f"])
|
|
2143
|
+
df["min_temp_c"] = convert_f_to_c(df["min_temp_f"])
|
|
2144
|
+
df["mean_temp_c"] = convert_f_to_c(df["mean_temp_f"])
|
|
2145
|
+
|
|
2146
|
+
# Group by "week_starting"
|
|
2147
|
+
df["week_starting"] = df["day"].apply(
|
|
2148
|
+
lambda x: x
|
|
2149
|
+
- pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
2150
|
+
if pd.notnull(x)
|
|
2151
|
+
else pd.NaT,
|
|
2152
|
+
)
|
|
2153
|
+
numeric_cols = df.select_dtypes(include="number").columns
|
|
2154
|
+
weekly = df.groupby("week_starting")[numeric_cols].mean()
|
|
2155
|
+
|
|
2156
|
+
# Rename columns
|
|
2157
|
+
rename_map = {
|
|
2158
|
+
"max_temp_f": "avg_max_temp_f",
|
|
2159
|
+
"min_temp_f": "avg_min_temp_f",
|
|
2160
|
+
"mean_temp_f": "avg_mean_temp_f",
|
|
2161
|
+
"max_temp_c": "avg_max_temp_c",
|
|
2162
|
+
"min_temp_c": "avg_min_temp_c",
|
|
2163
|
+
"mean_temp_c": "avg_mean_temp_c",
|
|
2164
|
+
}
|
|
2165
|
+
weekly.rename(columns=rename_map, inplace=True)
|
|
2166
|
+
|
|
2167
|
+
# Return as a DataFrame w/ index = week_starting
|
|
2168
|
+
return weekly
|
|
2169
|
+
|
|
2170
|
+
def weekly_aggregate_rain_snow_openmeteo(df: pd.DataFrame) -> pd.DataFrame:
|
|
2171
|
+
"""
|
|
2172
|
+
For NON-US, from open-meteo, we have daily columns 'date','rain_sum','snow_sum'.
|
|
2173
|
+
We'll do weekly average of each. -> 'avg_rain_sum', 'avg_snow_sum'.
|
|
2174
|
+
"""
|
|
2175
|
+
import pandas as pd
|
|
2176
|
+
|
|
2177
|
+
if "date" not in df.columns:
|
|
2178
|
+
return pd.DataFrame()
|
|
2179
|
+
|
|
2180
|
+
df["date"] = pd.to_datetime(df["date"], errors="coerce")
|
|
2181
|
+
df["week_starting"] = df["date"].apply(
|
|
2182
|
+
lambda x: x
|
|
2183
|
+
- pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
2184
|
+
if pd.notnull(x)
|
|
2185
|
+
else pd.NaT,
|
|
2186
|
+
)
|
|
2187
|
+
|
|
2188
|
+
# Convert to numeric
|
|
2189
|
+
for c in ["rain_sum", "snow_sum"]:
|
|
2190
|
+
if c in df.columns:
|
|
2191
|
+
df[c] = pd.to_numeric(df[c], errors="coerce")
|
|
2192
|
+
|
|
2193
|
+
numeric_cols = df.select_dtypes(include="number").columns
|
|
2194
|
+
weekly = df.groupby("week_starting")[numeric_cols].mean()
|
|
2195
|
+
|
|
2196
|
+
rename_map = {"rain_sum": "avg_rain_sum", "snow_sum": "avg_snow_sum"}
|
|
2197
|
+
weekly.rename(columns=rename_map, inplace=True)
|
|
2198
|
+
return weekly
|
|
2199
|
+
|
|
2200
|
+
def weekly_aggregate_us(df: pd.DataFrame) -> pd.DataFrame:
|
|
2201
|
+
"""
|
|
2202
|
+
For US Mesonet data (per state), we keep max_temp_f, min_temp_f, precip_in, snow_in,
|
|
2203
|
+
then compute mean_temp_f & convert to celsius, group weekly.
|
|
2204
|
+
We'll rename:
|
|
2205
|
+
max_temp_f -> avg_max_temp_f
|
|
2206
|
+
min_temp_f -> avg_min_temp_f
|
|
2207
|
+
mean_temp_f -> avg_mean_temp_f
|
|
2208
|
+
precip_in -> avg_rain_sum
|
|
2209
|
+
snow_in -> avg_snow_sum
|
|
2210
|
+
"""
|
|
2211
|
+
import pandas as pd
|
|
2212
|
+
|
|
2213
|
+
if "day" not in df.columns:
|
|
2214
|
+
return pd.DataFrame()
|
|
2215
|
+
|
|
2216
|
+
# Convert day
|
|
2217
|
+
df["day"] = pd.to_datetime(df["day"], errors="coerce")
|
|
2218
|
+
|
|
2219
|
+
# Convert "None" => numeric
|
|
2220
|
+
for c in ["max_temp_f", "min_temp_f", "precip_in", "snow_in"]:
|
|
2221
|
+
if c in df.columns:
|
|
2222
|
+
df[c] = df[c].replace("None", pd.NA)
|
|
2223
|
+
df[c] = pd.to_numeric(df[c], errors="coerce")
|
|
2224
|
+
|
|
2225
|
+
# Compute mean_temp_f, celsius
|
|
2226
|
+
df["mean_temp_f"] = (df["max_temp_f"] + df["min_temp_f"]) / 2
|
|
2227
|
+
df["max_temp_c"] = convert_f_to_c(df["max_temp_f"])
|
|
2228
|
+
df["min_temp_c"] = convert_f_to_c(df["min_temp_f"])
|
|
2229
|
+
df["mean_temp_c"] = convert_f_to_c(df["mean_temp_f"])
|
|
2230
|
+
|
|
2231
|
+
# Weekly grouping
|
|
2232
|
+
df["week_starting"] = df["day"].apply(
|
|
2233
|
+
lambda x: x
|
|
2234
|
+
- pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
2235
|
+
if pd.notnull(x)
|
|
2236
|
+
else pd.NaT,
|
|
2237
|
+
)
|
|
2238
|
+
numeric_cols = df.select_dtypes(include="number").columns
|
|
2239
|
+
weekly = df.groupby("week_starting")[numeric_cols].mean()
|
|
2240
|
+
|
|
2241
|
+
rename_map = {
|
|
2242
|
+
"max_temp_f": "avg_max_temp_f",
|
|
2243
|
+
"min_temp_f": "avg_min_temp_f",
|
|
2244
|
+
"mean_temp_f": "avg_mean_temp_f",
|
|
2245
|
+
"max_temp_c": "avg_max_temp_c",
|
|
2246
|
+
"min_temp_c": "avg_min_temp_c",
|
|
2247
|
+
"mean_temp_c": "avg_mean_temp_c",
|
|
2248
|
+
"precip_in": "avg_rain_sum",
|
|
2249
|
+
"snow_in": "avg_snow_sum",
|
|
2250
|
+
}
|
|
2251
|
+
weekly.rename(columns=rename_map, inplace=True)
|
|
2252
|
+
return weekly
|
|
2253
|
+
|
|
2254
|
+
def rename_with_prefix(df: pd.DataFrame, prefix: str) -> pd.DataFrame:
|
|
2255
|
+
"""Rename all columns except 'week_starting' or 'OBS' with the given prefix."""
|
|
2256
|
+
df2 = df.copy()
|
|
2257
|
+
new_cols = {}
|
|
2258
|
+
for col in df2.columns:
|
|
2259
|
+
if col not in ["week_starting", "OBS"]:
|
|
2260
|
+
new_cols[col] = prefix + col
|
|
2261
|
+
df2.rename(columns=new_cols, inplace=True)
|
|
2262
|
+
return df2
|
|
2263
|
+
|
|
2264
|
+
# ------------------------------------------------------------------ #
|
|
2265
|
+
# The final combined DataFrame
|
|
2266
|
+
# ------------------------------------------------------------------ #
|
|
2267
|
+
combined_df = pd.DataFrame()
|
|
2268
|
+
|
|
2269
|
+
# ------------------------------------------------------------------ #
|
|
2270
|
+
# 1) Loop over each requested country
|
|
2271
|
+
# ------------------------------------------------------------------ #
|
|
2272
|
+
for country_code in country_codes:
|
|
2273
|
+
net = country_dict.get(country_code)
|
|
2274
|
+
if net is None:
|
|
2275
|
+
print(f"Warning: Invalid country_code '{country_code}' – skipping.")
|
|
2276
|
+
continue
|
|
2277
|
+
|
|
2278
|
+
# =========================
|
|
2279
|
+
# 2) Special Logic for US
|
|
2280
|
+
# =========================
|
|
2281
|
+
if net == "US_STATES":
|
|
2282
|
+
for state_code, network_code in us_state_networks.items():
|
|
2283
|
+
stations = us_stations_map.get(network_code, [])
|
|
2284
|
+
if not stations:
|
|
2285
|
+
print(f"[DEBUG] No stations for {network_code}, skipping.")
|
|
2286
|
+
continue
|
|
2287
|
+
|
|
2288
|
+
raw_df = fetch_mesonet_data(network_code, stations)
|
|
2289
|
+
if raw_df.empty:
|
|
2290
|
+
print(f"[DEBUG] DataFrame empty for {network_code}, skipping.")
|
|
2291
|
+
continue
|
|
2292
|
+
|
|
2293
|
+
weekly_state = weekly_aggregate_us(raw_df)
|
|
2294
|
+
if weekly_state.empty:
|
|
2295
|
+
print(
|
|
2296
|
+
f"[DEBUG] Aggregated weekly DataFrame empty for {network_code}, skipping.",
|
|
2297
|
+
)
|
|
2298
|
+
continue
|
|
2299
|
+
|
|
2300
|
+
weekly_state.reset_index(inplace=True)
|
|
2301
|
+
weekly_state.rename(columns={"week_starting": "OBS"}, inplace=True)
|
|
2302
|
+
|
|
2303
|
+
# Now rename columns with prefix: seas_us_{statecode}_
|
|
2304
|
+
prefix = f"seas_us_{state_code.lower()}_"
|
|
2305
|
+
weekly_state = rename_with_prefix(weekly_state, prefix)
|
|
2306
|
+
|
|
2307
|
+
# Merge into combined
|
|
2308
|
+
if combined_df.empty:
|
|
2309
|
+
combined_df = weekly_state
|
|
2310
|
+
else:
|
|
2311
|
+
combined_df = pd.merge(
|
|
2312
|
+
combined_df,
|
|
2313
|
+
weekly_state,
|
|
2314
|
+
on="OBS",
|
|
2315
|
+
how="outer",
|
|
2316
|
+
)
|
|
2317
|
+
|
|
2318
|
+
# Done with the US. Move on to the next country in the loop
|
|
2319
|
+
continue
|
|
2320
|
+
|
|
2321
|
+
# =======================================
|
|
2322
|
+
# 3) Logic for Non-US (AU, GB, DE, CA, ZA)
|
|
2323
|
+
# =======================================
|
|
2324
|
+
# A) Fetch temperature data from Mesonet
|
|
2325
|
+
if net == "Canada":
|
|
2326
|
+
raw_temp = fetch_canada_data()
|
|
2327
|
+
else:
|
|
2328
|
+
# e.g. "GB__ASOS", "AU__ASOS", "DE__ASOS", "ZA__ASOS" (if added)
|
|
2329
|
+
stations = station_map.get(net, [])
|
|
2330
|
+
if not stations and net != "ZA__ASOS":
|
|
2331
|
+
# If we have no stations for net and it's not ZA,
|
|
2332
|
+
# there's no data. (If ZA has stations, add them above.)
|
|
2333
|
+
raw_temp = pd.DataFrame()
|
|
2334
|
+
else:
|
|
2335
|
+
raw_temp = fetch_mesonet_data(net, stations)
|
|
2336
|
+
|
|
2337
|
+
weekly_temp = pd.DataFrame()
|
|
2338
|
+
if not raw_temp.empty:
|
|
2339
|
+
# For these countries, we only keep max_temp_f, min_temp_f, mean_temp_f
|
|
2340
|
+
weekly_temp = weekly_aggregate_temp_mesonet(raw_temp)
|
|
2341
|
+
|
|
2342
|
+
# B) Fetch rain+snow from Open-Meteo (only if we have an entry in rainfall_city_map)
|
|
2343
|
+
weekly_precip = pd.DataFrame()
|
|
2344
|
+
if net in rainfall_city_map:
|
|
2345
|
+
city_list = rainfall_city_map[net]
|
|
2346
|
+
df_rain_snow = fetch_openmeteo_rain_snow(city_list)
|
|
2347
|
+
if not df_rain_snow.empty:
|
|
2348
|
+
weekly_precip = weekly_aggregate_rain_snow_openmeteo(df_rain_snow)
|
|
2349
|
+
|
|
2350
|
+
# C) Merge the temperature data + precip/snow data on the weekly index
|
|
2351
|
+
if not weekly_temp.empty and not weekly_precip.empty:
|
|
2352
|
+
merged_df = pd.merge(
|
|
2353
|
+
weekly_temp,
|
|
2354
|
+
weekly_precip,
|
|
2355
|
+
left_index=True,
|
|
2356
|
+
right_index=True,
|
|
2357
|
+
how="outer",
|
|
2358
|
+
)
|
|
2359
|
+
elif not weekly_temp.empty:
|
|
2360
|
+
merged_df = weekly_temp
|
|
2361
|
+
else:
|
|
2362
|
+
merged_df = weekly_precip
|
|
2363
|
+
|
|
2364
|
+
if merged_df.empty:
|
|
2365
|
+
print(f"No data retrieved for country: {country_code}")
|
|
2366
|
+
continue
|
|
2367
|
+
|
|
2368
|
+
# D) Convert index -> a column OBS
|
|
2369
|
+
merged_df.reset_index(inplace=True)
|
|
2370
|
+
merged_df.rename(columns={"week_starting": "OBS"}, inplace=True)
|
|
2371
|
+
|
|
2372
|
+
# E) Rename with prefix = "seas_{country_code}_"
|
|
2373
|
+
prefix = f"seas_{country_code.lower()}_"
|
|
2374
|
+
merged_df = rename_with_prefix(merged_df, prefix)
|
|
2375
|
+
|
|
2376
|
+
# F) Merge into combined_df
|
|
2377
|
+
if combined_df.empty:
|
|
2378
|
+
combined_df = merged_df
|
|
2379
|
+
else:
|
|
2380
|
+
combined_df = pd.merge(combined_df, merged_df, on="OBS", how="outer")
|
|
2381
|
+
|
|
2382
|
+
# ------------------------------------------------------------------ #
|
|
2383
|
+
# 4) Sort final by OBS (optional)
|
|
2384
|
+
# ------------------------------------------------------------------ #
|
|
2385
|
+
if not combined_df.empty:
|
|
2386
|
+
combined_df.sort_values(by="OBS", inplace=True)
|
|
2387
|
+
|
|
2388
|
+
return combined_df
|
|
2389
|
+
|
|
2390
|
+
def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
|
|
2391
|
+
"""
|
|
2392
|
+
Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
|
|
2393
|
+
aggregates it to weekly averages, and renames variables based on specified rules.
|
|
2394
|
+
|
|
2395
|
+
Parameters
|
|
2396
|
+
----------
|
|
2397
|
+
cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
|
|
2398
|
+
week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
|
|
2399
|
+
sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
|
|
2400
|
+
(e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
|
|
2401
|
+
|
|
2402
|
+
Returns
|
|
2403
|
+
-------
|
|
2404
|
+
pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
|
|
2405
|
+
and all series as renamed columns (e.g., 'macro_retail_sales_uk').
|
|
2406
|
+
Returns an empty DataFrame if no data is fetched or processed.
|
|
2407
|
+
|
|
2408
|
+
"""
|
|
2409
|
+
# Define CDIDs for sectors and defaults
|
|
2410
|
+
sector_cdids_map = {
|
|
2411
|
+
"fast_food": ["L7TD", "L78Q", "DOAD"],
|
|
2412
|
+
"clothing_footwear": ["D7BW", "D7GO", "CHBJ"],
|
|
2413
|
+
"fuel": ["A9FS", "L7FP", "CHOL"],
|
|
2414
|
+
"cars": ["D7E8", "D7E9", "D7CO"],
|
|
2415
|
+
"default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
|
|
2416
|
+
}
|
|
2417
|
+
|
|
2418
|
+
default_cdids = sector_cdids_map["default"]
|
|
2419
|
+
sector_specific_cdids = [] # Initialize empty list for sector CDIDs
|
|
2420
|
+
|
|
2421
|
+
if sector: # Check if sector is not None or empty
|
|
2422
|
+
if isinstance(sector, str):
|
|
2423
|
+
# If it's a single string, wrap it in a list
|
|
2424
|
+
sector_list = [sector]
|
|
2425
|
+
elif isinstance(sector, list):
|
|
2426
|
+
# If it's already a list, use it directly
|
|
2427
|
+
sector_list = sector
|
|
2428
|
+
else:
|
|
2429
|
+
raise TypeError(
|
|
2430
|
+
"`sector` parameter must be a string or a list of strings.",
|
|
2431
|
+
)
|
|
2432
|
+
|
|
2433
|
+
# Iterate through the list of sectors and collect their CDIDs
|
|
2434
|
+
for sec in sector_list:
|
|
2435
|
+
sector_specific_cdids.extend(
|
|
2436
|
+
sector_cdids_map.get(sec, []),
|
|
2437
|
+
) # Use extend to add items from the list
|
|
2438
|
+
|
|
2439
|
+
# Combine standard CDIDs and any additional user-provided CDIDs
|
|
2440
|
+
standard_cdids = list(dict.fromkeys(default_cdids + sector_specific_cdids))
|
|
2441
|
+
if cdid_list is None:
|
|
2442
|
+
cdid_list = []
|
|
2443
|
+
final_cdid_list = list(dict.fromkeys(standard_cdids + cdid_list))
|
|
2444
|
+
|
|
2445
|
+
base_search_url = (
|
|
2446
|
+
"https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
|
|
2447
|
+
)
|
|
2448
|
+
base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
|
|
2449
|
+
combined_df = pd.DataFrame()
|
|
2450
|
+
|
|
2451
|
+
# Map week start day to pandas weekday convention
|
|
2452
|
+
days_map = {
|
|
2453
|
+
"mon": 0,
|
|
2454
|
+
"tue": 1,
|
|
2455
|
+
"wed": 2,
|
|
2456
|
+
"thu": 3,
|
|
2457
|
+
"fri": 4,
|
|
2458
|
+
"sat": 5,
|
|
2459
|
+
"sun": 6,
|
|
2460
|
+
}
|
|
2461
|
+
if week_start_day.lower() not in days_map:
|
|
2462
|
+
raise ValueError(
|
|
2463
|
+
"Invalid week start day. Choose from: " + ", ".join(days_map.keys()),
|
|
2464
|
+
)
|
|
2465
|
+
week_start = days_map[
|
|
2466
|
+
week_start_day.lower()
|
|
2467
|
+
] # Use lower() for case-insensitivity
|
|
2468
|
+
|
|
2469
|
+
for cdid in final_cdid_list: # Use the final combined list
|
|
2470
|
+
try:
|
|
2471
|
+
# Search for the series
|
|
2472
|
+
search_url = f"{base_search_url}{cdid}"
|
|
2473
|
+
search_response = requests.get(search_url, timeout=30) # Add timeout
|
|
2474
|
+
search_response.raise_for_status()
|
|
2475
|
+
search_data = search_response.json()
|
|
2476
|
+
|
|
2477
|
+
items = search_data.get("items", [])
|
|
2478
|
+
if not items:
|
|
2479
|
+
print(f"Warning: No data found for CDID: {cdid}")
|
|
2480
|
+
continue
|
|
2481
|
+
|
|
2482
|
+
# Extract series name and latest release URI
|
|
2483
|
+
# Find the item with the most recent release_date
|
|
2484
|
+
latest_item = None
|
|
2485
|
+
latest_date = None
|
|
2486
|
+
for item in items:
|
|
2487
|
+
if "release_date" in item:
|
|
2488
|
+
try:
|
|
2489
|
+
# Ensure timezone awareness for comparison
|
|
2490
|
+
current_date = datetime.fromisoformat(
|
|
2491
|
+
item["release_date"].replace("Z", "+00:00"),
|
|
2492
|
+
)
|
|
2493
|
+
if latest_date is None or current_date > latest_date:
|
|
2494
|
+
latest_date = current_date
|
|
2495
|
+
latest_item = item
|
|
2496
|
+
except ValueError:
|
|
2497
|
+
print(
|
|
2498
|
+
f"Warning: Could not parse release_date '{item['release_date']}' for CDID {cdid}",
|
|
2499
|
+
)
|
|
2500
|
+
continue # Skip this item if date is invalid
|
|
2501
|
+
|
|
2502
|
+
if latest_item is None:
|
|
2503
|
+
print(f"Warning: No valid release date found for CDID: {cdid}")
|
|
2504
|
+
continue
|
|
2505
|
+
|
|
2506
|
+
series_name = latest_item.get(
|
|
2507
|
+
"title",
|
|
2508
|
+
f"Series_{cdid}",
|
|
2509
|
+
) # Use title from the latest item
|
|
2510
|
+
latest_uri = latest_item.get("uri")
|
|
2511
|
+
if not latest_uri:
|
|
2512
|
+
print(
|
|
2513
|
+
f"Warning: No URI found for the latest release of CDID: {cdid}",
|
|
2514
|
+
)
|
|
2515
|
+
continue
|
|
2516
|
+
|
|
2517
|
+
# Fetch the dataset
|
|
2518
|
+
data_url = f"{base_data_url}{latest_uri}"
|
|
2519
|
+
data_response = requests.get(data_url, timeout=30) # Add timeout
|
|
2520
|
+
data_response.raise_for_status()
|
|
2521
|
+
data_json = data_response.json()
|
|
2522
|
+
|
|
2523
|
+
# Detect the frequency and process accordingly
|
|
2524
|
+
frequency_key = None
|
|
2525
|
+
if data_json.get("months"):
|
|
2526
|
+
frequency_key = "months"
|
|
2527
|
+
elif data_json.get("quarters"):
|
|
2528
|
+
frequency_key = "quarters"
|
|
2529
|
+
elif data_json.get("years"):
|
|
2530
|
+
frequency_key = "years"
|
|
2531
|
+
else:
|
|
2532
|
+
print(
|
|
2533
|
+
f"Warning: Unsupported frequency or no data values found for CDID: {cdid} at URI {latest_uri}",
|
|
2534
|
+
)
|
|
2535
|
+
continue
|
|
2536
|
+
|
|
2537
|
+
# Prepare the DataFrame
|
|
2538
|
+
if not data_json[frequency_key]: # Check if the list of values is empty
|
|
2539
|
+
print(
|
|
2540
|
+
f"Warning: Empty data list for frequency '{frequency_key}' for CDID: {cdid}",
|
|
2541
|
+
)
|
|
2542
|
+
continue
|
|
2543
|
+
|
|
2544
|
+
df = pd.DataFrame(data_json[frequency_key])
|
|
2545
|
+
|
|
2546
|
+
# Check if essential columns exist
|
|
2547
|
+
if "date" not in df.columns or "value" not in df.columns:
|
|
2548
|
+
print(f"Warning: Missing 'date' or 'value' column for CDID: {cdid}")
|
|
2549
|
+
continue
|
|
2550
|
+
|
|
2551
|
+
# Parse the 'date' field based on frequency
|
|
2552
|
+
try:
|
|
2553
|
+
if frequency_key == "months":
|
|
2554
|
+
# Handles "YYYY Mon" format (e.g., "2023 FEB") - adjust if format differs
|
|
2555
|
+
df["date"] = pd.to_datetime(
|
|
2556
|
+
df["date"],
|
|
2557
|
+
format="%Y %b",
|
|
2558
|
+
errors="coerce",
|
|
2559
|
+
)
|
|
2560
|
+
elif frequency_key == "quarters":
|
|
2561
|
+
|
|
2562
|
+
def parse_quarter(quarter_str):
|
|
2563
|
+
try:
|
|
2564
|
+
year, qtr = quarter_str.split(" Q")
|
|
2565
|
+
month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
|
|
2566
|
+
return datetime(int(year), month, 1)
|
|
2567
|
+
except (ValueError, KeyError):
|
|
2568
|
+
return pd.NaT # Return Not a Time for parsing errors
|
|
2569
|
+
|
|
2570
|
+
df["date"] = df["date"].apply(parse_quarter)
|
|
2571
|
+
elif frequency_key == "years":
|
|
2572
|
+
df["date"] = pd.to_datetime(
|
|
2573
|
+
df["date"],
|
|
2574
|
+
format="%Y",
|
|
2575
|
+
errors="coerce",
|
|
2576
|
+
)
|
|
2577
|
+
except Exception as e:
|
|
2578
|
+
print(
|
|
2579
|
+
f"Error parsing date for CDID {cdid} with frequency {frequency_key}: {e}",
|
|
2580
|
+
)
|
|
2581
|
+
continue # Skip this series if date parsing fails
|
|
2582
|
+
|
|
2583
|
+
# Coerce value to numeric, handle potential errors
|
|
2584
|
+
df["value"] = pd.to_numeric(df["value"], errors="coerce")
|
|
2585
|
+
|
|
2586
|
+
# Drop rows where date or value parsing failed
|
|
2587
|
+
df.dropna(subset=["date", "value"], inplace=True)
|
|
2588
|
+
|
|
2589
|
+
if df.empty:
|
|
2590
|
+
print(
|
|
2591
|
+
f"Warning: No valid data points after processing for CDID: {cdid}",
|
|
2592
|
+
)
|
|
2593
|
+
continue
|
|
2594
|
+
|
|
2595
|
+
df.rename(columns={"value": series_name}, inplace=True)
|
|
2596
|
+
|
|
2597
|
+
# Combine data
|
|
2598
|
+
df_subset = df.loc[:, ["date", series_name]].reset_index(
|
|
2599
|
+
drop=True,
|
|
2600
|
+
) # Explicitly select columns
|
|
2601
|
+
if combined_df.empty:
|
|
2602
|
+
combined_df = df_subset
|
|
2603
|
+
else:
|
|
2604
|
+
# Use outer merge to keep all dates, sort afterwards
|
|
2605
|
+
combined_df = pd.merge(
|
|
2606
|
+
combined_df,
|
|
2607
|
+
df_subset,
|
|
2608
|
+
on="date",
|
|
2609
|
+
how="outer",
|
|
2610
|
+
)
|
|
2611
|
+
|
|
2612
|
+
except requests.exceptions.RequestException as e:
|
|
2613
|
+
print(f"Error fetching data for CDID {cdid}: {e}")
|
|
2614
|
+
except (KeyError, ValueError, TypeError) as e: # Added TypeError
|
|
2615
|
+
print(f"Error processing data for CDID {cdid}: {e}")
|
|
2616
|
+
except Exception as e: # Catch unexpected errors
|
|
2617
|
+
print(f"An unexpected error occurred for CDID {cdid}: {e}")
|
|
2618
|
+
|
|
2619
|
+
if not combined_df.empty:
|
|
2620
|
+
# Sort by date after merging to ensure correct forward fill
|
|
2621
|
+
combined_df.sort_values(by="date", inplace=True)
|
|
2622
|
+
combined_df.reset_index(drop=True, inplace=True)
|
|
2623
|
+
|
|
2624
|
+
# Create a complete daily date range
|
|
2625
|
+
min_date = combined_df["date"].min()
|
|
2626
|
+
# Ensure max_date is timezone-naive if min_date is, or consistent otherwise
|
|
2627
|
+
max_date = pd.Timestamp(
|
|
2628
|
+
datetime.today().date(),
|
|
2629
|
+
) # Use today's date, timezone-naive
|
|
2630
|
+
|
|
2631
|
+
if pd.isna(min_date):
|
|
2632
|
+
print("Error: Minimum date is NaT, cannot create date range.")
|
|
2633
|
+
return pd.DataFrame()
|
|
2634
|
+
|
|
2635
|
+
# Make sure min_date is not NaT before creating the range
|
|
2636
|
+
date_range = pd.date_range(start=min_date, end=max_date, freq="D")
|
|
2637
|
+
daily_df = pd.DataFrame(date_range, columns=["date"])
|
|
2638
|
+
|
|
2639
|
+
# Merge with original data and forward fill
|
|
2640
|
+
daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
|
|
2641
|
+
daily_df = daily_df.ffill()
|
|
2642
|
+
|
|
2643
|
+
# Drop rows before the first valid data point after ffill
|
|
2644
|
+
first_valid_index = daily_df.dropna(
|
|
2645
|
+
subset=daily_df.columns.difference(["date"]),
|
|
2646
|
+
).index.min()
|
|
2647
|
+
if pd.notna(first_valid_index):
|
|
2648
|
+
daily_df = daily_df.loc[first_valid_index:]
|
|
2649
|
+
else:
|
|
2650
|
+
print("Warning: No valid data points found after forward filling.")
|
|
2651
|
+
return pd.DataFrame() # Return empty if ffill results in no data
|
|
2652
|
+
|
|
2653
|
+
# Aggregate to weekly frequency
|
|
2654
|
+
# Ensure 'date' column is datetime type before dt accessor
|
|
2655
|
+
daily_df["date"] = pd.to_datetime(daily_df["date"])
|
|
2656
|
+
daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta(
|
|
2657
|
+
(daily_df["date"].dt.weekday - week_start + 7) % 7,
|
|
2658
|
+
unit="D",
|
|
2659
|
+
) # Corrected logic for week start
|
|
2660
|
+
# Group by week_commencing and calculate mean for numeric columns only
|
|
2661
|
+
weekly_df = (
|
|
2662
|
+
daily_df.groupby("week_commencing")
|
|
2663
|
+
.mean(numeric_only=True)
|
|
2664
|
+
.reset_index()
|
|
2665
|
+
)
|
|
2666
|
+
|
|
2667
|
+
def clean_column_name(name):
|
|
2668
|
+
# Remove content within parentheses
|
|
2669
|
+
name = re.sub(r"\(.*?\)", "", name)
|
|
2670
|
+
|
|
2671
|
+
# Special handling for ANY CPI items (not just CPI INDEX)
|
|
2672
|
+
if "CPI" in name.upper():
|
|
2673
|
+
# Extract the description part after the colon for CPI items
|
|
2674
|
+
if ":" in name:
|
|
2675
|
+
parts = name.split(":")
|
|
2676
|
+
if len(parts) >= 2:
|
|
2677
|
+
# Take the description part (usually the second part)
|
|
2678
|
+
description = parts[1].strip()
|
|
2679
|
+
# Remove any remaining colons and everything after
|
|
2680
|
+
description = description.split(":")[0].strip()
|
|
2681
|
+
name = f"CPI {description}"
|
|
2682
|
+
|
|
2683
|
+
# Remove numbers and dots for ALL CPI items (like 00, 06.2.2, 12.5.3/5)
|
|
2684
|
+
name = re.sub(r"\d+\.?\d*/?\.?\d*", "", name)
|
|
2685
|
+
|
|
2686
|
+
else:
|
|
2687
|
+
# For non-CPI items, take only the part before the first colon
|
|
2688
|
+
name = re.split(r":", name)[0]
|
|
2689
|
+
# Remove all digits for non-CPI items too
|
|
2690
|
+
name = re.sub(r"\d+", "", name)
|
|
2691
|
+
|
|
2692
|
+
# Remove year references like "2015=100"
|
|
2693
|
+
name = re.sub(r"\d{4}=\d+", "", name)
|
|
2694
|
+
|
|
2695
|
+
# Remove specific words case-insensitively
|
|
2696
|
+
name = re.sub(r"\b(annual|rate|index|seasonally|adjusted|sa|cvm)\b", "", name, flags=re.IGNORECASE)
|
|
2697
|
+
|
|
2698
|
+
# Remove percentage symbols and "%"
|
|
2699
|
+
name = re.sub(r"%", "percent", name)
|
|
2700
|
+
|
|
2701
|
+
# Remove non-alphanumeric characters (except underscore and space)
|
|
2702
|
+
name = re.sub(r"[^\w\s]", "", name)
|
|
2703
|
+
|
|
2704
|
+
# Replace spaces with underscores
|
|
2705
|
+
name = name.strip().replace(" ", "_")
|
|
2706
|
+
|
|
2707
|
+
# Replace multiple underscores with a single one
|
|
2708
|
+
name = re.sub(r"_+", "_", name)
|
|
2709
|
+
|
|
2710
|
+
# Remove leading/trailing underscores
|
|
2711
|
+
name = name.strip("_")
|
|
2712
|
+
|
|
2713
|
+
# Truncate very long names (optional)
|
|
2714
|
+
if len(name) > 50:
|
|
2715
|
+
words = name.split("_")
|
|
2716
|
+
# Keep first few meaningful words
|
|
2717
|
+
name = "_".join(words[:4])
|
|
2718
|
+
|
|
2719
|
+
return f"macro_{name.lower()}_uk"
|
|
2720
|
+
|
|
2721
|
+
# Apply cleaning function to relevant columns
|
|
2722
|
+
weekly_df.columns = [
|
|
2723
|
+
clean_column_name(col) if col != "week_commencing" else col
|
|
2724
|
+
for col in weekly_df.columns
|
|
2725
|
+
]
|
|
2726
|
+
weekly_df.rename(
|
|
2727
|
+
columns={"week_commencing": "OBS"},
|
|
2728
|
+
inplace=True,
|
|
2729
|
+
) # Rename week commencing col
|
|
2730
|
+
|
|
2731
|
+
# Optional: Fill remaining NaNs (e.g., at the beginning if ffill didn't cover) with 0
|
|
2732
|
+
# Consider if 0 is the appropriate fill value for your use case
|
|
2733
|
+
# weekly_df = weekly_df.fillna(0)
|
|
2734
|
+
|
|
2735
|
+
# Get only the data columns (excluding OBS)
|
|
2736
|
+
data_columns = [col for col in weekly_df.columns if col != "OBS"]
|
|
2737
|
+
|
|
2738
|
+
new_columns = ["OBS"]
|
|
2739
|
+
for i, col in enumerate(data_columns):
|
|
2740
|
+
if i < len(final_cdid_list):
|
|
2741
|
+
new_columns.append(f"{col}_{final_cdid_list[i]}")
|
|
2742
|
+
else:
|
|
2743
|
+
new_columns.append(col) # Keep original if no matching CDID
|
|
2744
|
+
|
|
2745
|
+
# Apply the new column names to the DataFrame
|
|
2746
|
+
weekly_df.columns = new_columns
|
|
2747
|
+
|
|
2748
|
+
return weekly_df
|
|
2749
|
+
print("No data successfully fetched or processed.")
|
|
2750
|
+
return pd.DataFrame()
|
|
2751
|
+
|
|
2752
|
+
def pull_yfinance(self, tickers=None, week_start_day="mon"):
|
|
2753
|
+
"""
|
|
2754
|
+
Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
|
|
2755
|
+
aggregates it to weekly averages, and renames variables.
|
|
2756
|
+
|
|
2757
|
+
Parameters
|
|
2758
|
+
----------
|
|
2759
|
+
tickers (list): A list of additional stock tickers to fetch (e.g., ['AAPL', 'MSFT']). Defaults to None.
|
|
2760
|
+
week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
|
|
2761
|
+
|
|
2762
|
+
Returns
|
|
2763
|
+
-------
|
|
2764
|
+
pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column
|
|
2765
|
+
and aggregated stock data for the specified tickers, with NaN values filled with 0.
|
|
2766
|
+
|
|
2767
|
+
"""
|
|
2768
|
+
# Define default tickers
|
|
2769
|
+
default_tickers = ["^FTSE", "GBPUSD=X", "GBPEUR=X", "^GSPC"]
|
|
2770
|
+
|
|
2771
|
+
# Combine default tickers with additional ones
|
|
2772
|
+
if tickers is None:
|
|
2773
|
+
tickers = []
|
|
2774
|
+
tickers = list(set(default_tickers + tickers)) # Ensure no duplicates
|
|
2775
|
+
|
|
2776
|
+
# Automatically set end_date to today
|
|
2777
|
+
end_date = datetime.today().strftime("%Y-%m-%d")
|
|
2778
|
+
|
|
2779
|
+
# Mapping week start day to pandas weekday convention
|
|
2780
|
+
days_map = {
|
|
2781
|
+
"mon": 0,
|
|
2782
|
+
"tue": 1,
|
|
2783
|
+
"wed": 2,
|
|
2784
|
+
"thu": 3,
|
|
2785
|
+
"fri": 4,
|
|
2786
|
+
"sat": 5,
|
|
2787
|
+
"sun": 6,
|
|
2788
|
+
}
|
|
2789
|
+
if week_start_day not in days_map:
|
|
2790
|
+
raise ValueError(
|
|
2791
|
+
"Invalid week start day. Choose from: " + ", ".join(days_map.keys()),
|
|
2792
|
+
)
|
|
2793
|
+
week_start = days_map[week_start_day]
|
|
2794
|
+
|
|
2795
|
+
# Fetch data for all tickers without specifying a start date to get all available data
|
|
2796
|
+
data = yf.download(tickers, end=end_date, group_by="ticker", auto_adjust=True)
|
|
2797
|
+
|
|
2798
|
+
# Process the data
|
|
2799
|
+
combined_df = pd.DataFrame()
|
|
2800
|
+
for ticker in tickers:
|
|
2801
|
+
try:
|
|
2802
|
+
# Extract the ticker's data
|
|
2803
|
+
ticker_data = data[ticker] if len(tickers) > 1 else data
|
|
2804
|
+
ticker_data = ticker_data.reset_index()
|
|
2805
|
+
|
|
2806
|
+
# Ensure necessary columns are present
|
|
2807
|
+
if "Close" not in ticker_data.columns:
|
|
2808
|
+
raise ValueError(
|
|
2809
|
+
f"Ticker {ticker} does not have 'Close' price data.",
|
|
2810
|
+
)
|
|
2811
|
+
|
|
2812
|
+
# Keep only relevant columns
|
|
2813
|
+
ticker_data = ticker_data[["Date", "Close"]]
|
|
2814
|
+
ticker_data.rename(columns={"Close": ticker}, inplace=True)
|
|
2815
|
+
|
|
2816
|
+
# Merge data
|
|
2817
|
+
if combined_df.empty:
|
|
2818
|
+
combined_df = ticker_data
|
|
2819
|
+
else:
|
|
2820
|
+
combined_df = pd.merge(
|
|
2821
|
+
combined_df,
|
|
2822
|
+
ticker_data,
|
|
2823
|
+
on="Date",
|
|
2824
|
+
how="outer",
|
|
2825
|
+
)
|
|
2826
|
+
|
|
2827
|
+
except KeyError:
|
|
2828
|
+
print(f"Data for ticker {ticker} not available.")
|
|
2829
|
+
except Exception as e:
|
|
2830
|
+
print(f"Error processing ticker {ticker}: {e}")
|
|
2831
|
+
|
|
2832
|
+
if not combined_df.empty:
|
|
2833
|
+
# Convert to daily frequency
|
|
2834
|
+
combined_df["Date"] = pd.to_datetime(combined_df["Date"])
|
|
2835
|
+
combined_df.set_index("Date", inplace=True)
|
|
2836
|
+
|
|
2837
|
+
# Fill missing dates
|
|
2838
|
+
min_date = combined_df.index.min()
|
|
2839
|
+
max_date = combined_df.index.max()
|
|
2840
|
+
daily_index = pd.date_range(start=min_date, end=max_date, freq="D")
|
|
2841
|
+
combined_df = combined_df.reindex(daily_index)
|
|
2842
|
+
combined_df.index.name = "Date"
|
|
2843
|
+
combined_df = combined_df.ffill()
|
|
2844
|
+
|
|
2845
|
+
# Aggregate to weekly frequency
|
|
2846
|
+
combined_df["OBS"] = combined_df.index - pd.to_timedelta(
|
|
2847
|
+
(combined_df.index.weekday - week_start) % 7,
|
|
2848
|
+
unit="D",
|
|
2849
|
+
)
|
|
2850
|
+
weekly_df = combined_df.groupby("OBS").mean(numeric_only=True).reset_index()
|
|
2851
|
+
|
|
2852
|
+
# Fill NaN values with 0
|
|
2853
|
+
weekly_df = weekly_df.fillna(0)
|
|
2854
|
+
|
|
2855
|
+
# Clean column names
|
|
2856
|
+
def clean_column_name(name):
|
|
2857
|
+
name = re.sub(r"[^\w\s]", "", name)
|
|
2858
|
+
return f"macro_{name.lower()}"
|
|
2859
|
+
|
|
2860
|
+
weekly_df.columns = [
|
|
2861
|
+
clean_column_name(col) if col != "OBS" else col
|
|
2862
|
+
for col in weekly_df.columns
|
|
2863
|
+
]
|
|
2864
|
+
|
|
2865
|
+
return weekly_df
|
|
2866
|
+
|
|
2867
|
+
print("No data available to process.")
|
|
2868
|
+
return pd.DataFrame()
|
|
2869
|
+
|
|
2870
|
+
def pull_sports_events(self, start_date="2020-01-01", week_commencing="mon"):
|
|
2871
|
+
"""
|
|
2872
|
+
Combines scraping logic for:
|
|
2873
|
+
- UEFA Champions League and NFL from TheSportsDB (website-scraping approach)
|
|
2874
|
+
- FIFA World Cup, UEFA Euro, Rugby World Cup, Six Nations (via TheSportsDB API)
|
|
2875
|
+
|
|
2876
|
+
Returns a single merged DataFrame with all event dummy variables.
|
|
2877
|
+
"""
|
|
2878
|
+
|
|
2879
|
+
############################################################
|
|
2880
|
+
# 1) SCRAPE UEFA CHAMPIONS LEAGUE & NFL (YOUR FIRST FUNCTION)
|
|
2881
|
+
############################################################
|
|
2882
|
+
def scrape_sports_events(
|
|
2883
|
+
start_date=start_date,
|
|
2884
|
+
week_commencing=week_commencing,
|
|
2885
|
+
):
|
|
2886
|
+
sports = {
|
|
2887
|
+
"uefa_champions_league": {
|
|
2888
|
+
"league_id": "4480",
|
|
2889
|
+
"seasons_url": "https://www.thesportsdb.com/league/4480-UEFA-Champions-League?a=1#allseasons",
|
|
2890
|
+
"season_url_template": "https://www.thesportsdb.com/season/4480-UEFA-Champions-League/{season}&all=1&view=",
|
|
2891
|
+
"round_filters": ["quarter", "semi", "final"],
|
|
2892
|
+
},
|
|
2893
|
+
"nfl": {
|
|
2894
|
+
"league_id": "4391",
|
|
2895
|
+
"seasons_url": "https://www.thesportsdb.com/league/4391-NFL?a=1#allseasons",
|
|
2896
|
+
"season_url_template": "https://www.thesportsdb.com/season/4391-NFL/{season}&all=1&view=",
|
|
2897
|
+
"round_filters": ["quarter", "semi", "final"],
|
|
2898
|
+
},
|
|
2899
|
+
}
|
|
2900
|
+
|
|
2901
|
+
headers = {"User-Agent": "Mozilla/5.0"}
|
|
2902
|
+
start_date_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
|
2903
|
+
|
|
2904
|
+
# Create a full date range DataFrame
|
|
2905
|
+
full_date_range = pd.date_range(
|
|
2906
|
+
start=start_date,
|
|
2907
|
+
end=pd.to_datetime("today"),
|
|
2908
|
+
)
|
|
2909
|
+
time_series_df = pd.DataFrame({"date": full_date_range})
|
|
2910
|
+
time_series_df["seas_uefa_champions_league"] = 0
|
|
2911
|
+
time_series_df["seas_nfl"] = 0
|
|
2912
|
+
|
|
2913
|
+
for sport, details in sports.items():
|
|
2914
|
+
# Get available seasons
|
|
2915
|
+
response = requests.get(details["seasons_url"], headers=headers)
|
|
2916
|
+
if response.status_code != 200:
|
|
2917
|
+
continue # Skip this sport if the request fails
|
|
2918
|
+
|
|
2919
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
2920
|
+
|
|
2921
|
+
# Extract season names
|
|
2922
|
+
seasons = []
|
|
2923
|
+
for link in soup.find_all("a", href=True):
|
|
2924
|
+
href = link["href"]
|
|
2925
|
+
if "season" in href and sport.replace("_", "-") in href.lower():
|
|
2926
|
+
season_name = href.split("/")[-1] # e.g. "2023-2024"
|
|
2927
|
+
try:
|
|
2928
|
+
season_start_year = int(season_name.split("-")[0])
|
|
2929
|
+
season_start_date = datetime(season_start_year, 1, 1)
|
|
2930
|
+
if season_start_date >= start_date_dt:
|
|
2931
|
+
seasons.append(season_name)
|
|
2932
|
+
except ValueError:
|
|
2933
|
+
continue
|
|
2934
|
+
|
|
2935
|
+
# Scrape matches for filtered seasons
|
|
2936
|
+
filtered_matches = []
|
|
2937
|
+
for season in seasons:
|
|
2938
|
+
season_url = details["season_url_template"].format(season=season)
|
|
2939
|
+
season_response = requests.get(season_url, headers=headers)
|
|
2940
|
+
if season_response.status_code != 200:
|
|
2941
|
+
continue
|
|
2942
|
+
|
|
2943
|
+
season_soup = BeautifulSoup(season_response.text, "html.parser")
|
|
2944
|
+
for row in season_soup.find_all("tr"):
|
|
2945
|
+
cols = row.find_all("td")
|
|
2946
|
+
if len(cols) >= 5:
|
|
2947
|
+
match_date = cols[0].text.strip()
|
|
2948
|
+
round_name = cols[1].text.strip().lower()
|
|
2949
|
+
try:
|
|
2950
|
+
match_date_dt = datetime.strptime(
|
|
2951
|
+
match_date,
|
|
2952
|
+
"%d %b %y",
|
|
2953
|
+
)
|
|
2954
|
+
if match_date_dt >= start_date_dt and any(
|
|
2955
|
+
r in round_name for r in details["round_filters"]
|
|
2956
|
+
):
|
|
2957
|
+
filtered_matches.append(match_date_dt)
|
|
2958
|
+
except ValueError:
|
|
2959
|
+
continue
|
|
2960
|
+
|
|
2961
|
+
# Convert matches into time series format
|
|
2962
|
+
df_sport = pd.DataFrame({"date": filtered_matches})
|
|
2963
|
+
if df_sport.empty:
|
|
2964
|
+
continue
|
|
2965
|
+
|
|
2966
|
+
col_name = (
|
|
2967
|
+
"seas_nfl" if sport == "nfl" else "seas_uefa_champions_league"
|
|
2968
|
+
)
|
|
2969
|
+
time_series_df.loc[
|
|
2970
|
+
time_series_df["date"].isin(df_sport["date"]),
|
|
2971
|
+
col_name,
|
|
2972
|
+
] = 1
|
|
2973
|
+
|
|
2974
|
+
# Aggregate by week commencing
|
|
2975
|
+
day_offsets = {
|
|
2976
|
+
"mon": "W-MON",
|
|
2977
|
+
"tue": "W-TUE",
|
|
2978
|
+
"wed": "W-WED",
|
|
2979
|
+
"thu": "W-THU",
|
|
2980
|
+
"fri": "W-FRI",
|
|
2981
|
+
"sat": "W-SAT",
|
|
2982
|
+
"sun": "W-SUN",
|
|
2983
|
+
}
|
|
2984
|
+
if week_commencing.lower() not in day_offsets:
|
|
2985
|
+
raise ValueError(
|
|
2986
|
+
f"Invalid week_commencing value: {week_commencing}. Must be one of {list(day_offsets.keys())}.",
|
|
2987
|
+
)
|
|
2988
|
+
|
|
2989
|
+
time_series_df = (
|
|
2990
|
+
time_series_df.set_index("date")
|
|
2991
|
+
.resample(day_offsets[week_commencing.lower()])
|
|
2992
|
+
.max()
|
|
2993
|
+
.reset_index()
|
|
2994
|
+
)
|
|
2995
|
+
|
|
2996
|
+
time_series_df.rename(columns={"date": "OBS"}, inplace=True)
|
|
2997
|
+
time_series_df.fillna(0, inplace=True)
|
|
2998
|
+
|
|
2999
|
+
return time_series_df
|
|
3000
|
+
|
|
3001
|
+
############################################################
|
|
3002
|
+
# 2) FETCH FIFA WC, UEFA EURO, RUGBY, SIX NATIONS (2ND FUNC)
|
|
3003
|
+
############################################################
|
|
3004
|
+
def fetch_events(start_date=start_date, week_commencing=week_commencing):
|
|
3005
|
+
# Initialize date range
|
|
3006
|
+
start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
|
|
3007
|
+
end_date_obj = datetime.today()
|
|
3008
|
+
date_range = pd.date_range(start=start_date_obj, end=end_date_obj)
|
|
3009
|
+
df = pd.DataFrame({"OBS": date_range}).set_index("OBS")
|
|
3010
|
+
|
|
3011
|
+
# Define columns for sports
|
|
3012
|
+
event_columns = {
|
|
3013
|
+
"seas_fifa_world_cup": {
|
|
3014
|
+
"league_id": 4429,
|
|
3015
|
+
"start_year": 1950,
|
|
3016
|
+
"interval": 4,
|
|
3017
|
+
},
|
|
3018
|
+
"seas_uefa_european_championship": {
|
|
3019
|
+
"league_id": 4502,
|
|
3020
|
+
"start_year": 1960,
|
|
3021
|
+
"interval": 4,
|
|
3022
|
+
"extra_years": [2021],
|
|
3023
|
+
},
|
|
3024
|
+
"seas_rugby_world_cup": {
|
|
3025
|
+
"league_id": 4574,
|
|
3026
|
+
"start_year": 1987,
|
|
3027
|
+
"interval": 4,
|
|
3028
|
+
},
|
|
3029
|
+
"seas_six_nations": {
|
|
3030
|
+
"league_id": 4714,
|
|
3031
|
+
"start_year": 2000,
|
|
3032
|
+
"interval": 1,
|
|
3033
|
+
},
|
|
3034
|
+
}
|
|
3035
|
+
|
|
3036
|
+
# Initialize columns
|
|
3037
|
+
for col in event_columns:
|
|
3038
|
+
df[col] = 0
|
|
3039
|
+
|
|
3040
|
+
def fetch_league_events(
|
|
3041
|
+
league_id,
|
|
3042
|
+
column_name,
|
|
3043
|
+
start_year,
|
|
3044
|
+
interval,
|
|
3045
|
+
extra_years=None,
|
|
3046
|
+
):
|
|
3047
|
+
extra_years = extra_years or []
|
|
3048
|
+
# Fetch seasons
|
|
3049
|
+
seasons_url = f"https://www.thesportsdb.com/api/v1/json/3/search_all_seasons.php?id={league_id}"
|
|
3050
|
+
seasons_response = requests.get(seasons_url)
|
|
3051
|
+
if seasons_response.status_code != 200:
|
|
3052
|
+
return # Skip on failure
|
|
3053
|
+
|
|
3054
|
+
seasons_data = seasons_response.json().get("seasons", [])
|
|
3055
|
+
for season in seasons_data:
|
|
3056
|
+
season_name = season.get("strSeason", "")
|
|
3057
|
+
if not season_name.isdigit():
|
|
3058
|
+
continue
|
|
3059
|
+
|
|
3060
|
+
year = int(season_name)
|
|
3061
|
+
# Check if the year is valid for this competition
|
|
3062
|
+
if year in extra_years or (
|
|
3063
|
+
year >= start_year and (year - start_year) % interval == 0
|
|
3064
|
+
):
|
|
3065
|
+
# Fetch events
|
|
3066
|
+
events_url = f"https://www.thesportsdb.com/api/v1/json/3/eventsseason.php?id={league_id}&s={season_name}"
|
|
3067
|
+
events_response = requests.get(events_url)
|
|
3068
|
+
if events_response.status_code != 200:
|
|
3069
|
+
continue
|
|
3070
|
+
|
|
3071
|
+
events_data = events_response.json().get("events", [])
|
|
3072
|
+
for event in events_data:
|
|
3073
|
+
event_date_str = event.get("dateEvent")
|
|
3074
|
+
if event_date_str:
|
|
3075
|
+
event_date = datetime.strptime(
|
|
3076
|
+
event_date_str,
|
|
3077
|
+
"%Y-%m-%d",
|
|
3078
|
+
)
|
|
3079
|
+
if event_date in df.index:
|
|
3080
|
+
df.loc[event_date, column_name] = 1
|
|
3081
|
+
|
|
3082
|
+
# Fetch events for all defined leagues
|
|
3083
|
+
for column_name, params in event_columns.items():
|
|
3084
|
+
fetch_league_events(
|
|
3085
|
+
league_id=params["league_id"],
|
|
3086
|
+
column_name=column_name,
|
|
3087
|
+
start_year=params["start_year"],
|
|
3088
|
+
interval=params["interval"],
|
|
3089
|
+
extra_years=params.get("extra_years", []),
|
|
3090
|
+
)
|
|
3091
|
+
|
|
3092
|
+
# Resample by week
|
|
3093
|
+
day_offsets = {
|
|
3094
|
+
"mon": "W-MON",
|
|
3095
|
+
"tue": "W-TUE",
|
|
3096
|
+
"wed": "W-WED",
|
|
3097
|
+
"thu": "W-THU",
|
|
3098
|
+
"fri": "W-FRI",
|
|
3099
|
+
"sat": "W-SAT",
|
|
3100
|
+
"sun": "W-SUN",
|
|
3101
|
+
}
|
|
3102
|
+
|
|
3103
|
+
if week_commencing.lower() not in day_offsets:
|
|
3104
|
+
raise ValueError(
|
|
3105
|
+
f"Invalid week_commencing value: {week_commencing}. "
|
|
3106
|
+
f"Must be one of {list(day_offsets.keys())}.",
|
|
3107
|
+
)
|
|
3108
|
+
|
|
3109
|
+
df = df.resample(day_offsets[week_commencing.lower()]).max()
|
|
3110
|
+
df = df.reset_index()
|
|
3111
|
+
return df
|
|
3112
|
+
|
|
3113
|
+
###################################################
|
|
3114
|
+
# 3) CALL BOTH, THEN MERGE ON "OBS" & FILL WITH 0s
|
|
3115
|
+
###################################################
|
|
3116
|
+
df_uefa_nfl = scrape_sports_events(start_date, week_commencing)
|
|
3117
|
+
df_other_events = fetch_events(start_date, week_commencing)
|
|
3118
|
+
|
|
3119
|
+
# Merge on "OBS" column (outer join to preserve all dates in range)
|
|
3120
|
+
final_df = pd.merge(df_uefa_nfl, df_other_events, on="OBS", how="outer")
|
|
3121
|
+
|
|
3122
|
+
# Fill any NaNs with 0 for event columns
|
|
3123
|
+
# (Only fill numeric columns or everything except 'OBS')
|
|
3124
|
+
for col in final_df.columns:
|
|
3125
|
+
if col != "OBS":
|
|
3126
|
+
final_df[col] = final_df[col].fillna(0)
|
|
3127
|
+
|
|
3128
|
+
# Sort by date just in case
|
|
3129
|
+
final_df.sort_values(by="OBS", inplace=True)
|
|
3130
|
+
final_df.reset_index(drop=True, inplace=True)
|
|
3131
|
+
|
|
3132
|
+
return final_df
|