imsciences 0.5.4.7__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imsciences/pull.py ADDED
@@ -0,0 +1,1483 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import re
4
+ from fredapi import Fred
5
+ import time
6
+ from datetime import datetime, timedelta
7
+ from io import StringIO
8
+ import requests
9
+ import xml.etree.ElementTree as ET
10
+ from bs4 import BeautifulSoup
11
+ import yfinance as yf
12
+ import holidays
13
+ from dateutil.easter import easter
14
+
15
+ from imsciences.mmm import dataprocessing
16
+
17
+ ims_proc = dataprocessing()
18
+
19
+ class datapull:
20
+
21
+ def help(self):
22
+ print("This is the help section. The functions in the package are as follows:")
23
+
24
+ print("\n1. pull_fred_data")
25
+ print(" - Description: Get data from FRED by using series id tokens.")
26
+ print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
27
+ print(" - Example: pull_fred_data('mon', ['GPDIC1'])")
28
+
29
+ print("\n2. pull_boe_data")
30
+ print(" - Description: Fetch and process Bank of England interest rate data.")
31
+ print(" - Usage: pull_boe_data(week_commencing)")
32
+ print(" - Example: pull_boe_data('mon')")
33
+
34
+ print("\n3. pull_oecd")
35
+ print(" - Description: Fetch macroeconomic data from OECD for a specified country.")
36
+ print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '2020-01-01')")
37
+ print(" - Example: pull_oecd('GBR', 'mon', '2000-01-01')")
38
+
39
+ print("\n4. get_google_mobility_data")
40
+ print(" - Description: Fetch Google Mobility data for the specified country.")
41
+ print(" - Usage: get_google_mobility_data(country, wc)")
42
+ print(" - Example: get_google_mobility_data('United Kingdom', 'mon')")
43
+
44
+ print("\n5. pull_seasonality")
45
+ print(" - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.")
46
+ print(" - Usage: pull_seasonality(week_commencing, start_date, countries)")
47
+ print(" - Example: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])")
48
+
49
+ print("\n6. pull_weather")
50
+ print(" - Description: Fetch and process historical weather data for the specified country.")
51
+ print(" - Usage: pull_weather(week_commencing, country)")
52
+ print(" - Example: pull_weather('mon', 'GBR')")
53
+
54
+ print("\n7. pull_macro_ons_uk")
55
+ print(" - Description: Fetch and process time series data from the Beta ONS API.")
56
+ print(" - Usage: pull_macro_ons_uk(aditional_list, week_commencing, sector)")
57
+ print(" - Example: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')")
58
+
59
+ print("\n8. pull_yfinance")
60
+ print(" - Description: Fetch and process time series data from the Beta ONS API.")
61
+ print(" - Usage: pull_yfinance(tickers, week_start_day)")
62
+ print(" - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
63
+
64
+ ############################################################### MACRO ##########################################################################
65
+
66
+ def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
67
+ '''
68
+ Parameters
69
+ ----------
70
+ week_commencing : str
71
+ specify the day for the week commencing, the default is 'sun' (e.g., 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
72
+
73
+ series_id_list : list[str]
74
+ provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
75
+ ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]
76
+
77
+ Returns
78
+ ----------
79
+ pd.DataFrame
80
+ Return a data frame with FRED data according to the series IDs provided
81
+ '''
82
+ # Fred API
83
+ fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
84
+
85
+ # Fetch the metadata for each series to get the full names
86
+ series_names = {series_id: fred.get_series_info(series_id).title for series_id in series_id_list}
87
+
88
+ # Download data from series id list
89
+ fred_series = {series_id: fred.get_series(series_id) for series_id in series_id_list}
90
+
91
+ # Data processing
92
+ date_range = {'OBS': pd.date_range("1950-01-01", datetime.today().strftime('%Y-%m-%d'), freq='d')}
93
+ fred_series_df = pd.DataFrame(date_range)
94
+
95
+ for series_id, series_data in fred_series.items():
96
+ series_data = series_data.reset_index()
97
+ series_data.columns = ['OBS', series_names[series_id]] # Use the series name as the column header
98
+ fred_series_df = pd.merge_asof(fred_series_df, series_data, on='OBS', direction='backward')
99
+
100
+ # Handle duplicate columns
101
+ for col in fred_series_df.columns:
102
+ if '_x' in col:
103
+ base_col = col.replace('_x', '')
104
+ fred_series_df[base_col] = fred_series_df[col].combine_first(fred_series_df[base_col + '_y'])
105
+ fred_series_df.drop([col, base_col + '_y'], axis=1, inplace=True)
106
+
107
+ # Ensure sum_columns are present in the DataFrame
108
+ sum_columns = [series_names[series_id] for series_id in series_id_list if series_names[series_id] in fred_series_df.columns]
109
+
110
+ # Aggregate results by week
111
+ fred_df_final = ims_proc.aggregate_daily_to_wc_wide(df=fred_series_df,
112
+ date_column="OBS",
113
+ group_columns=[],
114
+ sum_columns=sum_columns,
115
+ wc=week_commencing,
116
+ aggregation="average")
117
+
118
+ # Remove anything after the instance of any ':' in the column names and rename, except for 'OBS'
119
+ fred_df_final.columns = ['OBS' if col == 'OBS' else 'macro_' + col.lower().split(':')[0].replace(' ', '_') for col in fred_df_final.columns]
120
+
121
+ return fred_df_final
122
+
123
+ def pull_boe_data(self, week_commencing="mon", max_retries=5, delay=5):
124
+ """
125
+ Fetch and process Bank of England interest rate data.
126
+
127
+ Args:
128
+ week_commencing (str): The starting day of the week for aggregation.
129
+ Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
130
+ Default is "mon".
131
+ max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
132
+ delay (int): Delay in seconds between retry attempts. Default is 5.
133
+
134
+ Returns:
135
+ pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
136
+ The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
137
+ and 'macro_boe_intr_rate' contains the average interest rate for the week.
138
+ """
139
+ # Week commencing dictionary
140
+ day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
141
+
142
+ # URL of the Bank of England data page
143
+ url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
144
+
145
+ # Retry logic for HTTP request
146
+ for attempt in range(max_retries):
147
+ try:
148
+ # Set up headers to mimic a browser request
149
+ headers = {
150
+ "User-Agent": (
151
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
152
+ "Chrome/91.0.4472.124 Safari/537.36"
153
+ )
154
+ }
155
+ response = requests.get(url, headers=headers)
156
+ response.raise_for_status() # Raise an exception for HTTP errors
157
+ break
158
+ except requests.exceptions.RequestException as e:
159
+ print(f"Attempt {attempt + 1} failed: {e}")
160
+ if attempt < max_retries - 1:
161
+ time.sleep(delay)
162
+ else:
163
+ raise
164
+
165
+ # Parse the HTML page
166
+ soup = BeautifulSoup(response.content, "html.parser")
167
+
168
+ # Find the table on the page
169
+ table = soup.find("table") # Locate the first table
170
+ table_html = str(table) # Convert table to string
171
+ df = pd.read_html(StringIO(table_html))[0] # Use StringIO to wrap the table HTML
172
+
173
+ # Rename and clean up columns
174
+ df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
175
+ df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
176
+ df.sort_values("OBS", inplace=True)
177
+
178
+ # Create a daily date range
179
+ date_range = pd.date_range(df["OBS"].min(), datetime.today(), freq="D")
180
+ df_daily = pd.DataFrame(date_range, columns=["OBS"])
181
+
182
+ # Adjust each date to the specified week commencing day
183
+ df_daily["Week_Commencing"] = df_daily["OBS"].apply(
184
+ lambda x: x - timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
185
+ )
186
+
187
+ # Merge and forward-fill missing rates
188
+ df_daily = df_daily.merge(df, on="OBS", how="left")
189
+ df_daily["macro_boe_intr_rate"] = df_daily["macro_boe_intr_rate"].ffill()
190
+
191
+ # Group by week commencing and calculate the average rate
192
+ df_final = df_daily.groupby("Week_Commencing")["macro_boe_intr_rate"].mean().reset_index()
193
+ df_final["Week_Commencing"] = df_final["Week_Commencing"].dt.strftime('%d/%m/%Y')
194
+ df_final.rename(columns={"Week_Commencing": "OBS"}, inplace=True)
195
+
196
+ return df_final
197
+
198
+ def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "2020-01-01") -> pd.DataFrame:
199
+ """
200
+ Fetch and process time series data from the OECD API.
201
+
202
+ Args:
203
+ country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
204
+ week_commencing (str): The starting day of the week for aggregation.
205
+ Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
206
+ start_date (str): Dataset start date in the format "YYYY-MM-DD"
207
+
208
+ Returns:
209
+ pd.DataFrame: A DataFrame with weekly aggregated OECD data. The 'OBS' column contains the week
210
+ commencing dates, and other columns contain the aggregated time series values.
211
+ """
212
+
213
+ def parse_quarter(date_str):
214
+ """Parses a string in 'YYYY-Q#' format into a datetime object."""
215
+ year, quarter = date_str.split('-')
216
+ quarter_number = int(quarter[1])
217
+ month = (quarter_number - 1) * 3 + 1
218
+ return pd.Timestamp(f"{year}-{month:02d}-01")
219
+
220
+ # Generate a date range from 1950-01-01 to today
221
+ date_range = pd.date_range(start=start_date, end=datetime.today(), freq='D')
222
+
223
+ url_details = [
224
+ ["BCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_business_confidence_index"],
225
+ ["CCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_consumer_confidence_index"],
226
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA._T.N.GY", "macro_cpi_total"],
227
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP041T043.N.GY", "macro_cpi_housing"],
228
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP01.N.GY", "macro_cpi_food"],
229
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP045_0722.N.GY", "macro_cpi_energy"],
230
+ ["UNE_LF_M", "SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,", "._Z.Y._T.Y_GE15.", "macro_unemployment_rate"],
231
+ ["EAR", "SDD.TPS,DSD_EAR@DF_HOU_EAR,", ".Y..S1D", "macro_private_hourly_earnings"],
232
+ ["RHP", "ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0", "", "macro_real_house_prices"],
233
+ ["PRVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX.C..", "macro_manufacturing_production_volume"],
234
+ ["TOVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX...", "macro_retail_trade_volume"],
235
+ ["IRSTCI", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_interbank_rate"],
236
+ ["IRLT", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_long_term_interest_rate"],
237
+ ["B1GQ", "SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1", "._Z....GY.T0102", "macro_gdp_growth_yoy"]
238
+ ]
239
+
240
+ # Create empty final dataframe
241
+ oecd_df_final = pd.DataFrame()
242
+
243
+ daily_df = pd.DataFrame({'OBS': date_range})
244
+ value_columns = []
245
+
246
+ # Iterate for each variable of interest
247
+ for series_details in url_details:
248
+ series = series_details[0]
249
+ dataset_id = series_details[1]
250
+ filter = series_details[2]
251
+ col_name = series_details[3]
252
+
253
+ # check if request was successful and determine the most granular data available
254
+ for freq in ['M', 'Q', 'A']:
255
+
256
+ if series in ["UNE_LF_M", "EAR"]:
257
+ data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{series}.{filter}.{freq}?startPeriod=1950-01"
258
+ elif series in ["B1GQ"]:
259
+ data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{country}...{series}.{filter}?startPeriod=1950-01"
260
+ else:
261
+ data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{freq}.{series}.{filter}?startPeriod=1950-01"
262
+
263
+ # Make the request to the OECD API for data
264
+ data_response = requests.get(data_url)
265
+
266
+ # Check if the request was successful
267
+ if data_response.status_code != 200:
268
+ print(f"Failed to fetch data for series {series} with frequency '{freq}' for {country}: {data_response.status_code} {data_response.text}")
269
+ url_test = False
270
+ continue
271
+ else:
272
+ url_test = True
273
+ break
274
+
275
+ # get data for the next variable if url doesn't exist
276
+ if url_test is False:
277
+ continue
278
+
279
+ root = ET.fromstring(data_response.content)
280
+
281
+ # Define namespaces if necessary (the namespace is included in the tags)
282
+ namespaces = {'generic': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic'}
283
+
284
+ # Lists to store the data
285
+ dates = []
286
+ values = []
287
+
288
+ # Iterate over all <Obs> elements and extract date and value
289
+ for obs in root.findall('.//generic:Obs', namespaces):
290
+
291
+ # Extracting the time period (date)
292
+ time_period = obs.find('.//generic:ObsDimension', namespaces).get('value')
293
+
294
+ # Extracting the observation value
295
+ value = obs.find('.//generic:ObsValue', namespaces).get('value')
296
+
297
+ # Storing the data
298
+ if time_period and value:
299
+ dates.append(time_period)
300
+ values.append(float(value)) # Convert value to float
301
+
302
+ # Add variable names that were found to a list
303
+ value_columns.append(col_name)
304
+
305
+ # Creating a DataFrame
306
+ data = pd.DataFrame({'OBS': dates, col_name: values})
307
+
308
+ # Convert date strings into datetime format
309
+ if freq == 'Q':
310
+ data['OBS'] = data['OBS'].apply(parse_quarter)
311
+ else:
312
+ # Display the DataFrame
313
+ data['OBS'] = data['OBS'].apply(lambda x: datetime.strptime(x, '%Y-%m'))
314
+
315
+ # Sort data by chronological order
316
+ data.sort_values(by='OBS', inplace=True)
317
+
318
+ # Merge the data based on the observation date
319
+ daily_df = pd.merge_asof(daily_df, data[['OBS', col_name]], on='OBS', direction='backward')
320
+
321
+
322
+ # Ensure columns are numeric
323
+ for col in value_columns:
324
+ if col in daily_df.columns:
325
+ daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
326
+ else:
327
+ print(f"Column {col} not found in daily_df")
328
+
329
+ # Aggregate results by week
330
+ country_df = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
331
+ date_column="OBS",
332
+ group_columns=[],
333
+ sum_columns=value_columns,
334
+ wc=week_commencing,
335
+ aggregation="average")
336
+
337
+ oecd_df_final = pd.concat([oecd_df_final, country_df], axis=0, ignore_index=True)
338
+
339
+ return oecd_df_final
340
+
341
+ def get_google_mobility_data(self, country="United Kingdom", wc="mon") -> pd.DataFrame:
342
+ """
343
+ Fetch Google Mobility data for the specified country.
344
+
345
+ Parameters:
346
+ - country (str): The name of the country for which to fetch data.
347
+
348
+ Returns:
349
+ - pd.DataFrame: A DataFrame containing the Google Mobility data.
350
+ """
351
+ # URL of the Google Mobility Reports CSV file
352
+ url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv"
353
+
354
+ # Fetch the CSV file
355
+ response = requests.get(url)
356
+ if response.status_code != 200:
357
+ raise Exception(f"Failed to fetch data: {response.status_code}")
358
+
359
+ # Load the CSV file into a pandas DataFrame
360
+ csv_data = StringIO(response.text)
361
+ df = pd.read_csv(csv_data, low_memory=False)
362
+
363
+ # Filter the DataFrame for the specified country
364
+ country_df = df[df['country_region'] == country]
365
+
366
+ final_covid = ims_proc.aggregate_daily_to_wc_wide(country_df, "date", [], ['retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline',
367
+ 'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline',
368
+ 'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline'], wc, "average")
369
+
370
+ final_covid1 = ims_proc.rename_cols(final_covid, 'covid_')
371
+ return final_covid1
372
+
373
+ ############################################################### Seasonality ##########################################################################
374
+
375
+ def pull_seasonality(self, week_commencing, start_date, countries):
376
+ # ---------------------------------------------------------------------
377
+ # 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
378
+ # ---------------------------------------------------------------------
379
+ day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
380
+
381
+ # ---------------------------------------------------------------------
382
+ # 1. Create daily date range from start_date to today
383
+ # ---------------------------------------------------------------------
384
+ date_range = pd.date_range(
385
+ start=pd.to_datetime(start_date),
386
+ end=datetime.today(),
387
+ freq="D"
388
+ )
389
+ df_daily = pd.DataFrame(date_range, columns=["Date"])
390
+
391
+ # ---------------------------------------------------------------------
392
+ # 1.1 Identify "week_start" for each daily row, based on week_commencing
393
+ # ---------------------------------------------------------------------
394
+ df_daily['week_start'] = df_daily["Date"].apply(
395
+ lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
396
+ )
397
+
398
+ # ---------------------------------------------------------------------
399
+ # 2. Build a weekly index (df_weekly_start) with dummy columns
400
+ # ---------------------------------------------------------------------
401
+ df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
402
+ df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
403
+
404
+ # Set index to weekly "start of week"
405
+ df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
406
+ df_weekly_start.set_index("Date", inplace=True)
407
+
408
+ # Create individual weekly dummies
409
+ dummy_columns = {}
410
+ for i in range(len(df_weekly_start)):
411
+ col_name = f"dum_{df_weekly_start.index[i].strftime('%Y_%m_%d')}"
412
+ dummy_columns[col_name] = [0] * len(df_weekly_start)
413
+ dummy_columns[col_name][i] = 1
414
+
415
+ df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
416
+ df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
417
+
418
+ # ---------------------------------------------------------------------
419
+ # 3. Public holidays (daily) from 'holidays' package + each holiday name
420
+ # ---------------------------------------------------------------------
421
+ for country in countries:
422
+ country_holidays = holidays.CountryHoliday(
423
+ country,
424
+ years=range(int(start_date[:4]), datetime.today().year + 1)
425
+ )
426
+ # Daily indicator: 1 if that date is a holiday
427
+ df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(
428
+ lambda x: 1 if x in country_holidays else 0
429
+ )
430
+ # Create columns for specific holiday names
431
+ for date_hol, name in country_holidays.items():
432
+ col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
433
+ if col_name not in df_daily.columns:
434
+ df_daily[col_name] = 0
435
+ df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
436
+
437
+ # ---------------------------------------------------------------------
438
+ # 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
439
+ # We'll add daily columns for each.
440
+ # ---------------------------------------------------------------------
441
+ # Initialize columns
442
+ extra_cols = [
443
+ "seas_valentines_day",
444
+ "seas_halloween",
445
+ "seas_fathers_day_us_uk",
446
+ "seas_mothers_day_us",
447
+ "seas_mothers_day_uk",
448
+ "seas_good_friday",
449
+ "seas_easter_monday",
450
+ "seas_black_friday",
451
+ "seas_cyber_monday",
452
+ ]
453
+ for c in extra_cols:
454
+ df_daily[c] = 0 # default zero
455
+
456
+ # Helper: nth_weekday_of_month(year, month, weekday, nth=1 => first, 2 => second, etc.)
457
+ # weekday: Monday=0, Tuesday=1, ... Sunday=6
458
+ def nth_weekday_of_month(year, month, weekday, nth):
459
+ """
460
+ Returns date of the nth <weekday> in <month> of <year>.
461
+ E.g. nth_weekday_of_month(2023, 6, 6, 3) => 3rd Sunday of June 2023.
462
+ """
463
+ # 1st day of the month
464
+ d = datetime(year, month, 1)
465
+ # What is the weekday of day #1?
466
+ w = d.weekday() # Monday=0, Tuesday=1, ... Sunday=6
467
+ # If we want, e.g. Sunday=6, we see how many days to add
468
+ delta = (weekday - w) % 7
469
+ # This is the first <weekday> in that month
470
+ first_weekday = d + timedelta(days=delta)
471
+ # Now add 7*(nth-1) days
472
+ return first_weekday + timedelta(days=7 * (nth-1))
473
+
474
+ def get_good_friday(year):
475
+ """Good Friday is 2 days before Easter Sunday."""
476
+ return easter(year) - timedelta(days=2)
477
+
478
+ def get_easter_monday(year):
479
+ """Easter Monday is 1 day after Easter Sunday."""
480
+ return easter(year) + timedelta(days=1)
481
+
482
+ def get_black_friday(year):
483
+ """
484
+ Black Friday = day after US Thanksgiving,
485
+ and US Thanksgiving is the 4th Thursday in November.
486
+ """
487
+ # 4th Thursday in November
488
+ fourth_thursday = nth_weekday_of_month(year, 11, 3, 4) # weekday=3 => Thursday
489
+ return fourth_thursday + timedelta(days=1)
490
+
491
+ def get_cyber_monday(year):
492
+ """Cyber Monday = Monday after US Thanksgiving, i.e. 4 days after 4th Thursday in Nov."""
493
+ # 4th Thursday in November
494
+ fourth_thursday = nth_weekday_of_month(year, 11, 3, 4)
495
+ return fourth_thursday + timedelta(days=4) # Monday after Thanksgiving
496
+
497
+ # Loop over each year in range
498
+ start_yr = int(start_date[:4])
499
+ end_yr = datetime.today().year
500
+
501
+ for yr in range(start_yr, end_yr + 1):
502
+ # Valentines = Feb 14
503
+ valentines_day = datetime(yr, 2, 14)
504
+ # Halloween = Oct 31
505
+ halloween_day = datetime(yr, 10, 31)
506
+ # Father's Day (US & UK) = 3rd Sunday in June
507
+ fathers_day = nth_weekday_of_month(yr, 6, 6, 3) # Sunday=6
508
+ # Mother's Day US = 2nd Sunday in May
509
+ mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
510
+ # Mother's Day UK: 4th Sunday in Lent => "Mothering Sunday"
511
+ # We can approximate as: Easter Sunday - 21 days
512
+ # BUT we also must ensure it's actually Sunday
513
+ # (the 4th Sunday in Lent can shift. We'll do the official approach below.)
514
+ # Another approach: Easter Sunday - 7 * (4 weeks) is the 4th Sunday prior to Easter.
515
+ # But that might overshoot if Lent started mid-week.
516
+ # Let's do a quick approach:
517
+ # Officially: Mothering Sunday = 3 weeks before Easter Sunday (the 4th Sunday is Easter Sunday itself).
518
+ # So Easter - 21 days should be the Sunday, but let's confirm with weekday check.
519
+ mothering_sunday = easter(yr) - timedelta(days=21)
520
+ # If for some reason that's not a Sunday (rare corner cases), shift to Sunday:
521
+ while mothering_sunday.weekday() != 6: # Sunday=6
522
+ mothering_sunday -= timedelta(days=1)
523
+
524
+ # Good Friday, Easter Monday
525
+ gf = get_good_friday(yr)
526
+ em = get_easter_monday(yr)
527
+
528
+ # Black Friday, Cyber Monday
529
+ bf = get_black_friday(yr)
530
+ cm = get_cyber_monday(yr)
531
+
532
+ # Mark them in df_daily if in range
533
+ for special_date, col in [
534
+ (valentines_day, "seas_valentines_day"),
535
+ (halloween_day, "seas_halloween"),
536
+ (fathers_day, "seas_fathers_day_us_uk"),
537
+ (mothers_day_us, "seas_mothers_day_us"),
538
+ (mothering_sunday, "seas_mothers_day_uk"),
539
+ (gf, "seas_good_friday"),
540
+ (em, "seas_easter_monday"),
541
+ (bf, "seas_black_friday"),
542
+ (cm, "seas_cyber_monday"),
543
+ ]:
544
+ # Convert to pd.Timestamp:
545
+ special_ts = pd.Timestamp(special_date)
546
+
547
+ # Only set if it's within your daily range
548
+ if (special_ts >= df_daily["Date"].min()) and (special_ts <= df_daily["Date"].max()):
549
+ df_daily.loc[df_daily["Date"] == special_ts, col] = 1
550
+
551
+ # ---------------------------------------------------------------------
552
+ # 4. Add daily indicators for last day & last Friday of month
553
+ # Then aggregate them to weekly level using .max()
554
+ # ---------------------------------------------------------------------
555
+ # Last day of month (daily)
556
+ df_daily["seas_last_day_of_month"] = df_daily["Date"].apply(
557
+ lambda d: 1 if d == d.to_period("M").to_timestamp("M") else 0
558
+ )
559
+
560
+ # Last Friday of month (daily)
561
+ def is_last_friday(date):
562
+ # last day of the month
563
+ last_day_of_month = date.to_period("M").to_timestamp("M")
564
+ last_day_weekday = last_day_of_month.weekday() # Monday=0,...Sunday=6
565
+ # Determine how many days we go back from the last day to get Friday (weekday=4)
566
+ if last_day_weekday >= 4:
567
+ days_to_subtract = last_day_weekday - 4
568
+ else:
569
+ days_to_subtract = last_day_weekday + 3
570
+ last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
571
+ return 1 if date == last_friday else 0
572
+
573
+ df_daily["seas_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
574
+
575
+ # ---------------------------------------------------------------------
576
+ # 5. Weekly aggregation for holiday columns & monthly dummies
577
+ # ---------------------------------------------------------------------
578
+ # For monthly dummies, create a daily col "Month", then get_dummies
579
+ df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
580
+ df_monthly_dummies = pd.get_dummies(
581
+ df_daily,
582
+ prefix="seas",
583
+ columns=["Month"],
584
+ dtype=int
585
+ )
586
+ # Recalculate 'week_start' (already in df_daily, but just to be sure)
587
+ df_monthly_dummies['week_start'] = df_daily['week_start']
588
+
589
+ # Group monthly dummies by .sum() or .mean()—we often spread them across the week
590
+ df_monthly_dummies = (
591
+ df_monthly_dummies
592
+ .groupby('week_start')
593
+ .sum(numeric_only=True) # sum the daily flags
594
+ .reset_index()
595
+ .rename(columns={'week_start': "Date"})
596
+ .set_index("Date")
597
+ )
598
+ # Spread monthly dummies by 7 to distribute across that week
599
+ monthly_cols = [c for c in df_monthly_dummies.columns if c.startswith("seas_month_")]
600
+ df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
601
+
602
+ # Group holiday & special-day columns by .max() => binary at weekly level
603
+ df_holidays = (
604
+ df_daily
605
+ .groupby('week_start')
606
+ .max(numeric_only=True) # if any day=1 in that week, entire week=1
607
+ .reset_index()
608
+ .rename(columns={'week_start': "Date"})
609
+ .set_index("Date")
610
+ )
611
+
612
+ # ---------------------------------------------------------------------
613
+ # 6. Combine weekly start, monthly dummies, holiday flags
614
+ # ---------------------------------------------------------------------
615
+ df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
616
+ df_combined = pd.concat([df_combined, df_holidays], axis=1)
617
+ df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
618
+
619
+ # ---------------------------------------------------------------------
620
+ # 7. Create weekly dummies for Week of Year & yearly dummies
621
+ # ---------------------------------------------------------------------
622
+ df_combined.reset_index(inplace=True)
623
+ df_combined.rename(columns={"index": "old_index"}, inplace=True) # just in case
624
+
625
+ df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
626
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int)
627
+
628
+ df_combined["Year"] = df_combined["Date"].dt.year
629
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
630
+
631
+ # ---------------------------------------------------------------------
632
+ # 8. Add constant & trend
633
+ # ---------------------------------------------------------------------
634
+ df_combined["Constant"] = 1
635
+ df_combined["Trend"] = df_combined.index + 1
636
+
637
+ # ---------------------------------------------------------------------
638
+ # 9. Rename Date -> OBS and return
639
+ # ---------------------------------------------------------------------
640
+ df_combined.rename(columns={"Date": "OBS"}, inplace=True)
641
+
642
+ return df_combined
643
+
644
+
645
+ def pull_weather(self, week_commencing, country) -> pd.DataFrame:
646
+ import pandas as pd
647
+ import urllib.request # noqa: F811
648
+ from datetime import datetime
649
+ import requests
650
+ from geopy.geocoders import Nominatim # noqa: F811
651
+
652
+ # Week commencing dictionary
653
+ day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
654
+
655
+ # Country dictionary
656
+ country_dict = {"AUS": "AU__ASOS", "GBR": "GB__ASOS", "USA": "USCRN", "DEU": "DE__ASOS", "CAN": "Canada", "ZAF": "ZA__ASOS"}
657
+
658
+ # Function to flatten a list of nested lists into a list
659
+ def flatten_list(nested_list):
660
+ return [item for sublist in nested_list for item in sublist]
661
+
662
+ # Choose country
663
+ country = country_dict[country]
664
+
665
+ # Choose start and end dates
666
+ start_day = 1
667
+ start_month = 1
668
+ start_year = 2014
669
+ formatted_date = datetime(start_year, start_month, start_day).strftime("%Y-%m-%d")
670
+ today = datetime.now()
671
+ end_day = today.day
672
+ end_month = today.month
673
+ end_year = today.year
674
+
675
+ if country == "GB__ASOS":
676
+ stations = ["&stations=EGCC", "&stations=EGNM", "&stations=EGBB",
677
+ "&stations=EGSH", "&stations=EGFF", "&stations=EGHI",
678
+ "&stations=EGLC", "&stations=EGHQ", "&stations=EGAC",
679
+ "&stations=EGPF", "&stations=EGGD", "&stations=EGPE",
680
+ "&stations=EGNT"]
681
+ elif country == "AU__ASOS":
682
+ stations = ["&stations=YPDN", "&stations=YBCS", "&stations=YBBN",
683
+ "&stations=YSSY", "&stations=YSSY", "&stations=YMEN",
684
+ "&stations=YPAD", "&stations=YPPH"]
685
+ elif country == "USCRN":
686
+ stations = ["&stations=64756", "&stations=64758", "&stations=03761", "&stations=54797", # North
687
+ "&stations=53968", "&stations=53960", "&stations=54932", "&stations=13301", # Midwest
688
+ "&stations=64756", "&stations=64756", "&stations=92821", "&stations=63862", # South
689
+ "&stations=53152", "&stations=93245", "&stations=04138", "&stations=04237"] # West
690
+ elif country == "DE__ASOS":
691
+ stations = ["&stations=EDDL", "&stations=EDDH", "&stations=EDDB",
692
+ "&stations=EDDN", "&stations=EDDF", "&stations=EDDK",
693
+ "&stations=EDLW", "&stations=EDDM"]
694
+ elif country == "FR__ASOS":
695
+ stations = ["&stations=LFPB"]
696
+ elif country == "Canada":
697
+ institute_vector = ["CA_NB_ASOS", "CA_NF_ASOS", "CA_NT_ASOS", "CA_NS_ASOS",
698
+ "CA_NU_ASOS"]
699
+ stations_list = [[] for _ in range(5)]
700
+ stations_list[0].append(["&stations=CYQM", "&stations=CERM", "&stations=CZCR",
701
+ "&stations=CZBF", "&stations=CYFC", "&stations=CYCX"])
702
+
703
+ stations_list[1].append(["&stations=CWZZ", "&stations=CYDP", "&stations=CYMH",
704
+ "&stations=CYAY", "&stations=CWDO", "&stations=CXTP",
705
+ "&stations=CYJT", "&stations=CYYR", "&stations=CZUM",
706
+ "&stations=CYWK", "&stations=CYWK"])
707
+
708
+ stations_list[2].append(["&stations=CYHI", "&stations=CZCP", "&stations=CWLI",
709
+ "&stations=CWND", "&stations=CXTV", "&stations=CYVL",
710
+ "&stations=CYCO", "&stations=CXDE", "&stations=CYWE",
711
+ "&stations=CYLK", "&stations=CWID", "&stations=CYRF",
712
+ "&stations=CXYH", "&stations=CYWY", "&stations=CWMT"])
713
+
714
+ stations_list[3].append(["&stations=CWEF", "&stations=CXIB", "&stations=CYQY",
715
+ "&stations=CYPD", "&stations=CXNP", "&stations=CXMY",
716
+ "&stations=CYAW", "&stations=CWKG", "&stations=CWVU",
717
+ "&stations=CXLB", "&stations=CWSA", "&stations=CWRN"])
718
+
719
+ stations_list[4].append(["&stations=CYLT", "&stations=CWEU", "&stations=CWGZ",
720
+ "&stations=CYIO", "&stations=CXSE", "&stations=CYCB",
721
+ "&stations=CWIL", "&stations=CXWB", "&stations=CYZS",
722
+ "&stations=CWJC", "&stations=CYFB", "&stations=CWUW"])
723
+
724
+ elif country == "ZA__ASOS":
725
+ cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
726
+ stations = []
727
+
728
+ for city in cities:
729
+ geolocator = Nominatim(user_agent="MyApp")
730
+ location = geolocator.geocode(city)
731
+ stations.append(f"&latitude={location.latitude}&longitude={location.longitude}")
732
+
733
+ # Temperature
734
+ if country in ["GB__ASOS", "AU__ASOS", "DE__ASOS", "FR__ASOS"]:
735
+ # We start by making a data frame of the following weather stations
736
+ station_query = ''.join(stations)
737
+
738
+ raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
739
+ station_query,
740
+ "&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
741
+ "&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
742
+ raw_weather = urllib.request.urlopen(raw_weather_list)
743
+ raw_weather = pd.read_csv(raw_weather)
744
+
745
+ # Replace the occurrences of "None" with Missing Value
746
+ raw_weather["max_temp_f"].replace("None", 0, inplace=True)
747
+ raw_weather["min_temp_f"].replace("None", 0, inplace=True)
748
+
749
+ # Remove any data that isn't temperature-related
750
+ weather = raw_weather.iloc[:, 0:4]
751
+
752
+ weather[["max_temp_f", "min_temp_f"]] = weather[["max_temp_f", "min_temp_f"]].apply(pd.to_numeric)
753
+
754
+ # Estimate mean temperature
755
+ weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
756
+
757
+ # Convert Fahrenheit to Celsius for max_temp_f
758
+ weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
759
+
760
+ # Convert Fahrenheit to Celsius for min_temp_f
761
+ weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
762
+
763
+ # Convert Fahrenheit to Celsius for mean_temp_f
764
+ weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
765
+
766
+ # Aggregate the data to week commencing sunday taking the average of the data
767
+ # Convert the date column to a Date type
768
+ weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
769
+
770
+ # Determine the starting chosen day for each date
771
+ weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
772
+
773
+ # Group by week_starting and summarize
774
+ numeric_columns = weather.select_dtypes(include='number').columns
775
+ weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
776
+ weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
777
+ "min_temp_f": "avg_min_temp_f",
778
+ "mean_temp_f": "avg_mean_temp_f",
779
+ "max_temp_c": "avg_max_temp_c",
780
+ "min_temp_c": "avg_min_temp_c",
781
+ "mean_temp_c": "avg_mean_temp_c"}, inplace=True)
782
+ elif country == "Canada":
783
+ for i in range(len(institute_vector)):
784
+ station_query_temp = ''.join(flatten_list(stations_list[i]))
785
+ institute_temp = institute_vector[i]
786
+ raw_weather_temp = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", institute_temp,
787
+ station_query_temp,
788
+ "&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
789
+ "&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
790
+ raw_weather_temp = urllib.request.urlopen(raw_weather_temp)
791
+ raw_weather_temp = pd.read_csv(raw_weather_temp)
792
+
793
+ if len(raw_weather_temp.index) == 0:
794
+ continue
795
+ raw_weather_temp = raw_weather_temp[['station', 'day', 'max_temp_f', 'min_temp_f', 'precip_in']]
796
+
797
+ if i == 1:
798
+ raw_weather = raw_weather_temp
799
+ else:
800
+ raw_weather = pd.concat([raw_weather, raw_weather_temp])
801
+
802
+ # Drop error column if it exists
803
+ if 'ERROR: Invalid network specified' in list(raw_weather.columns):
804
+ raw_weather.drop('ERROR: Invalid network specified', axis=1, inplace=True)
805
+
806
+ # Replace none values
807
+ raw_weather["max_temp_f"].replace("None", 0, inplace=True)
808
+ raw_weather["min_temp_f"].replace("None", 0, inplace=True)
809
+ raw_weather["precip_in"].replace("None", 0, inplace=True)
810
+
811
+ weather = raw_weather
812
+ weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
813
+
814
+ # Estimate mean temperature
815
+ weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
816
+
817
+ # Convert Fahrenheit to Celsius for max_temp_f
818
+ weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
819
+
820
+ # Convert Fahrenheit to Celsius for min_temp_f
821
+ weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
822
+
823
+ # Convert Fahrenheit to Celsius for mean_temp_f
824
+ weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
825
+
826
+ # Aggregate the data to week commencing sunday taking the average of the data
827
+ # Convert the date column to a Date type
828
+ weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
829
+
830
+ # Determine the starting chosen day for each date
831
+ weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
832
+
833
+ # Group by week_starting and summarize
834
+ numeric_columns = weather.select_dtypes(include='number').columns
835
+ weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
836
+ weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
837
+ "min_temp_f": "avg_min_temp_f",
838
+ "mean_temp_f": "avg_mean_temp_f",
839
+ "max_temp_c": "avg_max_temp_c",
840
+ "min_temp_c": "avg_min_temp_c",
841
+ "mean_temp_c": "avg_mean_temp_c",
842
+ "precip_in": "avg_mean_perc"}, inplace=True)
843
+ elif country == "ZA__ASOS":
844
+ weather_data_list = []
845
+
846
+ for city in cities:
847
+ geolocator = Nominatim(user_agent="MyApp")
848
+ location = geolocator.geocode(city)
849
+ url = "https://archive-api.open-meteo.com/v1/archive"
850
+
851
+ params = {
852
+ "latitude": location.latitude,
853
+ "longitude": location.longitude,
854
+ "start_date": formatted_date,
855
+ "end_date": today.strftime("%Y-%m-%d"),
856
+ "daily": "temperature_2m_max,temperature_2m_min,precipitation_sum",
857
+ "timezone": "auto"
858
+ }
859
+
860
+ response = requests.get(url, params=params)
861
+ response_data = response.json()
862
+
863
+ daily_data = response_data["daily"]
864
+ dates = daily_data["time"]
865
+
866
+ data = pd.DataFrame({
867
+ "day": dates,
868
+ "max_temp_f": daily_data["temperature_2m_max"],
869
+ "min_temp_f": daily_data["temperature_2m_min"],
870
+ "precip_in": daily_data["precipitation_sum"]
871
+ })
872
+ data["city"] = city
873
+ weather_data_list.append(data)
874
+
875
+ weather = pd.concat(weather_data_list)
876
+
877
+ # Convert the date column to a Date type
878
+ weather["day"] = pd.to_datetime(weather["day"])
879
+
880
+ # Replace None values
881
+ weather["max_temp_f"].replace("None", 0, inplace=True)
882
+ weather["min_temp_f"].replace("None", 0, inplace=True)
883
+ weather["precip_in"].replace("None", 0, inplace=True)
884
+
885
+ weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
886
+
887
+ # Estimate mean temperature
888
+ weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
889
+
890
+ # Convert Fahrenheit to Celsius for max_temp_f
891
+ weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
892
+
893
+ # Convert Fahrenheit to Celsius for min_temp_f
894
+ weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
895
+
896
+ # Convert Fahrenheit to Celsius for mean_temp_f
897
+ weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
898
+
899
+ # Determine the starting chosen day for each date
900
+ weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
901
+
902
+ # Group by week_starting and summarize
903
+ numeric_columns = weather.select_dtypes(include='number').columns
904
+ weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
905
+ weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
906
+ "min_temp_f": "avg_min_temp_f",
907
+ "mean_temp_f": "avg_mean_temp_f",
908
+ "max_temp_c": "avg_max_temp_c",
909
+ "min_temp_c": "avg_min_temp_c",
910
+ "mean_temp_c": "avg_mean_temp_c",
911
+ "precip_in": "avg_mean_perc"}, inplace=True)
912
+
913
+ else:
914
+ # We start by making a data frame of the following weather stations
915
+ station_query = ''.join(stations)
916
+
917
+ raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
918
+ station_query,
919
+ "&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
920
+ "&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
921
+ raw_weather = urllib.request.urlopen(raw_weather_list)
922
+ raw_weather = pd.read_csv(raw_weather)
923
+
924
+ raw_weather = raw_weather[['day', 'max_temp_f', 'min_temp_f', 'precip_in']]
925
+
926
+ # Replace the occurrences of "None" with Missing Value
927
+ raw_weather["max_temp_f"].replace("None", 0, inplace=True)
928
+ raw_weather["min_temp_f"].replace("None", 0, inplace=True)
929
+ raw_weather["precip_in"].replace("None", 0, inplace=True)
930
+
931
+ # Remove any data that isn't temperature-related
932
+ weather = raw_weather
933
+
934
+ weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
935
+
936
+ # Estimate mean temperature
937
+ weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
938
+
939
+ # Convert Fahrenheit to Celsius for max_temp_f
940
+ weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
941
+
942
+ # Convert Fahrenheit to Celsius for min_temp_f
943
+ weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
944
+
945
+ # Convert Fahrenheit to Celsius for mean_temp_f
946
+ weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
947
+
948
+ # Aggregate the data to week commencing sunday taking the average of the data
949
+ # Convert the date column to a Date type
950
+ weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
951
+
952
+ # Determine the starting chosen day for each date
953
+ weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
954
+
955
+ # Group by week_starting and summarize
956
+ numeric_columns = weather.select_dtypes(include='number').columns
957
+ weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
958
+ weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
959
+ "min_temp_f": "avg_min_temp_f",
960
+ "mean_temp_f": "avg_mean_temp_f",
961
+ "max_temp_c": "avg_max_temp_c",
962
+ "min_temp_c": "avg_min_temp_c",
963
+ "mean_temp_c": "avg_mean_temp_c",
964
+ "precip_in": "avg_mean_perc"}, inplace=True)
965
+
966
+ # Rainfall
967
+ if country == "GB__ASOS":
968
+ # Define cities and date range
969
+ cities = ["Manchester", "Leeds", "Birmingham", "Norwich", "Cardiff", "Southampton", "London", "Newquay", "Belfast", "Glasgow", "Bristol", "Newcastle"]
970
+
971
+ start_date = formatted_date
972
+ end_date = today.strftime("%Y-%m-%d")
973
+
974
+ # Initialize an empty list to store the weather data for each city
975
+ weather_data_list = []
976
+
977
+ # Loop through each city and fetch weather data
978
+ for city in cities:
979
+ # Initialize Nominatim API
980
+ geolocator = Nominatim(user_agent="MyApp")
981
+ location = geolocator.geocode(city)
982
+ url = "https://archive-api.open-meteo.com/v1/archive"
983
+
984
+ params = {
985
+ "latitude": location.latitude,
986
+ "longitude": location.longitude,
987
+ "start_date": start_date,
988
+ "end_date": end_date,
989
+ "daily": "precipitation_sum",
990
+ "timezone": "auto"
991
+ }
992
+
993
+ response = requests.get(url, params=params)
994
+ response_data = response.json()
995
+
996
+ daily_data = response_data["daily"]["precipitation_sum"]
997
+ dates = response_data["daily"]["time"]
998
+
999
+ data = pd.DataFrame({"date": dates, "rainfall": daily_data})
1000
+ data["city"] = city
1001
+
1002
+ weather_data_list.append(data)
1003
+
1004
+ # Combine all city data into a single data frame
1005
+ all_weather_data = pd.concat(weather_data_list)
1006
+
1007
+ # Convert the date column to a Date type
1008
+ all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
1009
+
1010
+ # Set week commencing col up
1011
+ all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1012
+
1013
+ # Group by week_starting and summarize
1014
+ numeric_columns = all_weather_data.select_dtypes(include='number').columns
1015
+ weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
1016
+ weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
1017
+
1018
+ # Change index to datetime
1019
+ weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
1020
+
1021
+ elif country == "AU__ASOS":
1022
+
1023
+ # Define cities and date range
1024
+ cities = ["Darwin", "Cairns", "Brisbane", "Sydney", "Melbourne", "Adelaide", "Perth"]
1025
+
1026
+ start_date = formatted_date
1027
+ end_date = today.strftime("%Y-%m-%d")
1028
+
1029
+ # Initialize an empty list to store the weather data for each city
1030
+ weather_data_list = []
1031
+
1032
+ # Loop through each city and fetch weather data
1033
+ for city in cities:
1034
+ # Initialize Nominatim API
1035
+ geolocator = Nominatim(user_agent="MyApp")
1036
+ location = geolocator.geocode(city)
1037
+ url = "https://archive-api.open-meteo.com/v1/archive"
1038
+
1039
+ params = {
1040
+ "latitude": location.latitude,
1041
+ "longitude": location.longitude,
1042
+ "start_date": start_date,
1043
+ "end_date": end_date,
1044
+ "daily": "precipitation_sum",
1045
+ "timezone": "auto"
1046
+ }
1047
+
1048
+ response = requests.get(url, params=params)
1049
+ response_data = response.json()
1050
+
1051
+ daily_data = response_data["daily"]["precipitation_sum"]
1052
+ dates = response_data["daily"]["time"]
1053
+
1054
+ data = pd.DataFrame({"date": dates, "rainfall": daily_data})
1055
+ data["city"] = city
1056
+
1057
+ weather_data_list.append(data)
1058
+
1059
+ # Combine all city data into a single data frame
1060
+ all_weather_data = pd.concat(weather_data_list)
1061
+
1062
+ # Convert the date column to a Date type
1063
+ all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
1064
+
1065
+ # Set week commencing col up
1066
+ all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1067
+
1068
+ # Group by week_starting and summarize
1069
+ numeric_columns = all_weather_data.select_dtypes(include='number').columns
1070
+ weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
1071
+ weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
1072
+
1073
+ # Change index to datetime
1074
+ weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
1075
+
1076
+ elif country == "DE__ASOS":
1077
+
1078
+ # Define cities and date range
1079
+ cities = ["Dortmund", "Düsseldorf", "Frankfurt", "Munich", "Cologne", "Berlin", "Hamburg", "Nuernberg"]
1080
+
1081
+ start_date = formatted_date
1082
+ end_date = today.strftime("%Y-%m-%d")
1083
+
1084
+ # Initialize an empty list to store the weather data for each city
1085
+ weather_data_list = []
1086
+
1087
+ # Loop through each city and fetch weather data
1088
+ for city in cities:
1089
+ # Initialize Nominatim API
1090
+ geolocator = Nominatim(user_agent="MyApp")
1091
+ location = geolocator.geocode(city)
1092
+ url = "https://archive-api.open-meteo.com/v1/archive"
1093
+
1094
+ params = {
1095
+ "latitude": location.latitude,
1096
+ "longitude": location.longitude,
1097
+ "start_date": start_date,
1098
+ "end_date": end_date,
1099
+ "daily": "precipitation_sum",
1100
+ "timezone": "auto"
1101
+ }
1102
+
1103
+ response = requests.get(url, params=params)
1104
+ response_data = response.json()
1105
+
1106
+ daily_data = response_data["daily"]["precipitation_sum"]
1107
+ dates = response_data["daily"]["time"]
1108
+
1109
+ data = pd.DataFrame({"date": dates, "rainfall": daily_data})
1110
+ data["city"] = city
1111
+
1112
+ weather_data_list.append(data)
1113
+
1114
+ # Combine all city data into a single data frame
1115
+ all_weather_data = pd.concat(weather_data_list)
1116
+
1117
+ # Convert the date column to a Date type
1118
+ all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
1119
+
1120
+ # Set week commencing col up
1121
+ all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1122
+
1123
+ # Group by week_starting and summarize
1124
+ numeric_columns = all_weather_data.select_dtypes(include='number').columns
1125
+ weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
1126
+ weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
1127
+
1128
+ # Change index to datetime
1129
+ weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
1130
+
1131
+ elif country == "FR__ASOS":
1132
+
1133
+ # Define cities and date range
1134
+ cities = ["Paris"]
1135
+
1136
+ start_date = formatted_date
1137
+ end_date = today.strftime("%Y-%m-%d")
1138
+
1139
+ # Initialize an empty list to store the weather data for each city
1140
+ weather_data_list = []
1141
+
1142
+ # Loop through each city and fetch weather data
1143
+ for city in cities:
1144
+ # Initialize Nominatim API
1145
+ geolocator = Nominatim(user_agent="MyApp")
1146
+ location = geolocator.geocode(city)
1147
+ url = "https://archive-api.open-meteo.com/v1/archive"
1148
+
1149
+ params = {
1150
+ "latitude": location.latitude,
1151
+ "longitude": location.longitude,
1152
+ "start_date": start_date,
1153
+ "end_date": end_date,
1154
+ "daily": "precipitation_sum",
1155
+ "timezone": "auto"
1156
+ }
1157
+
1158
+ response = requests.get(url, params=params)
1159
+ response_data = response.json()
1160
+
1161
+ daily_data = response_data["daily"]["precipitation_sum"]
1162
+ dates = response_data["daily"]["time"]
1163
+
1164
+ data = pd.DataFrame({"date": dates, "rainfall": daily_data})
1165
+ data["city"] = city
1166
+
1167
+ weather_data_list.append(data)
1168
+
1169
+ # Combine all city data into a single data frame
1170
+ all_weather_data = pd.concat(weather_data_list)
1171
+
1172
+ # Convert the date column to a Date type
1173
+ all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
1174
+
1175
+ # Set week commencing col up
1176
+ all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1177
+
1178
+ # Group by week_starting and summarize
1179
+ numeric_columns = all_weather_data.select_dtypes(include='number').columns
1180
+ weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
1181
+ weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
1182
+
1183
+ # Change index to datetime
1184
+ weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
1185
+
1186
+ elif country == "ZA__ASOS":
1187
+ cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
1188
+ start_date = formatted_date
1189
+ end_date = today.strftime("%Y-%m-%d")
1190
+
1191
+ weather_data_list = []
1192
+
1193
+ for city in cities:
1194
+ geolocator = Nominatim(user_agent="MyApp")
1195
+ location = geolocator.geocode(city)
1196
+ url = "https://archive-api.open-meteo.com/v1/archive"
1197
+
1198
+ params = {
1199
+ "latitude": location.latitude,
1200
+ "longitude": location.longitude,
1201
+ "start_date": start_date,
1202
+ "end_date": end_date,
1203
+ "daily": "precipitation_sum",
1204
+ "timezone": "auto"
1205
+ }
1206
+
1207
+ response = requests.get(url, params=params)
1208
+ response_data = response.json()
1209
+
1210
+ daily_data = response_data["daily"]["precipitation_sum"]
1211
+ dates = response_data["daily"]["time"]
1212
+
1213
+ data = pd.DataFrame({"date": dates, "rainfall": daily_data})
1214
+ data["city"] = city
1215
+
1216
+ weather_data_list.append(data)
1217
+
1218
+ # Combine all city data into a single data frame
1219
+ all_weather_data = pd.concat(weather_data_list)
1220
+
1221
+ # Convert the date column to a Date type
1222
+ all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
1223
+
1224
+ # Set week commencing col up
1225
+ all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1226
+
1227
+ # Group by week_starting and summarize
1228
+ numeric_columns = all_weather_data.select_dtypes(include='number').columns
1229
+ weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
1230
+ weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
1231
+
1232
+ # Change index to datetime
1233
+ weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
1234
+
1235
+ # Merge the dataframes
1236
+ if country in ["AU__ASOS", "DE__ASOS", "FR__ASOS", "GB__ASOS", "ZA__ASOS"]:
1237
+ merged_df = weekly_avg_rain.merge(weekly_avg_temp, on="week_starting")
1238
+ else:
1239
+ merged_df = weekly_avg_temp
1240
+
1241
+ merged_df.reset_index(drop=False, inplace=True)
1242
+ merged_df.rename(columns={'week_starting': 'OBS'}, inplace=True)
1243
+
1244
+ final_weather = ims_proc.rename_cols(merged_df, 'seas_')
1245
+
1246
+ return final_weather
1247
+
1248
+ def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
1249
+ """
1250
+ Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
1251
+ aggregates it to weekly averages, and renames variables based on specified rules.
1252
+
1253
+ Parameters:
1254
+ cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
1255
+ week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
1256
+ sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
1257
+
1258
+ Returns:
1259
+ pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
1260
+ and all series as renamed columns.
1261
+ """
1262
+ # Define CDIDs for sectors and defaults
1263
+ sector_cdids = {
1264
+ "fast_food": ["L7TD", "L78Q", "DOAD"],
1265
+ "default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
1266
+ }
1267
+
1268
+ default_cdids = sector_cdids["default"]
1269
+ sector_specific_cdids = sector_cdids.get(sector, [])
1270
+ standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Avoid duplicates
1271
+
1272
+ # Combine standard CDIDs and additional CDIDs
1273
+ if cdid_list is None:
1274
+ cdid_list = []
1275
+ cdid_list = list(set(standard_cdids + cdid_list)) # Avoid duplicates
1276
+
1277
+ base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
1278
+ base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
1279
+ combined_df = pd.DataFrame()
1280
+
1281
+ # Map week start day to pandas weekday convention
1282
+ days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1283
+ if week_start_day not in days_map:
1284
+ raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
1285
+ week_start = days_map[week_start_day]
1286
+
1287
+ for cdid in cdid_list:
1288
+ try:
1289
+ # Search for the series
1290
+ search_url = f"{base_search_url}{cdid}"
1291
+ search_response = requests.get(search_url)
1292
+ search_response.raise_for_status()
1293
+ search_data = search_response.json()
1294
+
1295
+ items = search_data.get("items", [])
1296
+ if not items:
1297
+ print(f"No data found for CDID: {cdid}")
1298
+ continue
1299
+
1300
+ # Extract series name and latest release URI
1301
+ series_name = items[0].get("title", f"Series_{cdid}")
1302
+ latest_date = max(
1303
+ datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
1304
+ for item in items if "release_date" in item
1305
+ )
1306
+ latest_uri = next(
1307
+ item["uri"] for item in items
1308
+ if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
1309
+ )
1310
+
1311
+ # Fetch the dataset
1312
+ data_url = f"{base_data_url}{latest_uri}"
1313
+ data_response = requests.get(data_url)
1314
+ data_response.raise_for_status()
1315
+ data_json = data_response.json()
1316
+
1317
+ # Detect the frequency and process accordingly
1318
+ if "months" in data_json and data_json["months"]:
1319
+ frequency_key = "months"
1320
+ elif "quarters" in data_json and data_json["quarters"]:
1321
+ frequency_key = "quarters"
1322
+ elif "years" in data_json and data_json["years"]:
1323
+ frequency_key = "years"
1324
+ else:
1325
+ print(f"Unsupported frequency or no data for CDID: {cdid}")
1326
+ continue
1327
+
1328
+ # Prepare the DataFrame
1329
+ df = pd.DataFrame(data_json[frequency_key])
1330
+
1331
+ # Parse the 'date' field based on frequency
1332
+ if frequency_key == "months":
1333
+ df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
1334
+ elif frequency_key == "quarters":
1335
+ def parse_quarter(quarter_str):
1336
+ year, qtr = quarter_str.split(" Q")
1337
+ month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
1338
+ return datetime(int(year), month, 1)
1339
+ df["date"] = df["date"].apply(parse_quarter)
1340
+ elif frequency_key == "years":
1341
+ df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
1342
+
1343
+ df["value"] = pd.to_numeric(df["value"], errors="coerce")
1344
+ df.rename(columns={"value": series_name}, inplace=True)
1345
+
1346
+ # Combine data
1347
+ df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
1348
+ if combined_df.empty:
1349
+ combined_df = df
1350
+ else:
1351
+ combined_df = pd.merge(combined_df, df, on="date", how="outer")
1352
+
1353
+ except requests.exceptions.RequestException as e:
1354
+ print(f"Error fetching data for CDID {cdid}: {e}")
1355
+ except (KeyError, ValueError) as e:
1356
+ print(f"Error processing data for CDID {cdid}: {e}")
1357
+
1358
+ if not combined_df.empty:
1359
+ min_date = combined_df["date"].min()
1360
+ max_date = datetime.today()
1361
+ date_range = pd.date_range(start=min_date, end=max_date, freq='D')
1362
+ daily_df = pd.DataFrame(date_range, columns=['date'])
1363
+ daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
1364
+ daily_df = daily_df.ffill()
1365
+
1366
+ # Aggregate to weekly frequency
1367
+ daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
1368
+ weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
1369
+
1370
+ def clean_column_name(name):
1371
+ name = re.sub(r"\(.*?\)", "", name)
1372
+ name = re.split(r":", name)[0]
1373
+ name = re.sub(r"\d+", "", name)
1374
+ name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
1375
+ name = re.sub(r"[^\w\s]", "", name)
1376
+ name = name.replace(" ", "_")
1377
+ name = re.sub(r"_+", "_", name)
1378
+ name = name.rstrip("_")
1379
+ return f"macro_{name.lower()}_uk"
1380
+
1381
+ weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
1382
+ weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
1383
+
1384
+ weekly_df = weekly_df.fillna(0)
1385
+
1386
+ return weekly_df
1387
+ else:
1388
+ print("No data available to process.")
1389
+ return pd.DataFrame()
1390
+
1391
+ def pull_yfinance(self, tickers=None, week_start_day="mon"):
1392
+ """
1393
+ Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
1394
+ aggregates it to weekly averages, and renames variables.
1395
+
1396
+ Parameters:
1397
+ tickers (list): A list of additional stock tickers to fetch (e.g., ['AAPL', 'MSFT']). Defaults to None.
1398
+ week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
1399
+
1400
+ Returns:
1401
+ pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column
1402
+ and aggregated stock data for the specified tickers, with NaN values filled with 0.
1403
+ """
1404
+ # Define default tickers
1405
+ default_tickers = ["^FTSE", "GBPUSD=X", "GBPEUR=X", "^GSPC"]
1406
+
1407
+ # Combine default tickers with additional ones
1408
+ if tickers is None:
1409
+ tickers = []
1410
+ tickers = list(set(default_tickers + tickers)) # Ensure no duplicates
1411
+
1412
+ # Automatically set end_date to today
1413
+ end_date = datetime.today().strftime("%Y-%m-%d")
1414
+
1415
+ # Mapping week start day to pandas weekday convention
1416
+ days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1417
+ if week_start_day not in days_map:
1418
+ raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
1419
+ week_start = days_map[week_start_day]
1420
+
1421
+ # Fetch data for all tickers without specifying a start date to get all available data
1422
+ data = yf.download(tickers, end=end_date, group_by="ticker", auto_adjust=True)
1423
+
1424
+ # Process the data
1425
+ combined_df = pd.DataFrame()
1426
+ for ticker in tickers:
1427
+ try:
1428
+ # Extract the ticker's data
1429
+ ticker_data = data[ticker] if len(tickers) > 1 else data
1430
+ ticker_data = ticker_data.reset_index()
1431
+
1432
+ # Ensure necessary columns are present
1433
+ if "Close" not in ticker_data.columns:
1434
+ raise ValueError(f"Ticker {ticker} does not have 'Close' price data.")
1435
+
1436
+ # Keep only relevant columns
1437
+ ticker_data = ticker_data[["Date", "Close"]]
1438
+ ticker_data.rename(columns={"Close": ticker}, inplace=True)
1439
+
1440
+ # Merge data
1441
+ if combined_df.empty:
1442
+ combined_df = ticker_data
1443
+ else:
1444
+ combined_df = pd.merge(combined_df, ticker_data, on="Date", how="outer")
1445
+
1446
+ except KeyError:
1447
+ print(f"Data for ticker {ticker} not available.")
1448
+ except Exception as e:
1449
+ print(f"Error processing ticker {ticker}: {e}")
1450
+
1451
+ if not combined_df.empty:
1452
+ # Convert to daily frequency
1453
+ combined_df["Date"] = pd.to_datetime(combined_df["Date"])
1454
+ combined_df.set_index("Date", inplace=True)
1455
+
1456
+ # Fill missing dates
1457
+ min_date = combined_df.index.min()
1458
+ max_date = combined_df.index.max()
1459
+ daily_index = pd.date_range(start=min_date, end=max_date, freq='D')
1460
+ combined_df = combined_df.reindex(daily_index)
1461
+ combined_df.index.name = "Date"
1462
+ combined_df = combined_df.ffill()
1463
+
1464
+ # Aggregate to weekly frequency
1465
+ combined_df["OBS"] = combined_df.index - pd.to_timedelta((combined_df.index.weekday - week_start) % 7, unit="D")
1466
+ weekly_df = combined_df.groupby("OBS").mean(numeric_only=True).reset_index()
1467
+
1468
+ # Fill NaN values with 0
1469
+ weekly_df = weekly_df.fillna(0)
1470
+
1471
+ # Clean column names
1472
+ def clean_column_name(name):
1473
+ name = re.sub(r"[^\w\s]", "", name)
1474
+ return f"macro_{name.lower()}"
1475
+
1476
+ weekly_df.columns = [clean_column_name(col) if col != "OBS" else col for col in weekly_df.columns]
1477
+
1478
+ return weekly_df
1479
+
1480
+ else:
1481
+ print("No data available to process.")
1482
+ return pd.DataFrame()
1483
+