imsciences 1.1.10__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3132 +0,0 @@
1
- import importlib
2
- import re
3
- import time
4
- import urllib.request
5
- import xml.etree.ElementTree as ET
6
- from datetime import datetime, timedelta
7
- from io import StringIO
8
-
9
- import numpy as np
10
- import pandas as pd
11
- import requests
12
- import yfinance as yf
13
- from bs4 import BeautifulSoup
14
- from dateutil.easter import easter
15
- from fredapi import Fred
16
- from geopy.geocoders import Nominatim
17
-
18
- from imsciences.mmm import dataprocessing
19
-
20
- ims_proc = dataprocessing()
21
-
22
-
23
- class datapull:
24
- def help(self):
25
- print("This is the help section. The functions in the package are as follows:")
26
-
27
- print("\n1. pull_fred_data")
28
- print(" - Description: Get data from FRED by using series id tokens.")
29
- print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
30
- print(" - Example: pull_fred_data('mon', ['GPDIC1'])")
31
-
32
- print("\n2. pull_boe_data")
33
- print(" - Description: Fetch and process Bank of England interest rate data.")
34
- print(" - Usage: pull_boe_data(week_commencing)")
35
- print(" - Example: pull_boe_data('mon')")
36
-
37
- print("\n3. pull_oecd")
38
- print(
39
- " - Description: Fetch macroeconomic data from OECD for a specified country.",
40
- )
41
- print(
42
- " - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '2020-01-01')",
43
- )
44
- print(" - Example: pull_oecd('GBR', 'mon', '2000-01-01')")
45
-
46
- print("\n4. get_google_mobility_data")
47
- print(" - Description: Fetch Google Mobility data for the specified country.")
48
- print(" - Usage: get_google_mobility_data(country, wc)")
49
- print(" - Example: get_google_mobility_data('United Kingdom', 'mon')")
50
-
51
- print("\n5. pull_seasonality")
52
- print(
53
- " - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.",
54
- )
55
- print(" - Usage: pull_seasonality(week_commencing, start_date, countries)")
56
- print(" - Example: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])")
57
-
58
- print("\n6. pull_weather")
59
- print(
60
- " - Description: Fetch and process historical weather data for the specified country.",
61
- )
62
- print(" - Usage: pull_weather(week_commencing, start_date, country)")
63
- print(" - Example: pull_weather('mon', '2020-01-01', ['GBR'])")
64
-
65
- print("\n7. pull_macro_ons_uk")
66
- print(
67
- " - Description: Fetch and process time series data from the Beta ONS API.",
68
- )
69
- print(" - Usage: pull_macro_ons_uk(aditional_list, week_commencing, sector)")
70
- print(" - Example: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')")
71
-
72
- print("\n8. pull_yfinance")
73
- print(
74
- " - Description: Fetch and process time series data from the Beta ONS API.",
75
- )
76
- print(" - Usage: pull_yfinance(tickers, week_start_day)")
77
- print(" - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
78
-
79
- print("\n9. pull_sports_events")
80
- print(
81
- " - Description: Pull a veriety of sports events primaraly football and rugby.",
82
- )
83
- print(" - Usage: pull_sports_events(start_date, week_commencing)")
84
- print(" - Example: pull_sports_events('2020-01-01', 'mon')")
85
-
86
- ############################################################### MACRO ##########################################################################
87
-
88
- def pull_fred_data(
89
- self,
90
- week_commencing: str = "mon",
91
- series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"],
92
- ) -> pd.DataFrame:
93
- """
94
- Parameters
95
- ----------
96
- week_commencing : str
97
- specify the day for the week commencing, the default is 'sun' (e.g., 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
98
-
99
- series_id_list : list[str]
100
- provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
101
- ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]
102
-
103
- Returns
104
- -------
105
- pd.DataFrame
106
- Return a data frame with FRED data according to the series IDs provided
107
-
108
- """
109
- # Fred API
110
- fred = Fred(api_key="76f5f8156145fdb8fbaf66f1eb944f8a")
111
-
112
- # Fetch the metadata for each series to get the full names
113
- series_names = {
114
- series_id: fred.get_series_info(series_id).title
115
- for series_id in series_id_list
116
- }
117
-
118
- # Download data from series id list
119
- fred_series = {
120
- series_id: fred.get_series(series_id) for series_id in series_id_list
121
- }
122
-
123
- # Data processing
124
- date_range = {
125
- "OBS": pd.date_range(
126
- "1950-01-01",
127
- datetime.today().strftime("%Y-%m-%d"),
128
- freq="d",
129
- ),
130
- }
131
- fred_series_df = pd.DataFrame(date_range)
132
-
133
- for series_id, series_data in fred_series.items():
134
- series_data = series_data.reset_index()
135
- series_data.columns = [
136
- "OBS",
137
- series_names[series_id],
138
- ] # Use the series name as the column header
139
- fred_series_df = pd.merge_asof(
140
- fred_series_df,
141
- series_data,
142
- on="OBS",
143
- direction="backward",
144
- )
145
-
146
- # Handle duplicate columns
147
- for col in fred_series_df.columns:
148
- if "_x" in col:
149
- base_col = col.replace("_x", "")
150
- fred_series_df[base_col] = fred_series_df[col].combine_first(
151
- fred_series_df[base_col + "_y"],
152
- )
153
- fred_series_df.drop([col, base_col + "_y"], axis=1, inplace=True)
154
-
155
- # Ensure sum_columns are present in the DataFrame
156
- sum_columns = [
157
- series_names[series_id]
158
- for series_id in series_id_list
159
- if series_names[series_id] in fred_series_df.columns
160
- ]
161
-
162
- # Aggregate results by week
163
- fred_df_final = ims_proc.aggregate_daily_to_wc_wide(
164
- df=fred_series_df,
165
- date_column="OBS",
166
- group_columns=[],
167
- sum_columns=sum_columns,
168
- wc=week_commencing,
169
- aggregation="average",
170
- )
171
-
172
- # Remove anything after the instance of any ':' in the column names and rename, except for 'OBS'
173
- fred_df_final.columns = [
174
- "OBS"
175
- if col == "OBS"
176
- else "macro_" + col.lower().split(":")[0].replace(" ", "_")
177
- for col in fred_df_final.columns
178
- ]
179
-
180
- return fred_df_final
181
-
182
- def pull_boe_data(self, week_commencing="mon", max_retries=5, delay=5):
183
- """
184
- Fetch and process Bank of England interest rate data.
185
-
186
- Args:
187
- week_commencing (str): The starting day of the week for aggregation.
188
- Options are "mon", "tue", "wed", "thu", "fri", "sat", "sun".
189
- Default is "mon".
190
- max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
191
- delay (int): Delay in seconds between retry attempts. Default is 5.
192
-
193
- Returns:
194
- pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
195
- The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
196
- and 'macro_boe_intr_rate' contains the average interest rate for the week.
197
-
198
- """
199
- # Week commencing dictionary
200
- day_dict = {
201
- "mon": 0,
202
- "tue": 1,
203
- "wed": 2,
204
- "thu": 3,
205
- "fri": 4,
206
- "sat": 5,
207
- "sun": 6,
208
- }
209
-
210
- # URL of the Bank of England data page
211
- url = "https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp"
212
-
213
- # Retry logic for HTTP request
214
- for attempt in range(max_retries):
215
- try:
216
- # Set up headers to mimic a browser request
217
- headers = {
218
- "User-Agent": (
219
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
220
- "Chrome/91.0.4472.124 Safari/537.36"
221
- ),
222
- }
223
- response = requests.get(url, headers=headers)
224
- response.raise_for_status() # Raise an exception for HTTP errors
225
- break
226
- except requests.exceptions.RequestException as e:
227
- print(f"Attempt {attempt + 1} failed: {e}")
228
- if attempt < max_retries - 1:
229
- time.sleep(delay)
230
- else:
231
- raise
232
-
233
- # Parse the HTML page
234
- soup = BeautifulSoup(response.content, "html.parser")
235
-
236
- # Find the table on the page
237
- table = soup.find("table") # Locate the first table
238
- table_html = str(table) # Convert table to string
239
- df = pd.read_html(StringIO(table_html))[
240
- 0
241
- ] # Use StringIO to wrap the table HTML
242
-
243
- # Rename and clean up columns
244
- df.rename(
245
- columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"},
246
- inplace=True,
247
- )
248
- df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
249
- df.sort_values("OBS", inplace=True)
250
-
251
- # Create a daily date range
252
- date_range = pd.date_range(df["OBS"].min(), datetime.today(), freq="D")
253
- df_daily = pd.DataFrame(date_range, columns=["OBS"])
254
-
255
- # Adjust each date to the specified week commencing day
256
- df_daily["Week_Commencing"] = df_daily["OBS"].apply(
257
- lambda x: x - timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7),
258
- )
259
-
260
- # Merge and forward-fill missing rates
261
- df_daily = df_daily.merge(df, on="OBS", how="left")
262
- df_daily["macro_boe_intr_rate"] = df_daily["macro_boe_intr_rate"].ffill()
263
-
264
- # Group by week commencing and calculate the average rate
265
- df_final = (
266
- df_daily.groupby("Week_Commencing")["macro_boe_intr_rate"]
267
- .mean()
268
- .reset_index()
269
- )
270
- df_final["Week_Commencing"] = df_final["Week_Commencing"].dt.strftime(
271
- "%d/%m/%Y",
272
- )
273
- df_final.rename(columns={"Week_Commencing": "OBS"}, inplace=True)
274
-
275
- return df_final
276
-
277
- def pull_oecd(
278
- self,
279
- country: str = "GBR",
280
- week_commencing: str = "mon",
281
- start_date: str = "2020-01-01",
282
- ) -> pd.DataFrame:
283
- """
284
- Fetch and process time series data from the OECD API.
285
-
286
- Args:
287
- country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
288
- week_commencing (str): The starting day of the week for aggregation.
289
- Options are "mon", "tue", "wed", "thu", "fri", "sat", "sun".
290
- start_date (str): Dataset start date in the format "YYYY-MM-DD"
291
-
292
- Returns:
293
- pd.DataFrame: A DataFrame with weekly aggregated OECD data. The 'OBS' column contains the week
294
- commencing dates, and other columns contain the aggregated time series values.
295
-
296
- """
297
-
298
- def parse_quarter(date_str):
299
- """Parses a string in 'YYYY-Q#' format into a datetime object."""
300
- year, quarter = date_str.split("-")
301
- quarter_number = int(quarter[1])
302
- month = (quarter_number - 1) * 3 + 1
303
- return pd.Timestamp(f"{year}-{month:02d}-01")
304
-
305
- # Generate a date range from 1950-01-01 to today
306
- date_range = pd.date_range(start=start_date, end=datetime.today(), freq="D")
307
-
308
- url_details = [
309
- [
310
- "BCICP",
311
- "SDD.STES,DSD_STES@DF_CLI,",
312
- ".....",
313
- "macro_business_confidence_index",
314
- ],
315
- [
316
- "CCICP",
317
- "SDD.STES,DSD_STES@DF_CLI,",
318
- ".....",
319
- "macro_consumer_confidence_index",
320
- ],
321
- [
322
- "N.CPI",
323
- "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
324
- "PA._T.N.GY",
325
- "macro_cpi_total",
326
- ],
327
- [
328
- "N.CPI",
329
- "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
330
- "PA.CP041T043.N.GY",
331
- "macro_cpi_housing",
332
- ],
333
- [
334
- "N.CPI",
335
- "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
336
- "PA.CP01.N.GY",
337
- "macro_cpi_food",
338
- ],
339
- [
340
- "N.CPI",
341
- "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
342
- "PA.CP045_0722.N.GY",
343
- "macro_cpi_energy",
344
- ],
345
- [
346
- "UNE_LF_M",
347
- "SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,",
348
- "._Z.Y._T.Y_GE15.",
349
- "macro_unemployment_rate",
350
- ],
351
- [
352
- "EAR",
353
- "SDD.TPS,DSD_EAR@DF_HOU_EAR,",
354
- ".Y..S1D",
355
- "macro_private_hourly_earnings",
356
- ],
357
- [
358
- "RHP",
359
- "ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0",
360
- "",
361
- "macro_real_house_prices",
362
- ],
363
- [
364
- "PRVM",
365
- "SDD.STES,DSD_KEI@DF_KEI,4.0",
366
- "IX.C..",
367
- "macro_manufacturing_production_volume",
368
- ],
369
- [
370
- "TOVM",
371
- "SDD.STES,DSD_KEI@DF_KEI,4.0",
372
- "IX...",
373
- "macro_retail_trade_volume",
374
- ],
375
- ["IRSTCI", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_interbank_rate"],
376
- [
377
- "IRLT",
378
- "SDD.STES,DSD_KEI@DF_KEI,4.0",
379
- "PA...",
380
- "macro_long_term_interest_rate",
381
- ],
382
- [
383
- "B1GQ",
384
- "SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1",
385
- "._Z....GY.T0102",
386
- "macro_gdp_growth_yoy",
387
- ],
388
- ]
389
-
390
- # Create empty final dataframe
391
- oecd_df_final = pd.DataFrame()
392
-
393
- daily_df = pd.DataFrame({"OBS": date_range})
394
- value_columns = []
395
-
396
- # Iterate for each variable of interest
397
- for series_details in url_details:
398
- series = series_details[0]
399
- dataset_id = series_details[1]
400
- filter = series_details[2]
401
- col_name = series_details[3]
402
-
403
- # check if request was successful and determine the most granular data available
404
- for freq in ["M", "Q", "A"]:
405
- if series in ["UNE_LF_M", "EAR"]:
406
- data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{series}.{filter}.{freq}?startPeriod=1950-01"
407
- elif series in ["B1GQ"]:
408
- data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{country}...{series}.{filter}?startPeriod=1950-01"
409
- else:
410
- data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{freq}.{series}.{filter}?startPeriod=1950-01"
411
-
412
- # Make the request to the OECD API for data
413
- data_response = requests.get(data_url)
414
-
415
- # Check if the request was successful
416
- if data_response.status_code != 200:
417
- print(
418
- f"Failed to fetch data for series {series} with frequency '{freq}' for {country}: {data_response.status_code} {data_response.text}",
419
- )
420
- url_test = False
421
- continue
422
- url_test = True
423
- break
424
-
425
- # get data for the next variable if url doesn't exist
426
- if url_test is False:
427
- continue
428
-
429
- root = ET.fromstring(data_response.content)
430
-
431
- # Define namespaces if necessary (the namespace is included in the tags)
432
- namespaces = {
433
- "generic": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic",
434
- }
435
-
436
- # Lists to store the data
437
- dates = []
438
- values = []
439
-
440
- # Iterate over all <Obs> elements and extract date and value
441
- for obs in root.findall(".//generic:Obs", namespaces):
442
- # Extracting the time period (date)
443
- time_period = obs.find(".//generic:ObsDimension", namespaces).get(
444
- "value",
445
- )
446
-
447
- # Extracting the observation value
448
- value = obs.find(".//generic:ObsValue", namespaces).get("value")
449
-
450
- # Storing the data
451
- if time_period and value:
452
- dates.append(time_period)
453
- values.append(float(value)) # Convert value to float
454
-
455
- # Add variable names that were found to a list
456
- value_columns.append(col_name)
457
-
458
- # Creating a DataFrame
459
- data = pd.DataFrame({"OBS": dates, col_name: values})
460
-
461
- # Convert date strings into datetime format
462
- if freq == "Q":
463
- data["OBS"] = data["OBS"].apply(parse_quarter)
464
- else:
465
- # Display the DataFrame
466
- data["OBS"] = data["OBS"].apply(lambda x: datetime.strptime(x, "%Y-%m"))
467
-
468
- # Sort data by chronological order
469
- data.sort_values(by="OBS", inplace=True)
470
-
471
- # Merge the data based on the observation date
472
- daily_df = pd.merge_asof(
473
- daily_df,
474
- data[["OBS", col_name]],
475
- on="OBS",
476
- direction="backward",
477
- )
478
-
479
- # Ensure columns are numeric
480
- for col in value_columns:
481
- if col in daily_df.columns:
482
- daily_df[col] = pd.to_numeric(daily_df[col], errors="coerce").fillna(0)
483
- else:
484
- print(f"Column {col} not found in daily_df")
485
-
486
- # Aggregate results by week
487
- country_df = ims_proc.aggregate_daily_to_wc_wide(
488
- df=daily_df,
489
- date_column="OBS",
490
- group_columns=[],
491
- sum_columns=value_columns,
492
- wc=week_commencing,
493
- aggregation="average",
494
- )
495
-
496
- oecd_df_final = pd.concat(
497
- [oecd_df_final, country_df],
498
- axis=0,
499
- ignore_index=True,
500
- )
501
-
502
- return oecd_df_final
503
-
504
- def get_google_mobility_data(
505
- self,
506
- country="United Kingdom",
507
- wc="mon",
508
- ) -> pd.DataFrame:
509
- """
510
- Fetch Google Mobility data for the specified country.
511
-
512
- Parameters
513
- ----------
514
- - country (str): The name of the country for which to fetch data.
515
-
516
- Returns
517
- -------
518
- - pd.DataFrame: A DataFrame containing the Google Mobility data.
519
-
520
- """
521
- # URL of the Google Mobility Reports CSV file
522
- url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv"
523
-
524
- # Fetch the CSV file
525
- response = requests.get(url)
526
- if response.status_code != 200:
527
- raise Exception(f"Failed to fetch data: {response.status_code}")
528
-
529
- # Load the CSV file into a pandas DataFrame
530
- csv_data = StringIO(response.text)
531
- df = pd.read_csv(csv_data, low_memory=False)
532
-
533
- # Filter the DataFrame for the specified country
534
- country_df = df[df["country_region"] == country]
535
-
536
- final_covid = ims_proc.aggregate_daily_to_wc_wide(
537
- country_df,
538
- "date",
539
- [],
540
- [
541
- "retail_and_recreation_percent_change_from_baseline",
542
- "grocery_and_pharmacy_percent_change_from_baseline",
543
- "parks_percent_change_from_baseline",
544
- "transit_stations_percent_change_from_baseline",
545
- "workplaces_percent_change_from_baseline",
546
- "residential_percent_change_from_baseline",
547
- ],
548
- wc,
549
- "average",
550
- )
551
-
552
- final_covid1 = ims_proc.rename_cols(final_covid, "covid_")
553
- return final_covid1
554
-
555
- ############################################################### Seasonality ##########################################################################
556
-
557
- def pull_seasonality(self, week_commencing, start_date, countries):
558
- """
559
- Generates a DataFrame with weekly seasonality features.
560
-
561
- Args:
562
- week_commencing (str): The starting day of the week ('mon', 'tue', ..., 'sun').
563
- start_date (str): The start date in 'YYYY-MM-DD' format.
564
- countries (list): A list of country codes (e.g., ['GB', 'US']) for holidays.
565
-
566
- Returns:
567
- pd.DataFrame: A DataFrame indexed by week start date, containing various
568
- seasonal dummy variables, holidays, trend, and constant.
569
- The date column is named 'OBS'.
570
-
571
- """
572
- # ---------------------------------------------------------------------
573
- # 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
574
- # ---------------------------------------------------------------------
575
- day_dict = {
576
- "mon": 0,
577
- "tue": 1,
578
- "wed": 2,
579
- "thu": 3,
580
- "fri": 4,
581
- "sat": 5,
582
- "sun": 6,
583
- }
584
- if week_commencing not in day_dict:
585
- raise ValueError(
586
- f"Invalid week_commencing value: {week_commencing}. Use one of {list(day_dict.keys())}",
587
- )
588
-
589
- # ---------------------------------------------------------------------
590
- # 0.2 Setup: dictionary continents and countries
591
- # ---------------------------------------------------------------------
592
- COUNTRY_TO_CONTINENT = {
593
- # Europe
594
- "Austria": "europe",
595
- "Belarus": "europe",
596
- "Belgium": "europe",
597
- "Bulgaria": "europe",
598
- "Croatia": "europe",
599
- "Cyprus": "europe",
600
- "Czechia": "europe",
601
- "CzechRepublic": "europe",
602
- "Denmark": "europe",
603
- "Estonia": "europe",
604
- "EuropeanCentralBank": "europe",
605
- "Finland": "europe",
606
- "France": "europe",
607
- "FranceAlsaceMoselle": "europe",
608
- "Germany": "europe",
609
- "GermanyBaden": "europe",
610
- "GermanyBavaria": "europe",
611
- "GermanyBerlin": "europe",
612
- "GermanyBrandenburg": "europe",
613
- "GermanyBremen": "europe",
614
- "GermanyHamburg": "europe",
615
- "GermanyHesse": "europe",
616
- "GermanyLowerSaxony": "europe",
617
- "GermanyMecklenburgVorpommern": "europe",
618
- "GermanyNorthRhineWestphalia": "europe",
619
- "GermanyRhinelandPalatinate": "europe",
620
- "GermanySaarland": "europe",
621
- "GermanySaxony": "europe",
622
- "GermanySaxonyAnhalt": "europe",
623
- "GermanySchleswigHolstein": "europe",
624
- "GermanyThuringia": "europe",
625
- "Greece": "europe",
626
- "Hungary": "europe",
627
- "Iceland": "europe",
628
- "Ireland": "europe",
629
- "Italy": "europe",
630
- "Latvia": "europe",
631
- "Lithuania": "europe",
632
- "Luxembourg": "europe",
633
- "Malta": "europe",
634
- "Monaco": "europe",
635
- "Netherlands": "europe",
636
- "Norway": "europe",
637
- "Poland": "europe",
638
- "Portugal": "europe",
639
- "Romania": "europe",
640
- "Russia": "europe",
641
- "Serbia": "europe",
642
- "Slovakia": "europe",
643
- "Slovenia": "europe",
644
- "Spain": "europe",
645
- "SpainAndalusia": "europe",
646
- "SpainAragon": "europe",
647
- "SpainAsturias": "europe",
648
- "SpainBalearicIslands": "europe",
649
- "SpainBasqueCountry": "europe",
650
- "SpainCanaryIslands": "europe",
651
- "SpainCantabria": "europe",
652
- "SpainCastileAndLeon": "europe",
653
- "SpainCastillaLaMancha": "europe",
654
- "SpainCatalonia": "europe",
655
- "SpainExtremadura": "europe",
656
- "SpainGalicia": "europe",
657
- "SpainLaRioja": "europe",
658
- "SpainMadrid": "europe",
659
- "SpainMurcia": "europe",
660
- "SpainNavarre": "europe",
661
- "SpainValencia": "europe",
662
- "Sweden": "europe",
663
- "Switzerland": "europe",
664
- "Ukraine": "europe",
665
- "UnitedKingdom": "europe",
666
- # Americas
667
- "Argentina": "america",
668
- "Barbados": "america",
669
- "Brazil": "america",
670
- "Canada": "america",
671
- "Chile": "america",
672
- "Colombia": "america",
673
- "Mexico": "america",
674
- "Panama": "america",
675
- "Paraguay": "america",
676
- "Peru": "america",
677
- "UnitedStates": "usa",
678
- # US States
679
- "Alabama": "usa.states",
680
- "Alaska": "usa.states",
681
- "Arizona": "usa.states",
682
- "Arkansas": "usa.states",
683
- "California": "usa.states",
684
- "Colorado": "usa.states",
685
- "Connecticut": "usa.states",
686
- "Delaware": "usa.states",
687
- "DistrictOfColumbia": "usa.states",
688
- "Florida": "usa.states",
689
- "Georgia": "usa.states",
690
- "Hawaii": "usa.states",
691
- "Idaho": "usa.states",
692
- "Illinois": "usa.states",
693
- "Indiana": "usa.states",
694
- "Iowa": "usa.states",
695
- "Kansas": "usa.states",
696
- "Kentucky": "usa.states",
697
- "Louisiana": "usa.states",
698
- "Maine": "usa.states",
699
- "Maryland": "usa.states",
700
- "Massachusetts": "usa.states",
701
- "Michigan": "usa.states",
702
- "Minnesota": "usa.states",
703
- "Mississippi": "usa.states",
704
- "Missouri": "usa.states",
705
- "Montana": "usa.states",
706
- "Nebraska": "usa.states",
707
- "Nevada": "usa.states",
708
- "NewHampshire": "usa.states",
709
- "NewJersey": "usa.states",
710
- "NewMexico": "usa.states",
711
- "NewYork": "usa.states",
712
- "NorthCarolina": "usa.states",
713
- "NorthDakota": "usa.states",
714
- "Ohio": "usa.states",
715
- "Oklahoma": "usa.states",
716
- "Oregon": "usa.states",
717
- "Pennsylvania": "usa.states",
718
- "RhodeIsland": "usa.states",
719
- "SouthCarolina": "usa.states",
720
- "SouthDakota": "usa.states",
721
- "Tennessee": "usa.states",
722
- "Texas": "usa.states",
723
- "Utah": "usa.states",
724
- "Vermont": "usa.states",
725
- "Virginia": "usa.states",
726
- "Washington": "usa.states",
727
- "WestVirginia": "usa.states",
728
- "Wisconsin": "usa.states",
729
- "Wyoming": "usa.states",
730
- # Oceania
731
- "Australia": "oceania",
732
- "AustraliaCapitalTerritory": "oceania",
733
- "AustraliaNewSouthWales": "oceania",
734
- "AustraliaNorthernTerritory": "oceania",
735
- "AustraliaQueensland": "oceania",
736
- "AustraliaSouthAustralia": "oceania",
737
- "AustraliaTasmania": "oceania",
738
- "AustraliaVictoria": "oceania",
739
- "AustraliaWesternAustralia": "oceania",
740
- "MarshallIslands": "oceania",
741
- "NewZealand": "oceania",
742
- # Asia
743
- "China": "asia",
744
- "HongKong": "asia",
745
- "India": "asia",
746
- "Israel": "asia",
747
- "Japan": "asia",
748
- "Kazakhstan": "asia",
749
- "Malaysia": "asia",
750
- "Qatar": "asia",
751
- "Singapore": "asia",
752
- "SouthKorea": "asia",
753
- "Taiwan": "asia",
754
- "Turkey": "asia",
755
- "Vietnam": "asia",
756
- # Africa
757
- "Algeria": "africa",
758
- "Angola": "africa",
759
- "Benin": "africa",
760
- "IvoryCoast": "africa",
761
- "Kenya": "africa",
762
- "Madagascar": "africa",
763
- "Nigeria": "africa",
764
- "SaoTomeAndPrincipe": "africa",
765
- "SouthAfrica": "africa",
766
- }
767
-
768
- # Dictionary mapping ISO country codes to their corresponding workalendar country names
769
- holiday_country = {
770
- # Major countries with required formats
771
- "GB": "UnitedKingdom",
772
- "US": "UnitedStates",
773
- "USA": "UnitedStates", # Alternative code for US
774
- "CA": "Canada",
775
- "ZA": "SouthAfrica",
776
- "FR": "France",
777
- "DE": "Germany",
778
- "AU": "Australia",
779
- "AUS": "Australia", # Alternative code for Australia
780
- # European countries
781
- "AT": "Austria",
782
- "BY": "Belarus",
783
- "BE": "Belgium",
784
- "BG": "Bulgaria",
785
- "HR": "Croatia",
786
- "CY": "Cyprus",
787
- "CZ": "Czechia",
788
- "DK": "Denmark",
789
- "EE": "Estonia",
790
- "FI": "Finland",
791
- "GR": "Greece",
792
- "HU": "Hungary",
793
- "IS": "Iceland",
794
- "IE": "Ireland",
795
- "IT": "Italy",
796
- "LV": "Latvia",
797
- "LT": "Lithuania",
798
- "LU": "Luxembourg",
799
- "MT": "Malta",
800
- "MC": "Monaco",
801
- "NL": "Netherlands",
802
- "NO": "Norway",
803
- "PL": "Poland",
804
- "PT": "Portugal",
805
- "RO": "Romania",
806
- "RU": "Russia",
807
- "RS": "Serbia",
808
- "SK": "Slovakia",
809
- "SI": "Slovenia",
810
- "ES": "Spain",
811
- "SE": "Sweden",
812
- "CH": "Switzerland",
813
- "UA": "Ukraine",
814
- # Americas
815
- "AR": "Argentina",
816
- "BB": "Barbados",
817
- "BR": "Brazil",
818
- "CL": "Chile",
819
- "CO": "Colombia",
820
- "MX": "Mexico",
821
- "PA": "Panama",
822
- "PY": "Paraguay",
823
- "PE": "Peru",
824
- # USA States (using common abbreviations)
825
- "AL": "Alabama",
826
- "AK": "Alaska",
827
- "AZ": "Arizona",
828
- "AR": "Arkansas",
829
- "CA_US": "California",
830
- "CO_US": "Colorado",
831
- "CT": "Connecticut",
832
- "DE_US": "Delaware",
833
- "DC": "DistrictOfColumbia",
834
- "FL": "Florida",
835
- "GA": "Georgia",
836
- "HI": "Hawaii",
837
- "ID": "Idaho",
838
- "IL": "Illinois",
839
- "IN": "Indiana",
840
- "IA": "Iowa",
841
- "KS": "Kansas",
842
- "KY": "Kentucky",
843
- "LA": "Louisiana",
844
- "ME": "Maine",
845
- "MD": "Maryland",
846
- "MA": "Massachusetts",
847
- "MI": "Michigan",
848
- "MN": "Minnesota",
849
- "MS": "Mississippi",
850
- "MO": "Missouri",
851
- "MT": "Montana",
852
- "NE": "Nebraska",
853
- "NV": "Nevada",
854
- "NH": "NewHampshire",
855
- "NJ": "NewJersey",
856
- "NM": "NewMexico",
857
- "NY": "NewYork",
858
- "NC": "NorthCarolina",
859
- "ND": "NorthDakota",
860
- "OH": "Ohio",
861
- "OK": "Oklahoma",
862
- "OR": "Oregon",
863
- "PA_US": "Pennsylvania",
864
- "RI": "RhodeIsland",
865
- "SC": "SouthCarolina",
866
- "SD": "SouthDakota",
867
- "TN": "Tennessee",
868
- "TX": "Texas",
869
- "UT": "Utah",
870
- "VT": "Vermont",
871
- "VA": "Virginia",
872
- "WA": "Washington",
873
- "WV": "WestVirginia",
874
- "WI": "Wisconsin",
875
- "WY": "Wyoming",
876
- # Australia territories
877
- "ACT": "AustraliaCapitalTerritory",
878
- "NSW": "AustraliaNewSouthWales",
879
- "NT": "AustraliaNorthernTerritory",
880
- "QLD": "AustraliaQueensland",
881
- "SA": "AustraliaSouthAustralia",
882
- "TAS": "AustraliaTasmania",
883
- "VIC": "AustraliaVictoria",
884
- "WA_AU": "AustraliaWesternAustralia",
885
- # Asian countries
886
- "CN": "China",
887
- "HK": "HongKong",
888
- "IN": "India",
889
- "IL": "Israel",
890
- "JP": "Japan",
891
- "KZ": "Kazakhstan",
892
- "MY": "Malaysia",
893
- "QA": "Qatar",
894
- "SG": "Singapore",
895
- "KR": "SouthKorea",
896
- "TW": "Taiwan",
897
- "TR": "Turkey",
898
- "VN": "Vietnam",
899
- # Other Oceania countries
900
- "MH": "MarshallIslands",
901
- "NZ": "NewZealand",
902
- # African countries
903
- "DZ": "Algeria",
904
- "AO": "Angola",
905
- "BJ": "Benin",
906
- "CI": "IvoryCoast",
907
- "KE": "Kenya",
908
- "MG": "Madagascar",
909
- "NG": "Nigeria",
910
- "ST": "SaoTomeAndPrincipe",
911
- }
912
-
913
- # ---------------------------------------------------------------------
914
- # 1. Create daily date range from start_date to today
915
- # ---------------------------------------------------------------------
916
- try:
917
- start_dt = pd.to_datetime(start_date)
918
- except ValueError:
919
- raise ValueError(
920
- f"Invalid start_date format: {start_date}. Use 'YYYY-MM-DD'",
921
- )
922
-
923
- end_dt = datetime.today()
924
- # Ensure end date is not before start date
925
- if end_dt < start_dt:
926
- end_dt = start_dt + timedelta(days=1) # Or handle as error if preferred
927
-
928
- date_range = pd.date_range(start=start_dt, end=end_dt, freq="D")
929
- df_daily = pd.DataFrame(date_range, columns=["Date"])
930
-
931
- # ---------------------------------------------------------------------
932
- # 1.1 Identify "week_start" for each daily row, based on week_commencing
933
- # ---------------------------------------------------------------------
934
- start_day_int = day_dict[week_commencing]
935
- df_daily["week_start"] = df_daily["Date"].apply(
936
- lambda x: x - pd.Timedelta(days=(x.weekday() - start_day_int) % 7),
937
- )
938
-
939
- # ---------------------------------------------------------------------
940
- # 1.2 Calculate ISO week number for each DAY (for later aggregation)
941
- # Also calculate Year for each DAY to handle year transitions correctly
942
- # ---------------------------------------------------------------------
943
- df_daily["iso_week_daily"] = df_daily["Date"].dt.isocalendar().week.astype(int)
944
- df_daily["iso_year_daily"] = df_daily["Date"].dt.isocalendar().year.astype(int)
945
-
946
- # ---------------------------------------------------------------------
947
- # 2. Build a weekly index (df_weekly_start) based on unique week_start dates
948
- # ---------------------------------------------------------------------
949
- df_weekly_start = (
950
- df_daily[["week_start"]]
951
- .drop_duplicates()
952
- .sort_values("week_start")
953
- .reset_index(drop=True)
954
- )
955
- df_weekly_start.rename(columns={"week_start": "Date"}, inplace=True)
956
- df_weekly_start.set_index("Date", inplace=True)
957
-
958
- # Create individual weekly dummies (optional, uncomment if needed)
959
- dummy_columns = {}
960
- for i, date_index in enumerate(df_weekly_start.index):
961
- col_name = f"dum_{date_index.strftime('%Y_%m_%d')}"
962
- dummy_columns[col_name] = [0] * len(df_weekly_start)
963
- dummy_columns[col_name][i] = 1
964
- df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
965
- df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
966
-
967
- # ---------------------------------------------------------------------
968
- # 3. Public holidays (daily) using WorkCalendar
969
- # ---------------------------------------------------------------------
970
- start_year = start_dt.year
971
- end_year = end_dt.year
972
- years_range = range(start_year, end_year + 1)
973
-
974
- # Dictionary to store holiday dummies for each country
975
- country_holiday_dummies = {}
976
-
977
- for country_code in countries:
978
- # Skip if country code not found in holiday_country dictionary
979
- if country_code not in holiday_country:
980
- print(
981
- f"Warning: Country code '{country_code}' not found in country code dictionary. Skipping.",
982
- )
983
- continue
984
-
985
- country = holiday_country[country_code]
986
-
987
- # Skip if country not found in continent lookup dictionary
988
- if country not in COUNTRY_TO_CONTINENT:
989
- print(
990
- f"Warning: Country '{country}' not found in continent lookup dictionary. Skipping.",
991
- )
992
- continue
993
-
994
- continent = COUNTRY_TO_CONTINENT[country]
995
- module_path = f"workalendar.{continent}"
996
- try:
997
- module = importlib.import_module(module_path)
998
- calendar_class = getattr(module, country)
999
- cal = calendar_class()
1000
- except (ImportError, AttributeError) as e:
1001
- print(f"Error importing calendar for {country}: {e}. Skipping.")
1002
- continue
1003
-
1004
- # Collect holidays
1005
- holidays_list = []
1006
- for year in years_range:
1007
- holidays_list.extend(cal.holidays(year))
1008
-
1009
- holidays_df = pd.DataFrame(holidays_list, columns=["Date", "Holiday"])
1010
- holidays_df["Date"] = pd.to_datetime(holidays_df["Date"])
1011
-
1012
- # Filter out any holidays with "shift" or "substitute" in their name
1013
- holidays_df = holidays_df[
1014
- ~(
1015
- holidays_df["Holiday"].str.lower().str.contains("shift")
1016
- | holidays_df["Holiday"].str.lower().str.contains("substitute")
1017
- )
1018
- ]
1019
-
1020
- # Filter by date range
1021
- holidays_df = holidays_df[
1022
- (holidays_df["Date"] >= start_dt) & (holidays_df["Date"] <= end_dt)
1023
- ]
1024
- # ---------------------------------------------------------------------
1025
- # 3.1 Additional Public Holidays for Canada due to poor API data
1026
- # ---------------------------------------------------------------------
1027
- if country_code == "CA":
1028
- # Add Canada Day (July 1st) if not already in the list
1029
- for year in years_range:
1030
- canada_day = pd.Timestamp(f"{year}-07-01")
1031
- if canada_day >= start_dt and canada_day <= end_dt:
1032
- if not (
1033
- (holidays_df["Date"] == canada_day)
1034
- & (
1035
- holidays_df["Holiday"]
1036
- .str.lower()
1037
- .str.contains("canada day")
1038
- )
1039
- ).any():
1040
- holidays_df = pd.concat(
1041
- [
1042
- holidays_df,
1043
- pd.DataFrame(
1044
- {
1045
- "Date": [canada_day],
1046
- "Holiday": ["Canada Day"],
1047
- },
1048
- ),
1049
- ],
1050
- ignore_index=True,
1051
- )
1052
-
1053
- # Add Labour Day (first Monday in September)
1054
- for year in years_range:
1055
- # Get first day of September
1056
- first_day = pd.Timestamp(f"{year}-09-01")
1057
- # Calculate days until first Monday (Monday is weekday 0)
1058
- days_until_monday = (7 - first_day.weekday()) % 7
1059
- if days_until_monday == 0: # If first day is already Monday
1060
- labour_day = first_day
1061
- else:
1062
- labour_day = first_day + pd.Timedelta(days=days_until_monday)
1063
-
1064
- if labour_day >= start_dt and labour_day <= end_dt:
1065
- if not (
1066
- (holidays_df["Date"] == labour_day)
1067
- & (
1068
- holidays_df["Holiday"]
1069
- .str.lower()
1070
- .str.contains("labour day")
1071
- )
1072
- ).any():
1073
- holidays_df = pd.concat(
1074
- [
1075
- holidays_df,
1076
- pd.DataFrame(
1077
- {
1078
- "Date": [labour_day],
1079
- "Holiday": ["Labour Day"],
1080
- },
1081
- ),
1082
- ],
1083
- ignore_index=True,
1084
- )
1085
-
1086
- # Add Thanksgiving (second Monday in October)
1087
- for year in years_range:
1088
- # Get first day of October
1089
- first_day = pd.Timestamp(f"{year}-10-01")
1090
- # Calculate days until first Monday
1091
- days_until_monday = (7 - first_day.weekday()) % 7
1092
- if days_until_monday == 0: # If first day is already Monday
1093
- first_monday = first_day
1094
- else:
1095
- first_monday = first_day + pd.Timedelta(days=days_until_monday)
1096
-
1097
- # Second Monday is 7 days after first Monday
1098
- thanksgiving = first_monday + pd.Timedelta(days=7)
1099
-
1100
- if thanksgiving >= start_dt and thanksgiving <= end_dt:
1101
- if not (
1102
- (holidays_df["Date"] == thanksgiving)
1103
- & (
1104
- holidays_df["Holiday"]
1105
- .str.lower()
1106
- .str.contains("thanksgiving")
1107
- )
1108
- ).any():
1109
- holidays_df = pd.concat(
1110
- [
1111
- holidays_df,
1112
- pd.DataFrame(
1113
- {
1114
- "Date": [thanksgiving],
1115
- "Holiday": ["Thanksgiving"],
1116
- },
1117
- ),
1118
- ],
1119
- ignore_index=True,
1120
- )
1121
-
1122
- # Now process the collected holidays and add to df_daily
1123
- for _, row in holidays_df.iterrows():
1124
- holiday_date = row["Date"]
1125
- # Create column name without modifying original holiday names
1126
- holiday_name = row["Holiday"].lower().replace(" ", "_")
1127
-
1128
- # Remove "_shift" or "_substitute" if they appear as standalone suffixes
1129
- if holiday_name.endswith("_shift"):
1130
- holiday_name = holiday_name[:-6]
1131
- elif holiday_name.endswith("_substitute"):
1132
- holiday_name = holiday_name[:-11]
1133
-
1134
- column_name = f"seas_{holiday_name}_{country_code.lower()}"
1135
-
1136
- if column_name not in df_daily.columns:
1137
- df_daily[column_name] = 0
1138
-
1139
- # Mark the specific holiday date
1140
- df_daily.loc[df_daily["Date"] == holiday_date, column_name] = 1
1141
-
1142
- # Also mark a general holiday indicator for each country
1143
- holiday_indicator = f"seas_holiday_{country_code.lower()}"
1144
- if holiday_indicator not in df_daily.columns:
1145
- df_daily[holiday_indicator] = 0
1146
- df_daily.loc[df_daily["Date"] == holiday_date, holiday_indicator] = 1
1147
-
1148
- # ---------------------------------------------------------------------
1149
- # 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
1150
- # ---------------------------------------------------------------------
1151
- extra_cols = [
1152
- "seas_valentines_day",
1153
- "seas_halloween",
1154
- "seas_fathers_day_us_uk", # Note: UK/US is 3rd Sun Jun, others vary
1155
- "seas_mothers_day_us", # Note: US is 2nd Sun May
1156
- "seas_mothers_day_uk", # Note: UK Mothering Sunday varies with Easter
1157
- "seas_good_friday",
1158
- "seas_easter_monday",
1159
- "seas_black_friday", # US-centric, but globally adopted
1160
- "seas_cyber_monday", # US-centric, but globally adopted
1161
- ]
1162
- for c in extra_cols:
1163
- if (
1164
- c not in df_daily.columns
1165
- ): # Avoid overwriting if already created by holidays pkg
1166
- df_daily[c] = 0
1167
-
1168
- # Helper: nth_weekday_of_month(year, month, weekday, nth)
1169
- def nth_weekday_of_month(year, month, weekday, nth):
1170
- d = datetime(year, month, 1)
1171
- w = d.weekday()
1172
- delta = (weekday - w + 7) % 7 # Ensure positive delta
1173
- first_weekday = d + timedelta(days=delta)
1174
- target_date = first_weekday + timedelta(days=7 * (nth - 1))
1175
- # Check if the calculated date is still in the same month
1176
- if target_date.month == month:
1177
- return target_date
1178
- # This can happen if nth is too large (e.g., 5th Friday)
1179
- # Return the last occurrence of that weekday in the month instead
1180
- return target_date - timedelta(days=7)
1181
-
1182
- def get_good_friday(year):
1183
- return easter(year) - timedelta(days=2)
1184
-
1185
- def get_easter_monday(year):
1186
- return easter(year) + timedelta(days=1)
1187
-
1188
- def get_black_friday(year):
1189
- # US Thanksgiving is 4th Thursday in November (weekday=3)
1190
- thanksgiving = nth_weekday_of_month(year, 11, 3, 4)
1191
- return thanksgiving + timedelta(days=1)
1192
-
1193
- def get_cyber_monday(year):
1194
- # Monday after US Thanksgiving
1195
- thanksgiving = nth_weekday_of_month(year, 11, 3, 4)
1196
- return thanksgiving + timedelta(days=4)
1197
-
1198
- def get_mothering_sunday_uk(year):
1199
- # Fourth Sunday in Lent (3 weeks before Easter Sunday)
1200
- # Lent starts on Ash Wednesday, 46 days before Easter.
1201
- # Easter Sunday is day 0. Sunday before is -7, etc.
1202
- # 4th Sunday in Lent is 3 weeks before Easter.
1203
- return easter(year) - timedelta(days=21)
1204
-
1205
- # Loop over each year in range
1206
- for yr in range(start_year, end_year + 1):
1207
- try: # Wrap calculations in try-except for robustness
1208
- # Valentines = Feb 14
1209
- valentines_day = datetime(yr, 2, 14)
1210
- # Halloween = Oct 31
1211
- halloween_day = datetime(yr, 10, 31)
1212
- # Father's Day (US & UK) = 3rd Sunday (6) in June
1213
- fathers_day = nth_weekday_of_month(yr, 6, 6, 3)
1214
- # Mother's Day US = 2nd Sunday (6) in May
1215
- mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
1216
- # Mother's Day UK (Mothering Sunday)
1217
- mothering_sunday = get_mothering_sunday_uk(yr)
1218
-
1219
- # Good Friday, Easter Monday
1220
- gf = get_good_friday(yr)
1221
- em = get_easter_monday(yr)
1222
-
1223
- # Black Friday, Cyber Monday
1224
- bf = get_black_friday(yr)
1225
- cm = get_cyber_monday(yr)
1226
-
1227
- # Mark them in df_daily if in range
1228
- special_days_map = [
1229
- (valentines_day, "seas_valentines_day"),
1230
- (halloween_day, "seas_halloween"),
1231
- (fathers_day, "seas_fathers_day_us_uk"),
1232
- (mothers_day_us, "seas_mothers_day_us"),
1233
- (mothering_sunday, "seas_mothers_day_uk"),
1234
- (gf, "seas_good_friday"),
1235
- (em, "seas_easter_monday"),
1236
- (bf, "seas_black_friday"),
1237
- (cm, "seas_cyber_monday"),
1238
- ]
1239
-
1240
- for special_date, col in special_days_map:
1241
- if (
1242
- special_date is not None
1243
- ): # nth_weekday_of_month can return None edge cases
1244
- special_ts = pd.Timestamp(special_date)
1245
- # Only set if it's within the daily range AND column exists
1246
- if (
1247
- (special_ts >= df_daily["Date"].min())
1248
- and (special_ts <= df_daily["Date"].max())
1249
- and (col in df_daily.columns)
1250
- ):
1251
- df_daily.loc[df_daily["Date"] == special_ts, col] = 1
1252
- except Exception as e:
1253
- print(f"Warning: Could not calculate special days for year {yr}: {e}")
1254
-
1255
- # ---------------------------------------------------------------------
1256
- # 4. Add daily indicators for last day & last Friday of month & payday
1257
- # ---------------------------------------------------------------------
1258
- df_daily["is_last_day_of_month"] = df_daily["Date"].dt.is_month_end
1259
-
1260
- def is_last_friday(date):
1261
- # Check if it's a Friday first
1262
- if date.weekday() != 4: # Friday is 4
1263
- return 0
1264
- # Check if next Friday is in the next month
1265
- next_friday = date + timedelta(days=7)
1266
- return 1 if next_friday.month != date.month else 0
1267
-
1268
- def is_payday(date):
1269
- return 1 if date.day >= 25 else 0
1270
-
1271
- df_daily["is_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
1272
-
1273
- df_daily["is_payday"] = df_daily["Date"].apply(is_payday)
1274
-
1275
- # Rename for clarity prefix
1276
- df_daily.rename(
1277
- columns={
1278
- "is_last_day_of_month": "seas_last_day_of_month",
1279
- "is_last_friday_of_month": "seas_last_friday_of_month",
1280
- "is_payday": "seas_payday",
1281
- },
1282
- inplace=True,
1283
- )
1284
-
1285
- # ---------------------------------------------------------------------
1286
- # 5. Weekly aggregation
1287
- # ---------------------------------------------------------------------
1288
-
1289
- # Select only columns that are indicators/flags (intended for max aggregation)
1290
- flag_cols = [
1291
- col
1292
- for col in df_daily.columns
1293
- if (col.startswith("seas_") or col.startswith("is_"))
1294
- and col != "seas_payday"
1295
- ]
1296
- # Ensure 'week_start' is present for grouping
1297
- df_to_agg = df_daily[["week_start"] + flag_cols]
1298
-
1299
- df_weekly_flags = (
1300
- df_to_agg.groupby("week_start")
1301
- .max() # if any day=1 in that week, entire week=1
1302
- .reset_index()
1303
- .rename(columns={"week_start": "Date"})
1304
- .set_index("Date")
1305
- )
1306
-
1307
- # Do specific aggregation for payday
1308
- # Make sure 'date' column exists in df_daily
1309
- df_daily["month"] = df_daily["Date"].dt.month
1310
- df_daily["year"] = df_daily["Date"].dt.year
1311
-
1312
- # Sum of seas_payday flags per week
1313
- week_payday_sum = df_daily.groupby("week_start")["seas_payday"].sum()
1314
-
1315
- # Divide the number of payday flags by number of paydays per month
1316
- payday_days_in_month = df_daily.groupby(["year", "month"])["seas_payday"].sum()
1317
- week_month = df_daily.groupby("week_start").first()[["month", "year"]]
1318
- week_days_in_month = week_month.apply(
1319
- lambda row: payday_days_in_month.loc[(row["year"], row["month"])],
1320
- axis=1,
1321
- )
1322
- df_weekly_flags["seas_payday"] = (
1323
- (week_payday_sum / week_days_in_month).fillna(0).values
1324
- )
1325
-
1326
- # # Drop intermediate columns
1327
- # df_weekly_flags = df_weekly_flags.drop(columns=["month", "year"])
1328
-
1329
- # --- Aggregate Week Number using MODE ---
1330
- # Define aggregation function for mode (handling potential multi-modal cases by taking the first)
1331
- def get_mode(x):
1332
- modes = pd.Series.mode(x)
1333
- return modes[0] if not modes.empty else np.nan # Return first mode or NaN
1334
-
1335
- df_weekly_iso_week_year = (
1336
- df_daily[["week_start", "iso_week_daily", "iso_year_daily"]]
1337
- .groupby("week_start")
1338
- .agg(
1339
- # Find the most frequent week number and year within the group
1340
- Week=("iso_week_daily", get_mode),
1341
- Year=("iso_year_daily", get_mode),
1342
- )
1343
- .reset_index()
1344
- .rename(columns={"week_start": "Date"})
1345
- .set_index("Date")
1346
- )
1347
- # Convert Week/Year back to integer type after aggregation
1348
- df_weekly_iso_week_year["Week"] = df_weekly_iso_week_year["Week"].astype(int)
1349
- df_weekly_iso_week_year["Year"] = df_weekly_iso_week_year["Year"].astype(int)
1350
-
1351
- # --- Monthly dummies (spread evenly across week) ---
1352
- df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
1353
- df_monthly_dummies_daily = pd.get_dummies(
1354
- df_daily[["week_start", "Month"]], # Only need these columns
1355
- prefix="seas_month",
1356
- columns=["Month"],
1357
- dtype=float, # Use float for division
1358
- )
1359
- # Sum daily dummies within the week
1360
- df_monthly_dummies_summed = df_monthly_dummies_daily.groupby("week_start").sum()
1361
- # Divide by number of days in that specific week group (usually 7, except potentially start/end)
1362
- days_in_week = df_daily.groupby("week_start").size()
1363
- df_weekly_monthly_dummies = df_monthly_dummies_summed.div(days_in_week, axis=0)
1364
-
1365
- # Reset index to merge
1366
- df_weekly_monthly_dummies.reset_index(inplace=True)
1367
- df_weekly_monthly_dummies.rename(columns={"week_start": "Date"}, inplace=True)
1368
- df_weekly_monthly_dummies.set_index("Date", inplace=True)
1369
-
1370
- # ---------------------------------------------------------------------
1371
- # 6. Combine all weekly components
1372
- # ---------------------------------------------------------------------
1373
- # Start with the basic weekly index
1374
- df_combined = df_weekly_start.copy()
1375
-
1376
- # Join the other aggregated DataFrames
1377
- df_combined = df_combined.join(df_weekly_flags, how="left")
1378
- df_combined = df_combined.join(df_weekly_iso_week_year, how="left")
1379
- df_combined = df_combined.join(df_weekly_monthly_dummies, how="left")
1380
-
1381
- # Fill potential NaNs created by joins (e.g., if a flag column didn't exist) with 0
1382
- # Exclude 'Week' and 'Year' which should always be present
1383
- cols_to_fill = df_combined.columns.difference(["Week", "Year"])
1384
- df_combined[cols_to_fill] = df_combined[cols_to_fill].fillna(0)
1385
-
1386
- # Ensure correct types for flag columns (int)
1387
- for col in df_weekly_flags.columns:
1388
- if col != "seas_payday":
1389
- if col in df_combined.columns:
1390
- df_combined[col] = df_combined[col].astype(int)
1391
-
1392
- # Ensure correct types for month columns (float)
1393
- for col in df_weekly_monthly_dummies.columns:
1394
- if col in df_combined.columns:
1395
- df_combined[col] = df_combined[col].astype(float)
1396
-
1397
- # ---------------------------------------------------------------------
1398
- # 7. Create weekly dummies for Week of Year & yearly dummies from aggregated cols
1399
- # ---------------------------------------------------------------------
1400
- df_combined.reset_index(inplace=True) # 'Date', 'Week', 'Year' become columns
1401
-
1402
- # Create dummies from the aggregated 'Week' column
1403
- df_combined = pd.get_dummies(
1404
- df_combined,
1405
- prefix="seas",
1406
- columns=["Week"],
1407
- dtype=int,
1408
- prefix_sep="_",
1409
- )
1410
-
1411
- # Create dummies from the aggregated 'Year' column
1412
- df_combined = pd.get_dummies(
1413
- df_combined,
1414
- prefix="seas",
1415
- columns=["Year"],
1416
- dtype=int,
1417
- prefix_sep="_",
1418
- )
1419
-
1420
- # ---------------------------------------------------------------------
1421
- # 8. Add constant & trend
1422
- # ---------------------------------------------------------------------
1423
- df_combined["Constant"] = 1
1424
- df_combined.reset_index(
1425
- drop=True,
1426
- inplace=True,
1427
- ) # Ensure index is 0, 1, 2... for trend
1428
- df_combined["Trend"] = df_combined.index + 1
1429
-
1430
- # ---------------------------------------------------------------------
1431
- # 9. Rename Date -> OBS and select final columns
1432
- # ---------------------------------------------------------------------
1433
- df_combined.rename(columns={"Date": "OBS"}, inplace=True)
1434
-
1435
- # Reorder columns - OBS first, then Constant, Trend, then seasonal features
1436
- cols_order = (
1437
- ["OBS", "Constant", "Trend"]
1438
- + sorted([col for col in df_combined.columns if col.startswith("seas_")])
1439
- + sorted([col for col in df_combined.columns if col.startswith("dum_")])
1440
- ) # If individual week dummies were enabled
1441
-
1442
- # Filter out columns not in the desired order list (handles case where dum_ cols are off)
1443
- final_cols = [col for col in cols_order if col in df_combined.columns]
1444
- df_combined = df_combined[final_cols]
1445
-
1446
- return df_combined
1447
-
1448
- def pull_weather(self, week_commencing, start_date, country_codes) -> pd.DataFrame:
1449
- """
1450
- Pull weather data for a given week-commencing day and one or more country codes.
1451
- Tester
1452
- LOGIC:
1453
- 1) For non-US countries (AU, GB, DE, CA, ZA):
1454
- - Mesonet => max_temp_f, min_temp_f -> compute mean_temp_f -> weekly average => 'avg_max_temp_f', etc.
1455
- - Open-Meteo => precipitation_sum => 'avg_rain_sum', snowfall_sum => 'avg_snow_sum'.
1456
- - Merge, then rename columns with prefix 'seas_{country}_'.
1457
-
1458
- 2) For the US:
1459
- - We have multiple <STATE>_ASOS networks (e.g. CA_ASOS, TX_ASOS).
1460
- - For each state, fetch from Mesonet => max_temp_f, min_temp_f, precip_in, snow_in -> compute mean_temp_f -> weekly average => 'avg_max_temp_f', 'avg_rain_sum', 'avg_snow_sum', etc.
1461
- - Rename columns for each state with prefix 'seas_us_{state}_'.
1462
- - Merge all states (and countries) into a single DataFrame.
1463
-
1464
- :param week_commencing: A string in {"mon","tue","wed","thur","fri","sat","sun"}.
1465
- :param country_codes: A list of 2-letter country codes or a single string, e.g. ["GB","US"].
1466
- :return: A single Pandas DataFrame with weekly-aggregated data for all requested countries.
1467
- """
1468
- # ------------------------------------------------------------------ #
1469
- # 0) Handle either a single code or list of codes
1470
- # ------------------------------------------------------------------ #
1471
- if isinstance(country_codes, str):
1472
- country_codes = [country_codes]
1473
- elif not isinstance(country_codes, (list, tuple)):
1474
- raise ValueError("country_codes must be a list/tuple or a single string.")
1475
-
1476
- # --- Setup / Constants --- #
1477
- day_dict = {
1478
- "mon": 0,
1479
- "tue": 1,
1480
- "wed": 2,
1481
- "thu": 3,
1482
- "fri": 4,
1483
- "sat": 5,
1484
- "sun": 6,
1485
- }
1486
- # Map each 2-letter code to a key
1487
- country_dict = {
1488
- "US": "US_STATES",
1489
- "CA": "Canada",
1490
- "AU": "AU__ASOS",
1491
- "GB": "GB__ASOS",
1492
- "DE": "DE__ASOS",
1493
- "ZA": "ZA__ASOS",
1494
- }
1495
-
1496
- # Station-based countries for Mesonet
1497
- station_map = {
1498
- "GB__ASOS": [
1499
- "&stations=EGCC",
1500
- "&stations=EGNM",
1501
- "&stations=EGBB",
1502
- "&stations=EGSH",
1503
- "&stations=EGFF",
1504
- "&stations=EGHI",
1505
- "&stations=EGLC",
1506
- "&stations=EGHQ",
1507
- "&stations=EGAC",
1508
- "&stations=EGPF",
1509
- "&stations=EGGD",
1510
- "&stations=EGPE",
1511
- "&stations=EGNT",
1512
- ],
1513
- "AU__ASOS": [
1514
- "&stations=YPDN",
1515
- "&stations=YBCS",
1516
- "&stations=YBBN",
1517
- "&stations=YSSY",
1518
- "&stations=YSSY",
1519
- "&stations=YMEN",
1520
- "&stations=YPAD",
1521
- "&stations=YPPH",
1522
- ],
1523
- "DE__ASOS": [
1524
- "&stations=EDDL",
1525
- "&stations=EDDH",
1526
- "&stations=EDDB",
1527
- "&stations=EDDN",
1528
- "&stations=EDDF",
1529
- "&stations=EDDK",
1530
- "&stations=EDLW",
1531
- "&stations=EDDM",
1532
- ],
1533
- # Example: if ZA is also station-based, add it here.
1534
- "ZA__ASOS": [
1535
- # If you know the station codes, add them here:
1536
- # e.g. "&stations=FACT", "&stations=FAJS", ...
1537
- ],
1538
- # "FR__ASOS" if you need France, etc.
1539
- }
1540
-
1541
- # Non-US countries that also fetch RAIN & SNOW from Open-Meteo
1542
- rainfall_city_map = {
1543
- "GB__ASOS": [
1544
- "Manchester",
1545
- "Leeds",
1546
- "Birmingham",
1547
- "London",
1548
- "Glasgow",
1549
- ],
1550
- "AU__ASOS": [
1551
- "Darwin",
1552
- "Cairns",
1553
- "Brisbane",
1554
- "Sydney",
1555
- "Melbourne",
1556
- "Adelaide",
1557
- "Perth",
1558
- ],
1559
- "DE__ASOS": [
1560
- "Dortmund",
1561
- "Düsseldorf",
1562
- "Frankfurt",
1563
- "Munich",
1564
- "Cologne",
1565
- "Berlin",
1566
- "Hamburg",
1567
- "Nuernberg",
1568
- ],
1569
- "ZA__ASOS": ["Johannesburg", "Cape Town", "Durban", "Pretoria"],
1570
- }
1571
-
1572
- # Canada sub-networks
1573
- institute_vector = [
1574
- "CA_NB_ASOS",
1575
- "CA_NF_ASOS",
1576
- "CA_NT_ASOS",
1577
- "CA_NS_ASOS",
1578
- "CA_NU_ASOS",
1579
- ]
1580
- stations_list_canada = [
1581
- [
1582
- "&stations=CYQM",
1583
- "&stations=CERM",
1584
- "&stations=CZCR",
1585
- "&stations=CZBF",
1586
- "&stations=CYFC",
1587
- "&stations=CYCX",
1588
- ],
1589
- [
1590
- "&stations=CWZZ",
1591
- "&stations=CYDP",
1592
- "&stations=CYMH",
1593
- "&stations=CYAY",
1594
- "&stations=CWDO",
1595
- "&stations=CXTP",
1596
- "&stations=CYJT",
1597
- "&stations=CYYR",
1598
- "&stations=CZUM",
1599
- "&stations=CYWK",
1600
- "&stations=CYWK",
1601
- ],
1602
- [
1603
- "&stations=CYHI",
1604
- "&stations=CZCP",
1605
- "&stations=CWLI",
1606
- "&stations=CWND",
1607
- "&stations=CXTV",
1608
- "&stations=CYVL",
1609
- "&stations=CYCO",
1610
- "&stations=CXDE",
1611
- "&stations=CYWE",
1612
- "&stations=CYLK",
1613
- "&stations=CWID",
1614
- "&stations=CYRF",
1615
- "&stations=CXYH",
1616
- "&stations=CYWY",
1617
- "&stations=CWMT",
1618
- ],
1619
- [
1620
- "&stations=CWEF",
1621
- "&stations=CXIB",
1622
- "&stations=CYQY",
1623
- "&stations=CYPD",
1624
- "&stations=CXNP",
1625
- "&stations=CXMY",
1626
- "&stations=CYAW",
1627
- "&stations=CWKG",
1628
- "&stations=CWVU",
1629
- "&stations=CXLB",
1630
- "&stations=CWSA",
1631
- "&stations=CWRN",
1632
- ],
1633
- [
1634
- "&stations=CYLT",
1635
- "&stations=CWEU",
1636
- "&stations=CWGZ",
1637
- "&stations=CYIO",
1638
- "&stations=CXSE",
1639
- "&stations=CYCB",
1640
- "&stations=CWIL",
1641
- "&stations=CXWB",
1642
- "&stations=CYZS",
1643
- "&stations=CWJC",
1644
- "&stations=CYFB",
1645
- "&stations=CWUW",
1646
- ],
1647
- ]
1648
-
1649
- # US states and stations - each sub-network
1650
- us_state_networks = {
1651
- state: f"{state}_ASOS"
1652
- for state in [
1653
- "AL",
1654
- "AR",
1655
- "AZ",
1656
- "CA",
1657
- "CO",
1658
- "CT",
1659
- "DE",
1660
- "FL",
1661
- "GA",
1662
- "IA",
1663
- "ID",
1664
- "IL",
1665
- "IN",
1666
- "KS",
1667
- "KY",
1668
- "LA",
1669
- "MA",
1670
- "MD",
1671
- "ME",
1672
- "MI",
1673
- "MN",
1674
- "MO",
1675
- "MS",
1676
- "MT",
1677
- "NC",
1678
- "ND",
1679
- "NE",
1680
- "NH",
1681
- "NJ",
1682
- "NM",
1683
- "NV",
1684
- "NY",
1685
- "OH",
1686
- "OK",
1687
- "OR",
1688
- "PA",
1689
- "RI",
1690
- "SC",
1691
- "SD",
1692
- "TN",
1693
- "TX",
1694
- "UT",
1695
- "VA",
1696
- "VT",
1697
- "WA",
1698
- "WI",
1699
- "WV",
1700
- "WY",
1701
- ]
1702
- }
1703
-
1704
- us_stations_map = {
1705
- "AL_ASOS": [
1706
- "&stations=BHM",
1707
- "&stations=HSV",
1708
- "&stations=MGM",
1709
- "&stations=MOB",
1710
- "&stations=TCL",
1711
- ],
1712
- "AR_ASOS": [
1713
- "&stations=LIT",
1714
- "&stations=FSM",
1715
- "&stations=TXK",
1716
- "&stations=HOT",
1717
- "&stations=FYV",
1718
- ],
1719
- "AZ_ASOS": [
1720
- "&stations=PHX",
1721
- "&stations=TUS",
1722
- "&stations=FLG",
1723
- "&stations=YUM",
1724
- "&stations=PRC",
1725
- ],
1726
- "CA_ASOS": [
1727
- "&stations=LAX",
1728
- "&stations=SAN",
1729
- "&stations=SJC",
1730
- "&stations=SFO",
1731
- "&stations=FAT",
1732
- ],
1733
- "CO_ASOS": [
1734
- "&stations=DEN",
1735
- "&stations=COS",
1736
- "&stations=GJT",
1737
- "&stations=PUB",
1738
- "&stations=ASE",
1739
- ],
1740
- "CT_ASOS": [
1741
- "&stations=BDL",
1742
- "&stations=HVN",
1743
- "&stations=BDR",
1744
- "&stations=GON",
1745
- "&stations=HFD",
1746
- ],
1747
- "DE_ASOS": ["&stations=ILG", "&stations=GED", "&stations=DOV"],
1748
- "FL_ASOS": [
1749
- "&stations=MIA",
1750
- "&stations=TPA",
1751
- "&stations=ORL",
1752
- "&stations=JAX",
1753
- "&stations=TLH",
1754
- ],
1755
- "GA_ASOS": [
1756
- "&stations=ATL",
1757
- "&stations=SAV",
1758
- "&stations=CSG",
1759
- "&stations=MCN",
1760
- "&stations=AGS",
1761
- ],
1762
- "IA_ASOS": [
1763
- "&stations=DSM",
1764
- "&stations=CID",
1765
- "&stations=DBQ",
1766
- "&stations=ALO",
1767
- "&stations=SUX",
1768
- ],
1769
- "ID_ASOS": [
1770
- "&stations=BOI",
1771
- "&stations=IDA",
1772
- "&stations=PIH",
1773
- "&stations=SUN",
1774
- "&stations=COE",
1775
- ],
1776
- "IL_ASOS": [
1777
- "&stations=ORD",
1778
- "&stations=MDW",
1779
- "&stations=PIA",
1780
- "&stations=SPI",
1781
- "&stations=MLI",
1782
- ],
1783
- "IN_ASOS": [
1784
- "&stations=IND",
1785
- "&stations=FWA",
1786
- "&stations=SBN",
1787
- "&stations=EVV",
1788
- "&stations=HUF",
1789
- ],
1790
- "KS_ASOS": [
1791
- "&stations=ICT",
1792
- "&stations=FOE",
1793
- "&stations=GCK",
1794
- "&stations=HYS",
1795
- "&stations=SLN",
1796
- ],
1797
- "KY_ASOS": [
1798
- "&stations=SDF",
1799
- "&stations=LEX",
1800
- "&stations=CVG",
1801
- "&stations=PAH",
1802
- "&stations=BWG",
1803
- ],
1804
- "LA_ASOS": [
1805
- "&stations=MSY",
1806
- "&stations=SHV",
1807
- "&stations=LFT",
1808
- "&stations=BTR",
1809
- "&stations=MLU",
1810
- ],
1811
- "MA_ASOS": [
1812
- "&stations=BOS",
1813
- "&stations=ORH",
1814
- "&stations=HYA",
1815
- "&stations=ACK",
1816
- "&stations=BED",
1817
- ],
1818
- "MD_ASOS": [
1819
- "&stations=BWI",
1820
- "&stations=MTN",
1821
- "&stations=SBY",
1822
- "&stations=HGR",
1823
- "&stations=ADW",
1824
- ],
1825
- "ME_ASOS": [
1826
- "&stations=PWM",
1827
- "&stations=BGR",
1828
- "&stations=CAR",
1829
- "&stations=PQI",
1830
- "&stations=RKD",
1831
- ],
1832
- "MI_ASOS": [
1833
- "&stations=DTW",
1834
- "&stations=GRR",
1835
- "&stations=FNT",
1836
- "&stations=LAN",
1837
- "&stations=MKG",
1838
- ],
1839
- "MN_ASOS": [
1840
- "&stations=MSP",
1841
- "&stations=DLH",
1842
- "&stations=RST",
1843
- "&stations=STC",
1844
- "&stations=INL",
1845
- ],
1846
- "MO_ASOS": [
1847
- "&stations=STL",
1848
- "&stations=MCI",
1849
- "&stations=SGF",
1850
- "&stations=COU",
1851
- "&stations=JLN",
1852
- ],
1853
- "MS_ASOS": [
1854
- "&stations=JAN",
1855
- "&stations=GPT",
1856
- "&stations=MEI",
1857
- "&stations=PIB",
1858
- "&stations=GLH",
1859
- ],
1860
- "MT_ASOS": [
1861
- "&stations=BIL",
1862
- "&stations=MSO",
1863
- "&stations=GTF",
1864
- "&stations=HLN",
1865
- "&stations=BZN",
1866
- ],
1867
- "NC_ASOS": [
1868
- "&stations=CLT",
1869
- "&stations=RDU",
1870
- "&stations=GSO",
1871
- "&stations=ILM",
1872
- "&stations=AVL",
1873
- ],
1874
- "ND_ASOS": [
1875
- "&stations=BIS",
1876
- "&stations=FAR",
1877
- "&stations=GFK",
1878
- "&stations=ISN",
1879
- "&stations=JMS",
1880
- ],
1881
- "NE_ASOS": ["&stations=OMA"],
1882
- "NH_ASOS": [
1883
- "&stations=MHT",
1884
- "&stations=PSM",
1885
- "&stations=CON",
1886
- "&stations=LEB",
1887
- "&stations=ASH",
1888
- ],
1889
- "NJ_ASOS": [
1890
- "&stations=EWR",
1891
- "&stations=ACY",
1892
- "&stations=TTN",
1893
- "&stations=MMU",
1894
- "&stations=TEB",
1895
- ],
1896
- "NM_ASOS": [
1897
- "&stations=ABQ",
1898
- "&stations=SAF",
1899
- "&stations=ROW",
1900
- "&stations=HOB",
1901
- "&stations=FMN",
1902
- ],
1903
- "NV_ASOS": ["&stations=LAS"],
1904
- "NY_ASOS": [
1905
- "&stations=JFK",
1906
- "&stations=LGA",
1907
- "&stations=BUF",
1908
- "&stations=ALB",
1909
- "&stations=SYR",
1910
- ],
1911
- "OH_ASOS": ["&stations=CMH"],
1912
- "OK_ASOS": [
1913
- "&stations=OKC",
1914
- "&stations=TUL",
1915
- "&stations=LAW",
1916
- "&stations=SWO",
1917
- "&stations=PNC",
1918
- ],
1919
- "OR_ASOS": ["&stations=PDX"],
1920
- "PA_ASOS": [
1921
- "&stations=PHL",
1922
- "&stations=PIT",
1923
- "&stations=ERI",
1924
- "&stations=MDT",
1925
- "&stations=AVP",
1926
- ],
1927
- "RI_ASOS": ["&stations=PVD", "&stations=WST", "&stations=UUU"],
1928
- "SC_ASOS": [
1929
- "&stations=CHS",
1930
- "&stations=CAE",
1931
- "&stations=GSP",
1932
- "&stations=MYR",
1933
- "&stations=FLO",
1934
- ],
1935
- "SD_ASOS": [
1936
- "&stations=FSD",
1937
- "&stations=RAP",
1938
- "&stations=PIR",
1939
- "&stations=ABR",
1940
- "&stations=YKN",
1941
- ],
1942
- "TN_ASOS": [
1943
- "&stations=BNA",
1944
- "&stations=MEM",
1945
- "&stations=TYS",
1946
- "&stations=CHA",
1947
- "&stations=TRI",
1948
- ],
1949
- "TX_ASOS": [
1950
- "&stations=DFW",
1951
- "&stations=IAH",
1952
- "&stations=AUS",
1953
- "&stations=SAT",
1954
- "&stations=ELP",
1955
- ],
1956
- "UT_ASOS": [
1957
- "&stations=SLC",
1958
- "&stations=OGD",
1959
- "&stations=PVU",
1960
- "&stations=SGU",
1961
- "&stations=CNY",
1962
- ],
1963
- "VA_ASOS": [
1964
- "&stations=DCA",
1965
- "&stations=RIC",
1966
- "&stations=ROA",
1967
- "&stations=ORF",
1968
- "&stations=SHD",
1969
- ],
1970
- "VT_ASOS": [
1971
- "&stations=BTV",
1972
- "&stations=MPV",
1973
- "&stations=RUT",
1974
- "&stations=VSF",
1975
- "&stations=MVL",
1976
- ],
1977
- "WA_ASOS": [
1978
- "&stations=SEA",
1979
- "&stations=GEG",
1980
- "&stations=TIW",
1981
- "&stations=VUO",
1982
- "&stations=BFI",
1983
- ],
1984
- "WI_ASOS": [
1985
- "&stations=MKE",
1986
- "&stations=MSN",
1987
- "&stations=GRB",
1988
- "&stations=EAU",
1989
- "&stations=LSE",
1990
- ],
1991
- "WV_ASOS": [
1992
- "&stations=CRW",
1993
- "&stations=CKB",
1994
- "&stations=HTS",
1995
- "&stations=MGW",
1996
- "&stations=BKW",
1997
- ],
1998
- "WY_ASOS": [
1999
- "&stations=CPR",
2000
- "&stations=JAC",
2001
- "&stations=SHR",
2002
- "&stations=COD",
2003
- "&stations=RKS",
2004
- ],
2005
- }
2006
- # --- Date setup --- #
2007
- date_object = datetime.strptime(start_date, "%Y-%m-%d")
2008
- start_day = date_object.day
2009
- start_month = date_object.month
2010
- start_year = date_object.year
2011
- formatted_date = f"{start_year:04d}-01-01" # "2000-01-01"
2012
- today = datetime.now()
2013
- end_day, end_month, end_year = today.day, today.month, today.year
2014
-
2015
- # ------------------------------------------------------------------ #
2016
- # Utility functions
2017
- # ------------------------------------------------------------------ #
2018
- def convert_f_to_c(series_f: pd.Series) -> pd.Series:
2019
- """Convert Fahrenheit to Celsius."""
2020
- return (series_f - 32) * 5.0 / 9.0
2021
-
2022
- def fetch_mesonet_data(network: str, stations: list) -> pd.DataFrame:
2023
- """Fetch station-based data (daily) from Iowa Mesonet."""
2024
- import csv
2025
-
2026
- station_query = "".join(stations)
2027
- url = (
2028
- "https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?"
2029
- f"network={network}{station_query}"
2030
- f"&year1={start_year}&month1={start_month}&day1={start_day}"
2031
- f"&year2={end_year}&month2={end_month}&day2={end_day}"
2032
- )
2033
- with urllib.request.urlopen(url) as f:
2034
- df = pd.read_csv(f, dtype=str, quoting=csv.QUOTE_ALL)
2035
- return df
2036
-
2037
- def fetch_canada_data() -> pd.DataFrame:
2038
- """Canada uses multiple sub-networks. Combine them all."""
2039
- import csv
2040
-
2041
- final_df = pd.DataFrame()
2042
- for i, institute_temp in enumerate(institute_vector):
2043
- station_query_temp = "".join(stations_list_canada[i])
2044
- mesonet_url = (
2045
- "https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?"
2046
- f"network={institute_temp}{station_query_temp}"
2047
- f"&year1={start_year}&month1={start_month}&day1={start_day}"
2048
- f"&year2={end_year}&month2={end_month}&day2={end_day}"
2049
- )
2050
- with urllib.request.urlopen(mesonet_url) as f:
2051
- temp_df = pd.read_csv(f, dtype=str, quoting=csv.QUOTE_ALL)
2052
-
2053
- if not temp_df.empty:
2054
- final_df = pd.concat([final_df, temp_df], ignore_index=True)
2055
- return final_df
2056
-
2057
- def fetch_openmeteo_rain_snow(cities: list) -> pd.DataFrame:
2058
- """
2059
- Fetch daily precipitation_sum (rain) and snowfall_sum (snow) from Open-Meteo.
2060
- Returns columns: ["date", "rain_sum", "snow_sum", "city"] for each day.
2061
- We'll then do a weekly aggregator that yields avg_rain_sum, avg_snow_sum.
2062
- """
2063
- weather_data_list = []
2064
- geolocator = Nominatim(user_agent="MyApp")
2065
-
2066
- for city in cities:
2067
- loc = geolocator.geocode(city)
2068
- if not loc:
2069
- print(f"Could not find location for {city}, skipping.")
2070
- continue
2071
-
2072
- url = "https://archive-api.open-meteo.com/v1/archive"
2073
- params = {
2074
- "latitude": loc.latitude,
2075
- "longitude": loc.longitude,
2076
- "start_date": formatted_date,
2077
- "end_date": today.strftime("%Y-%m-%d"),
2078
- "daily": "precipitation_sum,snowfall_sum",
2079
- "timezone": "auto",
2080
- }
2081
- resp = requests.get(url, params=params)
2082
- if resp.status_code != 200:
2083
- print(
2084
- f"[ERROR] open-meteo returned status {resp.status_code} for city={city}",
2085
- )
2086
- continue
2087
- try:
2088
- data_json = resp.json()
2089
- except ValueError:
2090
- print(f"[ERROR] invalid JSON from open-meteo for city={city}")
2091
- continue
2092
-
2093
- daily_block = data_json.get("daily", {})
2094
- if not {"time", "precipitation_sum", "snowfall_sum"}.issubset(
2095
- daily_block.keys(),
2096
- ):
2097
- print(
2098
- f"[ERROR] missing required keys in open-meteo for city={city}",
2099
- )
2100
- continue
2101
-
2102
- df_temp = pd.DataFrame(
2103
- {
2104
- "date": daily_block["time"],
2105
- "rain_sum": daily_block["precipitation_sum"],
2106
- "snow_sum": daily_block["snowfall_sum"],
2107
- },
2108
- )
2109
- df_temp["city"] = city
2110
- weather_data_list.append(df_temp)
2111
-
2112
- if weather_data_list:
2113
- return pd.concat(weather_data_list, ignore_index=True)
2114
- return pd.DataFrame()
2115
-
2116
- def weekly_aggregate_temp_mesonet(df: pd.DataFrame) -> pd.DataFrame:
2117
- """
2118
- For NON-US mesonet data, we only keep max_temp_f, min_temp_f,
2119
- then compute mean_temp_f, plus Celsius, and do weekly average.
2120
- """
2121
- import pandas as pd
2122
-
2123
- # Convert day col
2124
- if "day" not in df.columns:
2125
- return pd.DataFrame()
2126
-
2127
- # Only keep relevant columns
2128
- keep_cols = []
2129
- for c in ["day", "max_temp_f", "min_temp_f"]:
2130
- if c in df.columns:
2131
- keep_cols.append(c)
2132
- df = df[keep_cols].copy()
2133
-
2134
- # Convert "None" => numeric
2135
- for c in ["max_temp_f", "min_temp_f"]:
2136
- if c in df.columns:
2137
- df[c] = df[c].replace("None", pd.NA)
2138
- df[c] = pd.to_numeric(df[c], errors="coerce")
2139
-
2140
- df["day"] = pd.to_datetime(df["day"], errors="coerce")
2141
- df["mean_temp_f"] = (df["max_temp_f"] + df["min_temp_f"]) / 2
2142
- df["max_temp_c"] = convert_f_to_c(df["max_temp_f"])
2143
- df["min_temp_c"] = convert_f_to_c(df["min_temp_f"])
2144
- df["mean_temp_c"] = convert_f_to_c(df["mean_temp_f"])
2145
-
2146
- # Group by "week_starting"
2147
- df["week_starting"] = df["day"].apply(
2148
- lambda x: x
2149
- - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
2150
- if pd.notnull(x)
2151
- else pd.NaT,
2152
- )
2153
- numeric_cols = df.select_dtypes(include="number").columns
2154
- weekly = df.groupby("week_starting")[numeric_cols].mean()
2155
-
2156
- # Rename columns
2157
- rename_map = {
2158
- "max_temp_f": "avg_max_temp_f",
2159
- "min_temp_f": "avg_min_temp_f",
2160
- "mean_temp_f": "avg_mean_temp_f",
2161
- "max_temp_c": "avg_max_temp_c",
2162
- "min_temp_c": "avg_min_temp_c",
2163
- "mean_temp_c": "avg_mean_temp_c",
2164
- }
2165
- weekly.rename(columns=rename_map, inplace=True)
2166
-
2167
- # Return as a DataFrame w/ index = week_starting
2168
- return weekly
2169
-
2170
- def weekly_aggregate_rain_snow_openmeteo(df: pd.DataFrame) -> pd.DataFrame:
2171
- """
2172
- For NON-US, from open-meteo, we have daily columns 'date','rain_sum','snow_sum'.
2173
- We'll do weekly average of each. -> 'avg_rain_sum', 'avg_snow_sum'.
2174
- """
2175
- import pandas as pd
2176
-
2177
- if "date" not in df.columns:
2178
- return pd.DataFrame()
2179
-
2180
- df["date"] = pd.to_datetime(df["date"], errors="coerce")
2181
- df["week_starting"] = df["date"].apply(
2182
- lambda x: x
2183
- - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
2184
- if pd.notnull(x)
2185
- else pd.NaT,
2186
- )
2187
-
2188
- # Convert to numeric
2189
- for c in ["rain_sum", "snow_sum"]:
2190
- if c in df.columns:
2191
- df[c] = pd.to_numeric(df[c], errors="coerce")
2192
-
2193
- numeric_cols = df.select_dtypes(include="number").columns
2194
- weekly = df.groupby("week_starting")[numeric_cols].mean()
2195
-
2196
- rename_map = {"rain_sum": "avg_rain_sum", "snow_sum": "avg_snow_sum"}
2197
- weekly.rename(columns=rename_map, inplace=True)
2198
- return weekly
2199
-
2200
- def weekly_aggregate_us(df: pd.DataFrame) -> pd.DataFrame:
2201
- """
2202
- For US Mesonet data (per state), we keep max_temp_f, min_temp_f, precip_in, snow_in,
2203
- then compute mean_temp_f & convert to celsius, group weekly.
2204
- We'll rename:
2205
- max_temp_f -> avg_max_temp_f
2206
- min_temp_f -> avg_min_temp_f
2207
- mean_temp_f -> avg_mean_temp_f
2208
- precip_in -> avg_rain_sum
2209
- snow_in -> avg_snow_sum
2210
- """
2211
- import pandas as pd
2212
-
2213
- if "day" not in df.columns:
2214
- return pd.DataFrame()
2215
-
2216
- # Convert day
2217
- df["day"] = pd.to_datetime(df["day"], errors="coerce")
2218
-
2219
- # Convert "None" => numeric
2220
- for c in ["max_temp_f", "min_temp_f", "precip_in", "snow_in"]:
2221
- if c in df.columns:
2222
- df[c] = df[c].replace("None", pd.NA)
2223
- df[c] = pd.to_numeric(df[c], errors="coerce")
2224
-
2225
- # Compute mean_temp_f, celsius
2226
- df["mean_temp_f"] = (df["max_temp_f"] + df["min_temp_f"]) / 2
2227
- df["max_temp_c"] = convert_f_to_c(df["max_temp_f"])
2228
- df["min_temp_c"] = convert_f_to_c(df["min_temp_f"])
2229
- df["mean_temp_c"] = convert_f_to_c(df["mean_temp_f"])
2230
-
2231
- # Weekly grouping
2232
- df["week_starting"] = df["day"].apply(
2233
- lambda x: x
2234
- - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
2235
- if pd.notnull(x)
2236
- else pd.NaT,
2237
- )
2238
- numeric_cols = df.select_dtypes(include="number").columns
2239
- weekly = df.groupby("week_starting")[numeric_cols].mean()
2240
-
2241
- rename_map = {
2242
- "max_temp_f": "avg_max_temp_f",
2243
- "min_temp_f": "avg_min_temp_f",
2244
- "mean_temp_f": "avg_mean_temp_f",
2245
- "max_temp_c": "avg_max_temp_c",
2246
- "min_temp_c": "avg_min_temp_c",
2247
- "mean_temp_c": "avg_mean_temp_c",
2248
- "precip_in": "avg_rain_sum",
2249
- "snow_in": "avg_snow_sum",
2250
- }
2251
- weekly.rename(columns=rename_map, inplace=True)
2252
- return weekly
2253
-
2254
- def rename_with_prefix(df: pd.DataFrame, prefix: str) -> pd.DataFrame:
2255
- """Rename all columns except 'week_starting' or 'OBS' with the given prefix."""
2256
- df2 = df.copy()
2257
- new_cols = {}
2258
- for col in df2.columns:
2259
- if col not in ["week_starting", "OBS"]:
2260
- new_cols[col] = prefix + col
2261
- df2.rename(columns=new_cols, inplace=True)
2262
- return df2
2263
-
2264
- # ------------------------------------------------------------------ #
2265
- # The final combined DataFrame
2266
- # ------------------------------------------------------------------ #
2267
- combined_df = pd.DataFrame()
2268
-
2269
- # ------------------------------------------------------------------ #
2270
- # 1) Loop over each requested country
2271
- # ------------------------------------------------------------------ #
2272
- for country_code in country_codes:
2273
- net = country_dict.get(country_code)
2274
- if net is None:
2275
- print(f"Warning: Invalid country_code '{country_code}' – skipping.")
2276
- continue
2277
-
2278
- # =========================
2279
- # 2) Special Logic for US
2280
- # =========================
2281
- if net == "US_STATES":
2282
- for state_code, network_code in us_state_networks.items():
2283
- stations = us_stations_map.get(network_code, [])
2284
- if not stations:
2285
- print(f"[DEBUG] No stations for {network_code}, skipping.")
2286
- continue
2287
-
2288
- raw_df = fetch_mesonet_data(network_code, stations)
2289
- if raw_df.empty:
2290
- print(f"[DEBUG] DataFrame empty for {network_code}, skipping.")
2291
- continue
2292
-
2293
- weekly_state = weekly_aggregate_us(raw_df)
2294
- if weekly_state.empty:
2295
- print(
2296
- f"[DEBUG] Aggregated weekly DataFrame empty for {network_code}, skipping.",
2297
- )
2298
- continue
2299
-
2300
- weekly_state.reset_index(inplace=True)
2301
- weekly_state.rename(columns={"week_starting": "OBS"}, inplace=True)
2302
-
2303
- # Now rename columns with prefix: seas_us_{statecode}_
2304
- prefix = f"seas_us_{state_code.lower()}_"
2305
- weekly_state = rename_with_prefix(weekly_state, prefix)
2306
-
2307
- # Merge into combined
2308
- if combined_df.empty:
2309
- combined_df = weekly_state
2310
- else:
2311
- combined_df = pd.merge(
2312
- combined_df,
2313
- weekly_state,
2314
- on="OBS",
2315
- how="outer",
2316
- )
2317
-
2318
- # Done with the US. Move on to the next country in the loop
2319
- continue
2320
-
2321
- # =======================================
2322
- # 3) Logic for Non-US (AU, GB, DE, CA, ZA)
2323
- # =======================================
2324
- # A) Fetch temperature data from Mesonet
2325
- if net == "Canada":
2326
- raw_temp = fetch_canada_data()
2327
- else:
2328
- # e.g. "GB__ASOS", "AU__ASOS", "DE__ASOS", "ZA__ASOS" (if added)
2329
- stations = station_map.get(net, [])
2330
- if not stations and net != "ZA__ASOS":
2331
- # If we have no stations for net and it's not ZA,
2332
- # there's no data. (If ZA has stations, add them above.)
2333
- raw_temp = pd.DataFrame()
2334
- else:
2335
- raw_temp = fetch_mesonet_data(net, stations)
2336
-
2337
- weekly_temp = pd.DataFrame()
2338
- if not raw_temp.empty:
2339
- # For these countries, we only keep max_temp_f, min_temp_f, mean_temp_f
2340
- weekly_temp = weekly_aggregate_temp_mesonet(raw_temp)
2341
-
2342
- # B) Fetch rain+snow from Open-Meteo (only if we have an entry in rainfall_city_map)
2343
- weekly_precip = pd.DataFrame()
2344
- if net in rainfall_city_map:
2345
- city_list = rainfall_city_map[net]
2346
- df_rain_snow = fetch_openmeteo_rain_snow(city_list)
2347
- if not df_rain_snow.empty:
2348
- weekly_precip = weekly_aggregate_rain_snow_openmeteo(df_rain_snow)
2349
-
2350
- # C) Merge the temperature data + precip/snow data on the weekly index
2351
- if not weekly_temp.empty and not weekly_precip.empty:
2352
- merged_df = pd.merge(
2353
- weekly_temp,
2354
- weekly_precip,
2355
- left_index=True,
2356
- right_index=True,
2357
- how="outer",
2358
- )
2359
- elif not weekly_temp.empty:
2360
- merged_df = weekly_temp
2361
- else:
2362
- merged_df = weekly_precip
2363
-
2364
- if merged_df.empty:
2365
- print(f"No data retrieved for country: {country_code}")
2366
- continue
2367
-
2368
- # D) Convert index -> a column OBS
2369
- merged_df.reset_index(inplace=True)
2370
- merged_df.rename(columns={"week_starting": "OBS"}, inplace=True)
2371
-
2372
- # E) Rename with prefix = "seas_{country_code}_"
2373
- prefix = f"seas_{country_code.lower()}_"
2374
- merged_df = rename_with_prefix(merged_df, prefix)
2375
-
2376
- # F) Merge into combined_df
2377
- if combined_df.empty:
2378
- combined_df = merged_df
2379
- else:
2380
- combined_df = pd.merge(combined_df, merged_df, on="OBS", how="outer")
2381
-
2382
- # ------------------------------------------------------------------ #
2383
- # 4) Sort final by OBS (optional)
2384
- # ------------------------------------------------------------------ #
2385
- if not combined_df.empty:
2386
- combined_df.sort_values(by="OBS", inplace=True)
2387
-
2388
- return combined_df
2389
-
2390
- def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
2391
- """
2392
- Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
2393
- aggregates it to weekly averages, and renames variables based on specified rules.
2394
-
2395
- Parameters
2396
- ----------
2397
- cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
2398
- week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
2399
- sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
2400
- (e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
2401
-
2402
- Returns
2403
- -------
2404
- pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
2405
- and all series as renamed columns (e.g., 'macro_retail_sales_uk').
2406
- Returns an empty DataFrame if no data is fetched or processed.
2407
-
2408
- """
2409
- # Define CDIDs for sectors and defaults
2410
- sector_cdids_map = {
2411
- "fast_food": ["L7TD", "L78Q", "DOAD"],
2412
- "clothing_footwear": ["D7BW", "D7GO", "CHBJ"],
2413
- "fuel": ["A9FS", "L7FP", "CHOL"],
2414
- "cars": ["D7E8", "D7E9", "D7CO"],
2415
- "default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
2416
- }
2417
-
2418
- default_cdids = sector_cdids_map["default"]
2419
- sector_specific_cdids = [] # Initialize empty list for sector CDIDs
2420
-
2421
- if sector: # Check if sector is not None or empty
2422
- if isinstance(sector, str):
2423
- # If it's a single string, wrap it in a list
2424
- sector_list = [sector]
2425
- elif isinstance(sector, list):
2426
- # If it's already a list, use it directly
2427
- sector_list = sector
2428
- else:
2429
- raise TypeError(
2430
- "`sector` parameter must be a string or a list of strings.",
2431
- )
2432
-
2433
- # Iterate through the list of sectors and collect their CDIDs
2434
- for sec in sector_list:
2435
- sector_specific_cdids.extend(
2436
- sector_cdids_map.get(sec, []),
2437
- ) # Use extend to add items from the list
2438
-
2439
- # Combine standard CDIDs and any additional user-provided CDIDs
2440
- standard_cdids = list(dict.fromkeys(default_cdids + sector_specific_cdids))
2441
- if cdid_list is None:
2442
- cdid_list = []
2443
- final_cdid_list = list(dict.fromkeys(standard_cdids + cdid_list))
2444
-
2445
- base_search_url = (
2446
- "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
2447
- )
2448
- base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
2449
- combined_df = pd.DataFrame()
2450
-
2451
- # Map week start day to pandas weekday convention
2452
- days_map = {
2453
- "mon": 0,
2454
- "tue": 1,
2455
- "wed": 2,
2456
- "thu": 3,
2457
- "fri": 4,
2458
- "sat": 5,
2459
- "sun": 6,
2460
- }
2461
- if week_start_day.lower() not in days_map:
2462
- raise ValueError(
2463
- "Invalid week start day. Choose from: " + ", ".join(days_map.keys()),
2464
- )
2465
- week_start = days_map[
2466
- week_start_day.lower()
2467
- ] # Use lower() for case-insensitivity
2468
-
2469
- for cdid in final_cdid_list: # Use the final combined list
2470
- try:
2471
- # Search for the series
2472
- search_url = f"{base_search_url}{cdid}"
2473
- search_response = requests.get(search_url, timeout=30) # Add timeout
2474
- search_response.raise_for_status()
2475
- search_data = search_response.json()
2476
-
2477
- items = search_data.get("items", [])
2478
- if not items:
2479
- print(f"Warning: No data found for CDID: {cdid}")
2480
- continue
2481
-
2482
- # Extract series name and latest release URI
2483
- # Find the item with the most recent release_date
2484
- latest_item = None
2485
- latest_date = None
2486
- for item in items:
2487
- if "release_date" in item:
2488
- try:
2489
- # Ensure timezone awareness for comparison
2490
- current_date = datetime.fromisoformat(
2491
- item["release_date"].replace("Z", "+00:00"),
2492
- )
2493
- if latest_date is None or current_date > latest_date:
2494
- latest_date = current_date
2495
- latest_item = item
2496
- except ValueError:
2497
- print(
2498
- f"Warning: Could not parse release_date '{item['release_date']}' for CDID {cdid}",
2499
- )
2500
- continue # Skip this item if date is invalid
2501
-
2502
- if latest_item is None:
2503
- print(f"Warning: No valid release date found for CDID: {cdid}")
2504
- continue
2505
-
2506
- series_name = latest_item.get(
2507
- "title",
2508
- f"Series_{cdid}",
2509
- ) # Use title from the latest item
2510
- latest_uri = latest_item.get("uri")
2511
- if not latest_uri:
2512
- print(
2513
- f"Warning: No URI found for the latest release of CDID: {cdid}",
2514
- )
2515
- continue
2516
-
2517
- # Fetch the dataset
2518
- data_url = f"{base_data_url}{latest_uri}"
2519
- data_response = requests.get(data_url, timeout=30) # Add timeout
2520
- data_response.raise_for_status()
2521
- data_json = data_response.json()
2522
-
2523
- # Detect the frequency and process accordingly
2524
- frequency_key = None
2525
- if data_json.get("months"):
2526
- frequency_key = "months"
2527
- elif data_json.get("quarters"):
2528
- frequency_key = "quarters"
2529
- elif data_json.get("years"):
2530
- frequency_key = "years"
2531
- else:
2532
- print(
2533
- f"Warning: Unsupported frequency or no data values found for CDID: {cdid} at URI {latest_uri}",
2534
- )
2535
- continue
2536
-
2537
- # Prepare the DataFrame
2538
- if not data_json[frequency_key]: # Check if the list of values is empty
2539
- print(
2540
- f"Warning: Empty data list for frequency '{frequency_key}' for CDID: {cdid}",
2541
- )
2542
- continue
2543
-
2544
- df = pd.DataFrame(data_json[frequency_key])
2545
-
2546
- # Check if essential columns exist
2547
- if "date" not in df.columns or "value" not in df.columns:
2548
- print(f"Warning: Missing 'date' or 'value' column for CDID: {cdid}")
2549
- continue
2550
-
2551
- # Parse the 'date' field based on frequency
2552
- try:
2553
- if frequency_key == "months":
2554
- # Handles "YYYY Mon" format (e.g., "2023 FEB") - adjust if format differs
2555
- df["date"] = pd.to_datetime(
2556
- df["date"],
2557
- format="%Y %b",
2558
- errors="coerce",
2559
- )
2560
- elif frequency_key == "quarters":
2561
-
2562
- def parse_quarter(quarter_str):
2563
- try:
2564
- year, qtr = quarter_str.split(" Q")
2565
- month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
2566
- return datetime(int(year), month, 1)
2567
- except (ValueError, KeyError):
2568
- return pd.NaT # Return Not a Time for parsing errors
2569
-
2570
- df["date"] = df["date"].apply(parse_quarter)
2571
- elif frequency_key == "years":
2572
- df["date"] = pd.to_datetime(
2573
- df["date"],
2574
- format="%Y",
2575
- errors="coerce",
2576
- )
2577
- except Exception as e:
2578
- print(
2579
- f"Error parsing date for CDID {cdid} with frequency {frequency_key}: {e}",
2580
- )
2581
- continue # Skip this series if date parsing fails
2582
-
2583
- # Coerce value to numeric, handle potential errors
2584
- df["value"] = pd.to_numeric(df["value"], errors="coerce")
2585
-
2586
- # Drop rows where date or value parsing failed
2587
- df.dropna(subset=["date", "value"], inplace=True)
2588
-
2589
- if df.empty:
2590
- print(
2591
- f"Warning: No valid data points after processing for CDID: {cdid}",
2592
- )
2593
- continue
2594
-
2595
- df.rename(columns={"value": series_name}, inplace=True)
2596
-
2597
- # Combine data
2598
- df_subset = df.loc[:, ["date", series_name]].reset_index(
2599
- drop=True,
2600
- ) # Explicitly select columns
2601
- if combined_df.empty:
2602
- combined_df = df_subset
2603
- else:
2604
- # Use outer merge to keep all dates, sort afterwards
2605
- combined_df = pd.merge(
2606
- combined_df,
2607
- df_subset,
2608
- on="date",
2609
- how="outer",
2610
- )
2611
-
2612
- except requests.exceptions.RequestException as e:
2613
- print(f"Error fetching data for CDID {cdid}: {e}")
2614
- except (KeyError, ValueError, TypeError) as e: # Added TypeError
2615
- print(f"Error processing data for CDID {cdid}: {e}")
2616
- except Exception as e: # Catch unexpected errors
2617
- print(f"An unexpected error occurred for CDID {cdid}: {e}")
2618
-
2619
- if not combined_df.empty:
2620
- # Sort by date after merging to ensure correct forward fill
2621
- combined_df.sort_values(by="date", inplace=True)
2622
- combined_df.reset_index(drop=True, inplace=True)
2623
-
2624
- # Create a complete daily date range
2625
- min_date = combined_df["date"].min()
2626
- # Ensure max_date is timezone-naive if min_date is, or consistent otherwise
2627
- max_date = pd.Timestamp(
2628
- datetime.today().date(),
2629
- ) # Use today's date, timezone-naive
2630
-
2631
- if pd.isna(min_date):
2632
- print("Error: Minimum date is NaT, cannot create date range.")
2633
- return pd.DataFrame()
2634
-
2635
- # Make sure min_date is not NaT before creating the range
2636
- date_range = pd.date_range(start=min_date, end=max_date, freq="D")
2637
- daily_df = pd.DataFrame(date_range, columns=["date"])
2638
-
2639
- # Merge with original data and forward fill
2640
- daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
2641
- daily_df = daily_df.ffill()
2642
-
2643
- # Drop rows before the first valid data point after ffill
2644
- first_valid_index = daily_df.dropna(
2645
- subset=daily_df.columns.difference(["date"]),
2646
- ).index.min()
2647
- if pd.notna(first_valid_index):
2648
- daily_df = daily_df.loc[first_valid_index:]
2649
- else:
2650
- print("Warning: No valid data points found after forward filling.")
2651
- return pd.DataFrame() # Return empty if ffill results in no data
2652
-
2653
- # Aggregate to weekly frequency
2654
- # Ensure 'date' column is datetime type before dt accessor
2655
- daily_df["date"] = pd.to_datetime(daily_df["date"])
2656
- daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta(
2657
- (daily_df["date"].dt.weekday - week_start + 7) % 7,
2658
- unit="D",
2659
- ) # Corrected logic for week start
2660
- # Group by week_commencing and calculate mean for numeric columns only
2661
- weekly_df = (
2662
- daily_df.groupby("week_commencing")
2663
- .mean(numeric_only=True)
2664
- .reset_index()
2665
- )
2666
-
2667
- def clean_column_name(name):
2668
- # Remove content within parentheses
2669
- name = re.sub(r"\(.*?\)", "", name)
2670
-
2671
- # Special handling for ANY CPI items (not just CPI INDEX)
2672
- if "CPI" in name.upper():
2673
- # Extract the description part after the colon for CPI items
2674
- if ":" in name:
2675
- parts = name.split(":")
2676
- if len(parts) >= 2:
2677
- # Take the description part (usually the second part)
2678
- description = parts[1].strip()
2679
- # Remove any remaining colons and everything after
2680
- description = description.split(":")[0].strip()
2681
- name = f"CPI {description}"
2682
-
2683
- # Remove numbers and dots for ALL CPI items (like 00, 06.2.2, 12.5.3/5)
2684
- name = re.sub(r"\d+\.?\d*/?\.?\d*", "", name)
2685
-
2686
- else:
2687
- # For non-CPI items, take only the part before the first colon
2688
- name = re.split(r":", name)[0]
2689
- # Remove all digits for non-CPI items too
2690
- name = re.sub(r"\d+", "", name)
2691
-
2692
- # Remove year references like "2015=100"
2693
- name = re.sub(r"\d{4}=\d+", "", name)
2694
-
2695
- # Remove specific words case-insensitively
2696
- name = re.sub(r"\b(annual|rate|index|seasonally|adjusted|sa|cvm)\b", "", name, flags=re.IGNORECASE)
2697
-
2698
- # Remove percentage symbols and "%"
2699
- name = re.sub(r"%", "percent", name)
2700
-
2701
- # Remove non-alphanumeric characters (except underscore and space)
2702
- name = re.sub(r"[^\w\s]", "", name)
2703
-
2704
- # Replace spaces with underscores
2705
- name = name.strip().replace(" ", "_")
2706
-
2707
- # Replace multiple underscores with a single one
2708
- name = re.sub(r"_+", "_", name)
2709
-
2710
- # Remove leading/trailing underscores
2711
- name = name.strip("_")
2712
-
2713
- # Truncate very long names (optional)
2714
- if len(name) > 50:
2715
- words = name.split("_")
2716
- # Keep first few meaningful words
2717
- name = "_".join(words[:4])
2718
-
2719
- return f"macro_{name.lower()}_uk"
2720
-
2721
- # Apply cleaning function to relevant columns
2722
- weekly_df.columns = [
2723
- clean_column_name(col) if col != "week_commencing" else col
2724
- for col in weekly_df.columns
2725
- ]
2726
- weekly_df.rename(
2727
- columns={"week_commencing": "OBS"},
2728
- inplace=True,
2729
- ) # Rename week commencing col
2730
-
2731
- # Optional: Fill remaining NaNs (e.g., at the beginning if ffill didn't cover) with 0
2732
- # Consider if 0 is the appropriate fill value for your use case
2733
- # weekly_df = weekly_df.fillna(0)
2734
-
2735
- # Get only the data columns (excluding OBS)
2736
- data_columns = [col for col in weekly_df.columns if col != "OBS"]
2737
-
2738
- new_columns = ["OBS"]
2739
- for i, col in enumerate(data_columns):
2740
- if i < len(final_cdid_list):
2741
- new_columns.append(f"{col}_{final_cdid_list[i]}")
2742
- else:
2743
- new_columns.append(col) # Keep original if no matching CDID
2744
-
2745
- # Apply the new column names to the DataFrame
2746
- weekly_df.columns = new_columns
2747
-
2748
- return weekly_df
2749
- print("No data successfully fetched or processed.")
2750
- return pd.DataFrame()
2751
-
2752
- def pull_yfinance(self, tickers=None, week_start_day="mon"):
2753
- """
2754
- Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
2755
- aggregates it to weekly averages, and renames variables.
2756
-
2757
- Parameters
2758
- ----------
2759
- tickers (list): A list of additional stock tickers to fetch (e.g., ['AAPL', 'MSFT']). Defaults to None.
2760
- week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
2761
-
2762
- Returns
2763
- -------
2764
- pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column
2765
- and aggregated stock data for the specified tickers, with NaN values filled with 0.
2766
-
2767
- """
2768
- # Define default tickers
2769
- default_tickers = ["^FTSE", "GBPUSD=X", "GBPEUR=X", "^GSPC"]
2770
-
2771
- # Combine default tickers with additional ones
2772
- if tickers is None:
2773
- tickers = []
2774
- tickers = list(set(default_tickers + tickers)) # Ensure no duplicates
2775
-
2776
- # Automatically set end_date to today
2777
- end_date = datetime.today().strftime("%Y-%m-%d")
2778
-
2779
- # Mapping week start day to pandas weekday convention
2780
- days_map = {
2781
- "mon": 0,
2782
- "tue": 1,
2783
- "wed": 2,
2784
- "thu": 3,
2785
- "fri": 4,
2786
- "sat": 5,
2787
- "sun": 6,
2788
- }
2789
- if week_start_day not in days_map:
2790
- raise ValueError(
2791
- "Invalid week start day. Choose from: " + ", ".join(days_map.keys()),
2792
- )
2793
- week_start = days_map[week_start_day]
2794
-
2795
- # Fetch data for all tickers without specifying a start date to get all available data
2796
- data = yf.download(tickers, end=end_date, group_by="ticker", auto_adjust=True)
2797
-
2798
- # Process the data
2799
- combined_df = pd.DataFrame()
2800
- for ticker in tickers:
2801
- try:
2802
- # Extract the ticker's data
2803
- ticker_data = data[ticker] if len(tickers) > 1 else data
2804
- ticker_data = ticker_data.reset_index()
2805
-
2806
- # Ensure necessary columns are present
2807
- if "Close" not in ticker_data.columns:
2808
- raise ValueError(
2809
- f"Ticker {ticker} does not have 'Close' price data.",
2810
- )
2811
-
2812
- # Keep only relevant columns
2813
- ticker_data = ticker_data[["Date", "Close"]]
2814
- ticker_data.rename(columns={"Close": ticker}, inplace=True)
2815
-
2816
- # Merge data
2817
- if combined_df.empty:
2818
- combined_df = ticker_data
2819
- else:
2820
- combined_df = pd.merge(
2821
- combined_df,
2822
- ticker_data,
2823
- on="Date",
2824
- how="outer",
2825
- )
2826
-
2827
- except KeyError:
2828
- print(f"Data for ticker {ticker} not available.")
2829
- except Exception as e:
2830
- print(f"Error processing ticker {ticker}: {e}")
2831
-
2832
- if not combined_df.empty:
2833
- # Convert to daily frequency
2834
- combined_df["Date"] = pd.to_datetime(combined_df["Date"])
2835
- combined_df.set_index("Date", inplace=True)
2836
-
2837
- # Fill missing dates
2838
- min_date = combined_df.index.min()
2839
- max_date = combined_df.index.max()
2840
- daily_index = pd.date_range(start=min_date, end=max_date, freq="D")
2841
- combined_df = combined_df.reindex(daily_index)
2842
- combined_df.index.name = "Date"
2843
- combined_df = combined_df.ffill()
2844
-
2845
- # Aggregate to weekly frequency
2846
- combined_df["OBS"] = combined_df.index - pd.to_timedelta(
2847
- (combined_df.index.weekday - week_start) % 7,
2848
- unit="D",
2849
- )
2850
- weekly_df = combined_df.groupby("OBS").mean(numeric_only=True).reset_index()
2851
-
2852
- # Fill NaN values with 0
2853
- weekly_df = weekly_df.fillna(0)
2854
-
2855
- # Clean column names
2856
- def clean_column_name(name):
2857
- name = re.sub(r"[^\w\s]", "", name)
2858
- return f"macro_{name.lower()}"
2859
-
2860
- weekly_df.columns = [
2861
- clean_column_name(col) if col != "OBS" else col
2862
- for col in weekly_df.columns
2863
- ]
2864
-
2865
- return weekly_df
2866
-
2867
- print("No data available to process.")
2868
- return pd.DataFrame()
2869
-
2870
- def pull_sports_events(self, start_date="2020-01-01", week_commencing="mon"):
2871
- """
2872
- Combines scraping logic for:
2873
- - UEFA Champions League and NFL from TheSportsDB (website-scraping approach)
2874
- - FIFA World Cup, UEFA Euro, Rugby World Cup, Six Nations (via TheSportsDB API)
2875
-
2876
- Returns a single merged DataFrame with all event dummy variables.
2877
- """
2878
-
2879
- ############################################################
2880
- # 1) SCRAPE UEFA CHAMPIONS LEAGUE & NFL (YOUR FIRST FUNCTION)
2881
- ############################################################
2882
- def scrape_sports_events(
2883
- start_date=start_date,
2884
- week_commencing=week_commencing,
2885
- ):
2886
- sports = {
2887
- "uefa_champions_league": {
2888
- "league_id": "4480",
2889
- "seasons_url": "https://www.thesportsdb.com/league/4480-UEFA-Champions-League?a=1#allseasons",
2890
- "season_url_template": "https://www.thesportsdb.com/season/4480-UEFA-Champions-League/{season}&all=1&view=",
2891
- "round_filters": ["quarter", "semi", "final"],
2892
- },
2893
- "nfl": {
2894
- "league_id": "4391",
2895
- "seasons_url": "https://www.thesportsdb.com/league/4391-NFL?a=1#allseasons",
2896
- "season_url_template": "https://www.thesportsdb.com/season/4391-NFL/{season}&all=1&view=",
2897
- "round_filters": ["quarter", "semi", "final"],
2898
- },
2899
- }
2900
-
2901
- headers = {"User-Agent": "Mozilla/5.0"}
2902
- start_date_dt = datetime.strptime(start_date, "%Y-%m-%d")
2903
-
2904
- # Create a full date range DataFrame
2905
- full_date_range = pd.date_range(
2906
- start=start_date,
2907
- end=pd.to_datetime("today"),
2908
- )
2909
- time_series_df = pd.DataFrame({"date": full_date_range})
2910
- time_series_df["seas_uefa_champions_league"] = 0
2911
- time_series_df["seas_nfl"] = 0
2912
-
2913
- for sport, details in sports.items():
2914
- # Get available seasons
2915
- response = requests.get(details["seasons_url"], headers=headers)
2916
- if response.status_code != 200:
2917
- continue # Skip this sport if the request fails
2918
-
2919
- soup = BeautifulSoup(response.text, "html.parser")
2920
-
2921
- # Extract season names
2922
- seasons = []
2923
- for link in soup.find_all("a", href=True):
2924
- href = link["href"]
2925
- if "season" in href and sport.replace("_", "-") in href.lower():
2926
- season_name = href.split("/")[-1] # e.g. "2023-2024"
2927
- try:
2928
- season_start_year = int(season_name.split("-")[0])
2929
- season_start_date = datetime(season_start_year, 1, 1)
2930
- if season_start_date >= start_date_dt:
2931
- seasons.append(season_name)
2932
- except ValueError:
2933
- continue
2934
-
2935
- # Scrape matches for filtered seasons
2936
- filtered_matches = []
2937
- for season in seasons:
2938
- season_url = details["season_url_template"].format(season=season)
2939
- season_response = requests.get(season_url, headers=headers)
2940
- if season_response.status_code != 200:
2941
- continue
2942
-
2943
- season_soup = BeautifulSoup(season_response.text, "html.parser")
2944
- for row in season_soup.find_all("tr"):
2945
- cols = row.find_all("td")
2946
- if len(cols) >= 5:
2947
- match_date = cols[0].text.strip()
2948
- round_name = cols[1].text.strip().lower()
2949
- try:
2950
- match_date_dt = datetime.strptime(
2951
- match_date,
2952
- "%d %b %y",
2953
- )
2954
- if match_date_dt >= start_date_dt and any(
2955
- r in round_name for r in details["round_filters"]
2956
- ):
2957
- filtered_matches.append(match_date_dt)
2958
- except ValueError:
2959
- continue
2960
-
2961
- # Convert matches into time series format
2962
- df_sport = pd.DataFrame({"date": filtered_matches})
2963
- if df_sport.empty:
2964
- continue
2965
-
2966
- col_name = (
2967
- "seas_nfl" if sport == "nfl" else "seas_uefa_champions_league"
2968
- )
2969
- time_series_df.loc[
2970
- time_series_df["date"].isin(df_sport["date"]),
2971
- col_name,
2972
- ] = 1
2973
-
2974
- # Aggregate by week commencing
2975
- day_offsets = {
2976
- "mon": "W-MON",
2977
- "tue": "W-TUE",
2978
- "wed": "W-WED",
2979
- "thu": "W-THU",
2980
- "fri": "W-FRI",
2981
- "sat": "W-SAT",
2982
- "sun": "W-SUN",
2983
- }
2984
- if week_commencing.lower() not in day_offsets:
2985
- raise ValueError(
2986
- f"Invalid week_commencing value: {week_commencing}. Must be one of {list(day_offsets.keys())}.",
2987
- )
2988
-
2989
- time_series_df = (
2990
- time_series_df.set_index("date")
2991
- .resample(day_offsets[week_commencing.lower()])
2992
- .max()
2993
- .reset_index()
2994
- )
2995
-
2996
- time_series_df.rename(columns={"date": "OBS"}, inplace=True)
2997
- time_series_df.fillna(0, inplace=True)
2998
-
2999
- return time_series_df
3000
-
3001
- ############################################################
3002
- # 2) FETCH FIFA WC, UEFA EURO, RUGBY, SIX NATIONS (2ND FUNC)
3003
- ############################################################
3004
- def fetch_events(start_date=start_date, week_commencing=week_commencing):
3005
- # Initialize date range
3006
- start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
3007
- end_date_obj = datetime.today()
3008
- date_range = pd.date_range(start=start_date_obj, end=end_date_obj)
3009
- df = pd.DataFrame({"OBS": date_range}).set_index("OBS")
3010
-
3011
- # Define columns for sports
3012
- event_columns = {
3013
- "seas_fifa_world_cup": {
3014
- "league_id": 4429,
3015
- "start_year": 1950,
3016
- "interval": 4,
3017
- },
3018
- "seas_uefa_european_championship": {
3019
- "league_id": 4502,
3020
- "start_year": 1960,
3021
- "interval": 4,
3022
- "extra_years": [2021],
3023
- },
3024
- "seas_rugby_world_cup": {
3025
- "league_id": 4574,
3026
- "start_year": 1987,
3027
- "interval": 4,
3028
- },
3029
- "seas_six_nations": {
3030
- "league_id": 4714,
3031
- "start_year": 2000,
3032
- "interval": 1,
3033
- },
3034
- }
3035
-
3036
- # Initialize columns
3037
- for col in event_columns:
3038
- df[col] = 0
3039
-
3040
- def fetch_league_events(
3041
- league_id,
3042
- column_name,
3043
- start_year,
3044
- interval,
3045
- extra_years=None,
3046
- ):
3047
- extra_years = extra_years or []
3048
- # Fetch seasons
3049
- seasons_url = f"https://www.thesportsdb.com/api/v1/json/3/search_all_seasons.php?id={league_id}"
3050
- seasons_response = requests.get(seasons_url)
3051
- if seasons_response.status_code != 200:
3052
- return # Skip on failure
3053
-
3054
- seasons_data = seasons_response.json().get("seasons", [])
3055
- for season in seasons_data:
3056
- season_name = season.get("strSeason", "")
3057
- if not season_name.isdigit():
3058
- continue
3059
-
3060
- year = int(season_name)
3061
- # Check if the year is valid for this competition
3062
- if year in extra_years or (
3063
- year >= start_year and (year - start_year) % interval == 0
3064
- ):
3065
- # Fetch events
3066
- events_url = f"https://www.thesportsdb.com/api/v1/json/3/eventsseason.php?id={league_id}&s={season_name}"
3067
- events_response = requests.get(events_url)
3068
- if events_response.status_code != 200:
3069
- continue
3070
-
3071
- events_data = events_response.json().get("events", [])
3072
- for event in events_data:
3073
- event_date_str = event.get("dateEvent")
3074
- if event_date_str:
3075
- event_date = datetime.strptime(
3076
- event_date_str,
3077
- "%Y-%m-%d",
3078
- )
3079
- if event_date in df.index:
3080
- df.loc[event_date, column_name] = 1
3081
-
3082
- # Fetch events for all defined leagues
3083
- for column_name, params in event_columns.items():
3084
- fetch_league_events(
3085
- league_id=params["league_id"],
3086
- column_name=column_name,
3087
- start_year=params["start_year"],
3088
- interval=params["interval"],
3089
- extra_years=params.get("extra_years", []),
3090
- )
3091
-
3092
- # Resample by week
3093
- day_offsets = {
3094
- "mon": "W-MON",
3095
- "tue": "W-TUE",
3096
- "wed": "W-WED",
3097
- "thu": "W-THU",
3098
- "fri": "W-FRI",
3099
- "sat": "W-SAT",
3100
- "sun": "W-SUN",
3101
- }
3102
-
3103
- if week_commencing.lower() not in day_offsets:
3104
- raise ValueError(
3105
- f"Invalid week_commencing value: {week_commencing}. "
3106
- f"Must be one of {list(day_offsets.keys())}.",
3107
- )
3108
-
3109
- df = df.resample(day_offsets[week_commencing.lower()]).max()
3110
- df = df.reset_index()
3111
- return df
3112
-
3113
- ###################################################
3114
- # 3) CALL BOTH, THEN MERGE ON "OBS" & FILL WITH 0s
3115
- ###################################################
3116
- df_uefa_nfl = scrape_sports_events(start_date, week_commencing)
3117
- df_other_events = fetch_events(start_date, week_commencing)
3118
-
3119
- # Merge on "OBS" column (outer join to preserve all dates in range)
3120
- final_df = pd.merge(df_uefa_nfl, df_other_events, on="OBS", how="outer")
3121
-
3122
- # Fill any NaNs with 0 for event columns
3123
- # (Only fill numeric columns or everything except 'OBS')
3124
- for col in final_df.columns:
3125
- if col != "OBS":
3126
- final_df[col] = final_df[col].fillna(0)
3127
-
3128
- # Sort by date just in case
3129
- final_df.sort_values(by="OBS", inplace=True)
3130
- final_df.reset_index(drop=True, inplace=True)
3131
-
3132
- return final_df