imsciences 0.5.4.7__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/__init__.py +4 -1
- imsciences/datafunctions-IMS-24Ltp-3.py +2711 -0
- imsciences/datafunctions.py +2842 -170
- imsciences/datapull.py +374 -0
- imsciences/geo.py +195 -0
- imsciences/mmm.py +1415 -0
- imsciences/pull.py +1483 -0
- imsciences/unittesting.py +1064 -0
- imsciences/vis.py +196 -0
- imsciences-0.9.3.dist-info/LICENSE.txt +21 -0
- imsciences-0.9.3.dist-info/METADATA +330 -0
- imsciences-0.9.3.dist-info/PKG-INFO-IMS-24Ltp-3 +24 -0
- imsciences-0.9.3.dist-info/RECORD +22 -0
- {imsciences-0.5.4.7.dist-info → imsciences-0.9.3.dist-info}/WHEEL +1 -1
- imsciences-0.5.4.7.dist-info/METADATA +0 -95
- imsciences-0.5.4.7.dist-info/RECORD +0 -13
- {imsciences-0.5.4.7.dist-info → imsciences-0.9.3.dist-info}/top_level.txt +0 -0
imsciences/pull.py
ADDED
|
@@ -0,0 +1,1483 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import re
|
|
4
|
+
from fredapi import Fred
|
|
5
|
+
import time
|
|
6
|
+
from datetime import datetime, timedelta
|
|
7
|
+
from io import StringIO
|
|
8
|
+
import requests
|
|
9
|
+
import xml.etree.ElementTree as ET
|
|
10
|
+
from bs4 import BeautifulSoup
|
|
11
|
+
import yfinance as yf
|
|
12
|
+
import holidays
|
|
13
|
+
from dateutil.easter import easter
|
|
14
|
+
|
|
15
|
+
from imsciences.mmm import dataprocessing
|
|
16
|
+
|
|
17
|
+
ims_proc = dataprocessing()
|
|
18
|
+
|
|
19
|
+
class datapull:
|
|
20
|
+
|
|
21
|
+
def help(self):
|
|
22
|
+
print("This is the help section. The functions in the package are as follows:")
|
|
23
|
+
|
|
24
|
+
print("\n1. pull_fred_data")
|
|
25
|
+
print(" - Description: Get data from FRED by using series id tokens.")
|
|
26
|
+
print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
|
|
27
|
+
print(" - Example: pull_fred_data('mon', ['GPDIC1'])")
|
|
28
|
+
|
|
29
|
+
print("\n2. pull_boe_data")
|
|
30
|
+
print(" - Description: Fetch and process Bank of England interest rate data.")
|
|
31
|
+
print(" - Usage: pull_boe_data(week_commencing)")
|
|
32
|
+
print(" - Example: pull_boe_data('mon')")
|
|
33
|
+
|
|
34
|
+
print("\n3. pull_oecd")
|
|
35
|
+
print(" - Description: Fetch macroeconomic data from OECD for a specified country.")
|
|
36
|
+
print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '2020-01-01')")
|
|
37
|
+
print(" - Example: pull_oecd('GBR', 'mon', '2000-01-01')")
|
|
38
|
+
|
|
39
|
+
print("\n4. get_google_mobility_data")
|
|
40
|
+
print(" - Description: Fetch Google Mobility data for the specified country.")
|
|
41
|
+
print(" - Usage: get_google_mobility_data(country, wc)")
|
|
42
|
+
print(" - Example: get_google_mobility_data('United Kingdom', 'mon')")
|
|
43
|
+
|
|
44
|
+
print("\n5. pull_seasonality")
|
|
45
|
+
print(" - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.")
|
|
46
|
+
print(" - Usage: pull_seasonality(week_commencing, start_date, countries)")
|
|
47
|
+
print(" - Example: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])")
|
|
48
|
+
|
|
49
|
+
print("\n6. pull_weather")
|
|
50
|
+
print(" - Description: Fetch and process historical weather data for the specified country.")
|
|
51
|
+
print(" - Usage: pull_weather(week_commencing, country)")
|
|
52
|
+
print(" - Example: pull_weather('mon', 'GBR')")
|
|
53
|
+
|
|
54
|
+
print("\n7. pull_macro_ons_uk")
|
|
55
|
+
print(" - Description: Fetch and process time series data from the Beta ONS API.")
|
|
56
|
+
print(" - Usage: pull_macro_ons_uk(aditional_list, week_commencing, sector)")
|
|
57
|
+
print(" - Example: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')")
|
|
58
|
+
|
|
59
|
+
print("\n8. pull_yfinance")
|
|
60
|
+
print(" - Description: Fetch and process time series data from the Beta ONS API.")
|
|
61
|
+
print(" - Usage: pull_yfinance(tickers, week_start_day)")
|
|
62
|
+
print(" - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
|
|
63
|
+
|
|
64
|
+
############################################################### MACRO ##########################################################################
|
|
65
|
+
|
|
66
|
+
def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
|
|
67
|
+
'''
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
week_commencing : str
|
|
71
|
+
specify the day for the week commencing, the default is 'sun' (e.g., 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
|
|
72
|
+
|
|
73
|
+
series_id_list : list[str]
|
|
74
|
+
provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
|
|
75
|
+
["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
----------
|
|
79
|
+
pd.DataFrame
|
|
80
|
+
Return a data frame with FRED data according to the series IDs provided
|
|
81
|
+
'''
|
|
82
|
+
# Fred API
|
|
83
|
+
fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
|
|
84
|
+
|
|
85
|
+
# Fetch the metadata for each series to get the full names
|
|
86
|
+
series_names = {series_id: fred.get_series_info(series_id).title for series_id in series_id_list}
|
|
87
|
+
|
|
88
|
+
# Download data from series id list
|
|
89
|
+
fred_series = {series_id: fred.get_series(series_id) for series_id in series_id_list}
|
|
90
|
+
|
|
91
|
+
# Data processing
|
|
92
|
+
date_range = {'OBS': pd.date_range("1950-01-01", datetime.today().strftime('%Y-%m-%d'), freq='d')}
|
|
93
|
+
fred_series_df = pd.DataFrame(date_range)
|
|
94
|
+
|
|
95
|
+
for series_id, series_data in fred_series.items():
|
|
96
|
+
series_data = series_data.reset_index()
|
|
97
|
+
series_data.columns = ['OBS', series_names[series_id]] # Use the series name as the column header
|
|
98
|
+
fred_series_df = pd.merge_asof(fred_series_df, series_data, on='OBS', direction='backward')
|
|
99
|
+
|
|
100
|
+
# Handle duplicate columns
|
|
101
|
+
for col in fred_series_df.columns:
|
|
102
|
+
if '_x' in col:
|
|
103
|
+
base_col = col.replace('_x', '')
|
|
104
|
+
fred_series_df[base_col] = fred_series_df[col].combine_first(fred_series_df[base_col + '_y'])
|
|
105
|
+
fred_series_df.drop([col, base_col + '_y'], axis=1, inplace=True)
|
|
106
|
+
|
|
107
|
+
# Ensure sum_columns are present in the DataFrame
|
|
108
|
+
sum_columns = [series_names[series_id] for series_id in series_id_list if series_names[series_id] in fred_series_df.columns]
|
|
109
|
+
|
|
110
|
+
# Aggregate results by week
|
|
111
|
+
fred_df_final = ims_proc.aggregate_daily_to_wc_wide(df=fred_series_df,
|
|
112
|
+
date_column="OBS",
|
|
113
|
+
group_columns=[],
|
|
114
|
+
sum_columns=sum_columns,
|
|
115
|
+
wc=week_commencing,
|
|
116
|
+
aggregation="average")
|
|
117
|
+
|
|
118
|
+
# Remove anything after the instance of any ':' in the column names and rename, except for 'OBS'
|
|
119
|
+
fred_df_final.columns = ['OBS' if col == 'OBS' else 'macro_' + col.lower().split(':')[0].replace(' ', '_') for col in fred_df_final.columns]
|
|
120
|
+
|
|
121
|
+
return fred_df_final
|
|
122
|
+
|
|
123
|
+
def pull_boe_data(self, week_commencing="mon", max_retries=5, delay=5):
|
|
124
|
+
"""
|
|
125
|
+
Fetch and process Bank of England interest rate data.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
week_commencing (str): The starting day of the week for aggregation.
|
|
129
|
+
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
130
|
+
Default is "mon".
|
|
131
|
+
max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
|
|
132
|
+
delay (int): Delay in seconds between retry attempts. Default is 5.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
|
|
136
|
+
The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
|
|
137
|
+
and 'macro_boe_intr_rate' contains the average interest rate for the week.
|
|
138
|
+
"""
|
|
139
|
+
# Week commencing dictionary
|
|
140
|
+
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
141
|
+
|
|
142
|
+
# URL of the Bank of England data page
|
|
143
|
+
url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
|
|
144
|
+
|
|
145
|
+
# Retry logic for HTTP request
|
|
146
|
+
for attempt in range(max_retries):
|
|
147
|
+
try:
|
|
148
|
+
# Set up headers to mimic a browser request
|
|
149
|
+
headers = {
|
|
150
|
+
"User-Agent": (
|
|
151
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
152
|
+
"Chrome/91.0.4472.124 Safari/537.36"
|
|
153
|
+
)
|
|
154
|
+
}
|
|
155
|
+
response = requests.get(url, headers=headers)
|
|
156
|
+
response.raise_for_status() # Raise an exception for HTTP errors
|
|
157
|
+
break
|
|
158
|
+
except requests.exceptions.RequestException as e:
|
|
159
|
+
print(f"Attempt {attempt + 1} failed: {e}")
|
|
160
|
+
if attempt < max_retries - 1:
|
|
161
|
+
time.sleep(delay)
|
|
162
|
+
else:
|
|
163
|
+
raise
|
|
164
|
+
|
|
165
|
+
# Parse the HTML page
|
|
166
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
167
|
+
|
|
168
|
+
# Find the table on the page
|
|
169
|
+
table = soup.find("table") # Locate the first table
|
|
170
|
+
table_html = str(table) # Convert table to string
|
|
171
|
+
df = pd.read_html(StringIO(table_html))[0] # Use StringIO to wrap the table HTML
|
|
172
|
+
|
|
173
|
+
# Rename and clean up columns
|
|
174
|
+
df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
|
|
175
|
+
df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
|
|
176
|
+
df.sort_values("OBS", inplace=True)
|
|
177
|
+
|
|
178
|
+
# Create a daily date range
|
|
179
|
+
date_range = pd.date_range(df["OBS"].min(), datetime.today(), freq="D")
|
|
180
|
+
df_daily = pd.DataFrame(date_range, columns=["OBS"])
|
|
181
|
+
|
|
182
|
+
# Adjust each date to the specified week commencing day
|
|
183
|
+
df_daily["Week_Commencing"] = df_daily["OBS"].apply(
|
|
184
|
+
lambda x: x - timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Merge and forward-fill missing rates
|
|
188
|
+
df_daily = df_daily.merge(df, on="OBS", how="left")
|
|
189
|
+
df_daily["macro_boe_intr_rate"] = df_daily["macro_boe_intr_rate"].ffill()
|
|
190
|
+
|
|
191
|
+
# Group by week commencing and calculate the average rate
|
|
192
|
+
df_final = df_daily.groupby("Week_Commencing")["macro_boe_intr_rate"].mean().reset_index()
|
|
193
|
+
df_final["Week_Commencing"] = df_final["Week_Commencing"].dt.strftime('%d/%m/%Y')
|
|
194
|
+
df_final.rename(columns={"Week_Commencing": "OBS"}, inplace=True)
|
|
195
|
+
|
|
196
|
+
return df_final
|
|
197
|
+
|
|
198
|
+
def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "2020-01-01") -> pd.DataFrame:
|
|
199
|
+
"""
|
|
200
|
+
Fetch and process time series data from the OECD API.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
|
|
204
|
+
week_commencing (str): The starting day of the week for aggregation.
|
|
205
|
+
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
206
|
+
start_date (str): Dataset start date in the format "YYYY-MM-DD"
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
pd.DataFrame: A DataFrame with weekly aggregated OECD data. The 'OBS' column contains the week
|
|
210
|
+
commencing dates, and other columns contain the aggregated time series values.
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
def parse_quarter(date_str):
|
|
214
|
+
"""Parses a string in 'YYYY-Q#' format into a datetime object."""
|
|
215
|
+
year, quarter = date_str.split('-')
|
|
216
|
+
quarter_number = int(quarter[1])
|
|
217
|
+
month = (quarter_number - 1) * 3 + 1
|
|
218
|
+
return pd.Timestamp(f"{year}-{month:02d}-01")
|
|
219
|
+
|
|
220
|
+
# Generate a date range from 1950-01-01 to today
|
|
221
|
+
date_range = pd.date_range(start=start_date, end=datetime.today(), freq='D')
|
|
222
|
+
|
|
223
|
+
url_details = [
|
|
224
|
+
["BCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_business_confidence_index"],
|
|
225
|
+
["CCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_consumer_confidence_index"],
|
|
226
|
+
["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA._T.N.GY", "macro_cpi_total"],
|
|
227
|
+
["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP041T043.N.GY", "macro_cpi_housing"],
|
|
228
|
+
["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP01.N.GY", "macro_cpi_food"],
|
|
229
|
+
["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP045_0722.N.GY", "macro_cpi_energy"],
|
|
230
|
+
["UNE_LF_M", "SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,", "._Z.Y._T.Y_GE15.", "macro_unemployment_rate"],
|
|
231
|
+
["EAR", "SDD.TPS,DSD_EAR@DF_HOU_EAR,", ".Y..S1D", "macro_private_hourly_earnings"],
|
|
232
|
+
["RHP", "ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0", "", "macro_real_house_prices"],
|
|
233
|
+
["PRVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX.C..", "macro_manufacturing_production_volume"],
|
|
234
|
+
["TOVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX...", "macro_retail_trade_volume"],
|
|
235
|
+
["IRSTCI", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_interbank_rate"],
|
|
236
|
+
["IRLT", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_long_term_interest_rate"],
|
|
237
|
+
["B1GQ", "SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1", "._Z....GY.T0102", "macro_gdp_growth_yoy"]
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
# Create empty final dataframe
|
|
241
|
+
oecd_df_final = pd.DataFrame()
|
|
242
|
+
|
|
243
|
+
daily_df = pd.DataFrame({'OBS': date_range})
|
|
244
|
+
value_columns = []
|
|
245
|
+
|
|
246
|
+
# Iterate for each variable of interest
|
|
247
|
+
for series_details in url_details:
|
|
248
|
+
series = series_details[0]
|
|
249
|
+
dataset_id = series_details[1]
|
|
250
|
+
filter = series_details[2]
|
|
251
|
+
col_name = series_details[3]
|
|
252
|
+
|
|
253
|
+
# check if request was successful and determine the most granular data available
|
|
254
|
+
for freq in ['M', 'Q', 'A']:
|
|
255
|
+
|
|
256
|
+
if series in ["UNE_LF_M", "EAR"]:
|
|
257
|
+
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{series}.{filter}.{freq}?startPeriod=1950-01"
|
|
258
|
+
elif series in ["B1GQ"]:
|
|
259
|
+
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{country}...{series}.{filter}?startPeriod=1950-01"
|
|
260
|
+
else:
|
|
261
|
+
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{freq}.{series}.{filter}?startPeriod=1950-01"
|
|
262
|
+
|
|
263
|
+
# Make the request to the OECD API for data
|
|
264
|
+
data_response = requests.get(data_url)
|
|
265
|
+
|
|
266
|
+
# Check if the request was successful
|
|
267
|
+
if data_response.status_code != 200:
|
|
268
|
+
print(f"Failed to fetch data for series {series} with frequency '{freq}' for {country}: {data_response.status_code} {data_response.text}")
|
|
269
|
+
url_test = False
|
|
270
|
+
continue
|
|
271
|
+
else:
|
|
272
|
+
url_test = True
|
|
273
|
+
break
|
|
274
|
+
|
|
275
|
+
# get data for the next variable if url doesn't exist
|
|
276
|
+
if url_test is False:
|
|
277
|
+
continue
|
|
278
|
+
|
|
279
|
+
root = ET.fromstring(data_response.content)
|
|
280
|
+
|
|
281
|
+
# Define namespaces if necessary (the namespace is included in the tags)
|
|
282
|
+
namespaces = {'generic': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic'}
|
|
283
|
+
|
|
284
|
+
# Lists to store the data
|
|
285
|
+
dates = []
|
|
286
|
+
values = []
|
|
287
|
+
|
|
288
|
+
# Iterate over all <Obs> elements and extract date and value
|
|
289
|
+
for obs in root.findall('.//generic:Obs', namespaces):
|
|
290
|
+
|
|
291
|
+
# Extracting the time period (date)
|
|
292
|
+
time_period = obs.find('.//generic:ObsDimension', namespaces).get('value')
|
|
293
|
+
|
|
294
|
+
# Extracting the observation value
|
|
295
|
+
value = obs.find('.//generic:ObsValue', namespaces).get('value')
|
|
296
|
+
|
|
297
|
+
# Storing the data
|
|
298
|
+
if time_period and value:
|
|
299
|
+
dates.append(time_period)
|
|
300
|
+
values.append(float(value)) # Convert value to float
|
|
301
|
+
|
|
302
|
+
# Add variable names that were found to a list
|
|
303
|
+
value_columns.append(col_name)
|
|
304
|
+
|
|
305
|
+
# Creating a DataFrame
|
|
306
|
+
data = pd.DataFrame({'OBS': dates, col_name: values})
|
|
307
|
+
|
|
308
|
+
# Convert date strings into datetime format
|
|
309
|
+
if freq == 'Q':
|
|
310
|
+
data['OBS'] = data['OBS'].apply(parse_quarter)
|
|
311
|
+
else:
|
|
312
|
+
# Display the DataFrame
|
|
313
|
+
data['OBS'] = data['OBS'].apply(lambda x: datetime.strptime(x, '%Y-%m'))
|
|
314
|
+
|
|
315
|
+
# Sort data by chronological order
|
|
316
|
+
data.sort_values(by='OBS', inplace=True)
|
|
317
|
+
|
|
318
|
+
# Merge the data based on the observation date
|
|
319
|
+
daily_df = pd.merge_asof(daily_df, data[['OBS', col_name]], on='OBS', direction='backward')
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
# Ensure columns are numeric
|
|
323
|
+
for col in value_columns:
|
|
324
|
+
if col in daily_df.columns:
|
|
325
|
+
daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
|
|
326
|
+
else:
|
|
327
|
+
print(f"Column {col} not found in daily_df")
|
|
328
|
+
|
|
329
|
+
# Aggregate results by week
|
|
330
|
+
country_df = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
|
|
331
|
+
date_column="OBS",
|
|
332
|
+
group_columns=[],
|
|
333
|
+
sum_columns=value_columns,
|
|
334
|
+
wc=week_commencing,
|
|
335
|
+
aggregation="average")
|
|
336
|
+
|
|
337
|
+
oecd_df_final = pd.concat([oecd_df_final, country_df], axis=0, ignore_index=True)
|
|
338
|
+
|
|
339
|
+
return oecd_df_final
|
|
340
|
+
|
|
341
|
+
def get_google_mobility_data(self, country="United Kingdom", wc="mon") -> pd.DataFrame:
|
|
342
|
+
"""
|
|
343
|
+
Fetch Google Mobility data for the specified country.
|
|
344
|
+
|
|
345
|
+
Parameters:
|
|
346
|
+
- country (str): The name of the country for which to fetch data.
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
- pd.DataFrame: A DataFrame containing the Google Mobility data.
|
|
350
|
+
"""
|
|
351
|
+
# URL of the Google Mobility Reports CSV file
|
|
352
|
+
url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv"
|
|
353
|
+
|
|
354
|
+
# Fetch the CSV file
|
|
355
|
+
response = requests.get(url)
|
|
356
|
+
if response.status_code != 200:
|
|
357
|
+
raise Exception(f"Failed to fetch data: {response.status_code}")
|
|
358
|
+
|
|
359
|
+
# Load the CSV file into a pandas DataFrame
|
|
360
|
+
csv_data = StringIO(response.text)
|
|
361
|
+
df = pd.read_csv(csv_data, low_memory=False)
|
|
362
|
+
|
|
363
|
+
# Filter the DataFrame for the specified country
|
|
364
|
+
country_df = df[df['country_region'] == country]
|
|
365
|
+
|
|
366
|
+
final_covid = ims_proc.aggregate_daily_to_wc_wide(country_df, "date", [], ['retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline',
|
|
367
|
+
'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline',
|
|
368
|
+
'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline'], wc, "average")
|
|
369
|
+
|
|
370
|
+
final_covid1 = ims_proc.rename_cols(final_covid, 'covid_')
|
|
371
|
+
return final_covid1
|
|
372
|
+
|
|
373
|
+
############################################################### Seasonality ##########################################################################
|
|
374
|
+
|
|
375
|
+
def pull_seasonality(self, week_commencing, start_date, countries):
|
|
376
|
+
# ---------------------------------------------------------------------
|
|
377
|
+
# 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
|
|
378
|
+
# ---------------------------------------------------------------------
|
|
379
|
+
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
380
|
+
|
|
381
|
+
# ---------------------------------------------------------------------
|
|
382
|
+
# 1. Create daily date range from start_date to today
|
|
383
|
+
# ---------------------------------------------------------------------
|
|
384
|
+
date_range = pd.date_range(
|
|
385
|
+
start=pd.to_datetime(start_date),
|
|
386
|
+
end=datetime.today(),
|
|
387
|
+
freq="D"
|
|
388
|
+
)
|
|
389
|
+
df_daily = pd.DataFrame(date_range, columns=["Date"])
|
|
390
|
+
|
|
391
|
+
# ---------------------------------------------------------------------
|
|
392
|
+
# 1.1 Identify "week_start" for each daily row, based on week_commencing
|
|
393
|
+
# ---------------------------------------------------------------------
|
|
394
|
+
df_daily['week_start'] = df_daily["Date"].apply(
|
|
395
|
+
lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# ---------------------------------------------------------------------
|
|
399
|
+
# 2. Build a weekly index (df_weekly_start) with dummy columns
|
|
400
|
+
# ---------------------------------------------------------------------
|
|
401
|
+
df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
|
|
402
|
+
df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
|
|
403
|
+
|
|
404
|
+
# Set index to weekly "start of week"
|
|
405
|
+
df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
|
|
406
|
+
df_weekly_start.set_index("Date", inplace=True)
|
|
407
|
+
|
|
408
|
+
# Create individual weekly dummies
|
|
409
|
+
dummy_columns = {}
|
|
410
|
+
for i in range(len(df_weekly_start)):
|
|
411
|
+
col_name = f"dum_{df_weekly_start.index[i].strftime('%Y_%m_%d')}"
|
|
412
|
+
dummy_columns[col_name] = [0] * len(df_weekly_start)
|
|
413
|
+
dummy_columns[col_name][i] = 1
|
|
414
|
+
|
|
415
|
+
df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
|
|
416
|
+
df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
|
|
417
|
+
|
|
418
|
+
# ---------------------------------------------------------------------
|
|
419
|
+
# 3. Public holidays (daily) from 'holidays' package + each holiday name
|
|
420
|
+
# ---------------------------------------------------------------------
|
|
421
|
+
for country in countries:
|
|
422
|
+
country_holidays = holidays.CountryHoliday(
|
|
423
|
+
country,
|
|
424
|
+
years=range(int(start_date[:4]), datetime.today().year + 1)
|
|
425
|
+
)
|
|
426
|
+
# Daily indicator: 1 if that date is a holiday
|
|
427
|
+
df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(
|
|
428
|
+
lambda x: 1 if x in country_holidays else 0
|
|
429
|
+
)
|
|
430
|
+
# Create columns for specific holiday names
|
|
431
|
+
for date_hol, name in country_holidays.items():
|
|
432
|
+
col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
|
|
433
|
+
if col_name not in df_daily.columns:
|
|
434
|
+
df_daily[col_name] = 0
|
|
435
|
+
df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
|
|
436
|
+
|
|
437
|
+
# ---------------------------------------------------------------------
|
|
438
|
+
# 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
|
|
439
|
+
# We'll add daily columns for each.
|
|
440
|
+
# ---------------------------------------------------------------------
|
|
441
|
+
# Initialize columns
|
|
442
|
+
extra_cols = [
|
|
443
|
+
"seas_valentines_day",
|
|
444
|
+
"seas_halloween",
|
|
445
|
+
"seas_fathers_day_us_uk",
|
|
446
|
+
"seas_mothers_day_us",
|
|
447
|
+
"seas_mothers_day_uk",
|
|
448
|
+
"seas_good_friday",
|
|
449
|
+
"seas_easter_monday",
|
|
450
|
+
"seas_black_friday",
|
|
451
|
+
"seas_cyber_monday",
|
|
452
|
+
]
|
|
453
|
+
for c in extra_cols:
|
|
454
|
+
df_daily[c] = 0 # default zero
|
|
455
|
+
|
|
456
|
+
# Helper: nth_weekday_of_month(year, month, weekday, nth=1 => first, 2 => second, etc.)
|
|
457
|
+
# weekday: Monday=0, Tuesday=1, ... Sunday=6
|
|
458
|
+
def nth_weekday_of_month(year, month, weekday, nth):
|
|
459
|
+
"""
|
|
460
|
+
Returns date of the nth <weekday> in <month> of <year>.
|
|
461
|
+
E.g. nth_weekday_of_month(2023, 6, 6, 3) => 3rd Sunday of June 2023.
|
|
462
|
+
"""
|
|
463
|
+
# 1st day of the month
|
|
464
|
+
d = datetime(year, month, 1)
|
|
465
|
+
# What is the weekday of day #1?
|
|
466
|
+
w = d.weekday() # Monday=0, Tuesday=1, ... Sunday=6
|
|
467
|
+
# If we want, e.g. Sunday=6, we see how many days to add
|
|
468
|
+
delta = (weekday - w) % 7
|
|
469
|
+
# This is the first <weekday> in that month
|
|
470
|
+
first_weekday = d + timedelta(days=delta)
|
|
471
|
+
# Now add 7*(nth-1) days
|
|
472
|
+
return first_weekday + timedelta(days=7 * (nth-1))
|
|
473
|
+
|
|
474
|
+
def get_good_friday(year):
|
|
475
|
+
"""Good Friday is 2 days before Easter Sunday."""
|
|
476
|
+
return easter(year) - timedelta(days=2)
|
|
477
|
+
|
|
478
|
+
def get_easter_monday(year):
|
|
479
|
+
"""Easter Monday is 1 day after Easter Sunday."""
|
|
480
|
+
return easter(year) + timedelta(days=1)
|
|
481
|
+
|
|
482
|
+
def get_black_friday(year):
|
|
483
|
+
"""
|
|
484
|
+
Black Friday = day after US Thanksgiving,
|
|
485
|
+
and US Thanksgiving is the 4th Thursday in November.
|
|
486
|
+
"""
|
|
487
|
+
# 4th Thursday in November
|
|
488
|
+
fourth_thursday = nth_weekday_of_month(year, 11, 3, 4) # weekday=3 => Thursday
|
|
489
|
+
return fourth_thursday + timedelta(days=1)
|
|
490
|
+
|
|
491
|
+
def get_cyber_monday(year):
|
|
492
|
+
"""Cyber Monday = Monday after US Thanksgiving, i.e. 4 days after 4th Thursday in Nov."""
|
|
493
|
+
# 4th Thursday in November
|
|
494
|
+
fourth_thursday = nth_weekday_of_month(year, 11, 3, 4)
|
|
495
|
+
return fourth_thursday + timedelta(days=4) # Monday after Thanksgiving
|
|
496
|
+
|
|
497
|
+
# Loop over each year in range
|
|
498
|
+
start_yr = int(start_date[:4])
|
|
499
|
+
end_yr = datetime.today().year
|
|
500
|
+
|
|
501
|
+
for yr in range(start_yr, end_yr + 1):
|
|
502
|
+
# Valentines = Feb 14
|
|
503
|
+
valentines_day = datetime(yr, 2, 14)
|
|
504
|
+
# Halloween = Oct 31
|
|
505
|
+
halloween_day = datetime(yr, 10, 31)
|
|
506
|
+
# Father's Day (US & UK) = 3rd Sunday in June
|
|
507
|
+
fathers_day = nth_weekday_of_month(yr, 6, 6, 3) # Sunday=6
|
|
508
|
+
# Mother's Day US = 2nd Sunday in May
|
|
509
|
+
mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
|
|
510
|
+
# Mother's Day UK: 4th Sunday in Lent => "Mothering Sunday"
|
|
511
|
+
# We can approximate as: Easter Sunday - 21 days
|
|
512
|
+
# BUT we also must ensure it's actually Sunday
|
|
513
|
+
# (the 4th Sunday in Lent can shift. We'll do the official approach below.)
|
|
514
|
+
# Another approach: Easter Sunday - 7 * (4 weeks) is the 4th Sunday prior to Easter.
|
|
515
|
+
# But that might overshoot if Lent started mid-week.
|
|
516
|
+
# Let's do a quick approach:
|
|
517
|
+
# Officially: Mothering Sunday = 3 weeks before Easter Sunday (the 4th Sunday is Easter Sunday itself).
|
|
518
|
+
# So Easter - 21 days should be the Sunday, but let's confirm with weekday check.
|
|
519
|
+
mothering_sunday = easter(yr) - timedelta(days=21)
|
|
520
|
+
# If for some reason that's not a Sunday (rare corner cases), shift to Sunday:
|
|
521
|
+
while mothering_sunday.weekday() != 6: # Sunday=6
|
|
522
|
+
mothering_sunday -= timedelta(days=1)
|
|
523
|
+
|
|
524
|
+
# Good Friday, Easter Monday
|
|
525
|
+
gf = get_good_friday(yr)
|
|
526
|
+
em = get_easter_monday(yr)
|
|
527
|
+
|
|
528
|
+
# Black Friday, Cyber Monday
|
|
529
|
+
bf = get_black_friday(yr)
|
|
530
|
+
cm = get_cyber_monday(yr)
|
|
531
|
+
|
|
532
|
+
# Mark them in df_daily if in range
|
|
533
|
+
for special_date, col in [
|
|
534
|
+
(valentines_day, "seas_valentines_day"),
|
|
535
|
+
(halloween_day, "seas_halloween"),
|
|
536
|
+
(fathers_day, "seas_fathers_day_us_uk"),
|
|
537
|
+
(mothers_day_us, "seas_mothers_day_us"),
|
|
538
|
+
(mothering_sunday, "seas_mothers_day_uk"),
|
|
539
|
+
(gf, "seas_good_friday"),
|
|
540
|
+
(em, "seas_easter_monday"),
|
|
541
|
+
(bf, "seas_black_friday"),
|
|
542
|
+
(cm, "seas_cyber_monday"),
|
|
543
|
+
]:
|
|
544
|
+
# Convert to pd.Timestamp:
|
|
545
|
+
special_ts = pd.Timestamp(special_date)
|
|
546
|
+
|
|
547
|
+
# Only set if it's within your daily range
|
|
548
|
+
if (special_ts >= df_daily["Date"].min()) and (special_ts <= df_daily["Date"].max()):
|
|
549
|
+
df_daily.loc[df_daily["Date"] == special_ts, col] = 1
|
|
550
|
+
|
|
551
|
+
# ---------------------------------------------------------------------
|
|
552
|
+
# 4. Add daily indicators for last day & last Friday of month
|
|
553
|
+
# Then aggregate them to weekly level using .max()
|
|
554
|
+
# ---------------------------------------------------------------------
|
|
555
|
+
# Last day of month (daily)
|
|
556
|
+
df_daily["seas_last_day_of_month"] = df_daily["Date"].apply(
|
|
557
|
+
lambda d: 1 if d == d.to_period("M").to_timestamp("M") else 0
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# Last Friday of month (daily)
|
|
561
|
+
def is_last_friday(date):
|
|
562
|
+
# last day of the month
|
|
563
|
+
last_day_of_month = date.to_period("M").to_timestamp("M")
|
|
564
|
+
last_day_weekday = last_day_of_month.weekday() # Monday=0,...Sunday=6
|
|
565
|
+
# Determine how many days we go back from the last day to get Friday (weekday=4)
|
|
566
|
+
if last_day_weekday >= 4:
|
|
567
|
+
days_to_subtract = last_day_weekday - 4
|
|
568
|
+
else:
|
|
569
|
+
days_to_subtract = last_day_weekday + 3
|
|
570
|
+
last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
|
|
571
|
+
return 1 if date == last_friday else 0
|
|
572
|
+
|
|
573
|
+
df_daily["seas_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
|
|
574
|
+
|
|
575
|
+
# ---------------------------------------------------------------------
|
|
576
|
+
# 5. Weekly aggregation for holiday columns & monthly dummies
|
|
577
|
+
# ---------------------------------------------------------------------
|
|
578
|
+
# For monthly dummies, create a daily col "Month", then get_dummies
|
|
579
|
+
df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
|
|
580
|
+
df_monthly_dummies = pd.get_dummies(
|
|
581
|
+
df_daily,
|
|
582
|
+
prefix="seas",
|
|
583
|
+
columns=["Month"],
|
|
584
|
+
dtype=int
|
|
585
|
+
)
|
|
586
|
+
# Recalculate 'week_start' (already in df_daily, but just to be sure)
|
|
587
|
+
df_monthly_dummies['week_start'] = df_daily['week_start']
|
|
588
|
+
|
|
589
|
+
# Group monthly dummies by .sum() or .mean()—we often spread them across the week
|
|
590
|
+
df_monthly_dummies = (
|
|
591
|
+
df_monthly_dummies
|
|
592
|
+
.groupby('week_start')
|
|
593
|
+
.sum(numeric_only=True) # sum the daily flags
|
|
594
|
+
.reset_index()
|
|
595
|
+
.rename(columns={'week_start': "Date"})
|
|
596
|
+
.set_index("Date")
|
|
597
|
+
)
|
|
598
|
+
# Spread monthly dummies by 7 to distribute across that week
|
|
599
|
+
monthly_cols = [c for c in df_monthly_dummies.columns if c.startswith("seas_month_")]
|
|
600
|
+
df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
|
|
601
|
+
|
|
602
|
+
# Group holiday & special-day columns by .max() => binary at weekly level
|
|
603
|
+
df_holidays = (
|
|
604
|
+
df_daily
|
|
605
|
+
.groupby('week_start')
|
|
606
|
+
.max(numeric_only=True) # if any day=1 in that week, entire week=1
|
|
607
|
+
.reset_index()
|
|
608
|
+
.rename(columns={'week_start': "Date"})
|
|
609
|
+
.set_index("Date")
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
# ---------------------------------------------------------------------
|
|
613
|
+
# 6. Combine weekly start, monthly dummies, holiday flags
|
|
614
|
+
# ---------------------------------------------------------------------
|
|
615
|
+
df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
|
|
616
|
+
df_combined = pd.concat([df_combined, df_holidays], axis=1)
|
|
617
|
+
df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
|
|
618
|
+
|
|
619
|
+
# ---------------------------------------------------------------------
|
|
620
|
+
# 7. Create weekly dummies for Week of Year & yearly dummies
|
|
621
|
+
# ---------------------------------------------------------------------
|
|
622
|
+
df_combined.reset_index(inplace=True)
|
|
623
|
+
df_combined.rename(columns={"index": "old_index"}, inplace=True) # just in case
|
|
624
|
+
|
|
625
|
+
df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
|
|
626
|
+
df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int)
|
|
627
|
+
|
|
628
|
+
df_combined["Year"] = df_combined["Date"].dt.year
|
|
629
|
+
df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
|
|
630
|
+
|
|
631
|
+
# ---------------------------------------------------------------------
|
|
632
|
+
# 8. Add constant & trend
|
|
633
|
+
# ---------------------------------------------------------------------
|
|
634
|
+
df_combined["Constant"] = 1
|
|
635
|
+
df_combined["Trend"] = df_combined.index + 1
|
|
636
|
+
|
|
637
|
+
# ---------------------------------------------------------------------
|
|
638
|
+
# 9. Rename Date -> OBS and return
|
|
639
|
+
# ---------------------------------------------------------------------
|
|
640
|
+
df_combined.rename(columns={"Date": "OBS"}, inplace=True)
|
|
641
|
+
|
|
642
|
+
return df_combined
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def pull_weather(self, week_commencing, country) -> pd.DataFrame:
|
|
646
|
+
import pandas as pd
|
|
647
|
+
import urllib.request # noqa: F811
|
|
648
|
+
from datetime import datetime
|
|
649
|
+
import requests
|
|
650
|
+
from geopy.geocoders import Nominatim # noqa: F811
|
|
651
|
+
|
|
652
|
+
# Week commencing dictionary
|
|
653
|
+
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
654
|
+
|
|
655
|
+
# Country dictionary
|
|
656
|
+
country_dict = {"AUS": "AU__ASOS", "GBR": "GB__ASOS", "USA": "USCRN", "DEU": "DE__ASOS", "CAN": "Canada", "ZAF": "ZA__ASOS"}
|
|
657
|
+
|
|
658
|
+
# Function to flatten a list of nested lists into a list
|
|
659
|
+
def flatten_list(nested_list):
|
|
660
|
+
return [item for sublist in nested_list for item in sublist]
|
|
661
|
+
|
|
662
|
+
# Choose country
|
|
663
|
+
country = country_dict[country]
|
|
664
|
+
|
|
665
|
+
# Choose start and end dates
|
|
666
|
+
start_day = 1
|
|
667
|
+
start_month = 1
|
|
668
|
+
start_year = 2014
|
|
669
|
+
formatted_date = datetime(start_year, start_month, start_day).strftime("%Y-%m-%d")
|
|
670
|
+
today = datetime.now()
|
|
671
|
+
end_day = today.day
|
|
672
|
+
end_month = today.month
|
|
673
|
+
end_year = today.year
|
|
674
|
+
|
|
675
|
+
if country == "GB__ASOS":
|
|
676
|
+
stations = ["&stations=EGCC", "&stations=EGNM", "&stations=EGBB",
|
|
677
|
+
"&stations=EGSH", "&stations=EGFF", "&stations=EGHI",
|
|
678
|
+
"&stations=EGLC", "&stations=EGHQ", "&stations=EGAC",
|
|
679
|
+
"&stations=EGPF", "&stations=EGGD", "&stations=EGPE",
|
|
680
|
+
"&stations=EGNT"]
|
|
681
|
+
elif country == "AU__ASOS":
|
|
682
|
+
stations = ["&stations=YPDN", "&stations=YBCS", "&stations=YBBN",
|
|
683
|
+
"&stations=YSSY", "&stations=YSSY", "&stations=YMEN",
|
|
684
|
+
"&stations=YPAD", "&stations=YPPH"]
|
|
685
|
+
elif country == "USCRN":
|
|
686
|
+
stations = ["&stations=64756", "&stations=64758", "&stations=03761", "&stations=54797", # North
|
|
687
|
+
"&stations=53968", "&stations=53960", "&stations=54932", "&stations=13301", # Midwest
|
|
688
|
+
"&stations=64756", "&stations=64756", "&stations=92821", "&stations=63862", # South
|
|
689
|
+
"&stations=53152", "&stations=93245", "&stations=04138", "&stations=04237"] # West
|
|
690
|
+
elif country == "DE__ASOS":
|
|
691
|
+
stations = ["&stations=EDDL", "&stations=EDDH", "&stations=EDDB",
|
|
692
|
+
"&stations=EDDN", "&stations=EDDF", "&stations=EDDK",
|
|
693
|
+
"&stations=EDLW", "&stations=EDDM"]
|
|
694
|
+
elif country == "FR__ASOS":
|
|
695
|
+
stations = ["&stations=LFPB"]
|
|
696
|
+
elif country == "Canada":
|
|
697
|
+
institute_vector = ["CA_NB_ASOS", "CA_NF_ASOS", "CA_NT_ASOS", "CA_NS_ASOS",
|
|
698
|
+
"CA_NU_ASOS"]
|
|
699
|
+
stations_list = [[] for _ in range(5)]
|
|
700
|
+
stations_list[0].append(["&stations=CYQM", "&stations=CERM", "&stations=CZCR",
|
|
701
|
+
"&stations=CZBF", "&stations=CYFC", "&stations=CYCX"])
|
|
702
|
+
|
|
703
|
+
stations_list[1].append(["&stations=CWZZ", "&stations=CYDP", "&stations=CYMH",
|
|
704
|
+
"&stations=CYAY", "&stations=CWDO", "&stations=CXTP",
|
|
705
|
+
"&stations=CYJT", "&stations=CYYR", "&stations=CZUM",
|
|
706
|
+
"&stations=CYWK", "&stations=CYWK"])
|
|
707
|
+
|
|
708
|
+
stations_list[2].append(["&stations=CYHI", "&stations=CZCP", "&stations=CWLI",
|
|
709
|
+
"&stations=CWND", "&stations=CXTV", "&stations=CYVL",
|
|
710
|
+
"&stations=CYCO", "&stations=CXDE", "&stations=CYWE",
|
|
711
|
+
"&stations=CYLK", "&stations=CWID", "&stations=CYRF",
|
|
712
|
+
"&stations=CXYH", "&stations=CYWY", "&stations=CWMT"])
|
|
713
|
+
|
|
714
|
+
stations_list[3].append(["&stations=CWEF", "&stations=CXIB", "&stations=CYQY",
|
|
715
|
+
"&stations=CYPD", "&stations=CXNP", "&stations=CXMY",
|
|
716
|
+
"&stations=CYAW", "&stations=CWKG", "&stations=CWVU",
|
|
717
|
+
"&stations=CXLB", "&stations=CWSA", "&stations=CWRN"])
|
|
718
|
+
|
|
719
|
+
stations_list[4].append(["&stations=CYLT", "&stations=CWEU", "&stations=CWGZ",
|
|
720
|
+
"&stations=CYIO", "&stations=CXSE", "&stations=CYCB",
|
|
721
|
+
"&stations=CWIL", "&stations=CXWB", "&stations=CYZS",
|
|
722
|
+
"&stations=CWJC", "&stations=CYFB", "&stations=CWUW"])
|
|
723
|
+
|
|
724
|
+
elif country == "ZA__ASOS":
|
|
725
|
+
cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
|
|
726
|
+
stations = []
|
|
727
|
+
|
|
728
|
+
for city in cities:
|
|
729
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
730
|
+
location = geolocator.geocode(city)
|
|
731
|
+
stations.append(f"&latitude={location.latitude}&longitude={location.longitude}")
|
|
732
|
+
|
|
733
|
+
# Temperature
|
|
734
|
+
if country in ["GB__ASOS", "AU__ASOS", "DE__ASOS", "FR__ASOS"]:
|
|
735
|
+
# We start by making a data frame of the following weather stations
|
|
736
|
+
station_query = ''.join(stations)
|
|
737
|
+
|
|
738
|
+
raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
|
|
739
|
+
station_query,
|
|
740
|
+
"&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
|
|
741
|
+
"&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
|
|
742
|
+
raw_weather = urllib.request.urlopen(raw_weather_list)
|
|
743
|
+
raw_weather = pd.read_csv(raw_weather)
|
|
744
|
+
|
|
745
|
+
# Replace the occurrences of "None" with Missing Value
|
|
746
|
+
raw_weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
747
|
+
raw_weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
748
|
+
|
|
749
|
+
# Remove any data that isn't temperature-related
|
|
750
|
+
weather = raw_weather.iloc[:, 0:4]
|
|
751
|
+
|
|
752
|
+
weather[["max_temp_f", "min_temp_f"]] = weather[["max_temp_f", "min_temp_f"]].apply(pd.to_numeric)
|
|
753
|
+
|
|
754
|
+
# Estimate mean temperature
|
|
755
|
+
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
756
|
+
|
|
757
|
+
# Convert Fahrenheit to Celsius for max_temp_f
|
|
758
|
+
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
759
|
+
|
|
760
|
+
# Convert Fahrenheit to Celsius for min_temp_f
|
|
761
|
+
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
762
|
+
|
|
763
|
+
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
764
|
+
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
765
|
+
|
|
766
|
+
# Aggregate the data to week commencing sunday taking the average of the data
|
|
767
|
+
# Convert the date column to a Date type
|
|
768
|
+
weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
|
|
769
|
+
|
|
770
|
+
# Determine the starting chosen day for each date
|
|
771
|
+
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
772
|
+
|
|
773
|
+
# Group by week_starting and summarize
|
|
774
|
+
numeric_columns = weather.select_dtypes(include='number').columns
|
|
775
|
+
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
776
|
+
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
777
|
+
"min_temp_f": "avg_min_temp_f",
|
|
778
|
+
"mean_temp_f": "avg_mean_temp_f",
|
|
779
|
+
"max_temp_c": "avg_max_temp_c",
|
|
780
|
+
"min_temp_c": "avg_min_temp_c",
|
|
781
|
+
"mean_temp_c": "avg_mean_temp_c"}, inplace=True)
|
|
782
|
+
elif country == "Canada":
|
|
783
|
+
for i in range(len(institute_vector)):
|
|
784
|
+
station_query_temp = ''.join(flatten_list(stations_list[i]))
|
|
785
|
+
institute_temp = institute_vector[i]
|
|
786
|
+
raw_weather_temp = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", institute_temp,
|
|
787
|
+
station_query_temp,
|
|
788
|
+
"&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
|
|
789
|
+
"&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
|
|
790
|
+
raw_weather_temp = urllib.request.urlopen(raw_weather_temp)
|
|
791
|
+
raw_weather_temp = pd.read_csv(raw_weather_temp)
|
|
792
|
+
|
|
793
|
+
if len(raw_weather_temp.index) == 0:
|
|
794
|
+
continue
|
|
795
|
+
raw_weather_temp = raw_weather_temp[['station', 'day', 'max_temp_f', 'min_temp_f', 'precip_in']]
|
|
796
|
+
|
|
797
|
+
if i == 1:
|
|
798
|
+
raw_weather = raw_weather_temp
|
|
799
|
+
else:
|
|
800
|
+
raw_weather = pd.concat([raw_weather, raw_weather_temp])
|
|
801
|
+
|
|
802
|
+
# Drop error column if it exists
|
|
803
|
+
if 'ERROR: Invalid network specified' in list(raw_weather.columns):
|
|
804
|
+
raw_weather.drop('ERROR: Invalid network specified', axis=1, inplace=True)
|
|
805
|
+
|
|
806
|
+
# Replace none values
|
|
807
|
+
raw_weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
808
|
+
raw_weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
809
|
+
raw_weather["precip_in"].replace("None", 0, inplace=True)
|
|
810
|
+
|
|
811
|
+
weather = raw_weather
|
|
812
|
+
weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
|
|
813
|
+
|
|
814
|
+
# Estimate mean temperature
|
|
815
|
+
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
816
|
+
|
|
817
|
+
# Convert Fahrenheit to Celsius for max_temp_f
|
|
818
|
+
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
819
|
+
|
|
820
|
+
# Convert Fahrenheit to Celsius for min_temp_f
|
|
821
|
+
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
822
|
+
|
|
823
|
+
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
824
|
+
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
825
|
+
|
|
826
|
+
# Aggregate the data to week commencing sunday taking the average of the data
|
|
827
|
+
# Convert the date column to a Date type
|
|
828
|
+
weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
|
|
829
|
+
|
|
830
|
+
# Determine the starting chosen day for each date
|
|
831
|
+
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
832
|
+
|
|
833
|
+
# Group by week_starting and summarize
|
|
834
|
+
numeric_columns = weather.select_dtypes(include='number').columns
|
|
835
|
+
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
836
|
+
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
837
|
+
"min_temp_f": "avg_min_temp_f",
|
|
838
|
+
"mean_temp_f": "avg_mean_temp_f",
|
|
839
|
+
"max_temp_c": "avg_max_temp_c",
|
|
840
|
+
"min_temp_c": "avg_min_temp_c",
|
|
841
|
+
"mean_temp_c": "avg_mean_temp_c",
|
|
842
|
+
"precip_in": "avg_mean_perc"}, inplace=True)
|
|
843
|
+
elif country == "ZA__ASOS":
|
|
844
|
+
weather_data_list = []
|
|
845
|
+
|
|
846
|
+
for city in cities:
|
|
847
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
848
|
+
location = geolocator.geocode(city)
|
|
849
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
850
|
+
|
|
851
|
+
params = {
|
|
852
|
+
"latitude": location.latitude,
|
|
853
|
+
"longitude": location.longitude,
|
|
854
|
+
"start_date": formatted_date,
|
|
855
|
+
"end_date": today.strftime("%Y-%m-%d"),
|
|
856
|
+
"daily": "temperature_2m_max,temperature_2m_min,precipitation_sum",
|
|
857
|
+
"timezone": "auto"
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
response = requests.get(url, params=params)
|
|
861
|
+
response_data = response.json()
|
|
862
|
+
|
|
863
|
+
daily_data = response_data["daily"]
|
|
864
|
+
dates = daily_data["time"]
|
|
865
|
+
|
|
866
|
+
data = pd.DataFrame({
|
|
867
|
+
"day": dates,
|
|
868
|
+
"max_temp_f": daily_data["temperature_2m_max"],
|
|
869
|
+
"min_temp_f": daily_data["temperature_2m_min"],
|
|
870
|
+
"precip_in": daily_data["precipitation_sum"]
|
|
871
|
+
})
|
|
872
|
+
data["city"] = city
|
|
873
|
+
weather_data_list.append(data)
|
|
874
|
+
|
|
875
|
+
weather = pd.concat(weather_data_list)
|
|
876
|
+
|
|
877
|
+
# Convert the date column to a Date type
|
|
878
|
+
weather["day"] = pd.to_datetime(weather["day"])
|
|
879
|
+
|
|
880
|
+
# Replace None values
|
|
881
|
+
weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
882
|
+
weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
883
|
+
weather["precip_in"].replace("None", 0, inplace=True)
|
|
884
|
+
|
|
885
|
+
weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
|
|
886
|
+
|
|
887
|
+
# Estimate mean temperature
|
|
888
|
+
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
889
|
+
|
|
890
|
+
# Convert Fahrenheit to Celsius for max_temp_f
|
|
891
|
+
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
892
|
+
|
|
893
|
+
# Convert Fahrenheit to Celsius for min_temp_f
|
|
894
|
+
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
895
|
+
|
|
896
|
+
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
897
|
+
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
898
|
+
|
|
899
|
+
# Determine the starting chosen day for each date
|
|
900
|
+
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
901
|
+
|
|
902
|
+
# Group by week_starting and summarize
|
|
903
|
+
numeric_columns = weather.select_dtypes(include='number').columns
|
|
904
|
+
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
905
|
+
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
906
|
+
"min_temp_f": "avg_min_temp_f",
|
|
907
|
+
"mean_temp_f": "avg_mean_temp_f",
|
|
908
|
+
"max_temp_c": "avg_max_temp_c",
|
|
909
|
+
"min_temp_c": "avg_min_temp_c",
|
|
910
|
+
"mean_temp_c": "avg_mean_temp_c",
|
|
911
|
+
"precip_in": "avg_mean_perc"}, inplace=True)
|
|
912
|
+
|
|
913
|
+
else:
|
|
914
|
+
# We start by making a data frame of the following weather stations
|
|
915
|
+
station_query = ''.join(stations)
|
|
916
|
+
|
|
917
|
+
raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
|
|
918
|
+
station_query,
|
|
919
|
+
"&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
|
|
920
|
+
"&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
|
|
921
|
+
raw_weather = urllib.request.urlopen(raw_weather_list)
|
|
922
|
+
raw_weather = pd.read_csv(raw_weather)
|
|
923
|
+
|
|
924
|
+
raw_weather = raw_weather[['day', 'max_temp_f', 'min_temp_f', 'precip_in']]
|
|
925
|
+
|
|
926
|
+
# Replace the occurrences of "None" with Missing Value
|
|
927
|
+
raw_weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
928
|
+
raw_weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
929
|
+
raw_weather["precip_in"].replace("None", 0, inplace=True)
|
|
930
|
+
|
|
931
|
+
# Remove any data that isn't temperature-related
|
|
932
|
+
weather = raw_weather
|
|
933
|
+
|
|
934
|
+
weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
|
|
935
|
+
|
|
936
|
+
# Estimate mean temperature
|
|
937
|
+
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
938
|
+
|
|
939
|
+
# Convert Fahrenheit to Celsius for max_temp_f
|
|
940
|
+
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
941
|
+
|
|
942
|
+
# Convert Fahrenheit to Celsius for min_temp_f
|
|
943
|
+
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
944
|
+
|
|
945
|
+
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
946
|
+
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
947
|
+
|
|
948
|
+
# Aggregate the data to week commencing sunday taking the average of the data
|
|
949
|
+
# Convert the date column to a Date type
|
|
950
|
+
weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
|
|
951
|
+
|
|
952
|
+
# Determine the starting chosen day for each date
|
|
953
|
+
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
954
|
+
|
|
955
|
+
# Group by week_starting and summarize
|
|
956
|
+
numeric_columns = weather.select_dtypes(include='number').columns
|
|
957
|
+
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
958
|
+
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
959
|
+
"min_temp_f": "avg_min_temp_f",
|
|
960
|
+
"mean_temp_f": "avg_mean_temp_f",
|
|
961
|
+
"max_temp_c": "avg_max_temp_c",
|
|
962
|
+
"min_temp_c": "avg_min_temp_c",
|
|
963
|
+
"mean_temp_c": "avg_mean_temp_c",
|
|
964
|
+
"precip_in": "avg_mean_perc"}, inplace=True)
|
|
965
|
+
|
|
966
|
+
# Rainfall
|
|
967
|
+
if country == "GB__ASOS":
|
|
968
|
+
# Define cities and date range
|
|
969
|
+
cities = ["Manchester", "Leeds", "Birmingham", "Norwich", "Cardiff", "Southampton", "London", "Newquay", "Belfast", "Glasgow", "Bristol", "Newcastle"]
|
|
970
|
+
|
|
971
|
+
start_date = formatted_date
|
|
972
|
+
end_date = today.strftime("%Y-%m-%d")
|
|
973
|
+
|
|
974
|
+
# Initialize an empty list to store the weather data for each city
|
|
975
|
+
weather_data_list = []
|
|
976
|
+
|
|
977
|
+
# Loop through each city and fetch weather data
|
|
978
|
+
for city in cities:
|
|
979
|
+
# Initialize Nominatim API
|
|
980
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
981
|
+
location = geolocator.geocode(city)
|
|
982
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
983
|
+
|
|
984
|
+
params = {
|
|
985
|
+
"latitude": location.latitude,
|
|
986
|
+
"longitude": location.longitude,
|
|
987
|
+
"start_date": start_date,
|
|
988
|
+
"end_date": end_date,
|
|
989
|
+
"daily": "precipitation_sum",
|
|
990
|
+
"timezone": "auto"
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
response = requests.get(url, params=params)
|
|
994
|
+
response_data = response.json()
|
|
995
|
+
|
|
996
|
+
daily_data = response_data["daily"]["precipitation_sum"]
|
|
997
|
+
dates = response_data["daily"]["time"]
|
|
998
|
+
|
|
999
|
+
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
1000
|
+
data["city"] = city
|
|
1001
|
+
|
|
1002
|
+
weather_data_list.append(data)
|
|
1003
|
+
|
|
1004
|
+
# Combine all city data into a single data frame
|
|
1005
|
+
all_weather_data = pd.concat(weather_data_list)
|
|
1006
|
+
|
|
1007
|
+
# Convert the date column to a Date type
|
|
1008
|
+
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
1009
|
+
|
|
1010
|
+
# Set week commencing col up
|
|
1011
|
+
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1012
|
+
|
|
1013
|
+
# Group by week_starting and summarize
|
|
1014
|
+
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
1015
|
+
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
1016
|
+
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
1017
|
+
|
|
1018
|
+
# Change index to datetime
|
|
1019
|
+
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
1020
|
+
|
|
1021
|
+
elif country == "AU__ASOS":
|
|
1022
|
+
|
|
1023
|
+
# Define cities and date range
|
|
1024
|
+
cities = ["Darwin", "Cairns", "Brisbane", "Sydney", "Melbourne", "Adelaide", "Perth"]
|
|
1025
|
+
|
|
1026
|
+
start_date = formatted_date
|
|
1027
|
+
end_date = today.strftime("%Y-%m-%d")
|
|
1028
|
+
|
|
1029
|
+
# Initialize an empty list to store the weather data for each city
|
|
1030
|
+
weather_data_list = []
|
|
1031
|
+
|
|
1032
|
+
# Loop through each city and fetch weather data
|
|
1033
|
+
for city in cities:
|
|
1034
|
+
# Initialize Nominatim API
|
|
1035
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
1036
|
+
location = geolocator.geocode(city)
|
|
1037
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
1038
|
+
|
|
1039
|
+
params = {
|
|
1040
|
+
"latitude": location.latitude,
|
|
1041
|
+
"longitude": location.longitude,
|
|
1042
|
+
"start_date": start_date,
|
|
1043
|
+
"end_date": end_date,
|
|
1044
|
+
"daily": "precipitation_sum",
|
|
1045
|
+
"timezone": "auto"
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
response = requests.get(url, params=params)
|
|
1049
|
+
response_data = response.json()
|
|
1050
|
+
|
|
1051
|
+
daily_data = response_data["daily"]["precipitation_sum"]
|
|
1052
|
+
dates = response_data["daily"]["time"]
|
|
1053
|
+
|
|
1054
|
+
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
1055
|
+
data["city"] = city
|
|
1056
|
+
|
|
1057
|
+
weather_data_list.append(data)
|
|
1058
|
+
|
|
1059
|
+
# Combine all city data into a single data frame
|
|
1060
|
+
all_weather_data = pd.concat(weather_data_list)
|
|
1061
|
+
|
|
1062
|
+
# Convert the date column to a Date type
|
|
1063
|
+
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
1064
|
+
|
|
1065
|
+
# Set week commencing col up
|
|
1066
|
+
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1067
|
+
|
|
1068
|
+
# Group by week_starting and summarize
|
|
1069
|
+
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
1070
|
+
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
1071
|
+
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
1072
|
+
|
|
1073
|
+
# Change index to datetime
|
|
1074
|
+
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
1075
|
+
|
|
1076
|
+
elif country == "DE__ASOS":
|
|
1077
|
+
|
|
1078
|
+
# Define cities and date range
|
|
1079
|
+
cities = ["Dortmund", "Düsseldorf", "Frankfurt", "Munich", "Cologne", "Berlin", "Hamburg", "Nuernberg"]
|
|
1080
|
+
|
|
1081
|
+
start_date = formatted_date
|
|
1082
|
+
end_date = today.strftime("%Y-%m-%d")
|
|
1083
|
+
|
|
1084
|
+
# Initialize an empty list to store the weather data for each city
|
|
1085
|
+
weather_data_list = []
|
|
1086
|
+
|
|
1087
|
+
# Loop through each city and fetch weather data
|
|
1088
|
+
for city in cities:
|
|
1089
|
+
# Initialize Nominatim API
|
|
1090
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
1091
|
+
location = geolocator.geocode(city)
|
|
1092
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
1093
|
+
|
|
1094
|
+
params = {
|
|
1095
|
+
"latitude": location.latitude,
|
|
1096
|
+
"longitude": location.longitude,
|
|
1097
|
+
"start_date": start_date,
|
|
1098
|
+
"end_date": end_date,
|
|
1099
|
+
"daily": "precipitation_sum",
|
|
1100
|
+
"timezone": "auto"
|
|
1101
|
+
}
|
|
1102
|
+
|
|
1103
|
+
response = requests.get(url, params=params)
|
|
1104
|
+
response_data = response.json()
|
|
1105
|
+
|
|
1106
|
+
daily_data = response_data["daily"]["precipitation_sum"]
|
|
1107
|
+
dates = response_data["daily"]["time"]
|
|
1108
|
+
|
|
1109
|
+
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
1110
|
+
data["city"] = city
|
|
1111
|
+
|
|
1112
|
+
weather_data_list.append(data)
|
|
1113
|
+
|
|
1114
|
+
# Combine all city data into a single data frame
|
|
1115
|
+
all_weather_data = pd.concat(weather_data_list)
|
|
1116
|
+
|
|
1117
|
+
# Convert the date column to a Date type
|
|
1118
|
+
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
1119
|
+
|
|
1120
|
+
# Set week commencing col up
|
|
1121
|
+
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1122
|
+
|
|
1123
|
+
# Group by week_starting and summarize
|
|
1124
|
+
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
1125
|
+
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
1126
|
+
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
1127
|
+
|
|
1128
|
+
# Change index to datetime
|
|
1129
|
+
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
1130
|
+
|
|
1131
|
+
elif country == "FR__ASOS":
|
|
1132
|
+
|
|
1133
|
+
# Define cities and date range
|
|
1134
|
+
cities = ["Paris"]
|
|
1135
|
+
|
|
1136
|
+
start_date = formatted_date
|
|
1137
|
+
end_date = today.strftime("%Y-%m-%d")
|
|
1138
|
+
|
|
1139
|
+
# Initialize an empty list to store the weather data for each city
|
|
1140
|
+
weather_data_list = []
|
|
1141
|
+
|
|
1142
|
+
# Loop through each city and fetch weather data
|
|
1143
|
+
for city in cities:
|
|
1144
|
+
# Initialize Nominatim API
|
|
1145
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
1146
|
+
location = geolocator.geocode(city)
|
|
1147
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
1148
|
+
|
|
1149
|
+
params = {
|
|
1150
|
+
"latitude": location.latitude,
|
|
1151
|
+
"longitude": location.longitude,
|
|
1152
|
+
"start_date": start_date,
|
|
1153
|
+
"end_date": end_date,
|
|
1154
|
+
"daily": "precipitation_sum",
|
|
1155
|
+
"timezone": "auto"
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
response = requests.get(url, params=params)
|
|
1159
|
+
response_data = response.json()
|
|
1160
|
+
|
|
1161
|
+
daily_data = response_data["daily"]["precipitation_sum"]
|
|
1162
|
+
dates = response_data["daily"]["time"]
|
|
1163
|
+
|
|
1164
|
+
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
1165
|
+
data["city"] = city
|
|
1166
|
+
|
|
1167
|
+
weather_data_list.append(data)
|
|
1168
|
+
|
|
1169
|
+
# Combine all city data into a single data frame
|
|
1170
|
+
all_weather_data = pd.concat(weather_data_list)
|
|
1171
|
+
|
|
1172
|
+
# Convert the date column to a Date type
|
|
1173
|
+
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
1174
|
+
|
|
1175
|
+
# Set week commencing col up
|
|
1176
|
+
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1177
|
+
|
|
1178
|
+
# Group by week_starting and summarize
|
|
1179
|
+
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
1180
|
+
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
1181
|
+
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
1182
|
+
|
|
1183
|
+
# Change index to datetime
|
|
1184
|
+
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
1185
|
+
|
|
1186
|
+
elif country == "ZA__ASOS":
|
|
1187
|
+
cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
|
|
1188
|
+
start_date = formatted_date
|
|
1189
|
+
end_date = today.strftime("%Y-%m-%d")
|
|
1190
|
+
|
|
1191
|
+
weather_data_list = []
|
|
1192
|
+
|
|
1193
|
+
for city in cities:
|
|
1194
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
1195
|
+
location = geolocator.geocode(city)
|
|
1196
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
1197
|
+
|
|
1198
|
+
params = {
|
|
1199
|
+
"latitude": location.latitude,
|
|
1200
|
+
"longitude": location.longitude,
|
|
1201
|
+
"start_date": start_date,
|
|
1202
|
+
"end_date": end_date,
|
|
1203
|
+
"daily": "precipitation_sum",
|
|
1204
|
+
"timezone": "auto"
|
|
1205
|
+
}
|
|
1206
|
+
|
|
1207
|
+
response = requests.get(url, params=params)
|
|
1208
|
+
response_data = response.json()
|
|
1209
|
+
|
|
1210
|
+
daily_data = response_data["daily"]["precipitation_sum"]
|
|
1211
|
+
dates = response_data["daily"]["time"]
|
|
1212
|
+
|
|
1213
|
+
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
1214
|
+
data["city"] = city
|
|
1215
|
+
|
|
1216
|
+
weather_data_list.append(data)
|
|
1217
|
+
|
|
1218
|
+
# Combine all city data into a single data frame
|
|
1219
|
+
all_weather_data = pd.concat(weather_data_list)
|
|
1220
|
+
|
|
1221
|
+
# Convert the date column to a Date type
|
|
1222
|
+
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
1223
|
+
|
|
1224
|
+
# Set week commencing col up
|
|
1225
|
+
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1226
|
+
|
|
1227
|
+
# Group by week_starting and summarize
|
|
1228
|
+
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
1229
|
+
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
1230
|
+
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
1231
|
+
|
|
1232
|
+
# Change index to datetime
|
|
1233
|
+
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
1234
|
+
|
|
1235
|
+
# Merge the dataframes
|
|
1236
|
+
if country in ["AU__ASOS", "DE__ASOS", "FR__ASOS", "GB__ASOS", "ZA__ASOS"]:
|
|
1237
|
+
merged_df = weekly_avg_rain.merge(weekly_avg_temp, on="week_starting")
|
|
1238
|
+
else:
|
|
1239
|
+
merged_df = weekly_avg_temp
|
|
1240
|
+
|
|
1241
|
+
merged_df.reset_index(drop=False, inplace=True)
|
|
1242
|
+
merged_df.rename(columns={'week_starting': 'OBS'}, inplace=True)
|
|
1243
|
+
|
|
1244
|
+
final_weather = ims_proc.rename_cols(merged_df, 'seas_')
|
|
1245
|
+
|
|
1246
|
+
return final_weather
|
|
1247
|
+
|
|
1248
|
+
def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
|
|
1249
|
+
"""
|
|
1250
|
+
Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
|
|
1251
|
+
aggregates it to weekly averages, and renames variables based on specified rules.
|
|
1252
|
+
|
|
1253
|
+
Parameters:
|
|
1254
|
+
cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
|
|
1255
|
+
week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
|
|
1256
|
+
sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
|
|
1257
|
+
|
|
1258
|
+
Returns:
|
|
1259
|
+
pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
|
|
1260
|
+
and all series as renamed columns.
|
|
1261
|
+
"""
|
|
1262
|
+
# Define CDIDs for sectors and defaults
|
|
1263
|
+
sector_cdids = {
|
|
1264
|
+
"fast_food": ["L7TD", "L78Q", "DOAD"],
|
|
1265
|
+
"default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
default_cdids = sector_cdids["default"]
|
|
1269
|
+
sector_specific_cdids = sector_cdids.get(sector, [])
|
|
1270
|
+
standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Avoid duplicates
|
|
1271
|
+
|
|
1272
|
+
# Combine standard CDIDs and additional CDIDs
|
|
1273
|
+
if cdid_list is None:
|
|
1274
|
+
cdid_list = []
|
|
1275
|
+
cdid_list = list(set(standard_cdids + cdid_list)) # Avoid duplicates
|
|
1276
|
+
|
|
1277
|
+
base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
|
|
1278
|
+
base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
|
|
1279
|
+
combined_df = pd.DataFrame()
|
|
1280
|
+
|
|
1281
|
+
# Map week start day to pandas weekday convention
|
|
1282
|
+
days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
1283
|
+
if week_start_day not in days_map:
|
|
1284
|
+
raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
|
|
1285
|
+
week_start = days_map[week_start_day]
|
|
1286
|
+
|
|
1287
|
+
for cdid in cdid_list:
|
|
1288
|
+
try:
|
|
1289
|
+
# Search for the series
|
|
1290
|
+
search_url = f"{base_search_url}{cdid}"
|
|
1291
|
+
search_response = requests.get(search_url)
|
|
1292
|
+
search_response.raise_for_status()
|
|
1293
|
+
search_data = search_response.json()
|
|
1294
|
+
|
|
1295
|
+
items = search_data.get("items", [])
|
|
1296
|
+
if not items:
|
|
1297
|
+
print(f"No data found for CDID: {cdid}")
|
|
1298
|
+
continue
|
|
1299
|
+
|
|
1300
|
+
# Extract series name and latest release URI
|
|
1301
|
+
series_name = items[0].get("title", f"Series_{cdid}")
|
|
1302
|
+
latest_date = max(
|
|
1303
|
+
datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
|
|
1304
|
+
for item in items if "release_date" in item
|
|
1305
|
+
)
|
|
1306
|
+
latest_uri = next(
|
|
1307
|
+
item["uri"] for item in items
|
|
1308
|
+
if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
|
|
1309
|
+
)
|
|
1310
|
+
|
|
1311
|
+
# Fetch the dataset
|
|
1312
|
+
data_url = f"{base_data_url}{latest_uri}"
|
|
1313
|
+
data_response = requests.get(data_url)
|
|
1314
|
+
data_response.raise_for_status()
|
|
1315
|
+
data_json = data_response.json()
|
|
1316
|
+
|
|
1317
|
+
# Detect the frequency and process accordingly
|
|
1318
|
+
if "months" in data_json and data_json["months"]:
|
|
1319
|
+
frequency_key = "months"
|
|
1320
|
+
elif "quarters" in data_json and data_json["quarters"]:
|
|
1321
|
+
frequency_key = "quarters"
|
|
1322
|
+
elif "years" in data_json and data_json["years"]:
|
|
1323
|
+
frequency_key = "years"
|
|
1324
|
+
else:
|
|
1325
|
+
print(f"Unsupported frequency or no data for CDID: {cdid}")
|
|
1326
|
+
continue
|
|
1327
|
+
|
|
1328
|
+
# Prepare the DataFrame
|
|
1329
|
+
df = pd.DataFrame(data_json[frequency_key])
|
|
1330
|
+
|
|
1331
|
+
# Parse the 'date' field based on frequency
|
|
1332
|
+
if frequency_key == "months":
|
|
1333
|
+
df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
|
|
1334
|
+
elif frequency_key == "quarters":
|
|
1335
|
+
def parse_quarter(quarter_str):
|
|
1336
|
+
year, qtr = quarter_str.split(" Q")
|
|
1337
|
+
month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
|
|
1338
|
+
return datetime(int(year), month, 1)
|
|
1339
|
+
df["date"] = df["date"].apply(parse_quarter)
|
|
1340
|
+
elif frequency_key == "years":
|
|
1341
|
+
df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
|
|
1342
|
+
|
|
1343
|
+
df["value"] = pd.to_numeric(df["value"], errors="coerce")
|
|
1344
|
+
df.rename(columns={"value": series_name}, inplace=True)
|
|
1345
|
+
|
|
1346
|
+
# Combine data
|
|
1347
|
+
df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
|
|
1348
|
+
if combined_df.empty:
|
|
1349
|
+
combined_df = df
|
|
1350
|
+
else:
|
|
1351
|
+
combined_df = pd.merge(combined_df, df, on="date", how="outer")
|
|
1352
|
+
|
|
1353
|
+
except requests.exceptions.RequestException as e:
|
|
1354
|
+
print(f"Error fetching data for CDID {cdid}: {e}")
|
|
1355
|
+
except (KeyError, ValueError) as e:
|
|
1356
|
+
print(f"Error processing data for CDID {cdid}: {e}")
|
|
1357
|
+
|
|
1358
|
+
if not combined_df.empty:
|
|
1359
|
+
min_date = combined_df["date"].min()
|
|
1360
|
+
max_date = datetime.today()
|
|
1361
|
+
date_range = pd.date_range(start=min_date, end=max_date, freq='D')
|
|
1362
|
+
daily_df = pd.DataFrame(date_range, columns=['date'])
|
|
1363
|
+
daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
|
|
1364
|
+
daily_df = daily_df.ffill()
|
|
1365
|
+
|
|
1366
|
+
# Aggregate to weekly frequency
|
|
1367
|
+
daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
|
|
1368
|
+
weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
|
|
1369
|
+
|
|
1370
|
+
def clean_column_name(name):
|
|
1371
|
+
name = re.sub(r"\(.*?\)", "", name)
|
|
1372
|
+
name = re.split(r":", name)[0]
|
|
1373
|
+
name = re.sub(r"\d+", "", name)
|
|
1374
|
+
name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
|
|
1375
|
+
name = re.sub(r"[^\w\s]", "", name)
|
|
1376
|
+
name = name.replace(" ", "_")
|
|
1377
|
+
name = re.sub(r"_+", "_", name)
|
|
1378
|
+
name = name.rstrip("_")
|
|
1379
|
+
return f"macro_{name.lower()}_uk"
|
|
1380
|
+
|
|
1381
|
+
weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
|
|
1382
|
+
weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
|
|
1383
|
+
|
|
1384
|
+
weekly_df = weekly_df.fillna(0)
|
|
1385
|
+
|
|
1386
|
+
return weekly_df
|
|
1387
|
+
else:
|
|
1388
|
+
print("No data available to process.")
|
|
1389
|
+
return pd.DataFrame()
|
|
1390
|
+
|
|
1391
|
+
def pull_yfinance(self, tickers=None, week_start_day="mon"):
|
|
1392
|
+
"""
|
|
1393
|
+
Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
|
|
1394
|
+
aggregates it to weekly averages, and renames variables.
|
|
1395
|
+
|
|
1396
|
+
Parameters:
|
|
1397
|
+
tickers (list): A list of additional stock tickers to fetch (e.g., ['AAPL', 'MSFT']). Defaults to None.
|
|
1398
|
+
week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
|
|
1399
|
+
|
|
1400
|
+
Returns:
|
|
1401
|
+
pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column
|
|
1402
|
+
and aggregated stock data for the specified tickers, with NaN values filled with 0.
|
|
1403
|
+
"""
|
|
1404
|
+
# Define default tickers
|
|
1405
|
+
default_tickers = ["^FTSE", "GBPUSD=X", "GBPEUR=X", "^GSPC"]
|
|
1406
|
+
|
|
1407
|
+
# Combine default tickers with additional ones
|
|
1408
|
+
if tickers is None:
|
|
1409
|
+
tickers = []
|
|
1410
|
+
tickers = list(set(default_tickers + tickers)) # Ensure no duplicates
|
|
1411
|
+
|
|
1412
|
+
# Automatically set end_date to today
|
|
1413
|
+
end_date = datetime.today().strftime("%Y-%m-%d")
|
|
1414
|
+
|
|
1415
|
+
# Mapping week start day to pandas weekday convention
|
|
1416
|
+
days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
1417
|
+
if week_start_day not in days_map:
|
|
1418
|
+
raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
|
|
1419
|
+
week_start = days_map[week_start_day]
|
|
1420
|
+
|
|
1421
|
+
# Fetch data for all tickers without specifying a start date to get all available data
|
|
1422
|
+
data = yf.download(tickers, end=end_date, group_by="ticker", auto_adjust=True)
|
|
1423
|
+
|
|
1424
|
+
# Process the data
|
|
1425
|
+
combined_df = pd.DataFrame()
|
|
1426
|
+
for ticker in tickers:
|
|
1427
|
+
try:
|
|
1428
|
+
# Extract the ticker's data
|
|
1429
|
+
ticker_data = data[ticker] if len(tickers) > 1 else data
|
|
1430
|
+
ticker_data = ticker_data.reset_index()
|
|
1431
|
+
|
|
1432
|
+
# Ensure necessary columns are present
|
|
1433
|
+
if "Close" not in ticker_data.columns:
|
|
1434
|
+
raise ValueError(f"Ticker {ticker} does not have 'Close' price data.")
|
|
1435
|
+
|
|
1436
|
+
# Keep only relevant columns
|
|
1437
|
+
ticker_data = ticker_data[["Date", "Close"]]
|
|
1438
|
+
ticker_data.rename(columns={"Close": ticker}, inplace=True)
|
|
1439
|
+
|
|
1440
|
+
# Merge data
|
|
1441
|
+
if combined_df.empty:
|
|
1442
|
+
combined_df = ticker_data
|
|
1443
|
+
else:
|
|
1444
|
+
combined_df = pd.merge(combined_df, ticker_data, on="Date", how="outer")
|
|
1445
|
+
|
|
1446
|
+
except KeyError:
|
|
1447
|
+
print(f"Data for ticker {ticker} not available.")
|
|
1448
|
+
except Exception as e:
|
|
1449
|
+
print(f"Error processing ticker {ticker}: {e}")
|
|
1450
|
+
|
|
1451
|
+
if not combined_df.empty:
|
|
1452
|
+
# Convert to daily frequency
|
|
1453
|
+
combined_df["Date"] = pd.to_datetime(combined_df["Date"])
|
|
1454
|
+
combined_df.set_index("Date", inplace=True)
|
|
1455
|
+
|
|
1456
|
+
# Fill missing dates
|
|
1457
|
+
min_date = combined_df.index.min()
|
|
1458
|
+
max_date = combined_df.index.max()
|
|
1459
|
+
daily_index = pd.date_range(start=min_date, end=max_date, freq='D')
|
|
1460
|
+
combined_df = combined_df.reindex(daily_index)
|
|
1461
|
+
combined_df.index.name = "Date"
|
|
1462
|
+
combined_df = combined_df.ffill()
|
|
1463
|
+
|
|
1464
|
+
# Aggregate to weekly frequency
|
|
1465
|
+
combined_df["OBS"] = combined_df.index - pd.to_timedelta((combined_df.index.weekday - week_start) % 7, unit="D")
|
|
1466
|
+
weekly_df = combined_df.groupby("OBS").mean(numeric_only=True).reset_index()
|
|
1467
|
+
|
|
1468
|
+
# Fill NaN values with 0
|
|
1469
|
+
weekly_df = weekly_df.fillna(0)
|
|
1470
|
+
|
|
1471
|
+
# Clean column names
|
|
1472
|
+
def clean_column_name(name):
|
|
1473
|
+
name = re.sub(r"[^\w\s]", "", name)
|
|
1474
|
+
return f"macro_{name.lower()}"
|
|
1475
|
+
|
|
1476
|
+
weekly_df.columns = [clean_column_name(col) if col != "OBS" else col for col in weekly_df.columns]
|
|
1477
|
+
|
|
1478
|
+
return weekly_df
|
|
1479
|
+
|
|
1480
|
+
else:
|
|
1481
|
+
print("No data available to process.")
|
|
1482
|
+
return pd.DataFrame()
|
|
1483
|
+
|