imsciences 0.6.3.1__py3-none-any.whl → 0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/__init__.py +1 -1
- imsciences/datafunctions.py +507 -419
- imsciences/unittesting.py +1335 -0
- imsciences-0.8.dist-info/METADATA +433 -0
- {imsciences-0.6.3.1.dist-info → imsciences-0.8.dist-info}/RECORD +8 -7
- {imsciences-0.6.3.1.dist-info → imsciences-0.8.dist-info}/WHEEL +1 -1
- imsciences-0.6.3.1.dist-info/METADATA +0 -24
- {imsciences-0.6.3.1.dist-info → imsciences-0.8.dist-info}/PKG-INFO-IMS-24Ltp-3 +0 -0
- {imsciences-0.6.3.1.dist-info → imsciences-0.8.dist-info}/top_level.txt +0 -0
imsciences/datafunctions.py
CHANGED
|
@@ -4,22 +4,18 @@ import os
|
|
|
4
4
|
import plotly.express as px
|
|
5
5
|
import plotly.graph_objs as go
|
|
6
6
|
import numpy as np
|
|
7
|
-
import datetime
|
|
8
7
|
import re
|
|
9
|
-
import pandas as pd
|
|
10
8
|
from fredapi import Fred
|
|
11
9
|
import time
|
|
12
|
-
from datetime import datetime,timedelta
|
|
13
|
-
from cif import cif
|
|
10
|
+
from datetime import datetime, timedelta
|
|
14
11
|
from io import StringIO
|
|
15
|
-
import urllib
|
|
16
|
-
import requests_cache
|
|
17
|
-
import urllib.request
|
|
18
12
|
import requests
|
|
19
|
-
from geopy.geocoders import Nominatim
|
|
20
13
|
import subprocess
|
|
21
14
|
import json
|
|
22
15
|
import xml.etree.ElementTree as ET
|
|
16
|
+
from bs4 import BeautifulSoup
|
|
17
|
+
import yfinance as yf
|
|
18
|
+
import holidays
|
|
23
19
|
|
|
24
20
|
class dataprocessing:
|
|
25
21
|
|
|
@@ -391,7 +387,7 @@ class dataprocessing:
|
|
|
391
387
|
# Divide each numeric value by the number of days in the month
|
|
392
388
|
for col in df.columns:
|
|
393
389
|
if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
|
|
394
|
-
if divide
|
|
390
|
+
if divide is True:
|
|
395
391
|
daily_row[col] = row[col] / num_days
|
|
396
392
|
else:
|
|
397
393
|
daily_row[col] = row[col]
|
|
@@ -678,7 +674,7 @@ class dataprocessing:
|
|
|
678
674
|
|
|
679
675
|
return combined_df
|
|
680
676
|
|
|
681
|
-
def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc=
|
|
677
|
+
def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing="W-MON"):
|
|
682
678
|
"""
|
|
683
679
|
Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
|
|
684
680
|
|
|
@@ -701,53 +697,57 @@ class dataprocessing:
|
|
|
701
697
|
pandas.DataFrame: The pivot table specified
|
|
702
698
|
"""
|
|
703
699
|
|
|
704
|
-
#
|
|
705
|
-
if
|
|
706
|
-
|
|
707
|
-
|
|
700
|
+
# Validate inputs
|
|
701
|
+
if index_col not in df.columns:
|
|
702
|
+
raise ValueError(f"index_col '{index_col}' not found in DataFrame.")
|
|
703
|
+
if columns not in df.columns:
|
|
704
|
+
raise ValueError(f"columns '{columns}' not found in DataFrame.")
|
|
705
|
+
if values_col not in df.columns:
|
|
706
|
+
raise ValueError(f"values_col '{values_col}' not found in DataFrame.")
|
|
707
|
+
|
|
708
|
+
# Apply filters if provided
|
|
709
|
+
if filters_dict:
|
|
708
710
|
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
# If datetime transformation is needed
|
|
711
|
+
else:
|
|
712
|
+
df_filtered = df.copy()
|
|
713
|
+
|
|
714
|
+
# Ensure index column is in datetime format if needed
|
|
714
715
|
if datetime_trans_needed:
|
|
715
716
|
df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
|
|
716
|
-
|
|
717
|
+
|
|
717
718
|
# Create the pivot table
|
|
718
|
-
pivoted_df = df_filtered.pivot_table(
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
719
|
+
pivoted_df = df_filtered.pivot_table(
|
|
720
|
+
index=index_col,
|
|
721
|
+
columns=columns,
|
|
722
|
+
values=values_col,
|
|
723
|
+
aggfunc=aggfunc,
|
|
724
|
+
margins=margins,
|
|
725
|
+
margins_name=margins_name,
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
# Handle column headers
|
|
729
|
+
if isinstance(pivoted_df.columns, pd.MultiIndex):
|
|
730
|
+
pivoted_df.columns = [
|
|
731
|
+
"_".join(reversed(map(str, col)) if reverse_header_order else map(str, col))
|
|
732
|
+
for col in pivoted_df.columns.values
|
|
733
|
+
]
|
|
726
734
|
else:
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
pivoted_df.columns = ['_'.join(reversed(col.split('_'))).strip() for col in pivoted_df.columns]
|
|
734
|
-
|
|
735
|
-
# Reset the pivot before returning
|
|
736
|
-
pivoted_df = pivoted_df.reset_index()
|
|
737
|
-
|
|
738
|
-
# Sort by index column from oldest to newest
|
|
735
|
+
pivoted_df.columns = pivoted_df.columns.map(str)
|
|
736
|
+
|
|
737
|
+
# Reset the index
|
|
738
|
+
pivoted_df.reset_index(inplace=True)
|
|
739
|
+
|
|
740
|
+
# Handle sorting and formatting of index column
|
|
739
741
|
if datetime_trans_needed:
|
|
740
|
-
pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col])
|
|
741
|
-
pivoted_df
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
# If there is a need to fill in missing weeks
|
|
750
|
-
if fill_missing_weekly_dates == True:
|
|
742
|
+
pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col], errors="coerce")
|
|
743
|
+
pivoted_df.sort_values(by=index_col, inplace=True)
|
|
744
|
+
pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
|
|
745
|
+
|
|
746
|
+
# Fill missing values
|
|
747
|
+
pivoted_df.fillna(fill_value, inplace=True)
|
|
748
|
+
|
|
749
|
+
# Fill missing weekly dates if specified
|
|
750
|
+
if fill_missing_weekly_dates:
|
|
751
751
|
pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
|
|
752
752
|
|
|
753
753
|
return pivoted_df
|
|
@@ -983,7 +983,7 @@ class dataprocessing:
|
|
|
983
983
|
|
|
984
984
|
return df
|
|
985
985
|
|
|
986
|
-
def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict,output_column_name="Updated Column"):
|
|
986
|
+
def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name="Updated Column"):
|
|
987
987
|
"""
|
|
988
988
|
This function updates values in a specified column of the DataFrame based on a lookup dictionary.
|
|
989
989
|
It first merges several columns into a new 'Merged' column, then uses this merged column to determine
|
|
@@ -1000,8 +1000,10 @@ class dataprocessing:
|
|
|
1000
1000
|
Returns:
|
|
1001
1001
|
pd.DataFrame: The modified DataFrame with updated values in the specified column.
|
|
1002
1002
|
"""
|
|
1003
|
+
# Create a merged column from specified columns
|
|
1003
1004
|
df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
|
|
1004
1005
|
|
|
1006
|
+
# Replace values in the specified column based on the lookup
|
|
1005
1007
|
def replace_values(x):
|
|
1006
1008
|
if x[col] == replacement_rows:
|
|
1007
1009
|
merged_value = x['Merged']
|
|
@@ -1009,10 +1011,14 @@ class dataprocessing:
|
|
|
1009
1011
|
return replacement_lookup_dict[merged_value]
|
|
1010
1012
|
return x[col]
|
|
1011
1013
|
|
|
1014
|
+
# Apply replacement logic
|
|
1012
1015
|
df[output_column_name] = df.apply(replace_values, axis=1)
|
|
1013
1016
|
|
|
1017
|
+
# Drop the intermediate 'Merged' column
|
|
1018
|
+
df.drop(columns=['Merged'], inplace=True)
|
|
1019
|
+
|
|
1014
1020
|
return df
|
|
1015
|
-
|
|
1021
|
+
|
|
1016
1022
|
def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
|
|
1017
1023
|
"""
|
|
1018
1024
|
Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
|
|
@@ -1049,35 +1055,38 @@ class dataprocessing:
|
|
|
1049
1055
|
|
|
1050
1056
|
return df_final
|
|
1051
1057
|
|
|
1052
|
-
def convert_df_wide_2_long(self, df,value_cols,variable_col_name='Stacked',value_col_name='Value'):
|
|
1058
|
+
def convert_df_wide_2_long(self, df, value_cols, variable_col_name='Stacked', value_col_name='Value'):
|
|
1053
1059
|
"""
|
|
1054
1060
|
Changes a dataframe from wide to long format.
|
|
1055
1061
|
|
|
1056
1062
|
Args:
|
|
1057
1063
|
df (pandas.DataFrame): The DataFrame containing the data.
|
|
1058
|
-
value_cols (list of str or str if only one):
|
|
1059
|
-
variable_col_name (str, optional): Name of new
|
|
1060
|
-
value_col_name (str, optional): Name of the new value column
|
|
1064
|
+
value_cols (list of str or str if only one): List of column names to transform from several columns into one.
|
|
1065
|
+
variable_col_name (str, optional): Name of the new variable column containing the original column names. Defaults to 'Stacked'.
|
|
1066
|
+
value_col_name (str, optional): Name of the new value column containing the data from stacked columns. Defaults to 'Value'.
|
|
1061
1067
|
|
|
1062
1068
|
Returns:
|
|
1063
|
-
pandas.DataFrame
|
|
1064
|
-
|
|
1069
|
+
pandas.DataFrame: DataFrame transformed from wide to long format.
|
|
1070
|
+
|
|
1065
1071
|
Raises:
|
|
1066
|
-
ValueError: If number of
|
|
1072
|
+
ValueError: If the number of columns to depivot is less than 2.
|
|
1067
1073
|
"""
|
|
1068
|
-
|
|
1069
|
-
# Check length of value cols is greater than 1
|
|
1074
|
+
# Check length of value_cols is greater than 1
|
|
1070
1075
|
if len(value_cols) < 2:
|
|
1071
1076
|
raise ValueError("Number of inputs in list must be greater than 1")
|
|
1072
|
-
|
|
1077
|
+
|
|
1073
1078
|
# Find the columns that are not to be depivoted into one column
|
|
1074
|
-
id_vars =
|
|
1075
|
-
|
|
1079
|
+
id_vars = [col for col in df.columns if col not in value_cols] # Preserve column order in the DataFrame
|
|
1080
|
+
|
|
1076
1081
|
# Melt all columns chosen into one column
|
|
1077
|
-
df_final = pd.melt(df, id_vars,value_cols,var_name=variable_col_name,value_name=value_col_name)
|
|
1078
|
-
|
|
1082
|
+
df_final = pd.melt(df, id_vars=id_vars, value_vars=value_cols, var_name=variable_col_name, value_name=value_col_name)
|
|
1083
|
+
|
|
1084
|
+
# Sort column order to match expected output
|
|
1085
|
+
ordered_columns = id_vars + [variable_col_name, value_col_name]
|
|
1086
|
+
df_final = df_final[ordered_columns]
|
|
1087
|
+
|
|
1079
1088
|
return df_final
|
|
1080
|
-
|
|
1089
|
+
|
|
1081
1090
|
def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
|
|
1082
1091
|
"""
|
|
1083
1092
|
Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
|
|
@@ -1102,18 +1111,24 @@ class dataprocessing:
|
|
|
1102
1111
|
Returns:
|
|
1103
1112
|
pandas.DataFrame: Dataframe with manual changes added
|
|
1104
1113
|
"""
|
|
1114
|
+
|
|
1105
1115
|
# Raise type error if more than one col is supported
|
|
1106
1116
|
if isinstance(col_to_change, list):
|
|
1107
1117
|
raise TypeError("Col to change must be specified as a string, not a list")
|
|
1108
|
-
|
|
1118
|
+
|
|
1109
1119
|
# Raises value error if input is invalid for change_in_existing_df_col
|
|
1110
1120
|
if change_in_existing_df_col not in ["Yes", "No"]:
|
|
1111
1121
|
raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
|
|
1112
|
-
|
|
1122
|
+
|
|
1113
1123
|
# Raises value error if input is invalid for add_notes_col
|
|
1114
1124
|
if add_notes not in ["Yes", "No"]:
|
|
1115
1125
|
raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
|
|
1116
1126
|
|
|
1127
|
+
# Validate filters_dict format
|
|
1128
|
+
for col, cond in filters_dict.items():
|
|
1129
|
+
if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
|
|
1130
|
+
raise ValueError(f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'")
|
|
1131
|
+
|
|
1117
1132
|
# Create the filtered df by applying the conditions
|
|
1118
1133
|
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
1119
1134
|
|
|
@@ -1122,7 +1137,7 @@ class dataprocessing:
|
|
|
1122
1137
|
if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
|
|
1123
1138
|
df = df.copy()
|
|
1124
1139
|
df[new_col_to_change_name] = df[col_to_change]
|
|
1125
|
-
|
|
1140
|
+
|
|
1126
1141
|
# Update the new cell in the chosen column
|
|
1127
1142
|
df.loc[df_filtered.index, col_to_update] = new_value
|
|
1128
1143
|
|
|
@@ -1146,32 +1161,32 @@ class dataprocessing:
|
|
|
1146
1161
|
|
|
1147
1162
|
def format_numbers_with_commas(self, df, decimal_length_chosen=2):
|
|
1148
1163
|
"""
|
|
1149
|
-
Converts data in numerical format into numbers with commas and a chosen decimal place length
|
|
1164
|
+
Converts data in numerical format into numbers with commas and a chosen decimal place length.
|
|
1150
1165
|
|
|
1151
1166
|
Args:
|
|
1152
1167
|
df (pandas.DataFrame): The DataFrame containing the data.
|
|
1153
|
-
decimal_length_chosen (int, optional):
|
|
1154
|
-
|
|
1168
|
+
decimal_length_chosen (int, optional): Number of decimal places. Defaults to 2.
|
|
1169
|
+
|
|
1155
1170
|
Returns:
|
|
1156
|
-
pandas.DataFrame: The
|
|
1171
|
+
pandas.DataFrame: The DataFrame with the chosen updated format.
|
|
1157
1172
|
"""
|
|
1158
1173
|
def format_number_with_commas(x, decimal_length=decimal_length_chosen):
|
|
1159
|
-
if
|
|
1174
|
+
if pd.isna(x): # Preserve None/NaN values
|
|
1175
|
+
return pd.NA # Explicitly normalize to pd.NA
|
|
1176
|
+
elif isinstance(x, (int, float)):
|
|
1160
1177
|
if decimal_length is not None:
|
|
1161
|
-
format_str = "{:,.{}f}"
|
|
1162
|
-
|
|
1178
|
+
format_str = f"{{:,.{decimal_length}f}}"
|
|
1179
|
+
return format_str.format(x)
|
|
1163
1180
|
else:
|
|
1164
|
-
|
|
1165
|
-
return formatted_number
|
|
1181
|
+
return f"{x:,}"
|
|
1166
1182
|
else:
|
|
1167
1183
|
return x # Return unchanged if not a number
|
|
1168
1184
|
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
formatted_df = df.applymap(format_number_with_commas)
|
|
1185
|
+
# Apply formatting column by column
|
|
1186
|
+
formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(value=pd.NA)
|
|
1172
1187
|
|
|
1173
1188
|
return formatted_df
|
|
1174
|
-
|
|
1189
|
+
|
|
1175
1190
|
def filter_df_on_multiple_conditions(self, df, filters_dict):
|
|
1176
1191
|
"""
|
|
1177
1192
|
Filter a dataframe based on mulitple conditions
|
|
@@ -1269,7 +1284,6 @@ class dataprocessing:
|
|
|
1269
1284
|
"""
|
|
1270
1285
|
|
|
1271
1286
|
#This line removes zero values from given column
|
|
1272
|
-
|
|
1273
1287
|
return data_frame.loc[~(data_frame[column_to_filter] ==0)]
|
|
1274
1288
|
|
|
1275
1289
|
def upgrade_outdated_packages(self):
|
|
@@ -1392,10 +1406,10 @@ class dataprocessing:
|
|
|
1392
1406
|
Returns:
|
|
1393
1407
|
pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
|
|
1394
1408
|
"""
|
|
1395
|
-
|
|
1409
|
+
|
|
1396
1410
|
# If there is no date column
|
|
1397
1411
|
if date_col is None:
|
|
1398
|
-
df = df.
|
|
1412
|
+
df = df.apply(lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0))
|
|
1399
1413
|
|
|
1400
1414
|
if add_total_dummy_col != 'No':
|
|
1401
1415
|
# Find max value of rows
|
|
@@ -1403,8 +1417,10 @@ class dataprocessing:
|
|
|
1403
1417
|
|
|
1404
1418
|
# If there is a date column
|
|
1405
1419
|
else:
|
|
1406
|
-
# Create dummies
|
|
1407
|
-
df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].
|
|
1420
|
+
# Create dummies for all columns except the date column
|
|
1421
|
+
df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
|
|
1422
|
+
lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
|
|
1423
|
+
)
|
|
1408
1424
|
|
|
1409
1425
|
if add_total_dummy_col != 'No':
|
|
1410
1426
|
# Find max value of rows
|
|
@@ -1427,7 +1443,6 @@ class dataprocessing:
|
|
|
1427
1443
|
Returns:
|
|
1428
1444
|
pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
|
|
1429
1445
|
"""
|
|
1430
|
-
|
|
1431
1446
|
if new_column is not None:
|
|
1432
1447
|
# Create a new column for replacements
|
|
1433
1448
|
df[new_column] = df[column]
|
|
@@ -1435,15 +1450,15 @@ class dataprocessing:
|
|
|
1435
1450
|
else:
|
|
1436
1451
|
# Modify the existing column
|
|
1437
1452
|
temp_column = column
|
|
1438
|
-
|
|
1439
|
-
# Apply substring replacements
|
|
1440
|
-
for old, new in replacements.items():
|
|
1441
|
-
df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
|
|
1442
|
-
|
|
1453
|
+
|
|
1443
1454
|
# Optionally convert to lowercase
|
|
1444
1455
|
if to_lower:
|
|
1445
1456
|
df[temp_column] = df[temp_column].str.lower()
|
|
1446
|
-
|
|
1457
|
+
|
|
1458
|
+
# Apply substring replacements
|
|
1459
|
+
for old, new in replacements.items():
|
|
1460
|
+
df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
|
|
1461
|
+
|
|
1447
1462
|
return df
|
|
1448
1463
|
|
|
1449
1464
|
def add_total_column(self, df, exclude_col=None, total_col_name='Total'):
|
|
@@ -1458,11 +1473,11 @@ class dataprocessing:
|
|
|
1458
1473
|
Returns:
|
|
1459
1474
|
pd.DataFrame: The DataFrame with an added total column.
|
|
1460
1475
|
"""
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
df[total_col_name] = df.drop(columns=[exclude_col]).sum(axis=1)
|
|
1476
|
+
if exclude_col and exclude_col in df.columns:
|
|
1477
|
+
# Ensure the column to exclude exists before dropping
|
|
1478
|
+
df[total_col_name] = df.drop(columns=[exclude_col], errors='ignore').sum(axis=1)
|
|
1464
1479
|
else:
|
|
1465
|
-
# Sum across all columns if
|
|
1480
|
+
# Sum across all columns if no column is specified to exclude
|
|
1466
1481
|
df[total_col_name] = df.sum(axis=1)
|
|
1467
1482
|
|
|
1468
1483
|
return df
|
|
@@ -1502,7 +1517,7 @@ class dataprocessing:
|
|
|
1502
1517
|
df[new_col_name] = df[column_name].apply(categorize_text)
|
|
1503
1518
|
return df
|
|
1504
1519
|
|
|
1505
|
-
def compare_overlap(self,df1, df2, date_col):
|
|
1520
|
+
def compare_overlap(self, df1, df2, date_col):
|
|
1506
1521
|
"""
|
|
1507
1522
|
Compare overlapping periods between two DataFrames and provide a summary of total differences.
|
|
1508
1523
|
|
|
@@ -1517,64 +1532,70 @@ class dataprocessing:
|
|
|
1517
1532
|
# Ensure date columns are in datetime format
|
|
1518
1533
|
df1[date_col] = pd.to_datetime(df1[date_col])
|
|
1519
1534
|
df2[date_col] = pd.to_datetime(df2[date_col])
|
|
1520
|
-
|
|
1535
|
+
|
|
1521
1536
|
# Determine the overlap period
|
|
1522
1537
|
start_date = max(df1[date_col].min(), df2[date_col].min())
|
|
1523
1538
|
end_date = min(df1[date_col].max(), df2[date_col].max())
|
|
1524
|
-
|
|
1525
|
-
# Filter
|
|
1539
|
+
|
|
1540
|
+
# Filter DataFrames to the overlapping period
|
|
1526
1541
|
df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
|
|
1527
1542
|
df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
|
|
1528
|
-
|
|
1529
|
-
# Merge the
|
|
1543
|
+
|
|
1544
|
+
# Merge the DataFrames on the date column
|
|
1530
1545
|
merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
|
|
1531
|
-
|
|
1532
|
-
# Get
|
|
1546
|
+
|
|
1547
|
+
# Get common columns, excluding the date column
|
|
1533
1548
|
common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
|
|
1534
|
-
|
|
1535
|
-
#
|
|
1549
|
+
|
|
1550
|
+
# Create a DataFrame for differences
|
|
1551
|
+
diff_df = pd.DataFrame({date_col: merged_df[date_col]})
|
|
1552
|
+
|
|
1536
1553
|
total_diff_list = []
|
|
1537
|
-
|
|
1538
|
-
# Create a DataFrame for the differences
|
|
1539
|
-
diff_df = pd.DataFrame({date_col: merged_df[date_col]}) # Initialize diff_df with the date column
|
|
1540
|
-
|
|
1541
1554
|
for col in common_cols:
|
|
1542
|
-
# Calculate the difference for each row
|
|
1543
1555
|
diff_col = f'diff_{col}'
|
|
1544
|
-
diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']
|
|
1545
|
-
|
|
1546
|
-
#
|
|
1556
|
+
diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2'] # Corrected subtraction order
|
|
1557
|
+
|
|
1558
|
+
# Sum differences for the column
|
|
1547
1559
|
total_diff = diff_df[diff_col].sum()
|
|
1548
1560
|
total_diff_list.append({'Column': col, 'Total Difference': total_diff})
|
|
1549
|
-
|
|
1550
|
-
# Create
|
|
1561
|
+
|
|
1562
|
+
# Create summary DataFrame
|
|
1551
1563
|
total_diff_df = pd.DataFrame(total_diff_list)
|
|
1552
|
-
|
|
1553
|
-
# Apply formatting to the numerical columns
|
|
1554
|
-
float_format = "{:,.2f}".format # Format to 2 decimal places with comma as thousand separator
|
|
1555
|
-
diff_df.iloc[:, 1:] = diff_df.iloc[:, 1:].applymap(float_format)
|
|
1556
|
-
total_diff_df['Total Difference'] = total_diff_df['Total Difference'].apply(float_format)
|
|
1557
|
-
|
|
1564
|
+
|
|
1558
1565
|
return diff_df, total_diff_df
|
|
1559
|
-
|
|
1560
|
-
def
|
|
1566
|
+
|
|
1567
|
+
def week_commencing_2_week_commencing_conversion_isoweekday(self, df, date_col, week_commencing='mon'):
|
|
1561
1568
|
"""
|
|
1562
|
-
Convert
|
|
1569
|
+
Convert a DataFrame's date column so that each date is mapped back
|
|
1570
|
+
to the 'week_commencing' day of the *current ISO week*.
|
|
1563
1571
|
|
|
1564
1572
|
Args:
|
|
1565
|
-
df (pandas.DataFrame): The DataFrame
|
|
1566
|
-
date_col (str): The name of the date column
|
|
1567
|
-
week_commencing (str
|
|
1573
|
+
df (pandas.DataFrame): The DataFrame with date-based data.
|
|
1574
|
+
date_col (str): The name of the date column.
|
|
1575
|
+
week_commencing (str): The desired start of the week.
|
|
1576
|
+
('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
|
|
1577
|
+
Uses ISO day numbering (Mon=1, ..., Sun=7).
|
|
1568
1578
|
|
|
1569
1579
|
Returns:
|
|
1570
|
-
pandas.DataFrame:
|
|
1580
|
+
pandas.DataFrame: Original DataFrame with an extra column
|
|
1581
|
+
'week_start_<week_commencing>' containing the
|
|
1582
|
+
start-of-week date for each row.
|
|
1571
1583
|
"""
|
|
1572
|
-
#
|
|
1573
|
-
|
|
1574
|
-
|
|
1584
|
+
# ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
|
|
1585
|
+
iso_day_dict = {"mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6, "sun": 7}
|
|
1586
|
+
|
|
1587
|
+
target_day = iso_day_dict[week_commencing]
|
|
1588
|
+
|
|
1589
|
+
def map_to_week_start(date_val):
|
|
1590
|
+
delta = (date_val.isoweekday() - target_day) % 7
|
|
1591
|
+
return date_val - pd.Timedelta(days=delta)
|
|
1592
|
+
|
|
1593
|
+
# Apply the transformation
|
|
1594
|
+
new_col = f"week_start_{week_commencing}"
|
|
1595
|
+
df[new_col] = df[date_col].apply(map_to_week_start)
|
|
1575
1596
|
|
|
1576
1597
|
return df
|
|
1577
|
-
|
|
1598
|
+
|
|
1578
1599
|
def plot_chart(self, df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs):
|
|
1579
1600
|
"""
|
|
1580
1601
|
Plot various types of charts using Plotly.
|
|
@@ -1743,17 +1764,6 @@ class dataprocessing:
|
|
|
1743
1764
|
########################################################################################################################################
|
|
1744
1765
|
########################################################################################################################################
|
|
1745
1766
|
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
1767
|
ims_proc = dataprocessing()
|
|
1758
1768
|
|
|
1759
1769
|
class datapull:
|
|
@@ -1764,46 +1774,46 @@ class datapull:
|
|
|
1764
1774
|
print("\n1. pull_fred_data")
|
|
1765
1775
|
print(" - Description: Get data from FRED by using series id tokens.")
|
|
1766
1776
|
print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
|
|
1767
|
-
print(" - Example: pull_fred_data('mon', ['GPDIC1'
|
|
1777
|
+
print(" - Example: pull_fred_data('mon', ['GPDIC1'])")
|
|
1768
1778
|
|
|
1769
1779
|
print("\n2. pull_boe_data")
|
|
1770
1780
|
print(" - Description: Fetch and process Bank of England interest rate data.")
|
|
1771
1781
|
print(" - Usage: pull_boe_data(week_commencing)")
|
|
1772
1782
|
print(" - Example: pull_boe_data('mon')")
|
|
1773
1783
|
|
|
1774
|
-
print("\n3.
|
|
1775
|
-
print(" - Description: Fetch and process time series data from the ONS API.")
|
|
1776
|
-
print(" - Usage: pull_ons_data(series_list, week_commencing)")
|
|
1777
|
-
print(" - Example: pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')")
|
|
1778
|
-
|
|
1779
|
-
print("\n4. pull_oecd")
|
|
1784
|
+
print("\n3. pull_oecd")
|
|
1780
1785
|
print(" - Description: Fetch macroeconomic data from OECD for a specified country.")
|
|
1781
|
-
print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '
|
|
1782
|
-
print(" - Example: pull_oecd('GBR', 'mon', '
|
|
1786
|
+
print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '2020-01-01')")
|
|
1787
|
+
print(" - Example: pull_oecd('GBR', 'mon', '2000-01-01')")
|
|
1783
1788
|
|
|
1784
|
-
print("\
|
|
1789
|
+
print("\n4. get_google_mobility_data")
|
|
1785
1790
|
print(" - Description: Fetch Google Mobility data for the specified country.")
|
|
1786
1791
|
print(" - Usage: get_google_mobility_data(country, wc)")
|
|
1787
1792
|
print(" - Example: get_google_mobility_data('United Kingdom', 'mon')")
|
|
1788
1793
|
|
|
1789
|
-
print("\
|
|
1794
|
+
print("\n5. pull_seasonality")
|
|
1790
1795
|
print(" - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.")
|
|
1791
|
-
print(" - Usage:
|
|
1792
|
-
print(" - Example:
|
|
1796
|
+
print(" - Usage: pull_seasonality(week_commencing, start_date, countries)")
|
|
1797
|
+
print(" - Example: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])")
|
|
1793
1798
|
|
|
1794
|
-
print("\
|
|
1799
|
+
print("\n6. pull_weather")
|
|
1795
1800
|
print(" - Description: Fetch and process historical weather data for the specified country.")
|
|
1796
1801
|
print(" - Usage: pull_weather(week_commencing, country)")
|
|
1797
1802
|
print(" - Example: pull_weather('mon', 'GBR')")
|
|
1803
|
+
|
|
1804
|
+
print("\n7. pull_macro_ons_uk")
|
|
1805
|
+
print(" - Description: Fetch and process time series data from the Beta ONS API.")
|
|
1806
|
+
print(" - Usage: pull_macro_ons_uk(aditional_list, week_commencing, sector)")
|
|
1807
|
+
print(" - Example: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')")
|
|
1808
|
+
|
|
1809
|
+
print("\n8. pull_yfinance")
|
|
1810
|
+
print(" - Description: Fetch and process time series data from the Beta ONS API.")
|
|
1811
|
+
print(" - Usage: pull_yfinance(tickers, week_start_day)")
|
|
1812
|
+
print(" - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
|
|
1798
1813
|
|
|
1799
|
-
print("\n8. pull_covid_data")
|
|
1800
|
-
print(" - Description: Get covid pandemic data for the country of interest.")
|
|
1801
|
-
print(" - Usage: pull_covid_data(folder_path, country, week_commencing)")
|
|
1802
|
-
print(" - Example: pull_covid_data('C:/Users/--username--/OneDrive/', 'GB', 'mon')")
|
|
1803
|
-
|
|
1804
1814
|
############################################################### MACRO ##########################################################################
|
|
1805
1815
|
|
|
1806
|
-
def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"
|
|
1816
|
+
def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
|
|
1807
1817
|
'''
|
|
1808
1818
|
Parameters
|
|
1809
1819
|
----------
|
|
@@ -1812,16 +1822,12 @@ class datapull:
|
|
|
1812
1822
|
|
|
1813
1823
|
series_id_list : list[str]
|
|
1814
1824
|
provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
|
|
1815
|
-
["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"
|
|
1825
|
+
["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]
|
|
1816
1826
|
|
|
1817
1827
|
Returns
|
|
1818
1828
|
----------
|
|
1819
1829
|
pd.DataFrame
|
|
1820
1830
|
Return a data frame with FRED data according to the series IDs provided
|
|
1821
|
-
|
|
1822
|
-
Example
|
|
1823
|
-
----------
|
|
1824
|
-
pull_fred_data("mon", ["GCEC1", "SP500"])
|
|
1825
1831
|
'''
|
|
1826
1832
|
# Fred API
|
|
1827
1833
|
fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
|
|
@@ -1864,169 +1870,82 @@ class datapull:
|
|
|
1864
1870
|
|
|
1865
1871
|
return fred_df_final
|
|
1866
1872
|
|
|
1867
|
-
def pull_boe_data(self, week_commencing="mon", max_retries=
|
|
1873
|
+
def pull_boe_data(self, week_commencing="mon", max_retries=5, delay=5):
|
|
1868
1874
|
"""
|
|
1869
1875
|
Fetch and process Bank of England interest rate data.
|
|
1870
1876
|
|
|
1871
1877
|
Args:
|
|
1872
|
-
week_commencing (str): The starting day of the week for aggregation.
|
|
1873
|
-
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
1874
|
-
Default is "
|
|
1875
|
-
max_retries (int): Maximum number of retries to fetch data in case of failure. Default is
|
|
1878
|
+
week_commencing (str): The starting day of the week for aggregation.
|
|
1879
|
+
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
1880
|
+
Default is "mon".
|
|
1881
|
+
max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
|
|
1876
1882
|
delay (int): Delay in seconds between retry attempts. Default is 5.
|
|
1877
1883
|
|
|
1878
1884
|
Returns:
|
|
1879
|
-
pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
|
|
1880
|
-
The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
|
|
1885
|
+
pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
|
|
1886
|
+
The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
|
|
1881
1887
|
and 'macro_boe_intr_rate' contains the average interest rate for the week.
|
|
1882
1888
|
"""
|
|
1883
1889
|
# Week commencing dictionary
|
|
1884
1890
|
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
1885
|
-
|
|
1886
|
-
#
|
|
1887
|
-
def fetch_data_with_retries(url, max_retries, delay):
|
|
1888
|
-
for attempt in range(max_retries):
|
|
1889
|
-
try:
|
|
1890
|
-
html_table = pd.read_html(url)[0]
|
|
1891
|
-
return html_table
|
|
1892
|
-
except Exception as e:
|
|
1893
|
-
print(f"Attempt {attempt + 1} failed: {e}")
|
|
1894
|
-
if attempt < max_retries - 1:
|
|
1895
|
-
time.sleep(delay)
|
|
1896
|
-
else:
|
|
1897
|
-
raise
|
|
1898
|
-
|
|
1899
|
-
# Import HTML data from Bank of England rate
|
|
1891
|
+
|
|
1892
|
+
# URL of the Bank of England data page
|
|
1900
1893
|
url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1894
|
+
|
|
1895
|
+
# Retry logic for HTTP request
|
|
1896
|
+
for attempt in range(max_retries):
|
|
1897
|
+
try:
|
|
1898
|
+
# Set up headers to mimic a browser request
|
|
1899
|
+
headers = {
|
|
1900
|
+
"User-Agent": (
|
|
1901
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
1902
|
+
"Chrome/91.0.4472.124 Safari/537.36"
|
|
1903
|
+
)
|
|
1904
|
+
}
|
|
1905
|
+
response = requests.get(url, headers=headers)
|
|
1906
|
+
response.raise_for_status() # Raise an exception for HTTP errors
|
|
1907
|
+
break
|
|
1908
|
+
except requests.exceptions.RequestException as e:
|
|
1909
|
+
print(f"Attempt {attempt + 1} failed: {e}")
|
|
1910
|
+
if attempt < max_retries - 1:
|
|
1911
|
+
time.sleep(delay)
|
|
1912
|
+
else:
|
|
1913
|
+
raise
|
|
1914
|
+
|
|
1915
|
+
# Parse the HTML page
|
|
1916
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
1917
|
+
|
|
1918
|
+
# Find the table on the page
|
|
1919
|
+
table = soup.find("table") # Locate the first table
|
|
1920
|
+
table_html = str(table) # Convert table to string
|
|
1921
|
+
df = pd.read_html(StringIO(table_html))[0] # Use StringIO to wrap the table HTML
|
|
1922
|
+
|
|
1923
|
+
# Rename and clean up columns
|
|
1904
1924
|
df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
|
|
1905
|
-
|
|
1906
|
-
# Change date column to datetime and find the corresponding week to the date
|
|
1907
1925
|
df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
|
|
1908
|
-
df.sort_values("OBS",
|
|
1909
|
-
|
|
1910
|
-
# Create a daily date range and find the week commencing for that day
|
|
1911
|
-
date_range = pd.date_range(df["OBS"].iloc[0], datetime.today(), freq="d")
|
|
1912
|
-
df_daily = pd.DataFrame(date_range, columns=["OBS"])
|
|
1913
|
-
|
|
1914
|
-
# Adjust each date to the specified week commencing day
|
|
1915
|
-
df_daily['Week_Commencing'] = df_daily["OBS"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1916
|
-
|
|
1917
|
-
# Outer merge the daily date range on the boe dataframe and forward fill in the blanks
|
|
1918
|
-
df_final = df_daily.merge(df, on='OBS', how="left")
|
|
1919
|
-
df_final["macro_boe_intr_rate"].ffill(inplace=True)
|
|
1920
|
-
|
|
1921
|
-
# Group by the week start date and get the mean of the interest rates for each week
|
|
1922
|
-
df_final = df_final.groupby('Week_Commencing')['macro_boe_intr_rate'].mean().reset_index()
|
|
1923
|
-
|
|
1924
|
-
df_final['Week_Commencing'] = df_final['Week_Commencing'].dt.strftime('%d/%m/%Y')
|
|
1925
|
-
df_final.rename(columns={'Week_Commencing': 'OBS'}, inplace=True)
|
|
1926
|
-
|
|
1927
|
-
return df_final
|
|
1926
|
+
df.sort_values("OBS", inplace=True)
|
|
1928
1927
|
|
|
1929
|
-
|
|
1930
|
-
"""
|
|
1931
|
-
|
|
1928
|
+
# Create a daily date range
|
|
1929
|
+
date_range = pd.date_range(df["OBS"].min(), datetime.today(), freq="D")
|
|
1930
|
+
df_daily = pd.DataFrame(date_range, columns=["OBS"])
|
|
1932
1931
|
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
1932
|
+
# Adjust each date to the specified week commencing day
|
|
1933
|
+
df_daily["Week_Commencing"] = df_daily["OBS"].apply(
|
|
1934
|
+
lambda x: x - timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
1935
|
+
)
|
|
1938
1936
|
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
"""
|
|
1943
|
-
|
|
1944
|
-
def parse_quarter(date_str):
|
|
1945
|
-
"""Parses a string in 'YYYY Q#' format into a datetime object."""
|
|
1946
|
-
year, quarter = date_str.split(' ')
|
|
1947
|
-
quarter_number = int(quarter[1])
|
|
1948
|
-
month = (quarter_number - 1) * 3 + 1
|
|
1949
|
-
return pd.Timestamp(f"{year}-{month:02d}-01")
|
|
1937
|
+
# Merge and forward-fill missing rates
|
|
1938
|
+
df_daily = df_daily.merge(df, on="OBS", how="left")
|
|
1939
|
+
df_daily["macro_boe_intr_rate"] = df_daily["macro_boe_intr_rate"].ffill()
|
|
1950
1940
|
|
|
1951
|
-
#
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
# Keep track of the renamed value columns
|
|
1956
|
-
value_columns = []
|
|
1941
|
+
# Group by week commencing and calculate the average rate
|
|
1942
|
+
df_final = df_daily.groupby("Week_Commencing")["macro_boe_intr_rate"].mean().reset_index()
|
|
1943
|
+
df_final["Week_Commencing"] = df_final["Week_Commencing"].dt.strftime('%d/%m/%Y')
|
|
1944
|
+
df_final.rename(columns={"Week_Commencing": "OBS"}, inplace=True)
|
|
1957
1945
|
|
|
1958
|
-
|
|
1959
|
-
series_id = series['series_id']
|
|
1960
|
-
dataset_id = series['dataset_id']
|
|
1961
|
-
|
|
1962
|
-
# Construct the URL for data
|
|
1963
|
-
data_url = f"https://api.ons.gov.uk/timeseries/{series_id}/dataset/{dataset_id}/data"
|
|
1964
|
-
|
|
1965
|
-
# Make the request to the ONS API for data
|
|
1966
|
-
data_response = requests.get(data_url)
|
|
1967
|
-
|
|
1968
|
-
# Check if the request was successful
|
|
1969
|
-
if data_response.status_code != 200:
|
|
1970
|
-
print(f"Failed to fetch data for series {series_id}: {data_response.status_code} {data_response.text}")
|
|
1971
|
-
continue
|
|
1972
|
-
|
|
1973
|
-
# Parse the JSON response for data
|
|
1974
|
-
data = data_response.json()
|
|
1975
|
-
|
|
1976
|
-
# Attempt to extract the name of the time series from the data response
|
|
1977
|
-
series_name = data.get('description', {}).get('title', 'Value')
|
|
1978
|
-
|
|
1979
|
-
# Determine the most granular time series data available
|
|
1980
|
-
if 'months' in data and data['months']:
|
|
1981
|
-
time_series_data = data['months']
|
|
1982
|
-
elif 'quarters' in data and data['quarters']:
|
|
1983
|
-
time_series_data = data['quarters']
|
|
1984
|
-
elif 'years' in data and data['years']:
|
|
1985
|
-
time_series_data = data['years']
|
|
1986
|
-
else:
|
|
1987
|
-
print("No time series data found in the response")
|
|
1988
|
-
continue
|
|
1989
|
-
|
|
1990
|
-
# Create a DataFrame from the time series data
|
|
1991
|
-
df = pd.DataFrame(time_series_data)
|
|
1992
|
-
|
|
1993
|
-
# Handle different frequencies in the data
|
|
1994
|
-
if 'date' in df.columns:
|
|
1995
|
-
if any(df['date'].str.contains('Q')):
|
|
1996
|
-
df['date'] = df['date'].apply(parse_quarter)
|
|
1997
|
-
else:
|
|
1998
|
-
df['date'] = pd.to_datetime(df['date'])
|
|
1999
|
-
|
|
2000
|
-
df = df.rename(columns={'date': 'OBS', 'value': series_name})
|
|
2001
|
-
|
|
2002
|
-
# Rename the value column
|
|
2003
|
-
new_col_name = 'macro_' + series_name.lower().replace(':', '').replace(' ', '_').replace('-', '_')
|
|
2004
|
-
df = df.rename(columns={series_name: new_col_name})
|
|
2005
|
-
|
|
2006
|
-
# Track the renamed value column
|
|
2007
|
-
value_columns.append(new_col_name)
|
|
2008
|
-
|
|
2009
|
-
# Merge the data based on the observation date
|
|
2010
|
-
daily_df = pd.merge_asof(daily_df, df[['OBS', new_col_name]], on='OBS', direction='backward')
|
|
2011
|
-
|
|
2012
|
-
# Ensure columns are numeric
|
|
2013
|
-
for col in value_columns:
|
|
2014
|
-
if col in daily_df.columns:
|
|
2015
|
-
daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
|
|
2016
|
-
else:
|
|
2017
|
-
print(f"Column {col} not found in daily_df")
|
|
2018
|
-
|
|
2019
|
-
# Aggregate results by week
|
|
2020
|
-
ons_df_final = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
|
|
2021
|
-
date_column="OBS",
|
|
2022
|
-
group_columns=[],
|
|
2023
|
-
sum_columns=value_columns,
|
|
2024
|
-
wc=week_commencing,
|
|
2025
|
-
aggregation="average")
|
|
2026
|
-
|
|
2027
|
-
return ons_df_final
|
|
1946
|
+
return df_final
|
|
2028
1947
|
|
|
2029
|
-
def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "
|
|
1948
|
+
def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "2020-01-01") -> pd.DataFrame:
|
|
2030
1949
|
"""
|
|
2031
1950
|
Fetch and process time series data from the OECD API.
|
|
2032
1951
|
|
|
@@ -2104,7 +2023,7 @@ class datapull:
|
|
|
2104
2023
|
break
|
|
2105
2024
|
|
|
2106
2025
|
# get data for the next variable if url doesn't exist
|
|
2107
|
-
if url_test
|
|
2026
|
+
if url_test is False:
|
|
2108
2027
|
continue
|
|
2109
2028
|
|
|
2110
2029
|
root = ET.fromstring(data_response.content)
|
|
@@ -2169,7 +2088,7 @@ class datapull:
|
|
|
2169
2088
|
|
|
2170
2089
|
return oecd_df_final
|
|
2171
2090
|
|
|
2172
|
-
def get_google_mobility_data(self, country
|
|
2091
|
+
def get_google_mobility_data(self, country="United Kingdom", wc="mon") -> pd.DataFrame:
|
|
2173
2092
|
"""
|
|
2174
2093
|
Fetch Google Mobility data for the specified country.
|
|
2175
2094
|
|
|
@@ -2189,7 +2108,7 @@ class datapull:
|
|
|
2189
2108
|
|
|
2190
2109
|
# Load the CSV file into a pandas DataFrame
|
|
2191
2110
|
csv_data = StringIO(response.text)
|
|
2192
|
-
df = pd.read_csv(csv_data)
|
|
2111
|
+
df = pd.read_csv(csv_data, low_memory=False)
|
|
2193
2112
|
|
|
2194
2113
|
# Filter the DataFrame for the specified country
|
|
2195
2114
|
country_df = df[df['country_region'] == country]
|
|
@@ -2203,12 +2122,12 @@ class datapull:
|
|
|
2203
2122
|
|
|
2204
2123
|
############################################################### Seasonality ##########################################################################
|
|
2205
2124
|
|
|
2206
|
-
def
|
|
2125
|
+
def pull_seasonality(self, week_commencing, start_date, countries):
|
|
2207
2126
|
# Week commencing dictionary
|
|
2208
2127
|
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
2209
2128
|
|
|
2210
|
-
# Create daily date range dataframe
|
|
2211
|
-
date_range = pd.date_range(
|
|
2129
|
+
# Create daily date range dataframe starting from start_date
|
|
2130
|
+
date_range = pd.date_range(start=pd.to_datetime(start_date), end=datetime.today(), freq="d")
|
|
2212
2131
|
df_daily = pd.DataFrame(date_range, columns=["Date"])
|
|
2213
2132
|
|
|
2214
2133
|
# Create weekly date range dataframe
|
|
@@ -2218,7 +2137,7 @@ class datapull:
|
|
|
2218
2137
|
|
|
2219
2138
|
df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
|
|
2220
2139
|
df_weekly_start.set_index("Date", inplace=True)
|
|
2221
|
-
|
|
2140
|
+
|
|
2222
2141
|
# Create individual weekly dummies
|
|
2223
2142
|
dummy_columns = {}
|
|
2224
2143
|
for i in range(len(df_weekly_start)):
|
|
@@ -2228,84 +2147,59 @@ class datapull:
|
|
|
2228
2147
|
|
|
2229
2148
|
df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
|
|
2230
2149
|
df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
|
|
2231
|
-
|
|
2232
|
-
#
|
|
2150
|
+
|
|
2151
|
+
# Add public holidays for each country and holiday type
|
|
2152
|
+
for country in countries:
|
|
2153
|
+
country_holidays = holidays.CountryHoliday(country, years=range(int(start_date[:4]), datetime.today().year + 1))
|
|
2154
|
+
df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(lambda x: 1 if x in country_holidays else 0)
|
|
2155
|
+
|
|
2156
|
+
# Extract specific holidays
|
|
2157
|
+
for date, name in country_holidays.items():
|
|
2158
|
+
col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
|
|
2159
|
+
if col_name not in df_daily.columns:
|
|
2160
|
+
df_daily[col_name] = 0
|
|
2161
|
+
df_daily.loc[df_daily["Date"] == pd.Timestamp(date), col_name] = 1
|
|
2162
|
+
|
|
2163
|
+
# Map daily holidays to weekly aggregation
|
|
2164
|
+
df_daily['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2165
|
+
df_holidays = df_daily.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
|
|
2166
|
+
df_holidays.set_index("Date", inplace=True)
|
|
2167
|
+
|
|
2168
|
+
# Create monthly dummies (separately from holidays)
|
|
2233
2169
|
df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
|
|
2234
|
-
df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"])
|
|
2170
|
+
df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"], dtype=int)
|
|
2235
2171
|
df_monthly_dummies['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2236
2172
|
df_monthly_dummies = df_monthly_dummies.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
|
|
2237
|
-
|
|
2238
2173
|
df_monthly_dummies.set_index("Date", inplace=True)
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
|
|
2243
|
-
|
|
2174
|
+
|
|
2175
|
+
# Divide only the monthly dummy columns by 7 (exclude holiday-related columns)
|
|
2176
|
+
monthly_cols = [col for col in df_monthly_dummies.columns if not col.startswith("seas_holiday") and not col.startswith("seas_")]
|
|
2177
|
+
df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
|
|
2178
|
+
|
|
2179
|
+
# Merge weekly dummies, monthly dummies, and holidays
|
|
2180
|
+
df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1) # Combine weekly and monthly first
|
|
2181
|
+
df_combined = pd.concat([df_combined, df_holidays], axis=1) # Add holidays separately
|
|
2182
|
+
|
|
2183
|
+
# Drop duplicate columns if any exist (this ensures holidays are not duplicated)
|
|
2184
|
+
df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
|
|
2185
|
+
|
|
2244
2186
|
# Create weekly dummies
|
|
2245
2187
|
df_combined.reset_index(inplace=True)
|
|
2246
2188
|
df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
|
|
2247
|
-
df_combined = pd.get_dummies(df_combined, prefix="
|
|
2189
|
+
df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int)
|
|
2248
2190
|
|
|
2249
2191
|
# Create yearly dummies
|
|
2250
2192
|
df_combined["Year"] = df_combined["Date"].dt.year
|
|
2251
|
-
df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"])
|
|
2193
|
+
df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
|
|
2252
2194
|
|
|
2253
2195
|
# Add constant
|
|
2254
2196
|
df_combined["Constant"] = 1
|
|
2255
2197
|
|
|
2256
2198
|
# Add trend
|
|
2257
2199
|
df_combined["Trend"] = df_combined.index + 1
|
|
2258
|
-
|
|
2259
|
-
# Set date as index
|
|
2260
|
-
df_combined.set_index("Date", inplace=True)
|
|
2261
|
-
|
|
2262
|
-
# Create COVID lockdown dummies
|
|
2263
|
-
lockdown_periods = [
|
|
2264
|
-
# Lockdown 1
|
|
2265
|
-
("2020-03-23", "2020-05-24"),
|
|
2266
|
-
# Lockdown 2
|
|
2267
|
-
("2020-11-05", "2020-12-02"),
|
|
2268
|
-
# Lockdown 3
|
|
2269
|
-
("2021-01-04", "2021-03-08")
|
|
2270
|
-
]
|
|
2271
|
-
|
|
2272
|
-
df_covid = pd.DataFrame(date_range, columns=["Date"])
|
|
2273
|
-
df_covid["national_lockdown"] = 0
|
|
2274
|
-
|
|
2275
|
-
for start, end in lockdown_periods:
|
|
2276
|
-
df_covid.loc[(df_covid["Date"] >= start) & (df_covid["Date"] <= end), "national_lockdown"] = 1
|
|
2277
|
-
|
|
2278
|
-
df_covid['week_start'] = df_covid["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2279
|
-
df_covid.drop("Date", axis=1, inplace=True)
|
|
2280
|
-
df_covid.rename(columns={"week_start": "OBS"}, inplace=True)
|
|
2281
|
-
df_national_lockdown_total = df_covid.groupby('OBS').sum(numeric_only=True)
|
|
2282
|
-
df_national_lockdown_total.rename(columns={"national_lockdown": "covid_uk_national_lockdown_total"}, inplace=True)
|
|
2283
|
-
|
|
2284
|
-
df_national_lockdown_1 = df_national_lockdown_total.copy(deep=True)
|
|
2285
|
-
df_national_lockdown_2 = df_national_lockdown_total.copy(deep=True)
|
|
2286
|
-
df_national_lockdown_3 = df_national_lockdown_total.copy(deep=True)
|
|
2287
|
-
|
|
2288
|
-
df_national_lockdown_1.loc[df_national_lockdown_1.index > "2020-05-24"] = 0
|
|
2289
|
-
df_national_lockdown_1.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_1"}, inplace=True)
|
|
2290
|
-
|
|
2291
|
-
df_national_lockdown_2.loc[df_national_lockdown_2.index < "2020-11-05"] = 0
|
|
2292
|
-
df_national_lockdown_2.loc[df_national_lockdown_2.index > "2020-12-02"] = 0
|
|
2293
|
-
df_national_lockdown_2.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_2"}, inplace=True)
|
|
2294
|
-
|
|
2295
|
-
df_national_lockdown_3.loc[df_national_lockdown_3.index < "2021-01-04"] = 0
|
|
2296
|
-
df_national_lockdown_3.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_3"}, inplace=True)
|
|
2297
2200
|
|
|
2298
|
-
df_final_covid = pd.concat([df_national_lockdown_total, df_national_lockdown_1, df_national_lockdown_2, df_national_lockdown_3], axis=1)
|
|
2299
|
-
df_final_covid.reset_index(inplace=True)
|
|
2300
|
-
df_final_covid.rename(columns={"index": "OBS"}, inplace=True)
|
|
2301
|
-
|
|
2302
2201
|
# Create seasonal indicators for the last day and last Friday of the month
|
|
2303
|
-
|
|
2304
|
-
max_date = datetime.today().strftime('%Y-%m-%d')
|
|
2305
|
-
date_range_seas = pd.date_range(start=min_date, end=max_date)
|
|
2306
|
-
|
|
2307
|
-
df_seas = pd.DataFrame(date_range_seas, columns=['Date'])
|
|
2308
|
-
df_seas['Last_Day_of_Month'] = df_seas['Date'].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
|
|
2202
|
+
df_combined['seas_last_day_of_month'] = df_combined["Date"].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
|
|
2309
2203
|
|
|
2310
2204
|
def is_last_friday(date):
|
|
2311
2205
|
last_day_of_month = date.to_period('M').to_timestamp('M')
|
|
@@ -2317,28 +2211,19 @@ class datapull:
|
|
|
2317
2211
|
last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
|
|
2318
2212
|
return 1 if date == last_friday else 0
|
|
2319
2213
|
|
|
2320
|
-
|
|
2214
|
+
df_combined['seas_last_friday_of_month'] = df_combined["Date"].apply(is_last_friday)
|
|
2321
2215
|
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
df_seas.set_index("Date", inplace=True)
|
|
2325
|
-
|
|
2326
|
-
# Combine all dataframes
|
|
2327
|
-
df_combined = df_combined.reset_index().rename(columns={"Date": "OBS"})
|
|
2328
|
-
df_final_combined = pd.merge(df_combined, df_final_covid, how='left', left_on='OBS', right_on='OBS')
|
|
2329
|
-
df_final_combined = pd.merge(df_final_combined, df_seas, how='left', left_on='OBS', right_on='Date')
|
|
2330
|
-
|
|
2331
|
-
# Fill any NaN values with 0
|
|
2332
|
-
df_final_combined.fillna(0, inplace=True)
|
|
2216
|
+
# Rename Date to OBS
|
|
2217
|
+
df_combined.rename(columns={"Date": "OBS"}, inplace=True)
|
|
2333
2218
|
|
|
2334
|
-
return
|
|
2219
|
+
return df_combined
|
|
2335
2220
|
|
|
2336
2221
|
def pull_weather(self, week_commencing, country) -> pd.DataFrame:
|
|
2337
2222
|
import pandas as pd
|
|
2338
|
-
import urllib.request
|
|
2223
|
+
import urllib.request # noqa: F811
|
|
2339
2224
|
from datetime import datetime
|
|
2340
2225
|
import requests
|
|
2341
|
-
from geopy.geocoders import Nominatim
|
|
2226
|
+
from geopy.geocoders import Nominatim # noqa: F811
|
|
2342
2227
|
|
|
2343
2228
|
# Week commencing dictionary
|
|
2344
2229
|
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
@@ -2936,35 +2821,238 @@ class datapull:
|
|
|
2936
2821
|
|
|
2937
2822
|
return final_weather
|
|
2938
2823
|
|
|
2939
|
-
def
|
|
2824
|
+
def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
|
|
2940
2825
|
"""
|
|
2941
|
-
|
|
2826
|
+
Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
|
|
2827
|
+
aggregates it to weekly averages, and renames variables based on specified rules.
|
|
2942
2828
|
|
|
2943
|
-
|
|
2944
|
-
|
|
2945
|
-
|
|
2946
|
-
|
|
2947
|
-
|
|
2948
|
-
|
|
2949
|
-
|
|
2950
|
-
|
|
2829
|
+
Parameters:
|
|
2830
|
+
cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
|
|
2831
|
+
week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
|
|
2832
|
+
sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
|
|
2833
|
+
|
|
2834
|
+
Returns:
|
|
2835
|
+
pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
|
|
2836
|
+
and all series as renamed columns.
|
|
2837
|
+
"""
|
|
2838
|
+
# Define CDIDs for sectors and defaults
|
|
2839
|
+
sector_cdids = {
|
|
2840
|
+
"fast_food": ["L7TD", "L78Q", "DOAD"],
|
|
2841
|
+
"default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
|
|
2842
|
+
}
|
|
2843
|
+
|
|
2844
|
+
default_cdids = sector_cdids["default"]
|
|
2845
|
+
sector_specific_cdids = sector_cdids.get(sector, [])
|
|
2846
|
+
standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Avoid duplicates
|
|
2847
|
+
|
|
2848
|
+
# Combine standard CDIDs and additional CDIDs
|
|
2849
|
+
if cdid_list is None:
|
|
2850
|
+
cdid_list = []
|
|
2851
|
+
cdid_list = list(set(standard_cdids + cdid_list)) # Avoid duplicates
|
|
2852
|
+
|
|
2853
|
+
base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
|
|
2854
|
+
base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
|
|
2855
|
+
combined_df = pd.DataFrame()
|
|
2856
|
+
|
|
2857
|
+
# Map week start day to pandas weekday convention
|
|
2858
|
+
days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
2859
|
+
if week_start_day not in days_map:
|
|
2860
|
+
raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
|
|
2861
|
+
week_start = days_map[week_start_day]
|
|
2862
|
+
|
|
2863
|
+
for cdid in cdid_list:
|
|
2864
|
+
try:
|
|
2865
|
+
# Search for the series
|
|
2866
|
+
search_url = f"{base_search_url}{cdid}"
|
|
2867
|
+
search_response = requests.get(search_url)
|
|
2868
|
+
search_response.raise_for_status()
|
|
2869
|
+
search_data = search_response.json()
|
|
2870
|
+
|
|
2871
|
+
items = search_data.get("items", [])
|
|
2872
|
+
if not items:
|
|
2873
|
+
print(f"No data found for CDID: {cdid}")
|
|
2874
|
+
continue
|
|
2875
|
+
|
|
2876
|
+
# Extract series name and latest release URI
|
|
2877
|
+
series_name = items[0].get("title", f"Series_{cdid}")
|
|
2878
|
+
latest_date = max(
|
|
2879
|
+
datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
|
|
2880
|
+
for item in items if "release_date" in item
|
|
2881
|
+
)
|
|
2882
|
+
latest_uri = next(
|
|
2883
|
+
item["uri"] for item in items
|
|
2884
|
+
if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
|
|
2885
|
+
)
|
|
2886
|
+
|
|
2887
|
+
# Fetch the dataset
|
|
2888
|
+
data_url = f"{base_data_url}{latest_uri}"
|
|
2889
|
+
data_response = requests.get(data_url)
|
|
2890
|
+
data_response.raise_for_status()
|
|
2891
|
+
data_json = data_response.json()
|
|
2892
|
+
|
|
2893
|
+
# Detect the frequency and process accordingly
|
|
2894
|
+
if "months" in data_json and data_json["months"]:
|
|
2895
|
+
frequency_key = "months"
|
|
2896
|
+
elif "quarters" in data_json and data_json["quarters"]:
|
|
2897
|
+
frequency_key = "quarters"
|
|
2898
|
+
elif "years" in data_json and data_json["years"]:
|
|
2899
|
+
frequency_key = "years"
|
|
2900
|
+
else:
|
|
2901
|
+
print(f"Unsupported frequency or no data for CDID: {cdid}")
|
|
2902
|
+
continue
|
|
2903
|
+
|
|
2904
|
+
# Prepare the DataFrame
|
|
2905
|
+
df = pd.DataFrame(data_json[frequency_key])
|
|
2906
|
+
|
|
2907
|
+
# Parse the 'date' field based on frequency
|
|
2908
|
+
if frequency_key == "months":
|
|
2909
|
+
df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
|
|
2910
|
+
elif frequency_key == "quarters":
|
|
2911
|
+
def parse_quarter(quarter_str):
|
|
2912
|
+
year, qtr = quarter_str.split(" Q")
|
|
2913
|
+
month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
|
|
2914
|
+
return datetime(int(year), month, 1)
|
|
2915
|
+
df["date"] = df["date"].apply(parse_quarter)
|
|
2916
|
+
elif frequency_key == "years":
|
|
2917
|
+
df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
|
|
2918
|
+
|
|
2919
|
+
df["value"] = pd.to_numeric(df["value"], errors="coerce")
|
|
2920
|
+
df.rename(columns={"value": series_name}, inplace=True)
|
|
2921
|
+
|
|
2922
|
+
# Combine data
|
|
2923
|
+
df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
|
|
2924
|
+
if combined_df.empty:
|
|
2925
|
+
combined_df = df
|
|
2926
|
+
else:
|
|
2927
|
+
combined_df = pd.merge(combined_df, df, on="date", how="outer")
|
|
2928
|
+
|
|
2929
|
+
except requests.exceptions.RequestException as e:
|
|
2930
|
+
print(f"Error fetching data for CDID {cdid}: {e}")
|
|
2931
|
+
except (KeyError, ValueError) as e:
|
|
2932
|
+
print(f"Error processing data for CDID {cdid}: {e}")
|
|
2933
|
+
|
|
2934
|
+
if not combined_df.empty:
|
|
2935
|
+
min_date = combined_df["date"].min()
|
|
2936
|
+
max_date = datetime.today()
|
|
2937
|
+
date_range = pd.date_range(start=min_date, end=max_date, freq='D')
|
|
2938
|
+
daily_df = pd.DataFrame(date_range, columns=['date'])
|
|
2939
|
+
daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
|
|
2940
|
+
daily_df = daily_df.ffill()
|
|
2941
|
+
|
|
2942
|
+
# Aggregate to weekly frequency
|
|
2943
|
+
daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
|
|
2944
|
+
weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
|
|
2945
|
+
|
|
2946
|
+
def clean_column_name(name):
|
|
2947
|
+
name = re.sub(r"\(.*?\)", "", name)
|
|
2948
|
+
name = re.split(r":", name)[0]
|
|
2949
|
+
name = re.sub(r"\d+", "", name)
|
|
2950
|
+
name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
|
|
2951
|
+
name = re.sub(r"[^\w\s]", "", name)
|
|
2952
|
+
name = name.replace(" ", "_")
|
|
2953
|
+
name = re.sub(r"_+", "_", name)
|
|
2954
|
+
name = name.rstrip("_")
|
|
2955
|
+
return f"macro_{name.lower()}_uk"
|
|
2956
|
+
|
|
2957
|
+
weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
|
|
2958
|
+
weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
|
|
2959
|
+
|
|
2960
|
+
weekly_df = weekly_df.fillna(0)
|
|
2961
|
+
|
|
2962
|
+
return weekly_df
|
|
2963
|
+
else:
|
|
2964
|
+
print("No data available to process.")
|
|
2965
|
+
return pd.DataFrame()
|
|
2966
|
+
|
|
2967
|
+
def pull_yfinance(self, tickers=None, week_start_day="mon"):
|
|
2968
|
+
"""
|
|
2969
|
+
Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
|
|
2970
|
+
aggregates it to weekly averages, and renames variables.
|
|
2971
|
+
|
|
2972
|
+
Parameters:
|
|
2973
|
+
tickers (list): A list of additional stock tickers to fetch (e.g., ['AAPL', 'MSFT']). Defaults to None.
|
|
2974
|
+
week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
|
|
2951
2975
|
|
|
2952
2976
|
Returns:
|
|
2953
|
-
pd.DataFrame: A DataFrame
|
|
2954
|
-
|
|
2977
|
+
pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column
|
|
2978
|
+
and aggregated stock data for the specified tickers, with NaN values filled with 0.
|
|
2955
2979
|
"""
|
|
2980
|
+
# Define default tickers
|
|
2981
|
+
default_tickers = ["^FTSE", "GBPUSD=X", "GBPEUR=X", "^GSPC"]
|
|
2956
2982
|
|
|
2957
|
-
|
|
2983
|
+
# Combine default tickers with additional ones
|
|
2984
|
+
if tickers is None:
|
|
2985
|
+
tickers = []
|
|
2986
|
+
tickers = list(set(default_tickers + tickers)) # Ensure no duplicates
|
|
2987
|
+
|
|
2988
|
+
# Automatically set end_date to today
|
|
2989
|
+
end_date = datetime.today().strftime("%Y-%m-%d")
|
|
2990
|
+
|
|
2991
|
+
# Mapping week start day to pandas weekday convention
|
|
2992
|
+
days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
2993
|
+
if week_start_day not in days_map:
|
|
2994
|
+
raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
|
|
2995
|
+
week_start = days_map[week_start_day]
|
|
2958
2996
|
|
|
2959
|
-
|
|
2960
|
-
|
|
2961
|
-
|
|
2997
|
+
# Fetch data for all tickers without specifying a start date to get all available data
|
|
2998
|
+
data = yf.download(tickers, end=end_date, group_by="ticker", auto_adjust=True)
|
|
2999
|
+
|
|
3000
|
+
# Process the data
|
|
3001
|
+
combined_df = pd.DataFrame()
|
|
3002
|
+
for ticker in tickers:
|
|
3003
|
+
try:
|
|
3004
|
+
# Extract the ticker's data
|
|
3005
|
+
ticker_data = data[ticker] if len(tickers) > 1 else data
|
|
3006
|
+
ticker_data = ticker_data.reset_index()
|
|
3007
|
+
|
|
3008
|
+
# Ensure necessary columns are present
|
|
3009
|
+
if "Close" not in ticker_data.columns:
|
|
3010
|
+
raise ValueError(f"Ticker {ticker} does not have 'Close' price data.")
|
|
3011
|
+
|
|
3012
|
+
# Keep only relevant columns
|
|
3013
|
+
ticker_data = ticker_data[["Date", "Close"]]
|
|
3014
|
+
ticker_data.rename(columns={"Close": ticker}, inplace=True)
|
|
2962
3015
|
|
|
2963
|
-
|
|
3016
|
+
# Merge data
|
|
3017
|
+
if combined_df.empty:
|
|
3018
|
+
combined_df = ticker_data
|
|
3019
|
+
else:
|
|
3020
|
+
combined_df = pd.merge(combined_df, ticker_data, on="Date", how="outer")
|
|
2964
3021
|
|
|
2965
|
-
|
|
3022
|
+
except KeyError:
|
|
3023
|
+
print(f"Data for ticker {ticker} not available.")
|
|
3024
|
+
except Exception as e:
|
|
3025
|
+
print(f"Error processing ticker {ticker}: {e}")
|
|
2966
3026
|
|
|
2967
|
-
|
|
3027
|
+
if not combined_df.empty:
|
|
3028
|
+
# Convert to daily frequency
|
|
3029
|
+
combined_df["Date"] = pd.to_datetime(combined_df["Date"])
|
|
3030
|
+
combined_df.set_index("Date", inplace=True)
|
|
2968
3031
|
|
|
2969
|
-
|
|
2970
|
-
|
|
3032
|
+
# Fill missing dates
|
|
3033
|
+
min_date = combined_df.index.min()
|
|
3034
|
+
max_date = combined_df.index.max()
|
|
3035
|
+
daily_index = pd.date_range(start=min_date, end=max_date, freq='D')
|
|
3036
|
+
combined_df = combined_df.reindex(daily_index)
|
|
3037
|
+
combined_df.index.name = "Date"
|
|
3038
|
+
combined_df = combined_df.ffill()
|
|
3039
|
+
|
|
3040
|
+
# Aggregate to weekly frequency
|
|
3041
|
+
combined_df["OBS"] = combined_df.index - pd.to_timedelta((combined_df.index.weekday - week_start) % 7, unit="D")
|
|
3042
|
+
weekly_df = combined_df.groupby("OBS").mean(numeric_only=True).reset_index()
|
|
3043
|
+
|
|
3044
|
+
# Fill NaN values with 0
|
|
3045
|
+
weekly_df = weekly_df.fillna(0)
|
|
3046
|
+
|
|
3047
|
+
# Clean column names
|
|
3048
|
+
def clean_column_name(name):
|
|
3049
|
+
name = re.sub(r"[^\w\s]", "", name)
|
|
3050
|
+
return f"macro_{name.lower()}"
|
|
3051
|
+
|
|
3052
|
+
weekly_df.columns = [clean_column_name(col) if col != "OBS" else col for col in weekly_df.columns]
|
|
3053
|
+
|
|
3054
|
+
return weekly_df
|
|
3055
|
+
|
|
3056
|
+
else:
|
|
3057
|
+
print("No data available to process.")
|
|
3058
|
+
return pd.DataFrame()
|