imsciences 0.6.3.1__py3-none-any.whl → 0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,22 +4,18 @@ import os
4
4
  import plotly.express as px
5
5
  import plotly.graph_objs as go
6
6
  import numpy as np
7
- import datetime
8
7
  import re
9
- import pandas as pd
10
8
  from fredapi import Fred
11
9
  import time
12
- from datetime import datetime,timedelta
13
- from cif import cif
10
+ from datetime import datetime, timedelta
14
11
  from io import StringIO
15
- import urllib
16
- import requests_cache
17
- import urllib.request
18
12
  import requests
19
- from geopy.geocoders import Nominatim
20
13
  import subprocess
21
14
  import json
22
15
  import xml.etree.ElementTree as ET
16
+ from bs4 import BeautifulSoup
17
+ import yfinance as yf
18
+ import holidays
23
19
 
24
20
  class dataprocessing:
25
21
 
@@ -391,7 +387,7 @@ class dataprocessing:
391
387
  # Divide each numeric value by the number of days in the month
392
388
  for col in df.columns:
393
389
  if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
394
- if divide == True:
390
+ if divide is True:
395
391
  daily_row[col] = row[col] / num_days
396
392
  else:
397
393
  daily_row[col] = row[col]
@@ -678,7 +674,7 @@ class dataprocessing:
678
674
 
679
675
  return combined_df
680
676
 
681
- def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name="Total", datetime_trans_needed=True, reverse_header_order=False,fill_missing_weekly_dates=False,week_commencing='W-MON'):
677
+ def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing="W-MON"):
682
678
  """
683
679
  Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
684
680
 
@@ -701,53 +697,57 @@ class dataprocessing:
701
697
  pandas.DataFrame: The pivot table specified
702
698
  """
703
699
 
704
- # Create the filtered df by applying the conditions
705
- if filters_dict is None:
706
- df_filtered = df
707
- else:
700
+ # Validate inputs
701
+ if index_col not in df.columns:
702
+ raise ValueError(f"index_col '{index_col}' not found in DataFrame.")
703
+ if columns not in df.columns:
704
+ raise ValueError(f"columns '{columns}' not found in DataFrame.")
705
+ if values_col not in df.columns:
706
+ raise ValueError(f"values_col '{values_col}' not found in DataFrame.")
707
+
708
+ # Apply filters if provided
709
+ if filters_dict:
708
710
  df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
709
-
710
- # Ensure index column is in datetime format for proper sorting
711
- df_filtered = df_filtered.copy()
712
-
713
- # If datetime transformation is needed
711
+ else:
712
+ df_filtered = df.copy()
713
+
714
+ # Ensure index column is in datetime format if needed
714
715
  if datetime_trans_needed:
715
716
  df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
716
-
717
+
717
718
  # Create the pivot table
718
- pivoted_df = df_filtered.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc=aggfunc, margins=margins, margins_name=margins_name)
719
-
720
- # Handling MultiIndex columns if present, making them a flat structure
721
- if not reverse_header_order:
722
- if isinstance(pivoted_df.columns, pd.MultiIndex):
723
- pivoted_df.columns = ['_'.join(map(str, col)).strip() for col in pivoted_df.columns.values]
724
- else:
725
- pivoted_df.columns = pivoted_df.columns.map(str)
719
+ pivoted_df = df_filtered.pivot_table(
720
+ index=index_col,
721
+ columns=columns,
722
+ values=values_col,
723
+ aggfunc=aggfunc,
724
+ margins=margins,
725
+ margins_name=margins_name,
726
+ )
727
+
728
+ # Handle column headers
729
+ if isinstance(pivoted_df.columns, pd.MultiIndex):
730
+ pivoted_df.columns = [
731
+ "_".join(reversed(map(str, col)) if reverse_header_order else map(str, col))
732
+ for col in pivoted_df.columns.values
733
+ ]
726
734
  else:
727
- if isinstance(pivoted_df.columns, pd.MultiIndex):
728
- # Reorder the MultiIndex columns
729
- pivoted_df.columns = ['_'.join(reversed(list(map(str, col)))).strip() for col in pivoted_df.columns.values]
730
- else:
731
- pivoted_df.columns = pivoted_df.columns.map(str)
732
- # Reverse the order for single index columns
733
- pivoted_df.columns = ['_'.join(reversed(col.split('_'))).strip() for col in pivoted_df.columns]
734
-
735
- # Reset the pivot before returning
736
- pivoted_df = pivoted_df.reset_index()
737
-
738
- # Sort by index column from oldest to newest
735
+ pivoted_df.columns = pivoted_df.columns.map(str)
736
+
737
+ # Reset the index
738
+ pivoted_df.reset_index(inplace=True)
739
+
740
+ # Handle sorting and formatting of index column
739
741
  if datetime_trans_needed:
740
- pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col]) # Ensure sorting works correctly
741
- pivoted_df = pivoted_df.sort_values(by=index_col)
742
-
743
- # Convert index column back to a string in YYYY-MM-DD format for display purposes
744
- pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
745
-
746
- # Fill in any NaNs
747
- pivoted_df = pivoted_df.fillna(fill_value)
748
-
749
- # If there is a need to fill in missing weeks
750
- if fill_missing_weekly_dates == True:
742
+ pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col], errors="coerce")
743
+ pivoted_df.sort_values(by=index_col, inplace=True)
744
+ pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
745
+
746
+ # Fill missing values
747
+ pivoted_df.fillna(fill_value, inplace=True)
748
+
749
+ # Fill missing weekly dates if specified
750
+ if fill_missing_weekly_dates:
751
751
  pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
752
752
 
753
753
  return pivoted_df
@@ -983,7 +983,7 @@ class dataprocessing:
983
983
 
984
984
  return df
985
985
 
986
- def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict,output_column_name="Updated Column"):
986
+ def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name="Updated Column"):
987
987
  """
988
988
  This function updates values in a specified column of the DataFrame based on a lookup dictionary.
989
989
  It first merges several columns into a new 'Merged' column, then uses this merged column to determine
@@ -1000,8 +1000,10 @@ class dataprocessing:
1000
1000
  Returns:
1001
1001
  pd.DataFrame: The modified DataFrame with updated values in the specified column.
1002
1002
  """
1003
+ # Create a merged column from specified columns
1003
1004
  df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
1004
1005
 
1006
+ # Replace values in the specified column based on the lookup
1005
1007
  def replace_values(x):
1006
1008
  if x[col] == replacement_rows:
1007
1009
  merged_value = x['Merged']
@@ -1009,10 +1011,14 @@ class dataprocessing:
1009
1011
  return replacement_lookup_dict[merged_value]
1010
1012
  return x[col]
1011
1013
 
1014
+ # Apply replacement logic
1012
1015
  df[output_column_name] = df.apply(replace_values, axis=1)
1013
1016
 
1017
+ # Drop the intermediate 'Merged' column
1018
+ df.drop(columns=['Merged'], inplace=True)
1019
+
1014
1020
  return df
1015
-
1021
+
1016
1022
  def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
1017
1023
  """
1018
1024
  Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
@@ -1049,35 +1055,38 @@ class dataprocessing:
1049
1055
 
1050
1056
  return df_final
1051
1057
 
1052
- def convert_df_wide_2_long(self, df,value_cols,variable_col_name='Stacked',value_col_name='Value'):
1058
+ def convert_df_wide_2_long(self, df, value_cols, variable_col_name='Stacked', value_col_name='Value'):
1053
1059
  """
1054
1060
  Changes a dataframe from wide to long format.
1055
1061
 
1056
1062
  Args:
1057
1063
  df (pandas.DataFrame): The DataFrame containing the data.
1058
- value_cols (list of str or str if only one): list of column names which are to be transformed from several columns into one.
1059
- variable_col_name (str, optional): Name of new variables column, which contains the names of the columns which have been stacked into one. Defaults to 'Stacked'.
1060
- value_col_name (str, optional): Name of the new value column which contains all the data from the stacked columns. Defaults to 'Value'.
1064
+ value_cols (list of str or str if only one): List of column names to transform from several columns into one.
1065
+ variable_col_name (str, optional): Name of the new variable column containing the original column names. Defaults to 'Stacked'.
1066
+ value_col_name (str, optional): Name of the new value column containing the data from stacked columns. Defaults to 'Value'.
1061
1067
 
1062
1068
  Returns:
1063
- pandas.DataFrame:: Returns dataframe transformed from long to wide.
1064
-
1069
+ pandas.DataFrame: DataFrame transformed from wide to long format.
1070
+
1065
1071
  Raises:
1066
- ValueError: If number of column names to be depivoted is less than 2, then this function is not neccesary.
1072
+ ValueError: If the number of columns to depivot is less than 2.
1067
1073
  """
1068
-
1069
- # Check length of value cols is greater than 1
1074
+ # Check length of value_cols is greater than 1
1070
1075
  if len(value_cols) < 2:
1071
1076
  raise ValueError("Number of inputs in list must be greater than 1")
1072
-
1077
+
1073
1078
  # Find the columns that are not to be depivoted into one column
1074
- id_vars = list(set(df.columns.tolist()) - set(value_cols))
1075
-
1079
+ id_vars = [col for col in df.columns if col not in value_cols] # Preserve column order in the DataFrame
1080
+
1076
1081
  # Melt all columns chosen into one column
1077
- df_final = pd.melt(df, id_vars,value_cols,var_name=variable_col_name,value_name=value_col_name)
1078
-
1082
+ df_final = pd.melt(df, id_vars=id_vars, value_vars=value_cols, var_name=variable_col_name, value_name=value_col_name)
1083
+
1084
+ # Sort column order to match expected output
1085
+ ordered_columns = id_vars + [variable_col_name, value_col_name]
1086
+ df_final = df_final[ordered_columns]
1087
+
1079
1088
  return df_final
1080
-
1089
+
1081
1090
  def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
1082
1091
  """
1083
1092
  Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
@@ -1102,18 +1111,24 @@ class dataprocessing:
1102
1111
  Returns:
1103
1112
  pandas.DataFrame: Dataframe with manual changes added
1104
1113
  """
1114
+
1105
1115
  # Raise type error if more than one col is supported
1106
1116
  if isinstance(col_to_change, list):
1107
1117
  raise TypeError("Col to change must be specified as a string, not a list")
1108
-
1118
+
1109
1119
  # Raises value error if input is invalid for change_in_existing_df_col
1110
1120
  if change_in_existing_df_col not in ["Yes", "No"]:
1111
1121
  raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
1112
-
1122
+
1113
1123
  # Raises value error if input is invalid for add_notes_col
1114
1124
  if add_notes not in ["Yes", "No"]:
1115
1125
  raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
1116
1126
 
1127
+ # Validate filters_dict format
1128
+ for col, cond in filters_dict.items():
1129
+ if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
1130
+ raise ValueError(f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'")
1131
+
1117
1132
  # Create the filtered df by applying the conditions
1118
1133
  df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
1119
1134
 
@@ -1122,7 +1137,7 @@ class dataprocessing:
1122
1137
  if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
1123
1138
  df = df.copy()
1124
1139
  df[new_col_to_change_name] = df[col_to_change]
1125
-
1140
+
1126
1141
  # Update the new cell in the chosen column
1127
1142
  df.loc[df_filtered.index, col_to_update] = new_value
1128
1143
 
@@ -1146,32 +1161,32 @@ class dataprocessing:
1146
1161
 
1147
1162
  def format_numbers_with_commas(self, df, decimal_length_chosen=2):
1148
1163
  """
1149
- Converts data in numerical format into numbers with commas and a chosen decimal place length
1164
+ Converts data in numerical format into numbers with commas and a chosen decimal place length.
1150
1165
 
1151
1166
  Args:
1152
1167
  df (pandas.DataFrame): The DataFrame containing the data.
1153
- decimal_length_chosen (int, optional): _description_. Defaults to 2.
1154
-
1168
+ decimal_length_chosen (int, optional): Number of decimal places. Defaults to 2.
1169
+
1155
1170
  Returns:
1156
- pandas.DataFrame: The dataframe with the chosen updated format
1171
+ pandas.DataFrame: The DataFrame with the chosen updated format.
1157
1172
  """
1158
1173
  def format_number_with_commas(x, decimal_length=decimal_length_chosen):
1159
- if isinstance(x, (int, float)):
1174
+ if pd.isna(x): # Preserve None/NaN values
1175
+ return pd.NA # Explicitly normalize to pd.NA
1176
+ elif isinstance(x, (int, float)):
1160
1177
  if decimal_length is not None:
1161
- format_str = "{:,.{}f}".format(x, decimal_length)
1162
- formatted_number = format_str.format(x)
1178
+ format_str = f"{{:,.{decimal_length}f}}"
1179
+ return format_str.format(x)
1163
1180
  else:
1164
- formatted_number = "{:,}".format(x)
1165
- return formatted_number
1181
+ return f"{x:,}"
1166
1182
  else:
1167
1183
  return x # Return unchanged if not a number
1168
1184
 
1169
-
1170
- # Apply the function across several columns using applymap()
1171
- formatted_df = df.applymap(format_number_with_commas)
1185
+ # Apply formatting column by column
1186
+ formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(value=pd.NA)
1172
1187
 
1173
1188
  return formatted_df
1174
-
1189
+
1175
1190
  def filter_df_on_multiple_conditions(self, df, filters_dict):
1176
1191
  """
1177
1192
  Filter a dataframe based on mulitple conditions
@@ -1269,7 +1284,6 @@ class dataprocessing:
1269
1284
  """
1270
1285
 
1271
1286
  #This line removes zero values from given column
1272
-
1273
1287
  return data_frame.loc[~(data_frame[column_to_filter] ==0)]
1274
1288
 
1275
1289
  def upgrade_outdated_packages(self):
@@ -1392,10 +1406,10 @@ class dataprocessing:
1392
1406
  Returns:
1393
1407
  pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
1394
1408
  """
1395
-
1409
+
1396
1410
  # If there is no date column
1397
1411
  if date_col is None:
1398
- df = df.applymap(lambda x: 1 if x > dummy_threshold else 0)
1412
+ df = df.apply(lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0))
1399
1413
 
1400
1414
  if add_total_dummy_col != 'No':
1401
1415
  # Find max value of rows
@@ -1403,8 +1417,10 @@ class dataprocessing:
1403
1417
 
1404
1418
  # If there is a date column
1405
1419
  else:
1406
- # Create dummies
1407
- df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].applymap(lambda x: 1 if x > dummy_threshold else 0)
1420
+ # Create dummies for all columns except the date column
1421
+ df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
1422
+ lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
1423
+ )
1408
1424
 
1409
1425
  if add_total_dummy_col != 'No':
1410
1426
  # Find max value of rows
@@ -1427,7 +1443,6 @@ class dataprocessing:
1427
1443
  Returns:
1428
1444
  pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
1429
1445
  """
1430
-
1431
1446
  if new_column is not None:
1432
1447
  # Create a new column for replacements
1433
1448
  df[new_column] = df[column]
@@ -1435,15 +1450,15 @@ class dataprocessing:
1435
1450
  else:
1436
1451
  # Modify the existing column
1437
1452
  temp_column = column
1438
-
1439
- # Apply substring replacements
1440
- for old, new in replacements.items():
1441
- df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
1442
-
1453
+
1443
1454
  # Optionally convert to lowercase
1444
1455
  if to_lower:
1445
1456
  df[temp_column] = df[temp_column].str.lower()
1446
-
1457
+
1458
+ # Apply substring replacements
1459
+ for old, new in replacements.items():
1460
+ df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
1461
+
1447
1462
  return df
1448
1463
 
1449
1464
  def add_total_column(self, df, exclude_col=None, total_col_name='Total'):
@@ -1458,11 +1473,11 @@ class dataprocessing:
1458
1473
  Returns:
1459
1474
  pd.DataFrame: The DataFrame with an added total column.
1460
1475
  """
1461
- # If exclude_col is provided, drop that column before summing
1462
- if exclude_col:
1463
- df[total_col_name] = df.drop(columns=[exclude_col]).sum(axis=1)
1476
+ if exclude_col and exclude_col in df.columns:
1477
+ # Ensure the column to exclude exists before dropping
1478
+ df[total_col_name] = df.drop(columns=[exclude_col], errors='ignore').sum(axis=1)
1464
1479
  else:
1465
- # Sum across all columns if exclude_col is not provided
1480
+ # Sum across all columns if no column is specified to exclude
1466
1481
  df[total_col_name] = df.sum(axis=1)
1467
1482
 
1468
1483
  return df
@@ -1502,7 +1517,7 @@ class dataprocessing:
1502
1517
  df[new_col_name] = df[column_name].apply(categorize_text)
1503
1518
  return df
1504
1519
 
1505
- def compare_overlap(self,df1, df2, date_col):
1520
+ def compare_overlap(self, df1, df2, date_col):
1506
1521
  """
1507
1522
  Compare overlapping periods between two DataFrames and provide a summary of total differences.
1508
1523
 
@@ -1517,64 +1532,70 @@ class dataprocessing:
1517
1532
  # Ensure date columns are in datetime format
1518
1533
  df1[date_col] = pd.to_datetime(df1[date_col])
1519
1534
  df2[date_col] = pd.to_datetime(df2[date_col])
1520
-
1535
+
1521
1536
  # Determine the overlap period
1522
1537
  start_date = max(df1[date_col].min(), df2[date_col].min())
1523
1538
  end_date = min(df1[date_col].max(), df2[date_col].max())
1524
-
1525
- # Filter dataframes to the overlapping period
1539
+
1540
+ # Filter DataFrames to the overlapping period
1526
1541
  df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
1527
1542
  df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
1528
-
1529
- # Merge the dataframes on the date column to align data for comparison
1543
+
1544
+ # Merge the DataFrames on the date column
1530
1545
  merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
1531
-
1532
- # Get the common columns between the two DataFrames, excluding the date column
1546
+
1547
+ # Get common columns, excluding the date column
1533
1548
  common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
1534
-
1535
- # Initialize a list to collect total differences for each column
1549
+
1550
+ # Create a DataFrame for differences
1551
+ diff_df = pd.DataFrame({date_col: merged_df[date_col]})
1552
+
1536
1553
  total_diff_list = []
1537
-
1538
- # Create a DataFrame for the differences
1539
- diff_df = pd.DataFrame({date_col: merged_df[date_col]}) # Initialize diff_df with the date column
1540
-
1541
1554
  for col in common_cols:
1542
- # Calculate the difference for each row
1543
1555
  diff_col = f'diff_{col}'
1544
- diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']
1545
-
1546
- # Calculate the total difference for the column and add it to the list
1556
+ diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2'] # Corrected subtraction order
1557
+
1558
+ # Sum differences for the column
1547
1559
  total_diff = diff_df[diff_col].sum()
1548
1560
  total_diff_list.append({'Column': col, 'Total Difference': total_diff})
1549
-
1550
- # Create a DataFrame for the summary of total differences
1561
+
1562
+ # Create summary DataFrame
1551
1563
  total_diff_df = pd.DataFrame(total_diff_list)
1552
-
1553
- # Apply formatting to the numerical columns
1554
- float_format = "{:,.2f}".format # Format to 2 decimal places with comma as thousand separator
1555
- diff_df.iloc[:, 1:] = diff_df.iloc[:, 1:].applymap(float_format)
1556
- total_diff_df['Total Difference'] = total_diff_df['Total Difference'].apply(float_format)
1557
-
1564
+
1558
1565
  return diff_df, total_diff_df
1559
-
1560
- def week_commencing_2_week_commencing_conversion(self,df,date_col,week_commencing='sun'):
1566
+
1567
+ def week_commencing_2_week_commencing_conversion_isoweekday(self, df, date_col, week_commencing='mon'):
1561
1568
  """
1562
- Convert week commencing column in a DataFrame to the start of another day specified.
1569
+ Convert a DataFrame's date column so that each date is mapped back
1570
+ to the 'week_commencing' day of the *current ISO week*.
1563
1571
 
1564
1572
  Args:
1565
- df (pandas.DataFrame): The DataFrame containing the date-based data.
1566
- date_col (str): The name of the date column in the DataFrame.
1567
- week_commencing (str, optional): The day of the week that the week starts on ('sun' for Sunday, 'mon' for Monday, etc.). Defaults to 'sun'.
1573
+ df (pandas.DataFrame): The DataFrame with date-based data.
1574
+ date_col (str): The name of the date column.
1575
+ week_commencing (str): The desired start of the week.
1576
+ ('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
1577
+ Uses ISO day numbering (Mon=1, ..., Sun=7).
1568
1578
 
1569
1579
  Returns:
1570
- pandas.DataFrame: The original DataFrame with an additional column indicating the start of the week.
1580
+ pandas.DataFrame: Original DataFrame with an extra column
1581
+ 'week_start_<week_commencing>' containing the
1582
+ start-of-week date for each row.
1571
1583
  """
1572
- # Week commencing dictionary
1573
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1574
- df['week_start_'+ week_commencing] = df[date_col].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1584
+ # ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
1585
+ iso_day_dict = {"mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6, "sun": 7}
1586
+
1587
+ target_day = iso_day_dict[week_commencing]
1588
+
1589
+ def map_to_week_start(date_val):
1590
+ delta = (date_val.isoweekday() - target_day) % 7
1591
+ return date_val - pd.Timedelta(days=delta)
1592
+
1593
+ # Apply the transformation
1594
+ new_col = f"week_start_{week_commencing}"
1595
+ df[new_col] = df[date_col].apply(map_to_week_start)
1575
1596
 
1576
1597
  return df
1577
-
1598
+
1578
1599
  def plot_chart(self, df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs):
1579
1600
  """
1580
1601
  Plot various types of charts using Plotly.
@@ -1743,17 +1764,6 @@ class dataprocessing:
1743
1764
  ########################################################################################################################################
1744
1765
  ########################################################################################################################################
1745
1766
 
1746
-
1747
-
1748
-
1749
-
1750
-
1751
-
1752
-
1753
-
1754
-
1755
-
1756
-
1757
1767
  ims_proc = dataprocessing()
1758
1768
 
1759
1769
  class datapull:
@@ -1764,46 +1774,46 @@ class datapull:
1764
1774
  print("\n1. pull_fred_data")
1765
1775
  print(" - Description: Get data from FRED by using series id tokens.")
1766
1776
  print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
1767
- print(" - Example: pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])")
1777
+ print(" - Example: pull_fred_data('mon', ['GPDIC1'])")
1768
1778
 
1769
1779
  print("\n2. pull_boe_data")
1770
1780
  print(" - Description: Fetch and process Bank of England interest rate data.")
1771
1781
  print(" - Usage: pull_boe_data(week_commencing)")
1772
1782
  print(" - Example: pull_boe_data('mon')")
1773
1783
 
1774
- print("\n3. pull_ons_data")
1775
- print(" - Description: Fetch and process time series data from the ONS API.")
1776
- print(" - Usage: pull_ons_data(series_list, week_commencing)")
1777
- print(" - Example: pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')")
1778
-
1779
- print("\n4. pull_oecd")
1784
+ print("\n3. pull_oecd")
1780
1785
  print(" - Description: Fetch macroeconomic data from OECD for a specified country.")
1781
- print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '1950-01-01')")
1782
- print(" - Example: pull_oecd('GBR', 'mon', '1950-01-01')")
1786
+ print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '2020-01-01')")
1787
+ print(" - Example: pull_oecd('GBR', 'mon', '2000-01-01')")
1783
1788
 
1784
- print("\n5. get_google_mobility_data")
1789
+ print("\n4. get_google_mobility_data")
1785
1790
  print(" - Description: Fetch Google Mobility data for the specified country.")
1786
1791
  print(" - Usage: get_google_mobility_data(country, wc)")
1787
1792
  print(" - Example: get_google_mobility_data('United Kingdom', 'mon')")
1788
1793
 
1789
- print("\n6. pull_combined_dummies")
1794
+ print("\n5. pull_seasonality")
1790
1795
  print(" - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.")
1791
- print(" - Usage: pull_combined_dummies(week_commencing)")
1792
- print(" - Example: pull_combined_dummies('mon')")
1796
+ print(" - Usage: pull_seasonality(week_commencing, start_date, countries)")
1797
+ print(" - Example: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])")
1793
1798
 
1794
- print("\n7. pull_weather")
1799
+ print("\n6. pull_weather")
1795
1800
  print(" - Description: Fetch and process historical weather data for the specified country.")
1796
1801
  print(" - Usage: pull_weather(week_commencing, country)")
1797
1802
  print(" - Example: pull_weather('mon', 'GBR')")
1803
+
1804
+ print("\n7. pull_macro_ons_uk")
1805
+ print(" - Description: Fetch and process time series data from the Beta ONS API.")
1806
+ print(" - Usage: pull_macro_ons_uk(aditional_list, week_commencing, sector)")
1807
+ print(" - Example: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')")
1808
+
1809
+ print("\n8. pull_yfinance")
1810
+ print(" - Description: Fetch and process time series data from the Beta ONS API.")
1811
+ print(" - Usage: pull_yfinance(tickers, week_start_day)")
1812
+ print(" - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
1798
1813
 
1799
- print("\n8. pull_covid_data")
1800
- print(" - Description: Get covid pandemic data for the country of interest.")
1801
- print(" - Usage: pull_covid_data(folder_path, country, week_commencing)")
1802
- print(" - Example: pull_covid_data('C:/Users/--username--/OneDrive/', 'GB', 'mon')")
1803
-
1804
1814
  ############################################################### MACRO ##########################################################################
1805
1815
 
1806
- def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]) -> pd.DataFrame:
1816
+ def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
1807
1817
  '''
1808
1818
  Parameters
1809
1819
  ----------
@@ -1812,16 +1822,12 @@ class datapull:
1812
1822
 
1813
1823
  series_id_list : list[str]
1814
1824
  provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
1815
- ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]
1825
+ ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]
1816
1826
 
1817
1827
  Returns
1818
1828
  ----------
1819
1829
  pd.DataFrame
1820
1830
  Return a data frame with FRED data according to the series IDs provided
1821
-
1822
- Example
1823
- ----------
1824
- pull_fred_data("mon", ["GCEC1", "SP500"])
1825
1831
  '''
1826
1832
  # Fred API
1827
1833
  fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
@@ -1864,169 +1870,82 @@ class datapull:
1864
1870
 
1865
1871
  return fred_df_final
1866
1872
 
1867
- def pull_boe_data(self, week_commencing="mon", max_retries=30, delay=5):
1873
+ def pull_boe_data(self, week_commencing="mon", max_retries=5, delay=5):
1868
1874
  """
1869
1875
  Fetch and process Bank of England interest rate data.
1870
1876
 
1871
1877
  Args:
1872
- week_commencing (str): The starting day of the week for aggregation.
1873
- Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1874
- Default is "sun".
1875
- max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 30.
1878
+ week_commencing (str): The starting day of the week for aggregation.
1879
+ Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1880
+ Default is "mon".
1881
+ max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
1876
1882
  delay (int): Delay in seconds between retry attempts. Default is 5.
1877
1883
 
1878
1884
  Returns:
1879
- pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
1880
- The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
1885
+ pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
1886
+ The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
1881
1887
  and 'macro_boe_intr_rate' contains the average interest rate for the week.
1882
1888
  """
1883
1889
  # Week commencing dictionary
1884
1890
  day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1885
-
1886
- # Function to fetch the data with retries
1887
- def fetch_data_with_retries(url, max_retries, delay):
1888
- for attempt in range(max_retries):
1889
- try:
1890
- html_table = pd.read_html(url)[0]
1891
- return html_table
1892
- except Exception as e:
1893
- print(f"Attempt {attempt + 1} failed: {e}")
1894
- if attempt < max_retries - 1:
1895
- time.sleep(delay)
1896
- else:
1897
- raise
1898
-
1899
- # Import HTML data from Bank of England rate
1891
+
1892
+ # URL of the Bank of England data page
1900
1893
  url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
1901
- html_table = fetch_data_with_retries(url, max_retries, delay)
1902
-
1903
- df = pd.DataFrame(html_table)
1894
+
1895
+ # Retry logic for HTTP request
1896
+ for attempt in range(max_retries):
1897
+ try:
1898
+ # Set up headers to mimic a browser request
1899
+ headers = {
1900
+ "User-Agent": (
1901
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
1902
+ "Chrome/91.0.4472.124 Safari/537.36"
1903
+ )
1904
+ }
1905
+ response = requests.get(url, headers=headers)
1906
+ response.raise_for_status() # Raise an exception for HTTP errors
1907
+ break
1908
+ except requests.exceptions.RequestException as e:
1909
+ print(f"Attempt {attempt + 1} failed: {e}")
1910
+ if attempt < max_retries - 1:
1911
+ time.sleep(delay)
1912
+ else:
1913
+ raise
1914
+
1915
+ # Parse the HTML page
1916
+ soup = BeautifulSoup(response.content, "html.parser")
1917
+
1918
+ # Find the table on the page
1919
+ table = soup.find("table") # Locate the first table
1920
+ table_html = str(table) # Convert table to string
1921
+ df = pd.read_html(StringIO(table_html))[0] # Use StringIO to wrap the table HTML
1922
+
1923
+ # Rename and clean up columns
1904
1924
  df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
1905
-
1906
- # Change date column to datetime and find the corresponding week to the date
1907
1925
  df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
1908
- df.sort_values("OBS", axis=0, inplace=True)
1909
-
1910
- # Create a daily date range and find the week commencing for that day
1911
- date_range = pd.date_range(df["OBS"].iloc[0], datetime.today(), freq="d")
1912
- df_daily = pd.DataFrame(date_range, columns=["OBS"])
1913
-
1914
- # Adjust each date to the specified week commencing day
1915
- df_daily['Week_Commencing'] = df_daily["OBS"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1916
-
1917
- # Outer merge the daily date range on the boe dataframe and forward fill in the blanks
1918
- df_final = df_daily.merge(df, on='OBS', how="left")
1919
- df_final["macro_boe_intr_rate"].ffill(inplace=True)
1920
-
1921
- # Group by the week start date and get the mean of the interest rates for each week
1922
- df_final = df_final.groupby('Week_Commencing')['macro_boe_intr_rate'].mean().reset_index()
1923
-
1924
- df_final['Week_Commencing'] = df_final['Week_Commencing'].dt.strftime('%d/%m/%Y')
1925
- df_final.rename(columns={'Week_Commencing': 'OBS'}, inplace=True)
1926
-
1927
- return df_final
1926
+ df.sort_values("OBS", inplace=True)
1928
1927
 
1929
- def pull_ons_data(self, series_list, week_commencing):
1930
- """
1931
- Fetch and process time series data from the ONS API.
1928
+ # Create a daily date range
1929
+ date_range = pd.date_range(df["OBS"].min(), datetime.today(), freq="D")
1930
+ df_daily = pd.DataFrame(date_range, columns=["OBS"])
1932
1931
 
1933
- Args:
1934
- series_list (list): A list of dictionaries where each dictionary represents a time series.
1935
- Each dictionary should have the keys 'series_id' and 'dataset_id'.
1936
- week_commencing (str): The starting day of the week for aggregation.
1937
- Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1932
+ # Adjust each date to the specified week commencing day
1933
+ df_daily["Week_Commencing"] = df_daily["OBS"].apply(
1934
+ lambda x: x - timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
1935
+ )
1938
1936
 
1939
- Returns:
1940
- pd.DataFrame: A DataFrame with weekly aggregated ONS data. The 'OBS' column contains the week
1941
- commencing dates and other columns contain the aggregated time series values.
1942
- """
1943
-
1944
- def parse_quarter(date_str):
1945
- """Parses a string in 'YYYY Q#' format into a datetime object."""
1946
- year, quarter = date_str.split(' ')
1947
- quarter_number = int(quarter[1])
1948
- month = (quarter_number - 1) * 3 + 1
1949
- return pd.Timestamp(f"{year}-{month:02d}-01")
1937
+ # Merge and forward-fill missing rates
1938
+ df_daily = df_daily.merge(df, on="OBS", how="left")
1939
+ df_daily["macro_boe_intr_rate"] = df_daily["macro_boe_intr_rate"].ffill()
1950
1940
 
1951
- # Generate a date range from 1950-01-01 to today
1952
- date_range = pd.date_range(start="1950-01-01", end=datetime.today(), freq='D')
1953
- daily_df = pd.DataFrame(date_range, columns=['OBS'])
1954
-
1955
- # Keep track of the renamed value columns
1956
- value_columns = []
1941
+ # Group by week commencing and calculate the average rate
1942
+ df_final = df_daily.groupby("Week_Commencing")["macro_boe_intr_rate"].mean().reset_index()
1943
+ df_final["Week_Commencing"] = df_final["Week_Commencing"].dt.strftime('%d/%m/%Y')
1944
+ df_final.rename(columns={"Week_Commencing": "OBS"}, inplace=True)
1957
1945
 
1958
- for series in series_list:
1959
- series_id = series['series_id']
1960
- dataset_id = series['dataset_id']
1961
-
1962
- # Construct the URL for data
1963
- data_url = f"https://api.ons.gov.uk/timeseries/{series_id}/dataset/{dataset_id}/data"
1964
-
1965
- # Make the request to the ONS API for data
1966
- data_response = requests.get(data_url)
1967
-
1968
- # Check if the request was successful
1969
- if data_response.status_code != 200:
1970
- print(f"Failed to fetch data for series {series_id}: {data_response.status_code} {data_response.text}")
1971
- continue
1972
-
1973
- # Parse the JSON response for data
1974
- data = data_response.json()
1975
-
1976
- # Attempt to extract the name of the time series from the data response
1977
- series_name = data.get('description', {}).get('title', 'Value')
1978
-
1979
- # Determine the most granular time series data available
1980
- if 'months' in data and data['months']:
1981
- time_series_data = data['months']
1982
- elif 'quarters' in data and data['quarters']:
1983
- time_series_data = data['quarters']
1984
- elif 'years' in data and data['years']:
1985
- time_series_data = data['years']
1986
- else:
1987
- print("No time series data found in the response")
1988
- continue
1989
-
1990
- # Create a DataFrame from the time series data
1991
- df = pd.DataFrame(time_series_data)
1992
-
1993
- # Handle different frequencies in the data
1994
- if 'date' in df.columns:
1995
- if any(df['date'].str.contains('Q')):
1996
- df['date'] = df['date'].apply(parse_quarter)
1997
- else:
1998
- df['date'] = pd.to_datetime(df['date'])
1999
-
2000
- df = df.rename(columns={'date': 'OBS', 'value': series_name})
2001
-
2002
- # Rename the value column
2003
- new_col_name = 'macro_' + series_name.lower().replace(':', '').replace(' ', '_').replace('-', '_')
2004
- df = df.rename(columns={series_name: new_col_name})
2005
-
2006
- # Track the renamed value column
2007
- value_columns.append(new_col_name)
2008
-
2009
- # Merge the data based on the observation date
2010
- daily_df = pd.merge_asof(daily_df, df[['OBS', new_col_name]], on='OBS', direction='backward')
2011
-
2012
- # Ensure columns are numeric
2013
- for col in value_columns:
2014
- if col in daily_df.columns:
2015
- daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
2016
- else:
2017
- print(f"Column {col} not found in daily_df")
2018
-
2019
- # Aggregate results by week
2020
- ons_df_final = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
2021
- date_column="OBS",
2022
- group_columns=[],
2023
- sum_columns=value_columns,
2024
- wc=week_commencing,
2025
- aggregation="average")
2026
-
2027
- return ons_df_final
1946
+ return df_final
2028
1947
 
2029
- def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "1950-01-01") -> pd.DataFrame:
1948
+ def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "2020-01-01") -> pd.DataFrame:
2030
1949
  """
2031
1950
  Fetch and process time series data from the OECD API.
2032
1951
 
@@ -2104,7 +2023,7 @@ class datapull:
2104
2023
  break
2105
2024
 
2106
2025
  # get data for the next variable if url doesn't exist
2107
- if url_test == False:
2026
+ if url_test is False:
2108
2027
  continue
2109
2028
 
2110
2029
  root = ET.fromstring(data_response.content)
@@ -2169,7 +2088,7 @@ class datapull:
2169
2088
 
2170
2089
  return oecd_df_final
2171
2090
 
2172
- def get_google_mobility_data(self, country: str, wc: str) -> pd.DataFrame:
2091
+ def get_google_mobility_data(self, country="United Kingdom", wc="mon") -> pd.DataFrame:
2173
2092
  """
2174
2093
  Fetch Google Mobility data for the specified country.
2175
2094
 
@@ -2189,7 +2108,7 @@ class datapull:
2189
2108
 
2190
2109
  # Load the CSV file into a pandas DataFrame
2191
2110
  csv_data = StringIO(response.text)
2192
- df = pd.read_csv(csv_data)
2111
+ df = pd.read_csv(csv_data, low_memory=False)
2193
2112
 
2194
2113
  # Filter the DataFrame for the specified country
2195
2114
  country_df = df[df['country_region'] == country]
@@ -2203,12 +2122,12 @@ class datapull:
2203
2122
 
2204
2123
  ############################################################### Seasonality ##########################################################################
2205
2124
 
2206
- def pull_combined_dummies(self, week_commencing):
2125
+ def pull_seasonality(self, week_commencing, start_date, countries):
2207
2126
  # Week commencing dictionary
2208
2127
  day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
2209
2128
 
2210
- # Create daily date range dataframe
2211
- date_range = pd.date_range(datetime(2015, 1, 1), datetime.today(), freq="d")
2129
+ # Create daily date range dataframe starting from start_date
2130
+ date_range = pd.date_range(start=pd.to_datetime(start_date), end=datetime.today(), freq="d")
2212
2131
  df_daily = pd.DataFrame(date_range, columns=["Date"])
2213
2132
 
2214
2133
  # Create weekly date range dataframe
@@ -2218,7 +2137,7 @@ class datapull:
2218
2137
 
2219
2138
  df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
2220
2139
  df_weekly_start.set_index("Date", inplace=True)
2221
-
2140
+
2222
2141
  # Create individual weekly dummies
2223
2142
  dummy_columns = {}
2224
2143
  for i in range(len(df_weekly_start)):
@@ -2228,84 +2147,59 @@ class datapull:
2228
2147
 
2229
2148
  df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
2230
2149
  df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
2231
-
2232
- # Create monthly dummies
2150
+
2151
+ # Add public holidays for each country and holiday type
2152
+ for country in countries:
2153
+ country_holidays = holidays.CountryHoliday(country, years=range(int(start_date[:4]), datetime.today().year + 1))
2154
+ df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(lambda x: 1 if x in country_holidays else 0)
2155
+
2156
+ # Extract specific holidays
2157
+ for date, name in country_holidays.items():
2158
+ col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
2159
+ if col_name not in df_daily.columns:
2160
+ df_daily[col_name] = 0
2161
+ df_daily.loc[df_daily["Date"] == pd.Timestamp(date), col_name] = 1
2162
+
2163
+ # Map daily holidays to weekly aggregation
2164
+ df_daily['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2165
+ df_holidays = df_daily.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
2166
+ df_holidays.set_index("Date", inplace=True)
2167
+
2168
+ # Create monthly dummies (separately from holidays)
2233
2169
  df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
2234
- df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"])
2170
+ df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"], dtype=int)
2235
2171
  df_monthly_dummies['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2236
2172
  df_monthly_dummies = df_monthly_dummies.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
2237
-
2238
2173
  df_monthly_dummies.set_index("Date", inplace=True)
2239
- df_monthly_dummies = df_monthly_dummies / 7
2240
-
2241
- # Combine weekly and monthly dataframes
2242
- df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
2243
-
2174
+
2175
+ # Divide only the monthly dummy columns by 7 (exclude holiday-related columns)
2176
+ monthly_cols = [col for col in df_monthly_dummies.columns if not col.startswith("seas_holiday") and not col.startswith("seas_")]
2177
+ df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
2178
+
2179
+ # Merge weekly dummies, monthly dummies, and holidays
2180
+ df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1) # Combine weekly and monthly first
2181
+ df_combined = pd.concat([df_combined, df_holidays], axis=1) # Add holidays separately
2182
+
2183
+ # Drop duplicate columns if any exist (this ensures holidays are not duplicated)
2184
+ df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
2185
+
2244
2186
  # Create weekly dummies
2245
2187
  df_combined.reset_index(inplace=True)
2246
2188
  df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
2247
- df_combined = pd.get_dummies(df_combined, prefix="wk", columns=["Week"])
2189
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int)
2248
2190
 
2249
2191
  # Create yearly dummies
2250
2192
  df_combined["Year"] = df_combined["Date"].dt.year
2251
- df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"])
2193
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
2252
2194
 
2253
2195
  # Add constant
2254
2196
  df_combined["Constant"] = 1
2255
2197
 
2256
2198
  # Add trend
2257
2199
  df_combined["Trend"] = df_combined.index + 1
2258
-
2259
- # Set date as index
2260
- df_combined.set_index("Date", inplace=True)
2261
-
2262
- # Create COVID lockdown dummies
2263
- lockdown_periods = [
2264
- # Lockdown 1
2265
- ("2020-03-23", "2020-05-24"),
2266
- # Lockdown 2
2267
- ("2020-11-05", "2020-12-02"),
2268
- # Lockdown 3
2269
- ("2021-01-04", "2021-03-08")
2270
- ]
2271
-
2272
- df_covid = pd.DataFrame(date_range, columns=["Date"])
2273
- df_covid["national_lockdown"] = 0
2274
-
2275
- for start, end in lockdown_periods:
2276
- df_covid.loc[(df_covid["Date"] >= start) & (df_covid["Date"] <= end), "national_lockdown"] = 1
2277
-
2278
- df_covid['week_start'] = df_covid["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2279
- df_covid.drop("Date", axis=1, inplace=True)
2280
- df_covid.rename(columns={"week_start": "OBS"}, inplace=True)
2281
- df_national_lockdown_total = df_covid.groupby('OBS').sum(numeric_only=True)
2282
- df_national_lockdown_total.rename(columns={"national_lockdown": "covid_uk_national_lockdown_total"}, inplace=True)
2283
-
2284
- df_national_lockdown_1 = df_national_lockdown_total.copy(deep=True)
2285
- df_national_lockdown_2 = df_national_lockdown_total.copy(deep=True)
2286
- df_national_lockdown_3 = df_national_lockdown_total.copy(deep=True)
2287
-
2288
- df_national_lockdown_1.loc[df_national_lockdown_1.index > "2020-05-24"] = 0
2289
- df_national_lockdown_1.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_1"}, inplace=True)
2290
-
2291
- df_national_lockdown_2.loc[df_national_lockdown_2.index < "2020-11-05"] = 0
2292
- df_national_lockdown_2.loc[df_national_lockdown_2.index > "2020-12-02"] = 0
2293
- df_national_lockdown_2.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_2"}, inplace=True)
2294
-
2295
- df_national_lockdown_3.loc[df_national_lockdown_3.index < "2021-01-04"] = 0
2296
- df_national_lockdown_3.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_3"}, inplace=True)
2297
2200
 
2298
- df_final_covid = pd.concat([df_national_lockdown_total, df_national_lockdown_1, df_national_lockdown_2, df_national_lockdown_3], axis=1)
2299
- df_final_covid.reset_index(inplace=True)
2300
- df_final_covid.rename(columns={"index": "OBS"}, inplace=True)
2301
-
2302
2201
  # Create seasonal indicators for the last day and last Friday of the month
2303
- min_date = '2019-12-29'
2304
- max_date = datetime.today().strftime('%Y-%m-%d')
2305
- date_range_seas = pd.date_range(start=min_date, end=max_date)
2306
-
2307
- df_seas = pd.DataFrame(date_range_seas, columns=['Date'])
2308
- df_seas['Last_Day_of_Month'] = df_seas['Date'].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
2202
+ df_combined['seas_last_day_of_month'] = df_combined["Date"].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
2309
2203
 
2310
2204
  def is_last_friday(date):
2311
2205
  last_day_of_month = date.to_period('M').to_timestamp('M')
@@ -2317,28 +2211,19 @@ class datapull:
2317
2211
  last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
2318
2212
  return 1 if date == last_friday else 0
2319
2213
 
2320
- df_seas['Last_Friday_of_Month'] = df_seas['Date'].apply(is_last_friday)
2214
+ df_combined['seas_last_friday_of_month'] = df_combined["Date"].apply(is_last_friday)
2321
2215
 
2322
- df_seas['week_start'] = df_seas["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2323
- df_seas = df_seas.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
2324
- df_seas.set_index("Date", inplace=True)
2325
-
2326
- # Combine all dataframes
2327
- df_combined = df_combined.reset_index().rename(columns={"Date": "OBS"})
2328
- df_final_combined = pd.merge(df_combined, df_final_covid, how='left', left_on='OBS', right_on='OBS')
2329
- df_final_combined = pd.merge(df_final_combined, df_seas, how='left', left_on='OBS', right_on='Date')
2330
-
2331
- # Fill any NaN values with 0
2332
- df_final_combined.fillna(0, inplace=True)
2216
+ # Rename Date to OBS
2217
+ df_combined.rename(columns={"Date": "OBS"}, inplace=True)
2333
2218
 
2334
- return df_final_combined
2219
+ return df_combined
2335
2220
 
2336
2221
  def pull_weather(self, week_commencing, country) -> pd.DataFrame:
2337
2222
  import pandas as pd
2338
- import urllib.request
2223
+ import urllib.request # noqa: F811
2339
2224
  from datetime import datetime
2340
2225
  import requests
2341
- from geopy.geocoders import Nominatim
2226
+ from geopy.geocoders import Nominatim # noqa: F811
2342
2227
 
2343
2228
  # Week commencing dictionary
2344
2229
  day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
@@ -2936,35 +2821,238 @@ class datapull:
2936
2821
 
2937
2822
  return final_weather
2938
2823
 
2939
- def pull_covid_data(self, folder_path: str, country: str = "GB", week_commencing: str = "mon") -> pd.DataFrame:
2824
+ def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
2940
2825
  """
2941
- Get covid pandemic data for the country of interest.
2826
+ Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
2827
+ aggregates it to weekly averages, and renames variables based on specified rules.
2942
2828
 
2943
- Args:
2944
- folder_path (str): A string containing the local location of the OneDrive folder.
2945
- Example: "C:/Users/-- username --/OneDrive - im-sciences.com"
2946
- The file location within the MasterDrive of the worldwide covid data is:
2947
- MasterDrive/Central Database/Covid/oxford-government-response.csv
2948
- country (str): A string containing the country of interest (E.g: "GB", "FR")
2949
- week_commencing (str): The starting day of the week for aggregation.
2950
- Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
2829
+ Parameters:
2830
+ cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
2831
+ week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
2832
+ sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
2833
+
2834
+ Returns:
2835
+ pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
2836
+ and all series as renamed columns.
2837
+ """
2838
+ # Define CDIDs for sectors and defaults
2839
+ sector_cdids = {
2840
+ "fast_food": ["L7TD", "L78Q", "DOAD"],
2841
+ "default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
2842
+ }
2843
+
2844
+ default_cdids = sector_cdids["default"]
2845
+ sector_specific_cdids = sector_cdids.get(sector, [])
2846
+ standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Avoid duplicates
2847
+
2848
+ # Combine standard CDIDs and additional CDIDs
2849
+ if cdid_list is None:
2850
+ cdid_list = []
2851
+ cdid_list = list(set(standard_cdids + cdid_list)) # Avoid duplicates
2852
+
2853
+ base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
2854
+ base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
2855
+ combined_df = pd.DataFrame()
2856
+
2857
+ # Map week start day to pandas weekday convention
2858
+ days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
2859
+ if week_start_day not in days_map:
2860
+ raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
2861
+ week_start = days_map[week_start_day]
2862
+
2863
+ for cdid in cdid_list:
2864
+ try:
2865
+ # Search for the series
2866
+ search_url = f"{base_search_url}{cdid}"
2867
+ search_response = requests.get(search_url)
2868
+ search_response.raise_for_status()
2869
+ search_data = search_response.json()
2870
+
2871
+ items = search_data.get("items", [])
2872
+ if not items:
2873
+ print(f"No data found for CDID: {cdid}")
2874
+ continue
2875
+
2876
+ # Extract series name and latest release URI
2877
+ series_name = items[0].get("title", f"Series_{cdid}")
2878
+ latest_date = max(
2879
+ datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
2880
+ for item in items if "release_date" in item
2881
+ )
2882
+ latest_uri = next(
2883
+ item["uri"] for item in items
2884
+ if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
2885
+ )
2886
+
2887
+ # Fetch the dataset
2888
+ data_url = f"{base_data_url}{latest_uri}"
2889
+ data_response = requests.get(data_url)
2890
+ data_response.raise_for_status()
2891
+ data_json = data_response.json()
2892
+
2893
+ # Detect the frequency and process accordingly
2894
+ if "months" in data_json and data_json["months"]:
2895
+ frequency_key = "months"
2896
+ elif "quarters" in data_json and data_json["quarters"]:
2897
+ frequency_key = "quarters"
2898
+ elif "years" in data_json and data_json["years"]:
2899
+ frequency_key = "years"
2900
+ else:
2901
+ print(f"Unsupported frequency or no data for CDID: {cdid}")
2902
+ continue
2903
+
2904
+ # Prepare the DataFrame
2905
+ df = pd.DataFrame(data_json[frequency_key])
2906
+
2907
+ # Parse the 'date' field based on frequency
2908
+ if frequency_key == "months":
2909
+ df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
2910
+ elif frequency_key == "quarters":
2911
+ def parse_quarter(quarter_str):
2912
+ year, qtr = quarter_str.split(" Q")
2913
+ month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
2914
+ return datetime(int(year), month, 1)
2915
+ df["date"] = df["date"].apply(parse_quarter)
2916
+ elif frequency_key == "years":
2917
+ df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
2918
+
2919
+ df["value"] = pd.to_numeric(df["value"], errors="coerce")
2920
+ df.rename(columns={"value": series_name}, inplace=True)
2921
+
2922
+ # Combine data
2923
+ df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
2924
+ if combined_df.empty:
2925
+ combined_df = df
2926
+ else:
2927
+ combined_df = pd.merge(combined_df, df, on="date", how="outer")
2928
+
2929
+ except requests.exceptions.RequestException as e:
2930
+ print(f"Error fetching data for CDID {cdid}: {e}")
2931
+ except (KeyError, ValueError) as e:
2932
+ print(f"Error processing data for CDID {cdid}: {e}")
2933
+
2934
+ if not combined_df.empty:
2935
+ min_date = combined_df["date"].min()
2936
+ max_date = datetime.today()
2937
+ date_range = pd.date_range(start=min_date, end=max_date, freq='D')
2938
+ daily_df = pd.DataFrame(date_range, columns=['date'])
2939
+ daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
2940
+ daily_df = daily_df.ffill()
2941
+
2942
+ # Aggregate to weekly frequency
2943
+ daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
2944
+ weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
2945
+
2946
+ def clean_column_name(name):
2947
+ name = re.sub(r"\(.*?\)", "", name)
2948
+ name = re.split(r":", name)[0]
2949
+ name = re.sub(r"\d+", "", name)
2950
+ name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
2951
+ name = re.sub(r"[^\w\s]", "", name)
2952
+ name = name.replace(" ", "_")
2953
+ name = re.sub(r"_+", "_", name)
2954
+ name = name.rstrip("_")
2955
+ return f"macro_{name.lower()}_uk"
2956
+
2957
+ weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
2958
+ weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
2959
+
2960
+ weekly_df = weekly_df.fillna(0)
2961
+
2962
+ return weekly_df
2963
+ else:
2964
+ print("No data available to process.")
2965
+ return pd.DataFrame()
2966
+
2967
+ def pull_yfinance(self, tickers=None, week_start_day="mon"):
2968
+ """
2969
+ Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
2970
+ aggregates it to weekly averages, and renames variables.
2971
+
2972
+ Parameters:
2973
+ tickers (list): A list of additional stock tickers to fetch (e.g., ['AAPL', 'MSFT']). Defaults to None.
2974
+ week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
2951
2975
 
2952
2976
  Returns:
2953
- pd.DataFrame: A DataFrame containing seasonality and public holiday dummies for the country of interest.
2954
- The 'OBS' column contains the week commencing dates.
2977
+ pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column
2978
+ and aggregated stock data for the specified tickers, with NaN values filled with 0.
2955
2979
  """
2980
+ # Define default tickers
2981
+ default_tickers = ["^FTSE", "GBPUSD=X", "GBPEUR=X", "^GSPC"]
2956
2982
 
2957
- df = pd.read_csv(f'{folder_path}/MasterDrive/Central Database/Covid/oxford-government-response.csv')
2983
+ # Combine default tickers with additional ones
2984
+ if tickers is None:
2985
+ tickers = []
2986
+ tickers = list(set(default_tickers + tickers)) # Ensure no duplicates
2987
+
2988
+ # Automatically set end_date to today
2989
+ end_date = datetime.today().strftime("%Y-%m-%d")
2990
+
2991
+ # Mapping week start day to pandas weekday convention
2992
+ days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
2993
+ if week_start_day not in days_map:
2994
+ raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
2995
+ week_start = days_map[week_start_day]
2958
2996
 
2959
- country_df = df[df['location_key']==country]
2960
- country_df.rename(columns={'date': 'OBS'}, inplace=True)
2961
- country_df.drop('location_key', axis=1, inplace=True)
2997
+ # Fetch data for all tickers without specifying a start date to get all available data
2998
+ data = yf.download(tickers, end=end_date, group_by="ticker", auto_adjust=True)
2999
+
3000
+ # Process the data
3001
+ combined_df = pd.DataFrame()
3002
+ for ticker in tickers:
3003
+ try:
3004
+ # Extract the ticker's data
3005
+ ticker_data = data[ticker] if len(tickers) > 1 else data
3006
+ ticker_data = ticker_data.reset_index()
3007
+
3008
+ # Ensure necessary columns are present
3009
+ if "Close" not in ticker_data.columns:
3010
+ raise ValueError(f"Ticker {ticker} does not have 'Close' price data.")
3011
+
3012
+ # Keep only relevant columns
3013
+ ticker_data = ticker_data[["Date", "Close"]]
3014
+ ticker_data.rename(columns={"Close": ticker}, inplace=True)
2962
3015
 
2963
- agg_df = ims_proc.aggregate_daily_to_wc_wide(country_df, 'OBS', [], country_df.columns.to_list(), week_commencing, 'average')
3016
+ # Merge data
3017
+ if combined_df.empty:
3018
+ combined_df = ticker_data
3019
+ else:
3020
+ combined_df = pd.merge(combined_df, ticker_data, on="Date", how="outer")
2964
3021
 
2965
- covid_df = ims_proc.rename_cols(agg_df, 'covid_')
3022
+ except KeyError:
3023
+ print(f"Data for ticker {ticker} not available.")
3024
+ except Exception as e:
3025
+ print(f"Error processing ticker {ticker}: {e}")
2966
3026
 
2967
- covid_df['OBS'] = covid_df['OBS'].apply(lambda x: x[0].date())
3027
+ if not combined_df.empty:
3028
+ # Convert to daily frequency
3029
+ combined_df["Date"] = pd.to_datetime(combined_df["Date"])
3030
+ combined_df.set_index("Date", inplace=True)
2968
3031
 
2969
- return covid_df
2970
-
3032
+ # Fill missing dates
3033
+ min_date = combined_df.index.min()
3034
+ max_date = combined_df.index.max()
3035
+ daily_index = pd.date_range(start=min_date, end=max_date, freq='D')
3036
+ combined_df = combined_df.reindex(daily_index)
3037
+ combined_df.index.name = "Date"
3038
+ combined_df = combined_df.ffill()
3039
+
3040
+ # Aggregate to weekly frequency
3041
+ combined_df["OBS"] = combined_df.index - pd.to_timedelta((combined_df.index.weekday - week_start) % 7, unit="D")
3042
+ weekly_df = combined_df.groupby("OBS").mean(numeric_only=True).reset_index()
3043
+
3044
+ # Fill NaN values with 0
3045
+ weekly_df = weekly_df.fillna(0)
3046
+
3047
+ # Clean column names
3048
+ def clean_column_name(name):
3049
+ name = re.sub(r"[^\w\s]", "", name)
3050
+ return f"macro_{name.lower()}"
3051
+
3052
+ weekly_df.columns = [clean_column_name(col) if col != "OBS" else col for col in weekly_df.columns]
3053
+
3054
+ return weekly_df
3055
+
3056
+ else:
3057
+ print("No data available to process.")
3058
+ return pd.DataFrame()