imsciences 0.6.3.1__py3-none-any.whl → 0.6.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,20 +6,19 @@ import plotly.graph_objs as go
6
6
  import numpy as np
7
7
  import datetime
8
8
  import re
9
- import pandas as pd
10
9
  from fredapi import Fred
11
10
  import time
12
- from datetime import datetime,timedelta
13
- from cif import cif
11
+ from datetime import datetime, timedelta # noqa: F811
14
12
  from io import StringIO
15
13
  import urllib
16
- import requests_cache
17
- import urllib.request
14
+ import requests_cache # noqa: F401
15
+ import urllib.request # noqa: F401
18
16
  import requests
19
- from geopy.geocoders import Nominatim
17
+ from geopy.geocoders import Nominatim # noqa: F401
20
18
  import subprocess
21
19
  import json
22
20
  import xml.etree.ElementTree as ET
21
+ from bs4 import BeautifulSoup
23
22
 
24
23
  class dataprocessing:
25
24
 
@@ -391,7 +390,7 @@ class dataprocessing:
391
390
  # Divide each numeric value by the number of days in the month
392
391
  for col in df.columns:
393
392
  if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
394
- if divide == True:
393
+ if divide is True:
395
394
  daily_row[col] = row[col] / num_days
396
395
  else:
397
396
  daily_row[col] = row[col]
@@ -678,7 +677,7 @@ class dataprocessing:
678
677
 
679
678
  return combined_df
680
679
 
681
- def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name="Total", datetime_trans_needed=True, reverse_header_order=False,fill_missing_weekly_dates=False,week_commencing='W-MON'):
680
+ def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing="W-MON"):
682
681
  """
683
682
  Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
684
683
 
@@ -701,53 +700,57 @@ class dataprocessing:
701
700
  pandas.DataFrame: The pivot table specified
702
701
  """
703
702
 
704
- # Create the filtered df by applying the conditions
705
- if filters_dict is None:
706
- df_filtered = df
707
- else:
703
+ # Validate inputs
704
+ if index_col not in df.columns:
705
+ raise ValueError(f"index_col '{index_col}' not found in DataFrame.")
706
+ if columns not in df.columns:
707
+ raise ValueError(f"columns '{columns}' not found in DataFrame.")
708
+ if values_col not in df.columns:
709
+ raise ValueError(f"values_col '{values_col}' not found in DataFrame.")
710
+
711
+ # Apply filters if provided
712
+ if filters_dict:
708
713
  df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
709
-
710
- # Ensure index column is in datetime format for proper sorting
711
- df_filtered = df_filtered.copy()
712
-
713
- # If datetime transformation is needed
714
+ else:
715
+ df_filtered = df.copy()
716
+
717
+ # Ensure index column is in datetime format if needed
714
718
  if datetime_trans_needed:
715
719
  df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
716
-
720
+
717
721
  # Create the pivot table
718
- pivoted_df = df_filtered.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc=aggfunc, margins=margins, margins_name=margins_name)
719
-
720
- # Handling MultiIndex columns if present, making them a flat structure
721
- if not reverse_header_order:
722
- if isinstance(pivoted_df.columns, pd.MultiIndex):
723
- pivoted_df.columns = ['_'.join(map(str, col)).strip() for col in pivoted_df.columns.values]
724
- else:
725
- pivoted_df.columns = pivoted_df.columns.map(str)
722
+ pivoted_df = df_filtered.pivot_table(
723
+ index=index_col,
724
+ columns=columns,
725
+ values=values_col,
726
+ aggfunc=aggfunc,
727
+ margins=margins,
728
+ margins_name=margins_name,
729
+ )
730
+
731
+ # Handle column headers
732
+ if isinstance(pivoted_df.columns, pd.MultiIndex):
733
+ pivoted_df.columns = [
734
+ "_".join(reversed(map(str, col)) if reverse_header_order else map(str, col))
735
+ for col in pivoted_df.columns.values
736
+ ]
726
737
  else:
727
- if isinstance(pivoted_df.columns, pd.MultiIndex):
728
- # Reorder the MultiIndex columns
729
- pivoted_df.columns = ['_'.join(reversed(list(map(str, col)))).strip() for col in pivoted_df.columns.values]
730
- else:
731
- pivoted_df.columns = pivoted_df.columns.map(str)
732
- # Reverse the order for single index columns
733
- pivoted_df.columns = ['_'.join(reversed(col.split('_'))).strip() for col in pivoted_df.columns]
734
-
735
- # Reset the pivot before returning
736
- pivoted_df = pivoted_df.reset_index()
737
-
738
- # Sort by index column from oldest to newest
738
+ pivoted_df.columns = pivoted_df.columns.map(str)
739
+
740
+ # Reset the index
741
+ pivoted_df.reset_index(inplace=True)
742
+
743
+ # Handle sorting and formatting of index column
739
744
  if datetime_trans_needed:
740
- pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col]) # Ensure sorting works correctly
741
- pivoted_df = pivoted_df.sort_values(by=index_col)
742
-
743
- # Convert index column back to a string in YYYY-MM-DD format for display purposes
744
- pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
745
-
746
- # Fill in any NaNs
747
- pivoted_df = pivoted_df.fillna(fill_value)
748
-
749
- # If there is a need to fill in missing weeks
750
- if fill_missing_weekly_dates == True:
745
+ pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col], errors="coerce")
746
+ pivoted_df.sort_values(by=index_col, inplace=True)
747
+ pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
748
+
749
+ # Fill missing values
750
+ pivoted_df.fillna(fill_value, inplace=True)
751
+
752
+ # Fill missing weekly dates if specified
753
+ if fill_missing_weekly_dates:
751
754
  pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
752
755
 
753
756
  return pivoted_df
@@ -983,7 +986,7 @@ class dataprocessing:
983
986
 
984
987
  return df
985
988
 
986
- def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict,output_column_name="Updated Column"):
989
+ def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name="Updated Column"):
987
990
  """
988
991
  This function updates values in a specified column of the DataFrame based on a lookup dictionary.
989
992
  It first merges several columns into a new 'Merged' column, then uses this merged column to determine
@@ -1000,8 +1003,10 @@ class dataprocessing:
1000
1003
  Returns:
1001
1004
  pd.DataFrame: The modified DataFrame with updated values in the specified column.
1002
1005
  """
1006
+ # Create a merged column from specified columns
1003
1007
  df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
1004
1008
 
1009
+ # Replace values in the specified column based on the lookup
1005
1010
  def replace_values(x):
1006
1011
  if x[col] == replacement_rows:
1007
1012
  merged_value = x['Merged']
@@ -1009,10 +1014,14 @@ class dataprocessing:
1009
1014
  return replacement_lookup_dict[merged_value]
1010
1015
  return x[col]
1011
1016
 
1017
+ # Apply replacement logic
1012
1018
  df[output_column_name] = df.apply(replace_values, axis=1)
1013
1019
 
1020
+ # Drop the intermediate 'Merged' column
1021
+ df.drop(columns=['Merged'], inplace=True)
1022
+
1014
1023
  return df
1015
-
1024
+
1016
1025
  def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
1017
1026
  """
1018
1027
  Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
@@ -1049,35 +1058,38 @@ class dataprocessing:
1049
1058
 
1050
1059
  return df_final
1051
1060
 
1052
- def convert_df_wide_2_long(self, df,value_cols,variable_col_name='Stacked',value_col_name='Value'):
1061
+ def convert_df_wide_2_long(self, df, value_cols, variable_col_name='Stacked', value_col_name='Value'):
1053
1062
  """
1054
1063
  Changes a dataframe from wide to long format.
1055
1064
 
1056
1065
  Args:
1057
1066
  df (pandas.DataFrame): The DataFrame containing the data.
1058
- value_cols (list of str or str if only one): list of column names which are to be transformed from several columns into one.
1059
- variable_col_name (str, optional): Name of new variables column, which contains the names of the columns which have been stacked into one. Defaults to 'Stacked'.
1060
- value_col_name (str, optional): Name of the new value column which contains all the data from the stacked columns. Defaults to 'Value'.
1067
+ value_cols (list of str or str if only one): List of column names to transform from several columns into one.
1068
+ variable_col_name (str, optional): Name of the new variable column containing the original column names. Defaults to 'Stacked'.
1069
+ value_col_name (str, optional): Name of the new value column containing the data from stacked columns. Defaults to 'Value'.
1061
1070
 
1062
1071
  Returns:
1063
- pandas.DataFrame:: Returns dataframe transformed from long to wide.
1064
-
1072
+ pandas.DataFrame: DataFrame transformed from wide to long format.
1073
+
1065
1074
  Raises:
1066
- ValueError: If number of column names to be depivoted is less than 2, then this function is not neccesary.
1075
+ ValueError: If the number of columns to depivot is less than 2.
1067
1076
  """
1068
-
1069
- # Check length of value cols is greater than 1
1077
+ # Check length of value_cols is greater than 1
1070
1078
  if len(value_cols) < 2:
1071
1079
  raise ValueError("Number of inputs in list must be greater than 1")
1072
-
1080
+
1073
1081
  # Find the columns that are not to be depivoted into one column
1074
- id_vars = list(set(df.columns.tolist()) - set(value_cols))
1075
-
1082
+ id_vars = [col for col in df.columns if col not in value_cols] # Preserve column order in the DataFrame
1083
+
1076
1084
  # Melt all columns chosen into one column
1077
- df_final = pd.melt(df, id_vars,value_cols,var_name=variable_col_name,value_name=value_col_name)
1078
-
1085
+ df_final = pd.melt(df, id_vars=id_vars, value_vars=value_cols, var_name=variable_col_name, value_name=value_col_name)
1086
+
1087
+ # Sort column order to match expected output
1088
+ ordered_columns = id_vars + [variable_col_name, value_col_name]
1089
+ df_final = df_final[ordered_columns]
1090
+
1079
1091
  return df_final
1080
-
1092
+
1081
1093
  def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
1082
1094
  """
1083
1095
  Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
@@ -1102,18 +1114,24 @@ class dataprocessing:
1102
1114
  Returns:
1103
1115
  pandas.DataFrame: Dataframe with manual changes added
1104
1116
  """
1117
+
1105
1118
  # Raise type error if more than one col is supported
1106
1119
  if isinstance(col_to_change, list):
1107
1120
  raise TypeError("Col to change must be specified as a string, not a list")
1108
-
1121
+
1109
1122
  # Raises value error if input is invalid for change_in_existing_df_col
1110
1123
  if change_in_existing_df_col not in ["Yes", "No"]:
1111
1124
  raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
1112
-
1125
+
1113
1126
  # Raises value error if input is invalid for add_notes_col
1114
1127
  if add_notes not in ["Yes", "No"]:
1115
1128
  raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
1116
1129
 
1130
+ # Validate filters_dict format
1131
+ for col, cond in filters_dict.items():
1132
+ if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
1133
+ raise ValueError(f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'")
1134
+
1117
1135
  # Create the filtered df by applying the conditions
1118
1136
  df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
1119
1137
 
@@ -1122,7 +1140,7 @@ class dataprocessing:
1122
1140
  if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
1123
1141
  df = df.copy()
1124
1142
  df[new_col_to_change_name] = df[col_to_change]
1125
-
1143
+
1126
1144
  # Update the new cell in the chosen column
1127
1145
  df.loc[df_filtered.index, col_to_update] = new_value
1128
1146
 
@@ -1146,32 +1164,32 @@ class dataprocessing:
1146
1164
 
1147
1165
  def format_numbers_with_commas(self, df, decimal_length_chosen=2):
1148
1166
  """
1149
- Converts data in numerical format into numbers with commas and a chosen decimal place length
1167
+ Converts data in numerical format into numbers with commas and a chosen decimal place length.
1150
1168
 
1151
1169
  Args:
1152
1170
  df (pandas.DataFrame): The DataFrame containing the data.
1153
- decimal_length_chosen (int, optional): _description_. Defaults to 2.
1154
-
1171
+ decimal_length_chosen (int, optional): Number of decimal places. Defaults to 2.
1172
+
1155
1173
  Returns:
1156
- pandas.DataFrame: The dataframe with the chosen updated format
1174
+ pandas.DataFrame: The DataFrame with the chosen updated format.
1157
1175
  """
1158
1176
  def format_number_with_commas(x, decimal_length=decimal_length_chosen):
1159
- if isinstance(x, (int, float)):
1177
+ if pd.isna(x): # Preserve None/NaN values
1178
+ return pd.NA # Explicitly normalize to pd.NA
1179
+ elif isinstance(x, (int, float)):
1160
1180
  if decimal_length is not None:
1161
- format_str = "{:,.{}f}".format(x, decimal_length)
1162
- formatted_number = format_str.format(x)
1181
+ format_str = f"{{:,.{decimal_length}f}}"
1182
+ return format_str.format(x)
1163
1183
  else:
1164
- formatted_number = "{:,}".format(x)
1165
- return formatted_number
1184
+ return f"{x:,}"
1166
1185
  else:
1167
1186
  return x # Return unchanged if not a number
1168
1187
 
1169
-
1170
- # Apply the function across several columns using applymap()
1171
- formatted_df = df.applymap(format_number_with_commas)
1188
+ # Apply formatting column by column
1189
+ formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(value=pd.NA)
1172
1190
 
1173
1191
  return formatted_df
1174
-
1192
+
1175
1193
  def filter_df_on_multiple_conditions(self, df, filters_dict):
1176
1194
  """
1177
1195
  Filter a dataframe based on mulitple conditions
@@ -1269,7 +1287,6 @@ class dataprocessing:
1269
1287
  """
1270
1288
 
1271
1289
  #This line removes zero values from given column
1272
-
1273
1290
  return data_frame.loc[~(data_frame[column_to_filter] ==0)]
1274
1291
 
1275
1292
  def upgrade_outdated_packages(self):
@@ -1392,10 +1409,10 @@ class dataprocessing:
1392
1409
  Returns:
1393
1410
  pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
1394
1411
  """
1395
-
1412
+
1396
1413
  # If there is no date column
1397
1414
  if date_col is None:
1398
- df = df.applymap(lambda x: 1 if x > dummy_threshold else 0)
1415
+ df = df.apply(lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0))
1399
1416
 
1400
1417
  if add_total_dummy_col != 'No':
1401
1418
  # Find max value of rows
@@ -1403,8 +1420,10 @@ class dataprocessing:
1403
1420
 
1404
1421
  # If there is a date column
1405
1422
  else:
1406
- # Create dummies
1407
- df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].applymap(lambda x: 1 if x > dummy_threshold else 0)
1423
+ # Create dummies for all columns except the date column
1424
+ df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
1425
+ lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
1426
+ )
1408
1427
 
1409
1428
  if add_total_dummy_col != 'No':
1410
1429
  # Find max value of rows
@@ -1427,7 +1446,6 @@ class dataprocessing:
1427
1446
  Returns:
1428
1447
  pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
1429
1448
  """
1430
-
1431
1449
  if new_column is not None:
1432
1450
  # Create a new column for replacements
1433
1451
  df[new_column] = df[column]
@@ -1435,15 +1453,15 @@ class dataprocessing:
1435
1453
  else:
1436
1454
  # Modify the existing column
1437
1455
  temp_column = column
1438
-
1439
- # Apply substring replacements
1440
- for old, new in replacements.items():
1441
- df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
1442
-
1456
+
1443
1457
  # Optionally convert to lowercase
1444
1458
  if to_lower:
1445
1459
  df[temp_column] = df[temp_column].str.lower()
1446
-
1460
+
1461
+ # Apply substring replacements
1462
+ for old, new in replacements.items():
1463
+ df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
1464
+
1447
1465
  return df
1448
1466
 
1449
1467
  def add_total_column(self, df, exclude_col=None, total_col_name='Total'):
@@ -1458,11 +1476,11 @@ class dataprocessing:
1458
1476
  Returns:
1459
1477
  pd.DataFrame: The DataFrame with an added total column.
1460
1478
  """
1461
- # If exclude_col is provided, drop that column before summing
1462
- if exclude_col:
1463
- df[total_col_name] = df.drop(columns=[exclude_col]).sum(axis=1)
1479
+ if exclude_col and exclude_col in df.columns:
1480
+ # Ensure the column to exclude exists before dropping
1481
+ df[total_col_name] = df.drop(columns=[exclude_col], errors='ignore').sum(axis=1)
1464
1482
  else:
1465
- # Sum across all columns if exclude_col is not provided
1483
+ # Sum across all columns if no column is specified to exclude
1466
1484
  df[total_col_name] = df.sum(axis=1)
1467
1485
 
1468
1486
  return df
@@ -1502,7 +1520,7 @@ class dataprocessing:
1502
1520
  df[new_col_name] = df[column_name].apply(categorize_text)
1503
1521
  return df
1504
1522
 
1505
- def compare_overlap(self,df1, df2, date_col):
1523
+ def compare_overlap(self, df1, df2, date_col):
1506
1524
  """
1507
1525
  Compare overlapping periods between two DataFrames and provide a summary of total differences.
1508
1526
 
@@ -1517,64 +1535,70 @@ class dataprocessing:
1517
1535
  # Ensure date columns are in datetime format
1518
1536
  df1[date_col] = pd.to_datetime(df1[date_col])
1519
1537
  df2[date_col] = pd.to_datetime(df2[date_col])
1520
-
1538
+
1521
1539
  # Determine the overlap period
1522
1540
  start_date = max(df1[date_col].min(), df2[date_col].min())
1523
1541
  end_date = min(df1[date_col].max(), df2[date_col].max())
1524
-
1525
- # Filter dataframes to the overlapping period
1542
+
1543
+ # Filter DataFrames to the overlapping period
1526
1544
  df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
1527
1545
  df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
1528
-
1529
- # Merge the dataframes on the date column to align data for comparison
1546
+
1547
+ # Merge the DataFrames on the date column
1530
1548
  merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
1531
-
1532
- # Get the common columns between the two DataFrames, excluding the date column
1549
+
1550
+ # Get common columns, excluding the date column
1533
1551
  common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
1534
-
1535
- # Initialize a list to collect total differences for each column
1552
+
1553
+ # Create a DataFrame for differences
1554
+ diff_df = pd.DataFrame({date_col: merged_df[date_col]})
1555
+
1536
1556
  total_diff_list = []
1537
-
1538
- # Create a DataFrame for the differences
1539
- diff_df = pd.DataFrame({date_col: merged_df[date_col]}) # Initialize diff_df with the date column
1540
-
1541
1557
  for col in common_cols:
1542
- # Calculate the difference for each row
1543
1558
  diff_col = f'diff_{col}'
1544
- diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']
1545
-
1546
- # Calculate the total difference for the column and add it to the list
1559
+ diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2'] # Corrected subtraction order
1560
+
1561
+ # Sum differences for the column
1547
1562
  total_diff = diff_df[diff_col].sum()
1548
1563
  total_diff_list.append({'Column': col, 'Total Difference': total_diff})
1549
-
1550
- # Create a DataFrame for the summary of total differences
1564
+
1565
+ # Create summary DataFrame
1551
1566
  total_diff_df = pd.DataFrame(total_diff_list)
1552
-
1553
- # Apply formatting to the numerical columns
1554
- float_format = "{:,.2f}".format # Format to 2 decimal places with comma as thousand separator
1555
- diff_df.iloc[:, 1:] = diff_df.iloc[:, 1:].applymap(float_format)
1556
- total_diff_df['Total Difference'] = total_diff_df['Total Difference'].apply(float_format)
1557
-
1567
+
1558
1568
  return diff_df, total_diff_df
1559
-
1560
- def week_commencing_2_week_commencing_conversion(self,df,date_col,week_commencing='sun'):
1569
+
1570
+ def week_commencing_2_week_commencing_conversion_isoweekday(self, df, date_col, week_commencing='mon'):
1561
1571
  """
1562
- Convert week commencing column in a DataFrame to the start of another day specified.
1572
+ Convert a DataFrame's date column so that each date is mapped back
1573
+ to the 'week_commencing' day of the *current ISO week*.
1563
1574
 
1564
1575
  Args:
1565
- df (pandas.DataFrame): The DataFrame containing the date-based data.
1566
- date_col (str): The name of the date column in the DataFrame.
1567
- week_commencing (str, optional): The day of the week that the week starts on ('sun' for Sunday, 'mon' for Monday, etc.). Defaults to 'sun'.
1576
+ df (pandas.DataFrame): The DataFrame with date-based data.
1577
+ date_col (str): The name of the date column.
1578
+ week_commencing (str): The desired start of the week.
1579
+ ('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
1580
+ Uses ISO day numbering (Mon=1, ..., Sun=7).
1568
1581
 
1569
1582
  Returns:
1570
- pandas.DataFrame: The original DataFrame with an additional column indicating the start of the week.
1583
+ pandas.DataFrame: Original DataFrame with an extra column
1584
+ 'week_start_<week_commencing>' containing the
1585
+ start-of-week date for each row.
1571
1586
  """
1572
- # Week commencing dictionary
1573
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1574
- df['week_start_'+ week_commencing] = df[date_col].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1587
+ # ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
1588
+ iso_day_dict = {"mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6, "sun": 7}
1589
+
1590
+ target_day = iso_day_dict[week_commencing]
1591
+
1592
+ def map_to_week_start(date_val):
1593
+ delta = (date_val.isoweekday() - target_day) % 7
1594
+ return date_val - pd.Timedelta(days=delta)
1595
+
1596
+ # Apply the transformation
1597
+ new_col = f"week_start_{week_commencing}"
1598
+ df[new_col] = df[date_col].apply(map_to_week_start)
1575
1599
 
1576
1600
  return df
1577
-
1601
+
1578
1602
  def plot_chart(self, df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs):
1579
1603
  """
1580
1604
  Plot various types of charts using Plotly.
@@ -1795,15 +1819,10 @@ class datapull:
1795
1819
  print(" - Description: Fetch and process historical weather data for the specified country.")
1796
1820
  print(" - Usage: pull_weather(week_commencing, country)")
1797
1821
  print(" - Example: pull_weather('mon', 'GBR')")
1798
-
1799
- print("\n8. pull_covid_data")
1800
- print(" - Description: Get covid pandemic data for the country of interest.")
1801
- print(" - Usage: pull_covid_data(folder_path, country, week_commencing)")
1802
- print(" - Example: pull_covid_data('C:/Users/--username--/OneDrive/', 'GB', 'mon')")
1803
1822
 
1804
1823
  ############################################################### MACRO ##########################################################################
1805
1824
 
1806
- def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]) -> pd.DataFrame:
1825
+ def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
1807
1826
  '''
1808
1827
  Parameters
1809
1828
  ----------
@@ -1812,7 +1831,7 @@ class datapull:
1812
1831
 
1813
1832
  series_id_list : list[str]
1814
1833
  provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
1815
- ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]
1834
+ ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]
1816
1835
 
1817
1836
  Returns
1818
1837
  ----------
@@ -1864,68 +1883,81 @@ class datapull:
1864
1883
 
1865
1884
  return fred_df_final
1866
1885
 
1867
- def pull_boe_data(self, week_commencing="mon", max_retries=30, delay=5):
1886
+ def pull_boe_data(self, week_commencing="mon", max_retries=5, delay=5):
1868
1887
  """
1869
1888
  Fetch and process Bank of England interest rate data.
1870
1889
 
1871
1890
  Args:
1872
- week_commencing (str): The starting day of the week for aggregation.
1873
- Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1874
- Default is "sun".
1875
- max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 30.
1891
+ week_commencing (str): The starting day of the week for aggregation.
1892
+ Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1893
+ Default is "mon".
1894
+ max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
1876
1895
  delay (int): Delay in seconds between retry attempts. Default is 5.
1877
1896
 
1878
1897
  Returns:
1879
- pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
1880
- The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
1898
+ pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
1899
+ The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
1881
1900
  and 'macro_boe_intr_rate' contains the average interest rate for the week.
1882
1901
  """
1883
1902
  # Week commencing dictionary
1884
1903
  day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1885
-
1886
- # Function to fetch the data with retries
1887
- def fetch_data_with_retries(url, max_retries, delay):
1888
- for attempt in range(max_retries):
1889
- try:
1890
- html_table = pd.read_html(url)[0]
1891
- return html_table
1892
- except Exception as e:
1893
- print(f"Attempt {attempt + 1} failed: {e}")
1894
- if attempt < max_retries - 1:
1895
- time.sleep(delay)
1896
- else:
1897
- raise
1898
-
1899
- # Import HTML data from Bank of England rate
1904
+
1905
+ # URL of the Bank of England data page
1900
1906
  url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
1901
- html_table = fetch_data_with_retries(url, max_retries, delay)
1902
-
1903
- df = pd.DataFrame(html_table)
1907
+
1908
+ # Retry logic for HTTP request
1909
+ for attempt in range(max_retries):
1910
+ try:
1911
+ # Set up headers to mimic a browser request
1912
+ headers = {
1913
+ "User-Agent": (
1914
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
1915
+ "Chrome/91.0.4472.124 Safari/537.36"
1916
+ )
1917
+ }
1918
+ response = requests.get(url, headers=headers)
1919
+ response.raise_for_status() # Raise an exception for HTTP errors
1920
+ break
1921
+ except requests.exceptions.RequestException as e:
1922
+ print(f"Attempt {attempt + 1} failed: {e}")
1923
+ if attempt < max_retries - 1:
1924
+ time.sleep(delay)
1925
+ else:
1926
+ raise
1927
+
1928
+ # Parse the HTML page
1929
+ soup = BeautifulSoup(response.content, "html.parser")
1930
+
1931
+ # Find the table on the page
1932
+ table = soup.find("table") # Locate the first table
1933
+ table_html = str(table) # Convert table to string
1934
+ df = pd.read_html(StringIO(table_html))[0] # Use StringIO to wrap the table HTML
1935
+
1936
+ # Rename and clean up columns
1904
1937
  df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
1905
-
1906
- # Change date column to datetime and find the corresponding week to the date
1907
1938
  df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
1908
- df.sort_values("OBS", axis=0, inplace=True)
1909
-
1910
- # Create a daily date range and find the week commencing for that day
1911
- date_range = pd.date_range(df["OBS"].iloc[0], datetime.today(), freq="d")
1939
+ df.sort_values("OBS", inplace=True)
1940
+
1941
+ # Create a daily date range
1942
+ date_range = pd.date_range(df["OBS"].min(), datetime.today(), freq="D")
1912
1943
  df_daily = pd.DataFrame(date_range, columns=["OBS"])
1913
-
1944
+
1914
1945
  # Adjust each date to the specified week commencing day
1915
- df_daily['Week_Commencing'] = df_daily["OBS"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1916
-
1917
- # Outer merge the daily date range on the boe dataframe and forward fill in the blanks
1918
- df_final = df_daily.merge(df, on='OBS', how="left")
1919
- df_final["macro_boe_intr_rate"].ffill(inplace=True)
1920
-
1921
- # Group by the week start date and get the mean of the interest rates for each week
1922
- df_final = df_final.groupby('Week_Commencing')['macro_boe_intr_rate'].mean().reset_index()
1923
-
1924
- df_final['Week_Commencing'] = df_final['Week_Commencing'].dt.strftime('%d/%m/%Y')
1925
- df_final.rename(columns={'Week_Commencing': 'OBS'}, inplace=True)
1926
-
1927
- return df_final
1946
+ df_daily["Week_Commencing"] = df_daily["OBS"].apply(
1947
+ lambda x: x - timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
1948
+ )
1949
+
1950
+ # Merge and forward-fill missing rates
1951
+ df_daily = df_daily.merge(df, on="OBS", how="left")
1952
+ df_daily["macro_boe_intr_rate"] = df_daily["macro_boe_intr_rate"].ffill()
1953
+
1954
+ # Group by week commencing and calculate the average rate
1955
+ df_final = df_daily.groupby("Week_Commencing")["macro_boe_intr_rate"].mean().reset_index()
1956
+ df_final["Week_Commencing"] = df_final["Week_Commencing"].dt.strftime('%d/%m/%Y')
1957
+ df_final.rename(columns={"Week_Commencing": "OBS"}, inplace=True)
1928
1958
 
1959
+ return df_final
1960
+
1929
1961
  def pull_ons_data(self, series_list, week_commencing):
1930
1962
  """
1931
1963
  Fetch and process time series data from the ONS API.
@@ -2104,7 +2136,7 @@ class datapull:
2104
2136
  break
2105
2137
 
2106
2138
  # get data for the next variable if url doesn't exist
2107
- if url_test == False:
2139
+ if url_test is False:
2108
2140
  continue
2109
2141
 
2110
2142
  root = ET.fromstring(data_response.content)
@@ -2169,7 +2201,7 @@ class datapull:
2169
2201
 
2170
2202
  return oecd_df_final
2171
2203
 
2172
- def get_google_mobility_data(self, country: str, wc: str) -> pd.DataFrame:
2204
+ def get_google_mobility_data(self, country="United Kingdom", wc="mon") -> pd.DataFrame:
2173
2205
  """
2174
2206
  Fetch Google Mobility data for the specified country.
2175
2207
 
@@ -2189,7 +2221,7 @@ class datapull:
2189
2221
 
2190
2222
  # Load the CSV file into a pandas DataFrame
2191
2223
  csv_data = StringIO(response.text)
2192
- df = pd.read_csv(csv_data)
2224
+ df = pd.read_csv(csv_data, low_memory=False)
2193
2225
 
2194
2226
  # Filter the DataFrame for the specified country
2195
2227
  country_df = df[df['country_region'] == country]
@@ -2335,10 +2367,10 @@ class datapull:
2335
2367
 
2336
2368
  def pull_weather(self, week_commencing, country) -> pd.DataFrame:
2337
2369
  import pandas as pd
2338
- import urllib.request
2370
+ import urllib.request # noqa: F811
2339
2371
  from datetime import datetime
2340
2372
  import requests
2341
- from geopy.geocoders import Nominatim
2373
+ from geopy.geocoders import Nominatim # noqa: F811
2342
2374
 
2343
2375
  # Week commencing dictionary
2344
2376
  day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
@@ -2934,37 +2966,4 @@ class datapull:
2934
2966
 
2935
2967
  final_weather = ims_proc.rename_cols(merged_df, 'seas_')
2936
2968
 
2937
- return final_weather
2938
-
2939
- def pull_covid_data(self, folder_path: str, country: str = "GB", week_commencing: str = "mon") -> pd.DataFrame:
2940
- """
2941
- Get covid pandemic data for the country of interest.
2942
-
2943
- Args:
2944
- folder_path (str): A string containing the local location of the OneDrive folder.
2945
- Example: "C:/Users/-- username --/OneDrive - im-sciences.com"
2946
- The file location within the MasterDrive of the worldwide covid data is:
2947
- MasterDrive/Central Database/Covid/oxford-government-response.csv
2948
- country (str): A string containing the country of interest (E.g: "GB", "FR")
2949
- week_commencing (str): The starting day of the week for aggregation.
2950
- Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
2951
-
2952
- Returns:
2953
- pd.DataFrame: A DataFrame containing seasonality and public holiday dummies for the country of interest.
2954
- The 'OBS' column contains the week commencing dates.
2955
- """
2956
-
2957
- df = pd.read_csv(f'{folder_path}/MasterDrive/Central Database/Covid/oxford-government-response.csv')
2958
-
2959
- country_df = df[df['location_key']==country]
2960
- country_df.rename(columns={'date': 'OBS'}, inplace=True)
2961
- country_df.drop('location_key', axis=1, inplace=True)
2962
-
2963
- agg_df = ims_proc.aggregate_daily_to_wc_wide(country_df, 'OBS', [], country_df.columns.to_list(), week_commencing, 'average')
2964
-
2965
- covid_df = ims_proc.rename_cols(agg_df, 'covid_')
2966
-
2967
- covid_df['OBS'] = covid_df['OBS'].apply(lambda x: x[0].date())
2968
-
2969
- return covid_df
2970
-
2969
+ return final_weather