imsciences 0.6.3.0__py3-none-any.whl → 0.6.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,20 +6,19 @@ import plotly.graph_objs as go
6
6
  import numpy as np
7
7
  import datetime
8
8
  import re
9
- import pandas as pd
10
9
  from fredapi import Fred
11
10
  import time
12
- from datetime import datetime,timedelta
13
- from cif import cif
11
+ from datetime import datetime, timedelta # noqa: F811
14
12
  from io import StringIO
15
13
  import urllib
16
- import requests_cache
17
- import urllib.request
14
+ import requests_cache # noqa: F401
15
+ import urllib.request # noqa: F401
18
16
  import requests
19
- from geopy.geocoders import Nominatim
17
+ from geopy.geocoders import Nominatim # noqa: F401
20
18
  import subprocess
21
19
  import json
22
20
  import xml.etree.ElementTree as ET
21
+ from bs4 import BeautifulSoup
23
22
 
24
23
  class dataprocessing:
25
24
 
@@ -391,7 +390,7 @@ class dataprocessing:
391
390
  # Divide each numeric value by the number of days in the month
392
391
  for col in df.columns:
393
392
  if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
394
- if divide == True:
393
+ if divide is True:
395
394
  daily_row[col] = row[col] / num_days
396
395
  else:
397
396
  daily_row[col] = row[col]
@@ -678,7 +677,7 @@ class dataprocessing:
678
677
 
679
678
  return combined_df
680
679
 
681
- def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name="Total", datetime_trans_needed=True, reverse_header_order=False,fill_missing_weekly_dates=False,week_commencing='W-MON'):
680
+ def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing="W-MON"):
682
681
  """
683
682
  Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
684
683
 
@@ -701,53 +700,57 @@ class dataprocessing:
701
700
  pandas.DataFrame: The pivot table specified
702
701
  """
703
702
 
704
- # Create the filtered df by applying the conditions
705
- if filters_dict is None:
706
- df_filtered = df
707
- else:
703
+ # Validate inputs
704
+ if index_col not in df.columns:
705
+ raise ValueError(f"index_col '{index_col}' not found in DataFrame.")
706
+ if columns not in df.columns:
707
+ raise ValueError(f"columns '{columns}' not found in DataFrame.")
708
+ if values_col not in df.columns:
709
+ raise ValueError(f"values_col '{values_col}' not found in DataFrame.")
710
+
711
+ # Apply filters if provided
712
+ if filters_dict:
708
713
  df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
709
-
710
- # Ensure index column is in datetime format for proper sorting
711
- df_filtered = df_filtered.copy()
712
-
713
- # If datetime transformation is needed
714
+ else:
715
+ df_filtered = df.copy()
716
+
717
+ # Ensure index column is in datetime format if needed
714
718
  if datetime_trans_needed:
715
719
  df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
716
-
720
+
717
721
  # Create the pivot table
718
- pivoted_df = df_filtered.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc=aggfunc, margins=margins, margins_name=margins_name)
719
-
720
- # Handling MultiIndex columns if present, making them a flat structure
721
- if not reverse_header_order:
722
- if isinstance(pivoted_df.columns, pd.MultiIndex):
723
- pivoted_df.columns = ['_'.join(map(str, col)).strip() for col in pivoted_df.columns.values]
724
- else:
725
- pivoted_df.columns = pivoted_df.columns.map(str)
722
+ pivoted_df = df_filtered.pivot_table(
723
+ index=index_col,
724
+ columns=columns,
725
+ values=values_col,
726
+ aggfunc=aggfunc,
727
+ margins=margins,
728
+ margins_name=margins_name,
729
+ )
730
+
731
+ # Handle column headers
732
+ if isinstance(pivoted_df.columns, pd.MultiIndex):
733
+ pivoted_df.columns = [
734
+ "_".join(reversed(map(str, col)) if reverse_header_order else map(str, col))
735
+ for col in pivoted_df.columns.values
736
+ ]
726
737
  else:
727
- if isinstance(pivoted_df.columns, pd.MultiIndex):
728
- # Reorder the MultiIndex columns
729
- pivoted_df.columns = ['_'.join(reversed(list(map(str, col)))).strip() for col in pivoted_df.columns.values]
730
- else:
731
- pivoted_df.columns = pivoted_df.columns.map(str)
732
- # Reverse the order for single index columns
733
- pivoted_df.columns = ['_'.join(reversed(col.split('_'))).strip() for col in pivoted_df.columns]
734
-
735
- # Reset the pivot before returning
736
- pivoted_df = pivoted_df.reset_index()
737
-
738
- # Sort by index column from oldest to newest
738
+ pivoted_df.columns = pivoted_df.columns.map(str)
739
+
740
+ # Reset the index
741
+ pivoted_df.reset_index(inplace=True)
742
+
743
+ # Handle sorting and formatting of index column
739
744
  if datetime_trans_needed:
740
- pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col]) # Ensure sorting works correctly
741
- pivoted_df = pivoted_df.sort_values(by=index_col)
742
-
743
- # Convert index column back to a string in YYYY-MM-DD format for display purposes
744
- pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
745
-
746
- # Fill in any NaNs
747
- pivoted_df = pivoted_df.fillna(fill_value)
748
-
749
- # If there is a need to fill in missing weeks
750
- if fill_missing_weekly_dates == True:
745
+ pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col], errors="coerce")
746
+ pivoted_df.sort_values(by=index_col, inplace=True)
747
+ pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
748
+
749
+ # Fill missing values
750
+ pivoted_df.fillna(fill_value, inplace=True)
751
+
752
+ # Fill missing weekly dates if specified
753
+ if fill_missing_weekly_dates:
751
754
  pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
752
755
 
753
756
  return pivoted_df
@@ -983,7 +986,7 @@ class dataprocessing:
983
986
 
984
987
  return df
985
988
 
986
- def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict,output_column_name="Updated Column"):
989
+ def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name="Updated Column"):
987
990
  """
988
991
  This function updates values in a specified column of the DataFrame based on a lookup dictionary.
989
992
  It first merges several columns into a new 'Merged' column, then uses this merged column to determine
@@ -1000,8 +1003,10 @@ class dataprocessing:
1000
1003
  Returns:
1001
1004
  pd.DataFrame: The modified DataFrame with updated values in the specified column.
1002
1005
  """
1006
+ # Create a merged column from specified columns
1003
1007
  df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
1004
1008
 
1009
+ # Replace values in the specified column based on the lookup
1005
1010
  def replace_values(x):
1006
1011
  if x[col] == replacement_rows:
1007
1012
  merged_value = x['Merged']
@@ -1009,10 +1014,14 @@ class dataprocessing:
1009
1014
  return replacement_lookup_dict[merged_value]
1010
1015
  return x[col]
1011
1016
 
1017
+ # Apply replacement logic
1012
1018
  df[output_column_name] = df.apply(replace_values, axis=1)
1013
1019
 
1020
+ # Drop the intermediate 'Merged' column
1021
+ df.drop(columns=['Merged'], inplace=True)
1022
+
1014
1023
  return df
1015
-
1024
+
1016
1025
  def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
1017
1026
  """
1018
1027
  Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
@@ -1049,35 +1058,38 @@ class dataprocessing:
1049
1058
 
1050
1059
  return df_final
1051
1060
 
1052
- def convert_df_wide_2_long(self, df,value_cols,variable_col_name='Stacked',value_col_name='Value'):
1061
+ def convert_df_wide_2_long(self, df, value_cols, variable_col_name='Stacked', value_col_name='Value'):
1053
1062
  """
1054
1063
  Changes a dataframe from wide to long format.
1055
1064
 
1056
1065
  Args:
1057
1066
  df (pandas.DataFrame): The DataFrame containing the data.
1058
- value_cols (list of str or str if only one): list of column names which are to be transformed from several columns into one.
1059
- variable_col_name (str, optional): Name of new variables column, which contains the names of the columns which have been stacked into one. Defaults to 'Stacked'.
1060
- value_col_name (str, optional): Name of the new value column which contains all the data from the stacked columns. Defaults to 'Value'.
1067
+ value_cols (list of str or str if only one): List of column names to transform from several columns into one.
1068
+ variable_col_name (str, optional): Name of the new variable column containing the original column names. Defaults to 'Stacked'.
1069
+ value_col_name (str, optional): Name of the new value column containing the data from stacked columns. Defaults to 'Value'.
1061
1070
 
1062
1071
  Returns:
1063
- pandas.DataFrame:: Returns dataframe transformed from long to wide.
1064
-
1072
+ pandas.DataFrame: DataFrame transformed from wide to long format.
1073
+
1065
1074
  Raises:
1066
- ValueError: If number of column names to be depivoted is less than 2, then this function is not neccesary.
1075
+ ValueError: If the number of columns to depivot is less than 2.
1067
1076
  """
1068
-
1069
- # Check length of value cols is greater than 1
1077
+ # Check length of value_cols is greater than 1
1070
1078
  if len(value_cols) < 2:
1071
1079
  raise ValueError("Number of inputs in list must be greater than 1")
1072
-
1080
+
1073
1081
  # Find the columns that are not to be depivoted into one column
1074
- id_vars = list(set(df.columns.tolist()) - set(value_cols))
1075
-
1082
+ id_vars = [col for col in df.columns if col not in value_cols] # Preserve column order in the DataFrame
1083
+
1076
1084
  # Melt all columns chosen into one column
1077
- df_final = pd.melt(df, id_vars,value_cols,var_name=variable_col_name,value_name=value_col_name)
1078
-
1085
+ df_final = pd.melt(df, id_vars=id_vars, value_vars=value_cols, var_name=variable_col_name, value_name=value_col_name)
1086
+
1087
+ # Sort column order to match expected output
1088
+ ordered_columns = id_vars + [variable_col_name, value_col_name]
1089
+ df_final = df_final[ordered_columns]
1090
+
1079
1091
  return df_final
1080
-
1092
+
1081
1093
  def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
1082
1094
  """
1083
1095
  Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
@@ -1102,18 +1114,24 @@ class dataprocessing:
1102
1114
  Returns:
1103
1115
  pandas.DataFrame: Dataframe with manual changes added
1104
1116
  """
1117
+
1105
1118
  # Raise type error if more than one col is supported
1106
1119
  if isinstance(col_to_change, list):
1107
1120
  raise TypeError("Col to change must be specified as a string, not a list")
1108
-
1121
+
1109
1122
  # Raises value error if input is invalid for change_in_existing_df_col
1110
1123
  if change_in_existing_df_col not in ["Yes", "No"]:
1111
1124
  raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
1112
-
1125
+
1113
1126
  # Raises value error if input is invalid for add_notes_col
1114
1127
  if add_notes not in ["Yes", "No"]:
1115
1128
  raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
1116
1129
 
1130
+ # Validate filters_dict format
1131
+ for col, cond in filters_dict.items():
1132
+ if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
1133
+ raise ValueError(f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'")
1134
+
1117
1135
  # Create the filtered df by applying the conditions
1118
1136
  df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
1119
1137
 
@@ -1122,7 +1140,7 @@ class dataprocessing:
1122
1140
  if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
1123
1141
  df = df.copy()
1124
1142
  df[new_col_to_change_name] = df[col_to_change]
1125
-
1143
+
1126
1144
  # Update the new cell in the chosen column
1127
1145
  df.loc[df_filtered.index, col_to_update] = new_value
1128
1146
 
@@ -1146,32 +1164,32 @@ class dataprocessing:
1146
1164
 
1147
1165
  def format_numbers_with_commas(self, df, decimal_length_chosen=2):
1148
1166
  """
1149
- Converts data in numerical format into numbers with commas and a chosen decimal place length
1167
+ Converts data in numerical format into numbers with commas and a chosen decimal place length.
1150
1168
 
1151
1169
  Args:
1152
1170
  df (pandas.DataFrame): The DataFrame containing the data.
1153
- decimal_length_chosen (int, optional): _description_. Defaults to 2.
1154
-
1171
+ decimal_length_chosen (int, optional): Number of decimal places. Defaults to 2.
1172
+
1155
1173
  Returns:
1156
- pandas.DataFrame: The dataframe with the chosen updated format
1174
+ pandas.DataFrame: The DataFrame with the chosen updated format.
1157
1175
  """
1158
1176
  def format_number_with_commas(x, decimal_length=decimal_length_chosen):
1159
- if isinstance(x, (int, float)):
1177
+ if pd.isna(x): # Preserve None/NaN values
1178
+ return pd.NA # Explicitly normalize to pd.NA
1179
+ elif isinstance(x, (int, float)):
1160
1180
  if decimal_length is not None:
1161
- format_str = "{:,.{}f}".format(x, decimal_length)
1162
- formatted_number = format_str.format(x)
1181
+ format_str = f"{{:,.{decimal_length}f}}"
1182
+ return format_str.format(x)
1163
1183
  else:
1164
- formatted_number = "{:,}".format(x)
1165
- return formatted_number
1184
+ return f"{x:,}"
1166
1185
  else:
1167
1186
  return x # Return unchanged if not a number
1168
1187
 
1169
-
1170
- # Apply the function across several columns using applymap()
1171
- formatted_df = df.applymap(format_number_with_commas)
1188
+ # Apply formatting column by column
1189
+ formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(value=pd.NA)
1172
1190
 
1173
1191
  return formatted_df
1174
-
1192
+
1175
1193
  def filter_df_on_multiple_conditions(self, df, filters_dict):
1176
1194
  """
1177
1195
  Filter a dataframe based on mulitple conditions
@@ -1269,7 +1287,6 @@ class dataprocessing:
1269
1287
  """
1270
1288
 
1271
1289
  #This line removes zero values from given column
1272
-
1273
1290
  return data_frame.loc[~(data_frame[column_to_filter] ==0)]
1274
1291
 
1275
1292
  def upgrade_outdated_packages(self):
@@ -1392,10 +1409,10 @@ class dataprocessing:
1392
1409
  Returns:
1393
1410
  pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
1394
1411
  """
1395
-
1412
+
1396
1413
  # If there is no date column
1397
1414
  if date_col is None:
1398
- df = df.applymap(lambda x: 1 if x > dummy_threshold else 0)
1415
+ df = df.apply(lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0))
1399
1416
 
1400
1417
  if add_total_dummy_col != 'No':
1401
1418
  # Find max value of rows
@@ -1403,8 +1420,10 @@ class dataprocessing:
1403
1420
 
1404
1421
  # If there is a date column
1405
1422
  else:
1406
- # Create dummies
1407
- df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].applymap(lambda x: 1 if x > dummy_threshold else 0)
1423
+ # Create dummies for all columns except the date column
1424
+ df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
1425
+ lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
1426
+ )
1408
1427
 
1409
1428
  if add_total_dummy_col != 'No':
1410
1429
  # Find max value of rows
@@ -1427,7 +1446,6 @@ class dataprocessing:
1427
1446
  Returns:
1428
1447
  pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
1429
1448
  """
1430
-
1431
1449
  if new_column is not None:
1432
1450
  # Create a new column for replacements
1433
1451
  df[new_column] = df[column]
@@ -1435,15 +1453,15 @@ class dataprocessing:
1435
1453
  else:
1436
1454
  # Modify the existing column
1437
1455
  temp_column = column
1438
-
1439
- # Apply substring replacements
1440
- for old, new in replacements.items():
1441
- df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
1442
-
1456
+
1443
1457
  # Optionally convert to lowercase
1444
1458
  if to_lower:
1445
1459
  df[temp_column] = df[temp_column].str.lower()
1446
-
1460
+
1461
+ # Apply substring replacements
1462
+ for old, new in replacements.items():
1463
+ df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
1464
+
1447
1465
  return df
1448
1466
 
1449
1467
  def add_total_column(self, df, exclude_col=None, total_col_name='Total'):
@@ -1458,11 +1476,11 @@ class dataprocessing:
1458
1476
  Returns:
1459
1477
  pd.DataFrame: The DataFrame with an added total column.
1460
1478
  """
1461
- # If exclude_col is provided, drop that column before summing
1462
- if exclude_col:
1463
- df[total_col_name] = df.drop(columns=[exclude_col]).sum(axis=1)
1479
+ if exclude_col and exclude_col in df.columns:
1480
+ # Ensure the column to exclude exists before dropping
1481
+ df[total_col_name] = df.drop(columns=[exclude_col], errors='ignore').sum(axis=1)
1464
1482
  else:
1465
- # Sum across all columns if exclude_col is not provided
1483
+ # Sum across all columns if no column is specified to exclude
1466
1484
  df[total_col_name] = df.sum(axis=1)
1467
1485
 
1468
1486
  return df
@@ -1502,7 +1520,7 @@ class dataprocessing:
1502
1520
  df[new_col_name] = df[column_name].apply(categorize_text)
1503
1521
  return df
1504
1522
 
1505
- def compare_overlap(self,df1, df2, date_col):
1523
+ def compare_overlap(self, df1, df2, date_col):
1506
1524
  """
1507
1525
  Compare overlapping periods between two DataFrames and provide a summary of total differences.
1508
1526
 
@@ -1517,64 +1535,70 @@ class dataprocessing:
1517
1535
  # Ensure date columns are in datetime format
1518
1536
  df1[date_col] = pd.to_datetime(df1[date_col])
1519
1537
  df2[date_col] = pd.to_datetime(df2[date_col])
1520
-
1538
+
1521
1539
  # Determine the overlap period
1522
1540
  start_date = max(df1[date_col].min(), df2[date_col].min())
1523
1541
  end_date = min(df1[date_col].max(), df2[date_col].max())
1524
-
1525
- # Filter dataframes to the overlapping period
1542
+
1543
+ # Filter DataFrames to the overlapping period
1526
1544
  df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
1527
1545
  df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
1528
-
1529
- # Merge the dataframes on the date column to align data for comparison
1546
+
1547
+ # Merge the DataFrames on the date column
1530
1548
  merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
1531
-
1532
- # Get the common columns between the two DataFrames, excluding the date column
1549
+
1550
+ # Get common columns, excluding the date column
1533
1551
  common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
1534
-
1535
- # Initialize a list to collect total differences for each column
1552
+
1553
+ # Create a DataFrame for differences
1554
+ diff_df = pd.DataFrame({date_col: merged_df[date_col]})
1555
+
1536
1556
  total_diff_list = []
1537
-
1538
- # Create a DataFrame for the differences
1539
- diff_df = pd.DataFrame({date_col: merged_df[date_col]}) # Initialize diff_df with the date column
1540
-
1541
1557
  for col in common_cols:
1542
- # Calculate the difference for each row
1543
1558
  diff_col = f'diff_{col}'
1544
- diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']
1545
-
1546
- # Calculate the total difference for the column and add it to the list
1559
+ diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2'] # Corrected subtraction order
1560
+
1561
+ # Sum differences for the column
1547
1562
  total_diff = diff_df[diff_col].sum()
1548
1563
  total_diff_list.append({'Column': col, 'Total Difference': total_diff})
1549
-
1550
- # Create a DataFrame for the summary of total differences
1564
+
1565
+ # Create summary DataFrame
1551
1566
  total_diff_df = pd.DataFrame(total_diff_list)
1552
-
1553
- # Apply formatting to the numerical columns
1554
- float_format = "{:,.2f}".format # Format to 2 decimal places with comma as thousand separator
1555
- diff_df.iloc[:, 1:] = diff_df.iloc[:, 1:].applymap(float_format)
1556
- total_diff_df['Total Difference'] = total_diff_df['Total Difference'].apply(float_format)
1557
-
1567
+
1558
1568
  return diff_df, total_diff_df
1559
-
1560
- def week_commencing_2_week_commencing_conversion(self,df,date_col,week_commencing='sun'):
1569
+
1570
+ def week_commencing_2_week_commencing_conversion_isoweekday(self, df, date_col, week_commencing='mon'):
1561
1571
  """
1562
- Convert week commencing column in a DataFrame to the start of another day specified.
1572
+ Convert a DataFrame's date column so that each date is mapped back
1573
+ to the 'week_commencing' day of the *current ISO week*.
1563
1574
 
1564
1575
  Args:
1565
- df (pandas.DataFrame): The DataFrame containing the date-based data.
1566
- date_col (str): The name of the date column in the DataFrame.
1567
- week_commencing (str, optional): The day of the week that the week starts on ('sun' for Sunday, 'mon' for Monday, etc.). Defaults to 'sun'.
1576
+ df (pandas.DataFrame): The DataFrame with date-based data.
1577
+ date_col (str): The name of the date column.
1578
+ week_commencing (str): The desired start of the week.
1579
+ ('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
1580
+ Uses ISO day numbering (Mon=1, ..., Sun=7).
1568
1581
 
1569
1582
  Returns:
1570
- pandas.DataFrame: The original DataFrame with an additional column indicating the start of the week.
1583
+ pandas.DataFrame: Original DataFrame with an extra column
1584
+ 'week_start_<week_commencing>' containing the
1585
+ start-of-week date for each row.
1571
1586
  """
1572
- # Week commencing dictionary
1573
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1574
- df['week_start_'+ week_commencing] = df[date_col].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1587
+ # ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
1588
+ iso_day_dict = {"mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6, "sun": 7}
1589
+
1590
+ target_day = iso_day_dict[week_commencing]
1591
+
1592
+ def map_to_week_start(date_val):
1593
+ delta = (date_val.isoweekday() - target_day) % 7
1594
+ return date_val - pd.Timedelta(days=delta)
1595
+
1596
+ # Apply the transformation
1597
+ new_col = f"week_start_{week_commencing}"
1598
+ df[new_col] = df[date_col].apply(map_to_week_start)
1575
1599
 
1576
1600
  return df
1577
-
1601
+
1578
1602
  def plot_chart(self, df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs):
1579
1603
  """
1580
1604
  Plot various types of charts using Plotly.
@@ -1738,10 +1762,6 @@ class dataprocessing:
1738
1762
  fig = self.plot_two(df1, col1, df2, col2, date_column, same_axis=same_axis)
1739
1763
  figs.append(fig)
1740
1764
 
1741
- # Show all the figures
1742
- for fig in figs:
1743
- fig.show()
1744
-
1745
1765
  return figs
1746
1766
 
1747
1767
  ########################################################################################################################################
@@ -1799,15 +1819,10 @@ class datapull:
1799
1819
  print(" - Description: Fetch and process historical weather data for the specified country.")
1800
1820
  print(" - Usage: pull_weather(week_commencing, country)")
1801
1821
  print(" - Example: pull_weather('mon', 'GBR')")
1802
-
1803
- print("\n8. pull_covid_data")
1804
- print(" - Description: Get covid pandemic data for the country of interest.")
1805
- print(" - Usage: pull_covid_data(folder_path, country, week_commencing)")
1806
- print(" - Example: pull_covid_data('C:/Users/--username--/OneDrive/', 'GB', 'mon')")
1807
1822
 
1808
1823
  ############################################################### MACRO ##########################################################################
1809
1824
 
1810
- def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]) -> pd.DataFrame:
1825
+ def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
1811
1826
  '''
1812
1827
  Parameters
1813
1828
  ----------
@@ -1816,7 +1831,7 @@ class datapull:
1816
1831
 
1817
1832
  series_id_list : list[str]
1818
1833
  provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
1819
- ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]
1834
+ ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]
1820
1835
 
1821
1836
  Returns
1822
1837
  ----------
@@ -1868,68 +1883,81 @@ class datapull:
1868
1883
 
1869
1884
  return fred_df_final
1870
1885
 
1871
- def pull_boe_data(self, week_commencing="mon", max_retries=30, delay=5):
1886
+ def pull_boe_data(self, week_commencing="mon", max_retries=5, delay=5):
1872
1887
  """
1873
1888
  Fetch and process Bank of England interest rate data.
1874
1889
 
1875
1890
  Args:
1876
- week_commencing (str): The starting day of the week for aggregation.
1877
- Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1878
- Default is "sun".
1879
- max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 30.
1891
+ week_commencing (str): The starting day of the week for aggregation.
1892
+ Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1893
+ Default is "mon".
1894
+ max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
1880
1895
  delay (int): Delay in seconds between retry attempts. Default is 5.
1881
1896
 
1882
1897
  Returns:
1883
- pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
1884
- The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
1898
+ pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
1899
+ The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
1885
1900
  and 'macro_boe_intr_rate' contains the average interest rate for the week.
1886
1901
  """
1887
1902
  # Week commencing dictionary
1888
1903
  day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1889
-
1890
- # Function to fetch the data with retries
1891
- def fetch_data_with_retries(url, max_retries, delay):
1892
- for attempt in range(max_retries):
1893
- try:
1894
- html_table = pd.read_html(url)[0]
1895
- return html_table
1896
- except Exception as e:
1897
- print(f"Attempt {attempt + 1} failed: {e}")
1898
- if attempt < max_retries - 1:
1899
- time.sleep(delay)
1900
- else:
1901
- raise
1902
-
1903
- # Import HTML data from Bank of England rate
1904
+
1905
+ # URL of the Bank of England data page
1904
1906
  url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
1905
- html_table = fetch_data_with_retries(url, max_retries, delay)
1906
-
1907
- df = pd.DataFrame(html_table)
1907
+
1908
+ # Retry logic for HTTP request
1909
+ for attempt in range(max_retries):
1910
+ try:
1911
+ # Set up headers to mimic a browser request
1912
+ headers = {
1913
+ "User-Agent": (
1914
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
1915
+ "Chrome/91.0.4472.124 Safari/537.36"
1916
+ )
1917
+ }
1918
+ response = requests.get(url, headers=headers)
1919
+ response.raise_for_status() # Raise an exception for HTTP errors
1920
+ break
1921
+ except requests.exceptions.RequestException as e:
1922
+ print(f"Attempt {attempt + 1} failed: {e}")
1923
+ if attempt < max_retries - 1:
1924
+ time.sleep(delay)
1925
+ else:
1926
+ raise
1927
+
1928
+ # Parse the HTML page
1929
+ soup = BeautifulSoup(response.content, "html.parser")
1930
+
1931
+ # Find the table on the page
1932
+ table = soup.find("table") # Locate the first table
1933
+ table_html = str(table) # Convert table to string
1934
+ df = pd.read_html(StringIO(table_html))[0] # Use StringIO to wrap the table HTML
1935
+
1936
+ # Rename and clean up columns
1908
1937
  df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
1909
-
1910
- # Change date column to datetime and find the corresponding week to the date
1911
1938
  df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
1912
- df.sort_values("OBS", axis=0, inplace=True)
1913
-
1914
- # Create a daily date range and find the week commencing for that day
1915
- date_range = pd.date_range(df["OBS"].iloc[0], datetime.today(), freq="d")
1939
+ df.sort_values("OBS", inplace=True)
1940
+
1941
+ # Create a daily date range
1942
+ date_range = pd.date_range(df["OBS"].min(), datetime.today(), freq="D")
1916
1943
  df_daily = pd.DataFrame(date_range, columns=["OBS"])
1917
-
1944
+
1918
1945
  # Adjust each date to the specified week commencing day
1919
- df_daily['Week_Commencing'] = df_daily["OBS"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1920
-
1921
- # Outer merge the daily date range on the boe dataframe and forward fill in the blanks
1922
- df_final = df_daily.merge(df, on='OBS', how="left")
1923
- df_final["macro_boe_intr_rate"].ffill(inplace=True)
1924
-
1925
- # Group by the week start date and get the mean of the interest rates for each week
1926
- df_final = df_final.groupby('Week_Commencing')['macro_boe_intr_rate'].mean().reset_index()
1927
-
1928
- df_final['Week_Commencing'] = df_final['Week_Commencing'].dt.strftime('%d/%m/%Y')
1929
- df_final.rename(columns={'Week_Commencing': 'OBS'}, inplace=True)
1930
-
1931
- return df_final
1946
+ df_daily["Week_Commencing"] = df_daily["OBS"].apply(
1947
+ lambda x: x - timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
1948
+ )
1949
+
1950
+ # Merge and forward-fill missing rates
1951
+ df_daily = df_daily.merge(df, on="OBS", how="left")
1952
+ df_daily["macro_boe_intr_rate"] = df_daily["macro_boe_intr_rate"].ffill()
1953
+
1954
+ # Group by week commencing and calculate the average rate
1955
+ df_final = df_daily.groupby("Week_Commencing")["macro_boe_intr_rate"].mean().reset_index()
1956
+ df_final["Week_Commencing"] = df_final["Week_Commencing"].dt.strftime('%d/%m/%Y')
1957
+ df_final.rename(columns={"Week_Commencing": "OBS"}, inplace=True)
1932
1958
 
1959
+ return df_final
1960
+
1933
1961
  def pull_ons_data(self, series_list, week_commencing):
1934
1962
  """
1935
1963
  Fetch and process time series data from the ONS API.
@@ -2108,7 +2136,7 @@ class datapull:
2108
2136
  break
2109
2137
 
2110
2138
  # get data for the next variable if url doesn't exist
2111
- if url_test == False:
2139
+ if url_test is False:
2112
2140
  continue
2113
2141
 
2114
2142
  root = ET.fromstring(data_response.content)
@@ -2173,7 +2201,7 @@ class datapull:
2173
2201
 
2174
2202
  return oecd_df_final
2175
2203
 
2176
- def get_google_mobility_data(self, country: str, wc: str) -> pd.DataFrame:
2204
+ def get_google_mobility_data(self, country="United Kingdom", wc="mon") -> pd.DataFrame:
2177
2205
  """
2178
2206
  Fetch Google Mobility data for the specified country.
2179
2207
 
@@ -2193,7 +2221,7 @@ class datapull:
2193
2221
 
2194
2222
  # Load the CSV file into a pandas DataFrame
2195
2223
  csv_data = StringIO(response.text)
2196
- df = pd.read_csv(csv_data)
2224
+ df = pd.read_csv(csv_data, low_memory=False)
2197
2225
 
2198
2226
  # Filter the DataFrame for the specified country
2199
2227
  country_df = df[df['country_region'] == country]
@@ -2339,10 +2367,10 @@ class datapull:
2339
2367
 
2340
2368
  def pull_weather(self, week_commencing, country) -> pd.DataFrame:
2341
2369
  import pandas as pd
2342
- import urllib.request
2370
+ import urllib.request # noqa: F811
2343
2371
  from datetime import datetime
2344
2372
  import requests
2345
- from geopy.geocoders import Nominatim
2373
+ from geopy.geocoders import Nominatim # noqa: F811
2346
2374
 
2347
2375
  # Week commencing dictionary
2348
2376
  day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
@@ -2938,37 +2966,4 @@ class datapull:
2938
2966
 
2939
2967
  final_weather = ims_proc.rename_cols(merged_df, 'seas_')
2940
2968
 
2941
- return final_weather
2942
-
2943
- def pull_covid_data(self, folder_path: str, country: str = "GB", week_commencing: str = "mon") -> pd.DataFrame:
2944
- """
2945
- Get covid pandemic data for the country of interest.
2946
-
2947
- Args:
2948
- folder_path (str): A string containing the local location of the OneDrive folder.
2949
- Example: "C:/Users/-- username --/OneDrive - im-sciences.com"
2950
- The file location within the MasterDrive of the worldwide covid data is:
2951
- MasterDrive/Central Database/Covid/oxford-government-response.csv
2952
- country (str): A string containing the country of interest (E.g: "GB", "FR")
2953
- week_commencing (str): The starting day of the week for aggregation.
2954
- Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
2955
-
2956
- Returns:
2957
- pd.DataFrame: A DataFrame containing seasonality and public holiday dummies for the country of interest.
2958
- The 'OBS' column contains the week commencing dates.
2959
- """
2960
-
2961
- df = pd.read_csv(f'{folder_path}/MasterDrive/Central Database/Covid/oxford-government-response.csv')
2962
-
2963
- country_df = df[df['location_key']==country]
2964
- country_df.rename(columns={'date': 'OBS'}, inplace=True)
2965
- country_df.drop('location_key', axis=1, inplace=True)
2966
-
2967
- agg_df = ims_proc.aggregate_daily_to_wc_wide(country_df, 'OBS', [], country_df.columns.to_list(), week_commencing, 'average')
2968
-
2969
- covid_df = ims_proc.rename_cols(agg_df, 'covid_')
2970
-
2971
- covid_df['OBS'] = covid_df['OBS'].apply(lambda x: x[0].date())
2972
-
2973
- return covid_df
2974
-
2969
+ return final_weather