imsciences 0.6.3.0__py3-none-any.whl → 0.6.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/__init__.py +2 -1
- imsciences/datafunctions.py +225 -230
- imsciences/unittesting.py +1335 -0
- imsciences-0.6.3.2.dist-info/METADATA +383 -0
- {imsciences-0.6.3.0.dist-info → imsciences-0.6.3.2.dist-info}/RECORD +8 -7
- {imsciences-0.6.3.0.dist-info → imsciences-0.6.3.2.dist-info}/WHEEL +1 -1
- imsciences-0.6.3.0.dist-info/METADATA +0 -24
- {imsciences-0.6.3.0.dist-info → imsciences-0.6.3.2.dist-info}/PKG-INFO-IMS-24Ltp-3 +0 -0
- {imsciences-0.6.3.0.dist-info → imsciences-0.6.3.2.dist-info}/top_level.txt +0 -0
imsciences/datafunctions.py
CHANGED
|
@@ -6,20 +6,19 @@ import plotly.graph_objs as go
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import datetime
|
|
8
8
|
import re
|
|
9
|
-
import pandas as pd
|
|
10
9
|
from fredapi import Fred
|
|
11
10
|
import time
|
|
12
|
-
from datetime import datetime,timedelta
|
|
13
|
-
from cif import cif
|
|
11
|
+
from datetime import datetime, timedelta # noqa: F811
|
|
14
12
|
from io import StringIO
|
|
15
13
|
import urllib
|
|
16
|
-
import requests_cache
|
|
17
|
-
import urllib.request
|
|
14
|
+
import requests_cache # noqa: F401
|
|
15
|
+
import urllib.request # noqa: F401
|
|
18
16
|
import requests
|
|
19
|
-
from geopy.geocoders import Nominatim
|
|
17
|
+
from geopy.geocoders import Nominatim # noqa: F401
|
|
20
18
|
import subprocess
|
|
21
19
|
import json
|
|
22
20
|
import xml.etree.ElementTree as ET
|
|
21
|
+
from bs4 import BeautifulSoup
|
|
23
22
|
|
|
24
23
|
class dataprocessing:
|
|
25
24
|
|
|
@@ -391,7 +390,7 @@ class dataprocessing:
|
|
|
391
390
|
# Divide each numeric value by the number of days in the month
|
|
392
391
|
for col in df.columns:
|
|
393
392
|
if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
|
|
394
|
-
if divide
|
|
393
|
+
if divide is True:
|
|
395
394
|
daily_row[col] = row[col] / num_days
|
|
396
395
|
else:
|
|
397
396
|
daily_row[col] = row[col]
|
|
@@ -678,7 +677,7 @@ class dataprocessing:
|
|
|
678
677
|
|
|
679
678
|
return combined_df
|
|
680
679
|
|
|
681
|
-
def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc=
|
|
680
|
+
def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing="W-MON"):
|
|
682
681
|
"""
|
|
683
682
|
Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
|
|
684
683
|
|
|
@@ -701,53 +700,57 @@ class dataprocessing:
|
|
|
701
700
|
pandas.DataFrame: The pivot table specified
|
|
702
701
|
"""
|
|
703
702
|
|
|
704
|
-
#
|
|
705
|
-
if
|
|
706
|
-
|
|
707
|
-
|
|
703
|
+
# Validate inputs
|
|
704
|
+
if index_col not in df.columns:
|
|
705
|
+
raise ValueError(f"index_col '{index_col}' not found in DataFrame.")
|
|
706
|
+
if columns not in df.columns:
|
|
707
|
+
raise ValueError(f"columns '{columns}' not found in DataFrame.")
|
|
708
|
+
if values_col not in df.columns:
|
|
709
|
+
raise ValueError(f"values_col '{values_col}' not found in DataFrame.")
|
|
710
|
+
|
|
711
|
+
# Apply filters if provided
|
|
712
|
+
if filters_dict:
|
|
708
713
|
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
# If datetime transformation is needed
|
|
714
|
+
else:
|
|
715
|
+
df_filtered = df.copy()
|
|
716
|
+
|
|
717
|
+
# Ensure index column is in datetime format if needed
|
|
714
718
|
if datetime_trans_needed:
|
|
715
719
|
df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
|
|
716
|
-
|
|
720
|
+
|
|
717
721
|
# Create the pivot table
|
|
718
|
-
pivoted_df = df_filtered.pivot_table(
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
722
|
+
pivoted_df = df_filtered.pivot_table(
|
|
723
|
+
index=index_col,
|
|
724
|
+
columns=columns,
|
|
725
|
+
values=values_col,
|
|
726
|
+
aggfunc=aggfunc,
|
|
727
|
+
margins=margins,
|
|
728
|
+
margins_name=margins_name,
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
# Handle column headers
|
|
732
|
+
if isinstance(pivoted_df.columns, pd.MultiIndex):
|
|
733
|
+
pivoted_df.columns = [
|
|
734
|
+
"_".join(reversed(map(str, col)) if reverse_header_order else map(str, col))
|
|
735
|
+
for col in pivoted_df.columns.values
|
|
736
|
+
]
|
|
726
737
|
else:
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
pivoted_df.columns = ['_'.join(reversed(col.split('_'))).strip() for col in pivoted_df.columns]
|
|
734
|
-
|
|
735
|
-
# Reset the pivot before returning
|
|
736
|
-
pivoted_df = pivoted_df.reset_index()
|
|
737
|
-
|
|
738
|
-
# Sort by index column from oldest to newest
|
|
738
|
+
pivoted_df.columns = pivoted_df.columns.map(str)
|
|
739
|
+
|
|
740
|
+
# Reset the index
|
|
741
|
+
pivoted_df.reset_index(inplace=True)
|
|
742
|
+
|
|
743
|
+
# Handle sorting and formatting of index column
|
|
739
744
|
if datetime_trans_needed:
|
|
740
|
-
pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col])
|
|
741
|
-
pivoted_df
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
# If there is a need to fill in missing weeks
|
|
750
|
-
if fill_missing_weekly_dates == True:
|
|
745
|
+
pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col], errors="coerce")
|
|
746
|
+
pivoted_df.sort_values(by=index_col, inplace=True)
|
|
747
|
+
pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
|
|
748
|
+
|
|
749
|
+
# Fill missing values
|
|
750
|
+
pivoted_df.fillna(fill_value, inplace=True)
|
|
751
|
+
|
|
752
|
+
# Fill missing weekly dates if specified
|
|
753
|
+
if fill_missing_weekly_dates:
|
|
751
754
|
pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
|
|
752
755
|
|
|
753
756
|
return pivoted_df
|
|
@@ -983,7 +986,7 @@ class dataprocessing:
|
|
|
983
986
|
|
|
984
987
|
return df
|
|
985
988
|
|
|
986
|
-
def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict,output_column_name="Updated Column"):
|
|
989
|
+
def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name="Updated Column"):
|
|
987
990
|
"""
|
|
988
991
|
This function updates values in a specified column of the DataFrame based on a lookup dictionary.
|
|
989
992
|
It first merges several columns into a new 'Merged' column, then uses this merged column to determine
|
|
@@ -1000,8 +1003,10 @@ class dataprocessing:
|
|
|
1000
1003
|
Returns:
|
|
1001
1004
|
pd.DataFrame: The modified DataFrame with updated values in the specified column.
|
|
1002
1005
|
"""
|
|
1006
|
+
# Create a merged column from specified columns
|
|
1003
1007
|
df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
|
|
1004
1008
|
|
|
1009
|
+
# Replace values in the specified column based on the lookup
|
|
1005
1010
|
def replace_values(x):
|
|
1006
1011
|
if x[col] == replacement_rows:
|
|
1007
1012
|
merged_value = x['Merged']
|
|
@@ -1009,10 +1014,14 @@ class dataprocessing:
|
|
|
1009
1014
|
return replacement_lookup_dict[merged_value]
|
|
1010
1015
|
return x[col]
|
|
1011
1016
|
|
|
1017
|
+
# Apply replacement logic
|
|
1012
1018
|
df[output_column_name] = df.apply(replace_values, axis=1)
|
|
1013
1019
|
|
|
1020
|
+
# Drop the intermediate 'Merged' column
|
|
1021
|
+
df.drop(columns=['Merged'], inplace=True)
|
|
1022
|
+
|
|
1014
1023
|
return df
|
|
1015
|
-
|
|
1024
|
+
|
|
1016
1025
|
def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
|
|
1017
1026
|
"""
|
|
1018
1027
|
Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
|
|
@@ -1049,35 +1058,38 @@ class dataprocessing:
|
|
|
1049
1058
|
|
|
1050
1059
|
return df_final
|
|
1051
1060
|
|
|
1052
|
-
def convert_df_wide_2_long(self, df,value_cols,variable_col_name='Stacked',value_col_name='Value'):
|
|
1061
|
+
def convert_df_wide_2_long(self, df, value_cols, variable_col_name='Stacked', value_col_name='Value'):
|
|
1053
1062
|
"""
|
|
1054
1063
|
Changes a dataframe from wide to long format.
|
|
1055
1064
|
|
|
1056
1065
|
Args:
|
|
1057
1066
|
df (pandas.DataFrame): The DataFrame containing the data.
|
|
1058
|
-
value_cols (list of str or str if only one):
|
|
1059
|
-
variable_col_name (str, optional): Name of new
|
|
1060
|
-
value_col_name (str, optional): Name of the new value column
|
|
1067
|
+
value_cols (list of str or str if only one): List of column names to transform from several columns into one.
|
|
1068
|
+
variable_col_name (str, optional): Name of the new variable column containing the original column names. Defaults to 'Stacked'.
|
|
1069
|
+
value_col_name (str, optional): Name of the new value column containing the data from stacked columns. Defaults to 'Value'.
|
|
1061
1070
|
|
|
1062
1071
|
Returns:
|
|
1063
|
-
pandas.DataFrame
|
|
1064
|
-
|
|
1072
|
+
pandas.DataFrame: DataFrame transformed from wide to long format.
|
|
1073
|
+
|
|
1065
1074
|
Raises:
|
|
1066
|
-
ValueError: If number of
|
|
1075
|
+
ValueError: If the number of columns to depivot is less than 2.
|
|
1067
1076
|
"""
|
|
1068
|
-
|
|
1069
|
-
# Check length of value cols is greater than 1
|
|
1077
|
+
# Check length of value_cols is greater than 1
|
|
1070
1078
|
if len(value_cols) < 2:
|
|
1071
1079
|
raise ValueError("Number of inputs in list must be greater than 1")
|
|
1072
|
-
|
|
1080
|
+
|
|
1073
1081
|
# Find the columns that are not to be depivoted into one column
|
|
1074
|
-
id_vars =
|
|
1075
|
-
|
|
1082
|
+
id_vars = [col for col in df.columns if col not in value_cols] # Preserve column order in the DataFrame
|
|
1083
|
+
|
|
1076
1084
|
# Melt all columns chosen into one column
|
|
1077
|
-
df_final = pd.melt(df, id_vars,value_cols,var_name=variable_col_name,value_name=value_col_name)
|
|
1078
|
-
|
|
1085
|
+
df_final = pd.melt(df, id_vars=id_vars, value_vars=value_cols, var_name=variable_col_name, value_name=value_col_name)
|
|
1086
|
+
|
|
1087
|
+
# Sort column order to match expected output
|
|
1088
|
+
ordered_columns = id_vars + [variable_col_name, value_col_name]
|
|
1089
|
+
df_final = df_final[ordered_columns]
|
|
1090
|
+
|
|
1079
1091
|
return df_final
|
|
1080
|
-
|
|
1092
|
+
|
|
1081
1093
|
def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
|
|
1082
1094
|
"""
|
|
1083
1095
|
Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
|
|
@@ -1102,18 +1114,24 @@ class dataprocessing:
|
|
|
1102
1114
|
Returns:
|
|
1103
1115
|
pandas.DataFrame: Dataframe with manual changes added
|
|
1104
1116
|
"""
|
|
1117
|
+
|
|
1105
1118
|
# Raise type error if more than one col is supported
|
|
1106
1119
|
if isinstance(col_to_change, list):
|
|
1107
1120
|
raise TypeError("Col to change must be specified as a string, not a list")
|
|
1108
|
-
|
|
1121
|
+
|
|
1109
1122
|
# Raises value error if input is invalid for change_in_existing_df_col
|
|
1110
1123
|
if change_in_existing_df_col not in ["Yes", "No"]:
|
|
1111
1124
|
raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
|
|
1112
|
-
|
|
1125
|
+
|
|
1113
1126
|
# Raises value error if input is invalid for add_notes_col
|
|
1114
1127
|
if add_notes not in ["Yes", "No"]:
|
|
1115
1128
|
raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
|
|
1116
1129
|
|
|
1130
|
+
# Validate filters_dict format
|
|
1131
|
+
for col, cond in filters_dict.items():
|
|
1132
|
+
if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
|
|
1133
|
+
raise ValueError(f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'")
|
|
1134
|
+
|
|
1117
1135
|
# Create the filtered df by applying the conditions
|
|
1118
1136
|
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
1119
1137
|
|
|
@@ -1122,7 +1140,7 @@ class dataprocessing:
|
|
|
1122
1140
|
if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
|
|
1123
1141
|
df = df.copy()
|
|
1124
1142
|
df[new_col_to_change_name] = df[col_to_change]
|
|
1125
|
-
|
|
1143
|
+
|
|
1126
1144
|
# Update the new cell in the chosen column
|
|
1127
1145
|
df.loc[df_filtered.index, col_to_update] = new_value
|
|
1128
1146
|
|
|
@@ -1146,32 +1164,32 @@ class dataprocessing:
|
|
|
1146
1164
|
|
|
1147
1165
|
def format_numbers_with_commas(self, df, decimal_length_chosen=2):
|
|
1148
1166
|
"""
|
|
1149
|
-
Converts data in numerical format into numbers with commas and a chosen decimal place length
|
|
1167
|
+
Converts data in numerical format into numbers with commas and a chosen decimal place length.
|
|
1150
1168
|
|
|
1151
1169
|
Args:
|
|
1152
1170
|
df (pandas.DataFrame): The DataFrame containing the data.
|
|
1153
|
-
decimal_length_chosen (int, optional):
|
|
1154
|
-
|
|
1171
|
+
decimal_length_chosen (int, optional): Number of decimal places. Defaults to 2.
|
|
1172
|
+
|
|
1155
1173
|
Returns:
|
|
1156
|
-
pandas.DataFrame: The
|
|
1174
|
+
pandas.DataFrame: The DataFrame with the chosen updated format.
|
|
1157
1175
|
"""
|
|
1158
1176
|
def format_number_with_commas(x, decimal_length=decimal_length_chosen):
|
|
1159
|
-
if
|
|
1177
|
+
if pd.isna(x): # Preserve None/NaN values
|
|
1178
|
+
return pd.NA # Explicitly normalize to pd.NA
|
|
1179
|
+
elif isinstance(x, (int, float)):
|
|
1160
1180
|
if decimal_length is not None:
|
|
1161
|
-
format_str = "{:,.{}f}"
|
|
1162
|
-
|
|
1181
|
+
format_str = f"{{:,.{decimal_length}f}}"
|
|
1182
|
+
return format_str.format(x)
|
|
1163
1183
|
else:
|
|
1164
|
-
|
|
1165
|
-
return formatted_number
|
|
1184
|
+
return f"{x:,}"
|
|
1166
1185
|
else:
|
|
1167
1186
|
return x # Return unchanged if not a number
|
|
1168
1187
|
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
formatted_df = df.applymap(format_number_with_commas)
|
|
1188
|
+
# Apply formatting column by column
|
|
1189
|
+
formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(value=pd.NA)
|
|
1172
1190
|
|
|
1173
1191
|
return formatted_df
|
|
1174
|
-
|
|
1192
|
+
|
|
1175
1193
|
def filter_df_on_multiple_conditions(self, df, filters_dict):
|
|
1176
1194
|
"""
|
|
1177
1195
|
Filter a dataframe based on mulitple conditions
|
|
@@ -1269,7 +1287,6 @@ class dataprocessing:
|
|
|
1269
1287
|
"""
|
|
1270
1288
|
|
|
1271
1289
|
#This line removes zero values from given column
|
|
1272
|
-
|
|
1273
1290
|
return data_frame.loc[~(data_frame[column_to_filter] ==0)]
|
|
1274
1291
|
|
|
1275
1292
|
def upgrade_outdated_packages(self):
|
|
@@ -1392,10 +1409,10 @@ class dataprocessing:
|
|
|
1392
1409
|
Returns:
|
|
1393
1410
|
pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
|
|
1394
1411
|
"""
|
|
1395
|
-
|
|
1412
|
+
|
|
1396
1413
|
# If there is no date column
|
|
1397
1414
|
if date_col is None:
|
|
1398
|
-
df = df.
|
|
1415
|
+
df = df.apply(lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0))
|
|
1399
1416
|
|
|
1400
1417
|
if add_total_dummy_col != 'No':
|
|
1401
1418
|
# Find max value of rows
|
|
@@ -1403,8 +1420,10 @@ class dataprocessing:
|
|
|
1403
1420
|
|
|
1404
1421
|
# If there is a date column
|
|
1405
1422
|
else:
|
|
1406
|
-
# Create dummies
|
|
1407
|
-
df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].
|
|
1423
|
+
# Create dummies for all columns except the date column
|
|
1424
|
+
df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
|
|
1425
|
+
lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
|
|
1426
|
+
)
|
|
1408
1427
|
|
|
1409
1428
|
if add_total_dummy_col != 'No':
|
|
1410
1429
|
# Find max value of rows
|
|
@@ -1427,7 +1446,6 @@ class dataprocessing:
|
|
|
1427
1446
|
Returns:
|
|
1428
1447
|
pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
|
|
1429
1448
|
"""
|
|
1430
|
-
|
|
1431
1449
|
if new_column is not None:
|
|
1432
1450
|
# Create a new column for replacements
|
|
1433
1451
|
df[new_column] = df[column]
|
|
@@ -1435,15 +1453,15 @@ class dataprocessing:
|
|
|
1435
1453
|
else:
|
|
1436
1454
|
# Modify the existing column
|
|
1437
1455
|
temp_column = column
|
|
1438
|
-
|
|
1439
|
-
# Apply substring replacements
|
|
1440
|
-
for old, new in replacements.items():
|
|
1441
|
-
df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
|
|
1442
|
-
|
|
1456
|
+
|
|
1443
1457
|
# Optionally convert to lowercase
|
|
1444
1458
|
if to_lower:
|
|
1445
1459
|
df[temp_column] = df[temp_column].str.lower()
|
|
1446
|
-
|
|
1460
|
+
|
|
1461
|
+
# Apply substring replacements
|
|
1462
|
+
for old, new in replacements.items():
|
|
1463
|
+
df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
|
|
1464
|
+
|
|
1447
1465
|
return df
|
|
1448
1466
|
|
|
1449
1467
|
def add_total_column(self, df, exclude_col=None, total_col_name='Total'):
|
|
@@ -1458,11 +1476,11 @@ class dataprocessing:
|
|
|
1458
1476
|
Returns:
|
|
1459
1477
|
pd.DataFrame: The DataFrame with an added total column.
|
|
1460
1478
|
"""
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
df[total_col_name] = df.drop(columns=[exclude_col]).sum(axis=1)
|
|
1479
|
+
if exclude_col and exclude_col in df.columns:
|
|
1480
|
+
# Ensure the column to exclude exists before dropping
|
|
1481
|
+
df[total_col_name] = df.drop(columns=[exclude_col], errors='ignore').sum(axis=1)
|
|
1464
1482
|
else:
|
|
1465
|
-
# Sum across all columns if
|
|
1483
|
+
# Sum across all columns if no column is specified to exclude
|
|
1466
1484
|
df[total_col_name] = df.sum(axis=1)
|
|
1467
1485
|
|
|
1468
1486
|
return df
|
|
@@ -1502,7 +1520,7 @@ class dataprocessing:
|
|
|
1502
1520
|
df[new_col_name] = df[column_name].apply(categorize_text)
|
|
1503
1521
|
return df
|
|
1504
1522
|
|
|
1505
|
-
def compare_overlap(self,df1, df2, date_col):
|
|
1523
|
+
def compare_overlap(self, df1, df2, date_col):
|
|
1506
1524
|
"""
|
|
1507
1525
|
Compare overlapping periods between two DataFrames and provide a summary of total differences.
|
|
1508
1526
|
|
|
@@ -1517,64 +1535,70 @@ class dataprocessing:
|
|
|
1517
1535
|
# Ensure date columns are in datetime format
|
|
1518
1536
|
df1[date_col] = pd.to_datetime(df1[date_col])
|
|
1519
1537
|
df2[date_col] = pd.to_datetime(df2[date_col])
|
|
1520
|
-
|
|
1538
|
+
|
|
1521
1539
|
# Determine the overlap period
|
|
1522
1540
|
start_date = max(df1[date_col].min(), df2[date_col].min())
|
|
1523
1541
|
end_date = min(df1[date_col].max(), df2[date_col].max())
|
|
1524
|
-
|
|
1525
|
-
# Filter
|
|
1542
|
+
|
|
1543
|
+
# Filter DataFrames to the overlapping period
|
|
1526
1544
|
df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
|
|
1527
1545
|
df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
|
|
1528
|
-
|
|
1529
|
-
# Merge the
|
|
1546
|
+
|
|
1547
|
+
# Merge the DataFrames on the date column
|
|
1530
1548
|
merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
|
|
1531
|
-
|
|
1532
|
-
# Get
|
|
1549
|
+
|
|
1550
|
+
# Get common columns, excluding the date column
|
|
1533
1551
|
common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
|
|
1534
|
-
|
|
1535
|
-
#
|
|
1552
|
+
|
|
1553
|
+
# Create a DataFrame for differences
|
|
1554
|
+
diff_df = pd.DataFrame({date_col: merged_df[date_col]})
|
|
1555
|
+
|
|
1536
1556
|
total_diff_list = []
|
|
1537
|
-
|
|
1538
|
-
# Create a DataFrame for the differences
|
|
1539
|
-
diff_df = pd.DataFrame({date_col: merged_df[date_col]}) # Initialize diff_df with the date column
|
|
1540
|
-
|
|
1541
1557
|
for col in common_cols:
|
|
1542
|
-
# Calculate the difference for each row
|
|
1543
1558
|
diff_col = f'diff_{col}'
|
|
1544
|
-
diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']
|
|
1545
|
-
|
|
1546
|
-
#
|
|
1559
|
+
diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2'] # Corrected subtraction order
|
|
1560
|
+
|
|
1561
|
+
# Sum differences for the column
|
|
1547
1562
|
total_diff = diff_df[diff_col].sum()
|
|
1548
1563
|
total_diff_list.append({'Column': col, 'Total Difference': total_diff})
|
|
1549
|
-
|
|
1550
|
-
# Create
|
|
1564
|
+
|
|
1565
|
+
# Create summary DataFrame
|
|
1551
1566
|
total_diff_df = pd.DataFrame(total_diff_list)
|
|
1552
|
-
|
|
1553
|
-
# Apply formatting to the numerical columns
|
|
1554
|
-
float_format = "{:,.2f}".format # Format to 2 decimal places with comma as thousand separator
|
|
1555
|
-
diff_df.iloc[:, 1:] = diff_df.iloc[:, 1:].applymap(float_format)
|
|
1556
|
-
total_diff_df['Total Difference'] = total_diff_df['Total Difference'].apply(float_format)
|
|
1557
|
-
|
|
1567
|
+
|
|
1558
1568
|
return diff_df, total_diff_df
|
|
1559
|
-
|
|
1560
|
-
def
|
|
1569
|
+
|
|
1570
|
+
def week_commencing_2_week_commencing_conversion_isoweekday(self, df, date_col, week_commencing='mon'):
|
|
1561
1571
|
"""
|
|
1562
|
-
Convert
|
|
1572
|
+
Convert a DataFrame's date column so that each date is mapped back
|
|
1573
|
+
to the 'week_commencing' day of the *current ISO week*.
|
|
1563
1574
|
|
|
1564
1575
|
Args:
|
|
1565
|
-
df (pandas.DataFrame): The DataFrame
|
|
1566
|
-
date_col (str): The name of the date column
|
|
1567
|
-
week_commencing (str
|
|
1576
|
+
df (pandas.DataFrame): The DataFrame with date-based data.
|
|
1577
|
+
date_col (str): The name of the date column.
|
|
1578
|
+
week_commencing (str): The desired start of the week.
|
|
1579
|
+
('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
|
|
1580
|
+
Uses ISO day numbering (Mon=1, ..., Sun=7).
|
|
1568
1581
|
|
|
1569
1582
|
Returns:
|
|
1570
|
-
pandas.DataFrame:
|
|
1583
|
+
pandas.DataFrame: Original DataFrame with an extra column
|
|
1584
|
+
'week_start_<week_commencing>' containing the
|
|
1585
|
+
start-of-week date for each row.
|
|
1571
1586
|
"""
|
|
1572
|
-
#
|
|
1573
|
-
|
|
1574
|
-
|
|
1587
|
+
# ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
|
|
1588
|
+
iso_day_dict = {"mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6, "sun": 7}
|
|
1589
|
+
|
|
1590
|
+
target_day = iso_day_dict[week_commencing]
|
|
1591
|
+
|
|
1592
|
+
def map_to_week_start(date_val):
|
|
1593
|
+
delta = (date_val.isoweekday() - target_day) % 7
|
|
1594
|
+
return date_val - pd.Timedelta(days=delta)
|
|
1595
|
+
|
|
1596
|
+
# Apply the transformation
|
|
1597
|
+
new_col = f"week_start_{week_commencing}"
|
|
1598
|
+
df[new_col] = df[date_col].apply(map_to_week_start)
|
|
1575
1599
|
|
|
1576
1600
|
return df
|
|
1577
|
-
|
|
1601
|
+
|
|
1578
1602
|
def plot_chart(self, df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs):
|
|
1579
1603
|
"""
|
|
1580
1604
|
Plot various types of charts using Plotly.
|
|
@@ -1738,10 +1762,6 @@ class dataprocessing:
|
|
|
1738
1762
|
fig = self.plot_two(df1, col1, df2, col2, date_column, same_axis=same_axis)
|
|
1739
1763
|
figs.append(fig)
|
|
1740
1764
|
|
|
1741
|
-
# Show all the figures
|
|
1742
|
-
for fig in figs:
|
|
1743
|
-
fig.show()
|
|
1744
|
-
|
|
1745
1765
|
return figs
|
|
1746
1766
|
|
|
1747
1767
|
########################################################################################################################################
|
|
@@ -1799,15 +1819,10 @@ class datapull:
|
|
|
1799
1819
|
print(" - Description: Fetch and process historical weather data for the specified country.")
|
|
1800
1820
|
print(" - Usage: pull_weather(week_commencing, country)")
|
|
1801
1821
|
print(" - Example: pull_weather('mon', 'GBR')")
|
|
1802
|
-
|
|
1803
|
-
print("\n8. pull_covid_data")
|
|
1804
|
-
print(" - Description: Get covid pandemic data for the country of interest.")
|
|
1805
|
-
print(" - Usage: pull_covid_data(folder_path, country, week_commencing)")
|
|
1806
|
-
print(" - Example: pull_covid_data('C:/Users/--username--/OneDrive/', 'GB', 'mon')")
|
|
1807
1822
|
|
|
1808
1823
|
############################################################### MACRO ##########################################################################
|
|
1809
1824
|
|
|
1810
|
-
def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"
|
|
1825
|
+
def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
|
|
1811
1826
|
'''
|
|
1812
1827
|
Parameters
|
|
1813
1828
|
----------
|
|
@@ -1816,7 +1831,7 @@ class datapull:
|
|
|
1816
1831
|
|
|
1817
1832
|
series_id_list : list[str]
|
|
1818
1833
|
provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
|
|
1819
|
-
["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"
|
|
1834
|
+
["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]
|
|
1820
1835
|
|
|
1821
1836
|
Returns
|
|
1822
1837
|
----------
|
|
@@ -1868,68 +1883,81 @@ class datapull:
|
|
|
1868
1883
|
|
|
1869
1884
|
return fred_df_final
|
|
1870
1885
|
|
|
1871
|
-
def pull_boe_data(self, week_commencing="mon", max_retries=
|
|
1886
|
+
def pull_boe_data(self, week_commencing="mon", max_retries=5, delay=5):
|
|
1872
1887
|
"""
|
|
1873
1888
|
Fetch and process Bank of England interest rate data.
|
|
1874
1889
|
|
|
1875
1890
|
Args:
|
|
1876
|
-
week_commencing (str): The starting day of the week for aggregation.
|
|
1877
|
-
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
1878
|
-
Default is "
|
|
1879
|
-
max_retries (int): Maximum number of retries to fetch data in case of failure. Default is
|
|
1891
|
+
week_commencing (str): The starting day of the week for aggregation.
|
|
1892
|
+
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
1893
|
+
Default is "mon".
|
|
1894
|
+
max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
|
|
1880
1895
|
delay (int): Delay in seconds between retry attempts. Default is 5.
|
|
1881
1896
|
|
|
1882
1897
|
Returns:
|
|
1883
|
-
pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
|
|
1884
|
-
The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
|
|
1898
|
+
pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
|
|
1899
|
+
The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
|
|
1885
1900
|
and 'macro_boe_intr_rate' contains the average interest rate for the week.
|
|
1886
1901
|
"""
|
|
1887
1902
|
# Week commencing dictionary
|
|
1888
1903
|
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
1889
|
-
|
|
1890
|
-
#
|
|
1891
|
-
def fetch_data_with_retries(url, max_retries, delay):
|
|
1892
|
-
for attempt in range(max_retries):
|
|
1893
|
-
try:
|
|
1894
|
-
html_table = pd.read_html(url)[0]
|
|
1895
|
-
return html_table
|
|
1896
|
-
except Exception as e:
|
|
1897
|
-
print(f"Attempt {attempt + 1} failed: {e}")
|
|
1898
|
-
if attempt < max_retries - 1:
|
|
1899
|
-
time.sleep(delay)
|
|
1900
|
-
else:
|
|
1901
|
-
raise
|
|
1902
|
-
|
|
1903
|
-
# Import HTML data from Bank of England rate
|
|
1904
|
+
|
|
1905
|
+
# URL of the Bank of England data page
|
|
1904
1906
|
url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1907
|
+
|
|
1908
|
+
# Retry logic for HTTP request
|
|
1909
|
+
for attempt in range(max_retries):
|
|
1910
|
+
try:
|
|
1911
|
+
# Set up headers to mimic a browser request
|
|
1912
|
+
headers = {
|
|
1913
|
+
"User-Agent": (
|
|
1914
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
1915
|
+
"Chrome/91.0.4472.124 Safari/537.36"
|
|
1916
|
+
)
|
|
1917
|
+
}
|
|
1918
|
+
response = requests.get(url, headers=headers)
|
|
1919
|
+
response.raise_for_status() # Raise an exception for HTTP errors
|
|
1920
|
+
break
|
|
1921
|
+
except requests.exceptions.RequestException as e:
|
|
1922
|
+
print(f"Attempt {attempt + 1} failed: {e}")
|
|
1923
|
+
if attempt < max_retries - 1:
|
|
1924
|
+
time.sleep(delay)
|
|
1925
|
+
else:
|
|
1926
|
+
raise
|
|
1927
|
+
|
|
1928
|
+
# Parse the HTML page
|
|
1929
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
1930
|
+
|
|
1931
|
+
# Find the table on the page
|
|
1932
|
+
table = soup.find("table") # Locate the first table
|
|
1933
|
+
table_html = str(table) # Convert table to string
|
|
1934
|
+
df = pd.read_html(StringIO(table_html))[0] # Use StringIO to wrap the table HTML
|
|
1935
|
+
|
|
1936
|
+
# Rename and clean up columns
|
|
1908
1937
|
df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
|
|
1909
|
-
|
|
1910
|
-
# Change date column to datetime and find the corresponding week to the date
|
|
1911
1938
|
df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
|
|
1912
|
-
df.sort_values("OBS",
|
|
1913
|
-
|
|
1914
|
-
# Create a daily date range
|
|
1915
|
-
date_range = pd.date_range(df["OBS"].
|
|
1939
|
+
df.sort_values("OBS", inplace=True)
|
|
1940
|
+
|
|
1941
|
+
# Create a daily date range
|
|
1942
|
+
date_range = pd.date_range(df["OBS"].min(), datetime.today(), freq="D")
|
|
1916
1943
|
df_daily = pd.DataFrame(date_range, columns=["OBS"])
|
|
1917
|
-
|
|
1944
|
+
|
|
1918
1945
|
# Adjust each date to the specified week commencing day
|
|
1919
|
-
df_daily[
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1927
|
-
|
|
1928
|
-
df_final
|
|
1929
|
-
df_final.
|
|
1930
|
-
|
|
1931
|
-
return df_final
|
|
1946
|
+
df_daily["Week_Commencing"] = df_daily["OBS"].apply(
|
|
1947
|
+
lambda x: x - timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
1948
|
+
)
|
|
1949
|
+
|
|
1950
|
+
# Merge and forward-fill missing rates
|
|
1951
|
+
df_daily = df_daily.merge(df, on="OBS", how="left")
|
|
1952
|
+
df_daily["macro_boe_intr_rate"] = df_daily["macro_boe_intr_rate"].ffill()
|
|
1953
|
+
|
|
1954
|
+
# Group by week commencing and calculate the average rate
|
|
1955
|
+
df_final = df_daily.groupby("Week_Commencing")["macro_boe_intr_rate"].mean().reset_index()
|
|
1956
|
+
df_final["Week_Commencing"] = df_final["Week_Commencing"].dt.strftime('%d/%m/%Y')
|
|
1957
|
+
df_final.rename(columns={"Week_Commencing": "OBS"}, inplace=True)
|
|
1932
1958
|
|
|
1959
|
+
return df_final
|
|
1960
|
+
|
|
1933
1961
|
def pull_ons_data(self, series_list, week_commencing):
|
|
1934
1962
|
"""
|
|
1935
1963
|
Fetch and process time series data from the ONS API.
|
|
@@ -2108,7 +2136,7 @@ class datapull:
|
|
|
2108
2136
|
break
|
|
2109
2137
|
|
|
2110
2138
|
# get data for the next variable if url doesn't exist
|
|
2111
|
-
if url_test
|
|
2139
|
+
if url_test is False:
|
|
2112
2140
|
continue
|
|
2113
2141
|
|
|
2114
2142
|
root = ET.fromstring(data_response.content)
|
|
@@ -2173,7 +2201,7 @@ class datapull:
|
|
|
2173
2201
|
|
|
2174
2202
|
return oecd_df_final
|
|
2175
2203
|
|
|
2176
|
-
def get_google_mobility_data(self, country
|
|
2204
|
+
def get_google_mobility_data(self, country="United Kingdom", wc="mon") -> pd.DataFrame:
|
|
2177
2205
|
"""
|
|
2178
2206
|
Fetch Google Mobility data for the specified country.
|
|
2179
2207
|
|
|
@@ -2193,7 +2221,7 @@ class datapull:
|
|
|
2193
2221
|
|
|
2194
2222
|
# Load the CSV file into a pandas DataFrame
|
|
2195
2223
|
csv_data = StringIO(response.text)
|
|
2196
|
-
df = pd.read_csv(csv_data)
|
|
2224
|
+
df = pd.read_csv(csv_data, low_memory=False)
|
|
2197
2225
|
|
|
2198
2226
|
# Filter the DataFrame for the specified country
|
|
2199
2227
|
country_df = df[df['country_region'] == country]
|
|
@@ -2339,10 +2367,10 @@ class datapull:
|
|
|
2339
2367
|
|
|
2340
2368
|
def pull_weather(self, week_commencing, country) -> pd.DataFrame:
|
|
2341
2369
|
import pandas as pd
|
|
2342
|
-
import urllib.request
|
|
2370
|
+
import urllib.request # noqa: F811
|
|
2343
2371
|
from datetime import datetime
|
|
2344
2372
|
import requests
|
|
2345
|
-
from geopy.geocoders import Nominatim
|
|
2373
|
+
from geopy.geocoders import Nominatim # noqa: F811
|
|
2346
2374
|
|
|
2347
2375
|
# Week commencing dictionary
|
|
2348
2376
|
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
@@ -2938,37 +2966,4 @@ class datapull:
|
|
|
2938
2966
|
|
|
2939
2967
|
final_weather = ims_proc.rename_cols(merged_df, 'seas_')
|
|
2940
2968
|
|
|
2941
|
-
return final_weather
|
|
2942
|
-
|
|
2943
|
-
def pull_covid_data(self, folder_path: str, country: str = "GB", week_commencing: str = "mon") -> pd.DataFrame:
|
|
2944
|
-
"""
|
|
2945
|
-
Get covid pandemic data for the country of interest.
|
|
2946
|
-
|
|
2947
|
-
Args:
|
|
2948
|
-
folder_path (str): A string containing the local location of the OneDrive folder.
|
|
2949
|
-
Example: "C:/Users/-- username --/OneDrive - im-sciences.com"
|
|
2950
|
-
The file location within the MasterDrive of the worldwide covid data is:
|
|
2951
|
-
MasterDrive/Central Database/Covid/oxford-government-response.csv
|
|
2952
|
-
country (str): A string containing the country of interest (E.g: "GB", "FR")
|
|
2953
|
-
week_commencing (str): The starting day of the week for aggregation.
|
|
2954
|
-
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
2955
|
-
|
|
2956
|
-
Returns:
|
|
2957
|
-
pd.DataFrame: A DataFrame containing seasonality and public holiday dummies for the country of interest.
|
|
2958
|
-
The 'OBS' column contains the week commencing dates.
|
|
2959
|
-
"""
|
|
2960
|
-
|
|
2961
|
-
df = pd.read_csv(f'{folder_path}/MasterDrive/Central Database/Covid/oxford-government-response.csv')
|
|
2962
|
-
|
|
2963
|
-
country_df = df[df['location_key']==country]
|
|
2964
|
-
country_df.rename(columns={'date': 'OBS'}, inplace=True)
|
|
2965
|
-
country_df.drop('location_key', axis=1, inplace=True)
|
|
2966
|
-
|
|
2967
|
-
agg_df = ims_proc.aggregate_daily_to_wc_wide(country_df, 'OBS', [], country_df.columns.to_list(), week_commencing, 'average')
|
|
2968
|
-
|
|
2969
|
-
covid_df = ims_proc.rename_cols(agg_df, 'covid_')
|
|
2970
|
-
|
|
2971
|
-
covid_df['OBS'] = covid_df['OBS'].apply(lambda x: x[0].date())
|
|
2972
|
-
|
|
2973
|
-
return covid_df
|
|
2974
|
-
|
|
2969
|
+
return final_weather
|