PyPI - imsciences - Versions diffs - 0.6.3.1__py3-none-any.whl → 0.6.3.2__py3-none-any.whl - Mend

imsciences 0.6.3.1py3-none-any.whl → 0.6.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

imsciences/__init__.py +2 -1
imsciences/datafunctions.py +225 -226
imsciences/unittesting.py +1335 -0
imsciences-0.6.3.2.dist-info/METADATA +383 -0
{imsciences-0.6.3.1.dist-info → imsciences-0.6.3.2.dist-info}/RECORD +8 -7
{imsciences-0.6.3.1.dist-info → imsciences-0.6.3.2.dist-info}/WHEEL +1 -1
imsciences-0.6.3.1.dist-info/METADATA +0 -24
{imsciences-0.6.3.1.dist-info → imsciences-0.6.3.2.dist-info}/PKG-INFO-IMS-24Ltp-3 +0 -0
{imsciences-0.6.3.1.dist-info → imsciences-0.6.3.2.dist-info}/top_level.txt +0 -0

imsciences/datafunctions.py CHANGED Viewed

@@ -6,20 +6,19 @@ import plotly.graph_objs as go
 import numpy as np
 import datetime
 import re
-import pandas as pd
 from fredapi import Fred
 import time
-from datetime import datetime,timedelta
-from cif import cif
+from datetime import datetime, timedelta  # noqa: F811
 from io import StringIO
 import urllib
-import requests_cache
-import urllib.request
+import requests_cache  # noqa: F401
+import urllib.request  # noqa: F401
 import requests
-from geopy.geocoders import Nominatim
+from geopy.geocoders import Nominatim  # noqa: F401
 import subprocess
 import json
 import xml.etree.ElementTree as ET
+from bs4 import BeautifulSoup
 class dataprocessing:
@@ -391,7 +390,7 @@ class dataprocessing:
                 # Divide each numeric value by the number of days in the month
                 for col in df.columns:
                     if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
-                        if divide == True:
+                        if divide is True:
                             daily_row[col] = row[col] / num_days
                         else:
                             daily_row[col] = row[col]
@@ -678,7 +677,7 @@ class dataprocessing:
         return combined_df
-    def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name="Total", datetime_trans_needed=True, reverse_header_order=False,fill_missing_weekly_dates=False,week_commencing='W-MON'):
+    def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing="W-MON"):
         """
         Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
@@ -701,53 +700,57 @@ class dataprocessing:
             pandas.DataFrame: The pivot table specified
         """
-        # Create the filtered df by applying the conditions
-        if filters_dict is None:
-            df_filtered = df
-        else:
+        # Validate inputs
+        if index_col not in df.columns:
+            raise ValueError(f"index_col '{index_col}' not found in DataFrame.")
+        if columns not in df.columns:
+            raise ValueError(f"columns '{columns}' not found in DataFrame.")
+        if values_col not in df.columns:
+            raise ValueError(f"values_col '{values_col}' not found in DataFrame.")
+        # Apply filters if provided
+        if filters_dict:
             df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
-        # Ensure index column is in datetime format for proper sorting
-        df_filtered = df_filtered.copy()
-        # If datetime transformation is needed
+        else:
+            df_filtered = df.copy()
+        # Ensure index column is in datetime format if needed
         if datetime_trans_needed:
             df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
         # Create the pivot table
-        pivoted_df = df_filtered.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc=aggfunc, margins=margins, margins_name=margins_name)
-        # Handling MultiIndex columns if present, making them a flat structure
-        if not reverse_header_order:
-            if isinstance(pivoted_df.columns, pd.MultiIndex):
-                pivoted_df.columns = ['_'.join(map(str, col)).strip() for col in pivoted_df.columns.values]
-            else:
-                pivoted_df.columns = pivoted_df.columns.map(str)
+        pivoted_df = df_filtered.pivot_table(
+            index=index_col,
+            columns=columns,
+            values=values_col,
+            aggfunc=aggfunc,
+            margins=margins,
+            margins_name=margins_name,
+        )
+        # Handle column headers
+        if isinstance(pivoted_df.columns, pd.MultiIndex):
+            pivoted_df.columns = [
+                "_".join(reversed(map(str, col)) if reverse_header_order else map(str, col))
+                for col in pivoted_df.columns.values
+            ]
         else:
-            if isinstance(pivoted_df.columns, pd.MultiIndex):
-                # Reorder the MultiIndex columns
-                pivoted_df.columns = ['_'.join(reversed(list(map(str, col)))).strip() for col in pivoted_df.columns.values]
-            else:
-                pivoted_df.columns = pivoted_df.columns.map(str)
-                # Reverse the order for single index columns
-                pivoted_df.columns = ['_'.join(reversed(col.split('_'))).strip() for col in pivoted_df.columns]
-        # Reset the pivot before returning
-        pivoted_df = pivoted_df.reset_index()
-        # Sort by index column from oldest to newest
+            pivoted_df.columns = pivoted_df.columns.map(str)
+        # Reset the index
+        pivoted_df.reset_index(inplace=True)
+        # Handle sorting and formatting of index column
         if datetime_trans_needed:
-            pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col])  # Ensure sorting works correctly
-            pivoted_df = pivoted_df.sort_values(by=index_col)
-            # Convert index column back to a string in YYYY-MM-DD format for display purposes
-            pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
-        # Fill in any NaNs
-        pivoted_df = pivoted_df.fillna(fill_value)
-        # If there is a need to fill in missing weeks
-        if fill_missing_weekly_dates == True:
+            pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col], errors="coerce")
+            pivoted_df.sort_values(by=index_col, inplace=True)
+            pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
+        # Fill missing values
+        pivoted_df.fillna(fill_value, inplace=True)
+        # Fill missing weekly dates if specified
+        if fill_missing_weekly_dates:
             pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
         return pivoted_df
@@ -983,7 +986,7 @@ class dataprocessing:
         return df
-    def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict,output_column_name="Updated Column"):
+    def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name="Updated Column"):
         """
         This function updates values in a specified column of the DataFrame based on a lookup dictionary.
         It first merges several columns into a new 'Merged' column, then uses this merged column to determine
@@ -1000,8 +1003,10 @@ class dataprocessing:
         Returns:
         pd.DataFrame: The modified DataFrame with updated values in the specified column.
         """
+        # Create a merged column from specified columns
         df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
+        # Replace values in the specified column based on the lookup
         def replace_values(x):
             if x[col] == replacement_rows:
                 merged_value = x['Merged']
@@ -1009,10 +1014,14 @@ class dataprocessing:
                     return replacement_lookup_dict[merged_value]
             return x[col]
+        # Apply replacement logic
         df[output_column_name] = df.apply(replace_values, axis=1)
+        # Drop the intermediate 'Merged' column
+        df.drop(columns=['Merged'], inplace=True)
         return df
     def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
         """
         Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
@@ -1049,35 +1058,38 @@ class dataprocessing:
         return df_final
-    def convert_df_wide_2_long(self, df,value_cols,variable_col_name='Stacked',value_col_name='Value'):
+    def convert_df_wide_2_long(self, df, value_cols, variable_col_name='Stacked', value_col_name='Value'):
         """
         Changes a dataframe from wide to long format.
         Args:
             df (pandas.DataFrame): The DataFrame containing the data.
-            value_cols (list of str or str if only one): list of column names which are to be transformed from several columns into one.
-            variable_col_name (str, optional): Name of new variables column, which contains the names of the columns which have been stacked into one. Defaults to 'Stacked'.
-            value_col_name (str, optional): Name of the new value column which contains all the data from the stacked columns. Defaults to 'Value'.
+            value_cols (list of str or str if only one): List of column names to transform from several columns into one.
+            variable_col_name (str, optional): Name of the new variable column containing the original column names. Defaults to 'Stacked'.
+            value_col_name (str, optional): Name of the new value column containing the data from stacked columns. Defaults to 'Value'.
         Returns:
-            pandas.DataFrame:: Returns dataframe transformed from long to wide.
+            pandas.DataFrame: DataFrame transformed from wide to long format.
         Raises:
-            ValueError: If number of column names to be depivoted is less than 2, then this function is not neccesary.
+            ValueError: If the number of columns to depivot is less than 2.
         """
-        # Check length of value cols is greater than 1
+        # Check length of value_cols is greater than 1
         if len(value_cols) < 2:
             raise ValueError("Number of inputs in list must be greater than 1")
         # Find the columns that are not to be depivoted into one column
-        id_vars = list(set(df.columns.tolist()) - set(value_cols))
+        id_vars = [col for col in df.columns if col not in value_cols]  # Preserve column order in the DataFrame
         # Melt all columns chosen into one column
-        df_final = pd.melt(df, id_vars,value_cols,var_name=variable_col_name,value_name=value_col_name)
+        df_final = pd.melt(df, id_vars=id_vars, value_vars=value_cols, var_name=variable_col_name, value_name=value_col_name)
+        # Sort column order to match expected output
+        ordered_columns = id_vars + [variable_col_name, value_col_name]
+        df_final = df_final[ordered_columns]
         return df_final
     def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
         """
         Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
@@ -1102,18 +1114,24 @@ class dataprocessing:
         Returns:
             pandas.DataFrame: Dataframe with manual changes added
         """
         # Raise type error if more than one col is supported
         if isinstance(col_to_change, list):
             raise TypeError("Col to change must be specified as a string, not a list")
         # Raises value error if input is invalid for change_in_existing_df_col
         if change_in_existing_df_col not in ["Yes", "No"]:
             raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
         # Raises value error if input is invalid for add_notes_col
         if add_notes not in ["Yes", "No"]:
             raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
+        # Validate filters_dict format
+        for col, cond in filters_dict.items():
+            if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
+                raise ValueError(f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'")
         # Create the filtered df by applying the conditions
         df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
@@ -1122,7 +1140,7 @@ class dataprocessing:
         if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
             df = df.copy()
             df[new_col_to_change_name] = df[col_to_change]
         # Update the new cell in the chosen column
         df.loc[df_filtered.index, col_to_update] = new_value
@@ -1146,32 +1164,32 @@ class dataprocessing:
     def format_numbers_with_commas(self, df, decimal_length_chosen=2):
         """
-        Converts data in numerical format into numbers with commas and a chosen decimal place length
+        Converts data in numerical format into numbers with commas and a chosen decimal place length.
         Args:
             df (pandas.DataFrame): The DataFrame containing the data.
-            decimal_length_chosen (int, optional): _description_. Defaults to 2.
+            decimal_length_chosen (int, optional): Number of decimal places. Defaults to 2.
         Returns:
-            pandas.DataFrame: The dataframe with the chosen updated format
+            pandas.DataFrame: The DataFrame with the chosen updated format.
         """
         def format_number_with_commas(x, decimal_length=decimal_length_chosen):
-            if isinstance(x, (int, float)):
+            if pd.isna(x):  # Preserve None/NaN values
+                return pd.NA  # Explicitly normalize to pd.NA
+            elif isinstance(x, (int, float)):
                 if decimal_length is not None:
-                    format_str = "{:,.{}f}".format(x, decimal_length)
-                    formatted_number = format_str.format(x)
+                    format_str = f"{{:,.{decimal_length}f}}"
+                    return format_str.format(x)
                 else:
-                    formatted_number = "{:,}".format(x)
-                return formatted_number
+                    return f"{x:,}"
             else:
                 return x  # Return unchanged if not a number
-        # Apply the function across several columns using applymap()
-        formatted_df = df.applymap(format_number_with_commas)
+        # Apply formatting column by column
+        formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(value=pd.NA)
         return formatted_df
     def filter_df_on_multiple_conditions(self, df, filters_dict):
         """
         Filter a dataframe based on mulitple conditions
@@ -1269,7 +1287,6 @@ class dataprocessing:
         """
         #This line removes zero values from given column
             return data_frame.loc[~(data_frame[column_to_filter] ==0)]
     def upgrade_outdated_packages(self):
@@ -1392,10 +1409,10 @@ class dataprocessing:
         Returns:
         pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
         """
         # If there is no date column
         if date_col is None:
-            df = df.applymap(lambda x: 1 if x > dummy_threshold else 0)
+            df = df.apply(lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0))
             if add_total_dummy_col != 'No':
                 # Find max value of rows
@@ -1403,8 +1420,10 @@ class dataprocessing:
         # If there is a date column
         else:
-            # Create dummies
-            df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].applymap(lambda x: 1 if x > dummy_threshold else 0)
+            # Create dummies for all columns except the date column
+            df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
+                lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
+            )
             if add_total_dummy_col != 'No':
                 # Find max value of rows
@@ -1427,7 +1446,6 @@ class dataprocessing:
         Returns:
         pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
         """
         if new_column is not None:
             # Create a new column for replacements
             df[new_column] = df[column]
@@ -1435,15 +1453,15 @@ class dataprocessing:
         else:
             # Modify the existing column
             temp_column = column
-        # Apply substring replacements
-        for old, new in replacements.items():
-            df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
         # Optionally convert to lowercase
         if to_lower:
             df[temp_column] = df[temp_column].str.lower()
+        # Apply substring replacements
+        for old, new in replacements.items():
+            df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
         return df
     def add_total_column(self, df, exclude_col=None, total_col_name='Total'):
@@ -1458,11 +1476,11 @@ class dataprocessing:
         Returns:
         pd.DataFrame: The DataFrame with an added total column.
         """
-        # If exclude_col is provided, drop that column before summing
-        if exclude_col:
-            df[total_col_name] = df.drop(columns=[exclude_col]).sum(axis=1)
+        if exclude_col and exclude_col in df.columns:
+            # Ensure the column to exclude exists before dropping
+            df[total_col_name] = df.drop(columns=[exclude_col], errors='ignore').sum(axis=1)
         else:
-            # Sum across all columns if exclude_col is not provided
+            # Sum across all columns if no column is specified to exclude
             df[total_col_name] = df.sum(axis=1)
         return df
@@ -1502,7 +1520,7 @@ class dataprocessing:
         df[new_col_name] = df[column_name].apply(categorize_text)
         return df
-    def compare_overlap(self,df1, df2, date_col):
+    def compare_overlap(self, df1, df2, date_col):
         """
         Compare overlapping periods between two DataFrames and provide a summary of total differences.
@@ -1517,64 +1535,70 @@ class dataprocessing:
         # Ensure date columns are in datetime format
         df1[date_col] = pd.to_datetime(df1[date_col])
         df2[date_col] = pd.to_datetime(df2[date_col])
         # Determine the overlap period
         start_date = max(df1[date_col].min(), df2[date_col].min())
         end_date = min(df1[date_col].max(), df2[date_col].max())
-        # Filter dataframes to the overlapping period
+        # Filter DataFrames to the overlapping period
         df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
         df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
-        # Merge the dataframes on the date column to align data for comparison
+        # Merge the DataFrames on the date column
         merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
-        # Get the common columns between the two DataFrames, excluding the date column
+        # Get common columns, excluding the date column
         common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
-        # Initialize a list to collect total differences for each column
+        # Create a DataFrame for differences
+        diff_df = pd.DataFrame({date_col: merged_df[date_col]})
         total_diff_list = []
-        # Create a DataFrame for the differences
-        diff_df = pd.DataFrame({date_col: merged_df[date_col]})  # Initialize diff_df with the date column
         for col in common_cols:
-            # Calculate the difference for each row
             diff_col = f'diff_{col}'
-            diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']
-            # Calculate the total difference for the column and add it to the list
+            diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']  # Corrected subtraction order
+            # Sum differences for the column
             total_diff = diff_df[diff_col].sum()
             total_diff_list.append({'Column': col, 'Total Difference': total_diff})
-        # Create a DataFrame for the summary of total differences
+        # Create summary DataFrame
         total_diff_df = pd.DataFrame(total_diff_list)
-        # Apply formatting to the numerical columns
-        float_format = "{:,.2f}".format  # Format to 2 decimal places with comma as thousand separator
-        diff_df.iloc[:, 1:] = diff_df.iloc[:, 1:].applymap(float_format)
-        total_diff_df['Total Difference'] = total_diff_df['Total Difference'].apply(float_format)
         return diff_df, total_diff_df
-    def week_commencing_2_week_commencing_conversion(self,df,date_col,week_commencing='sun'):
+    def week_commencing_2_week_commencing_conversion_isoweekday(self, df, date_col, week_commencing='mon'):
         """
-        Convert week commencing column in a DataFrame to the start of another day specified.
+        Convert a DataFrame's date column so that each date is mapped back
+        to the 'week_commencing' day of the *current ISO week*.
         Args:
-            df (pandas.DataFrame): The DataFrame containing the date-based data.
-            date_col (str): The name of the date column in the DataFrame.
-            week_commencing (str, optional): The day of the week that the week starts on ('sun' for Sunday, 'mon' for Monday, etc.). Defaults to 'sun'.
+            df (pandas.DataFrame): The DataFrame with date-based data.
+            date_col (str): The name of the date column.
+            week_commencing (str): The desired start of the week.
+                ('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
+                Uses ISO day numbering (Mon=1, ..., Sun=7).
         Returns:
-            pandas.DataFrame: The original DataFrame with an additional column indicating the start of the week.
+            pandas.DataFrame: Original DataFrame with an extra column
+                            'week_start_<week_commencing>' containing the
+                            start-of-week date for each row.
         """
-        # Week commencing dictionary
-        day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
-        df['week_start_'+ week_commencing] = df[date_col].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
+        # ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
+        iso_day_dict = {"mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6, "sun": 7}
+        target_day = iso_day_dict[week_commencing]
+        def map_to_week_start(date_val):
+            delta = (date_val.isoweekday() - target_day) % 7
+            return date_val - pd.Timedelta(days=delta)
+        # Apply the transformation
+        new_col = f"week_start_{week_commencing}"
+        df[new_col] = df[date_col].apply(map_to_week_start)
         return df
     def plot_chart(self, df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs):
         """
         Plot various types of charts using Plotly.
@@ -1795,15 +1819,10 @@ class datapull:
         print("   - Description: Fetch and process historical weather data for the specified country.")
         print("   - Usage: pull_weather(week_commencing, country)")
         print("   - Example: pull_weather('mon', 'GBR')")
-        print("\n8. pull_covid_data")
-        print("   - Description: Get covid pandemic data for the country of interest.")
-        print("   - Usage: pull_covid_data(folder_path, country, week_commencing)")
-        print("   - Example: pull_covid_data('C:/Users/--username--/OneDrive/', 'GB', 'mon')")
     ###############################################################  MACRO ##########################################################################
-    def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]) -> pd.DataFrame:
+    def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
         '''
         Parameters
         ----------
@@ -1812,7 +1831,7 @@ class datapull:
         series_id_list : list[str]
             provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
-            ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]
+            ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]
         Returns
         ----------
@@ -1864,68 +1883,81 @@ class datapull:
         return fred_df_final
-    def pull_boe_data(self, week_commencing="mon", max_retries=30, delay=5):
+    def pull_boe_data(self, week_commencing="mon", max_retries=5, delay=5):
         """
         Fetch and process Bank of England interest rate data.
         Args:
-            week_commencing (str): The starting day of the week for aggregation.
-                                Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
-                                Default is "sun".
-            max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 30.
+            week_commencing (str): The starting day of the week for aggregation.
+                                Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
+                                Default is "mon".
+            max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
             delay (int): Delay in seconds between retry attempts. Default is 5.
         Returns:
-            pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
-                        The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
+            pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
+                        The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
                         and 'macro_boe_intr_rate' contains the average interest rate for the week.
         """
         # Week commencing dictionary
         day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
-        # Function to fetch the data with retries
-        def fetch_data_with_retries(url, max_retries, delay):
-            for attempt in range(max_retries):
-                try:
-                    html_table = pd.read_html(url)[0]
-                    return html_table
-                except Exception as e:
-                    print(f"Attempt {attempt + 1} failed: {e}")
-                    if attempt < max_retries - 1:
-                        time.sleep(delay)
-                    else:
-                        raise
-        # Import HTML data from Bank of England rate
+        # URL of the Bank of England data page
         url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
-        html_table = fetch_data_with_retries(url, max_retries, delay)
-        df = pd.DataFrame(html_table)
+        # Retry logic for HTTP request
+        for attempt in range(max_retries):
+            try:
+                # Set up headers to mimic a browser request
+                headers = {
+                    "User-Agent": (
+                        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+                        "Chrome/91.0.4472.124 Safari/537.36"
+                    )
+                }
+                response = requests.get(url, headers=headers)
+                response.raise_for_status()  # Raise an exception for HTTP errors
+                break
+            except requests.exceptions.RequestException as e:
+                print(f"Attempt {attempt + 1} failed: {e}")
+                if attempt < max_retries - 1:
+                    time.sleep(delay)
+                else:
+                    raise
+        # Parse the HTML page
+        soup = BeautifulSoup(response.content, "html.parser")
+        # Find the table on the page
+        table = soup.find("table")  # Locate the first table
+        table_html = str(table)  # Convert table to string
+        df = pd.read_html(StringIO(table_html))[0]  # Use StringIO to wrap the table HTML
+        # Rename and clean up columns
         df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
-        # Change date column to datetime and find the corresponding week to the date
         df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
-        df.sort_values("OBS", axis=0, inplace=True)
-        # Create a daily date range and find the week commencing for that day
-        date_range = pd.date_range(df["OBS"].iloc[0], datetime.today(), freq="d")
+        df.sort_values("OBS", inplace=True)
+        # Create a daily date range
+        date_range = pd.date_range(df["OBS"].min(), datetime.today(), freq="D")
         df_daily = pd.DataFrame(date_range, columns=["OBS"])
         # Adjust each date to the specified week commencing day
-        df_daily['Week_Commencing'] = df_daily["OBS"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
-        # Outer merge the daily date range on the boe dataframe and forward fill in the blanks
-        df_final = df_daily.merge(df, on='OBS', how="left")
-        df_final["macro_boe_intr_rate"].ffill(inplace=True)
-        # Group by the week start date and get the mean of the interest rates for each week
-        df_final = df_final.groupby('Week_Commencing')['macro_boe_intr_rate'].mean().reset_index()
-        df_final['Week_Commencing'] = df_final['Week_Commencing'].dt.strftime('%d/%m/%Y')
-        df_final.rename(columns={'Week_Commencing': 'OBS'}, inplace=True)
-        return df_final
+        df_daily["Week_Commencing"] = df_daily["OBS"].apply(
+            lambda x: x - timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
+        )
+        # Merge and forward-fill missing rates
+        df_daily = df_daily.merge(df, on="OBS", how="left")
+        df_daily["macro_boe_intr_rate"] = df_daily["macro_boe_intr_rate"].ffill()
+        # Group by week commencing and calculate the average rate
+        df_final = df_daily.groupby("Week_Commencing")["macro_boe_intr_rate"].mean().reset_index()
+        df_final["Week_Commencing"] = df_final["Week_Commencing"].dt.strftime('%d/%m/%Y')
+        df_final.rename(columns={"Week_Commencing": "OBS"}, inplace=True)
+        return df_final
     def pull_ons_data(self, series_list, week_commencing):
         """
         Fetch and process time series data from the ONS API.
@@ -2104,7 +2136,7 @@ class datapull:
                     break
             # get data for the next variable if url doesn't exist
-            if url_test == False:
+            if url_test is False:
                 continue
             root = ET.fromstring(data_response.content)
@@ -2169,7 +2201,7 @@ class datapull:
         return oecd_df_final
-    def get_google_mobility_data(self, country: str, wc: str) -> pd.DataFrame:
+    def get_google_mobility_data(self, country="United Kingdom", wc="mon") -> pd.DataFrame:
         """
         Fetch Google Mobility data for the specified country.
@@ -2189,7 +2221,7 @@ class datapull:
         # Load the CSV file into a pandas DataFrame
         csv_data = StringIO(response.text)
-        df = pd.read_csv(csv_data)
+        df = pd.read_csv(csv_data, low_memory=False)
         # Filter the DataFrame for the specified country
         country_df = df[df['country_region'] == country]
@@ -2335,10 +2367,10 @@ class datapull:
     def pull_weather(self, week_commencing, country) -> pd.DataFrame:
         import pandas as pd
-        import urllib.request
+        import urllib.request  # noqa: F811
         from datetime import datetime
         import requests
-        from geopy.geocoders import Nominatim
+        from geopy.geocoders import Nominatim  # noqa: F811
         # Week commencing dictionary
         day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
@@ -2934,37 +2966,4 @@ class datapull:
         final_weather = ims_proc.rename_cols(merged_df, 'seas_')
-        return final_weather
-    def pull_covid_data(self, folder_path: str, country: str = "GB", week_commencing: str = "mon") -> pd.DataFrame:
-        """
-        Get covid pandemic data for the country of interest.
-        Args:
-            folder_path (str): A string containing the local location of the OneDrive folder.
-                                Example: "C:/Users/-- username --/OneDrive - im-sciences.com"
-                                The file location within the MasterDrive of the worldwide covid data is:
-                                MasterDrive/Central Database/Covid/oxford-government-response.csv
-            country (str): A string containing the country of interest (E.g: "GB", "FR")
-            week_commencing (str): The starting day of the week for aggregation.
-                                Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
-        Returns:
-            pd.DataFrame: A DataFrame containing seasonality and public holiday dummies for the country of interest.
-                            The 'OBS' column contains the week commencing dates.
-        """
-        df = pd.read_csv(f'{folder_path}/MasterDrive/Central Database/Covid/oxford-government-response.csv')
-        country_df = df[df['location_key']==country]
-        country_df.rename(columns={'date': 'OBS'}, inplace=True)
-        country_df.drop('location_key', axis=1, inplace=True)
-        agg_df = ims_proc.aggregate_daily_to_wc_wide(country_df, 'OBS', [], country_df.columns.to_list(), week_commencing, 'average')
-        covid_df = ims_proc.rename_cols(agg_df, 'covid_')
-        covid_df['OBS'] = covid_df['OBS'].apply(lambda x: x[0].date())
-        return covid_df
+        return final_weather

imsciences 0.6.3.1__py3-none-any.whl → 0.6.3.2__py3-none-any.whl

imsciences 0.6.3.1py3-none-any.whl → 0.6.3.2py3-none-any.whl