PyPI - imsciences - Versions diffs - 0.6.3.1__py3-none-any.whl → 0.8__py3-none-any.whl - Mend

imsciences 0.6.3.1py3-none-any.whl → 0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

imsciences/__init__.py +1 -1
imsciences/datafunctions.py +507 -419
imsciences/unittesting.py +1335 -0
imsciences-0.8.dist-info/METADATA +433 -0
{imsciences-0.6.3.1.dist-info → imsciences-0.8.dist-info}/RECORD +8 -7
{imsciences-0.6.3.1.dist-info → imsciences-0.8.dist-info}/WHEEL +1 -1
imsciences-0.6.3.1.dist-info/METADATA +0 -24
{imsciences-0.6.3.1.dist-info → imsciences-0.8.dist-info}/PKG-INFO-IMS-24Ltp-3 +0 -0
{imsciences-0.6.3.1.dist-info → imsciences-0.8.dist-info}/top_level.txt +0 -0

imsciences/datafunctions.py CHANGED Viewed

@@ -4,22 +4,18 @@ import os
 import plotly.express as px
 import plotly.graph_objs as go
 import numpy as np
-import datetime
 import re
-import pandas as pd
 from fredapi import Fred
 import time
-from datetime import datetime,timedelta
-from cif import cif
+from datetime import datetime, timedelta
 from io import StringIO
-import urllib
-import requests_cache
-import urllib.request
 import requests
-from geopy.geocoders import Nominatim
 import subprocess
 import json
 import xml.etree.ElementTree as ET
+from bs4 import BeautifulSoup
+import yfinance as yf
+import holidays
 class dataprocessing:
@@ -391,7 +387,7 @@ class dataprocessing:
                 # Divide each numeric value by the number of days in the month
                 for col in df.columns:
                     if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
-                        if divide == True:
+                        if divide is True:
                             daily_row[col] = row[col] / num_days
                         else:
                             daily_row[col] = row[col]
@@ -678,7 +674,7 @@ class dataprocessing:
         return combined_df
-    def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name="Total", datetime_trans_needed=True, reverse_header_order=False,fill_missing_weekly_dates=False,week_commencing='W-MON'):
+    def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing="W-MON"):
         """
         Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
@@ -701,53 +697,57 @@ class dataprocessing:
             pandas.DataFrame: The pivot table specified
         """
-        # Create the filtered df by applying the conditions
-        if filters_dict is None:
-            df_filtered = df
-        else:
+        # Validate inputs
+        if index_col not in df.columns:
+            raise ValueError(f"index_col '{index_col}' not found in DataFrame.")
+        if columns not in df.columns:
+            raise ValueError(f"columns '{columns}' not found in DataFrame.")
+        if values_col not in df.columns:
+            raise ValueError(f"values_col '{values_col}' not found in DataFrame.")
+        # Apply filters if provided
+        if filters_dict:
             df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
-        # Ensure index column is in datetime format for proper sorting
-        df_filtered = df_filtered.copy()
-        # If datetime transformation is needed
+        else:
+            df_filtered = df.copy()
+        # Ensure index column is in datetime format if needed
         if datetime_trans_needed:
             df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
         # Create the pivot table
-        pivoted_df = df_filtered.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc=aggfunc, margins=margins, margins_name=margins_name)
-        # Handling MultiIndex columns if present, making them a flat structure
-        if not reverse_header_order:
-            if isinstance(pivoted_df.columns, pd.MultiIndex):
-                pivoted_df.columns = ['_'.join(map(str, col)).strip() for col in pivoted_df.columns.values]
-            else:
-                pivoted_df.columns = pivoted_df.columns.map(str)
+        pivoted_df = df_filtered.pivot_table(
+            index=index_col,
+            columns=columns,
+            values=values_col,
+            aggfunc=aggfunc,
+            margins=margins,
+            margins_name=margins_name,
+        )
+        # Handle column headers
+        if isinstance(pivoted_df.columns, pd.MultiIndex):
+            pivoted_df.columns = [
+                "_".join(reversed(map(str, col)) if reverse_header_order else map(str, col))
+                for col in pivoted_df.columns.values
+            ]
         else:
-            if isinstance(pivoted_df.columns, pd.MultiIndex):
-                # Reorder the MultiIndex columns
-                pivoted_df.columns = ['_'.join(reversed(list(map(str, col)))).strip() for col in pivoted_df.columns.values]
-            else:
-                pivoted_df.columns = pivoted_df.columns.map(str)
-                # Reverse the order for single index columns
-                pivoted_df.columns = ['_'.join(reversed(col.split('_'))).strip() for col in pivoted_df.columns]
-        # Reset the pivot before returning
-        pivoted_df = pivoted_df.reset_index()
-        # Sort by index column from oldest to newest
+            pivoted_df.columns = pivoted_df.columns.map(str)
+        # Reset the index
+        pivoted_df.reset_index(inplace=True)
+        # Handle sorting and formatting of index column
         if datetime_trans_needed:
-            pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col])  # Ensure sorting works correctly
-            pivoted_df = pivoted_df.sort_values(by=index_col)
-            # Convert index column back to a string in YYYY-MM-DD format for display purposes
-            pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
-        # Fill in any NaNs
-        pivoted_df = pivoted_df.fillna(fill_value)
-        # If there is a need to fill in missing weeks
-        if fill_missing_weekly_dates == True:
+            pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col], errors="coerce")
+            pivoted_df.sort_values(by=index_col, inplace=True)
+            pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
+        # Fill missing values
+        pivoted_df.fillna(fill_value, inplace=True)
+        # Fill missing weekly dates if specified
+        if fill_missing_weekly_dates:
             pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
         return pivoted_df
@@ -983,7 +983,7 @@ class dataprocessing:
         return df
-    def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict,output_column_name="Updated Column"):
+    def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name="Updated Column"):
         """
         This function updates values in a specified column of the DataFrame based on a lookup dictionary.
         It first merges several columns into a new 'Merged' column, then uses this merged column to determine
@@ -1000,8 +1000,10 @@ class dataprocessing:
         Returns:
         pd.DataFrame: The modified DataFrame with updated values in the specified column.
         """
+        # Create a merged column from specified columns
         df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
+        # Replace values in the specified column based on the lookup
         def replace_values(x):
             if x[col] == replacement_rows:
                 merged_value = x['Merged']
@@ -1009,10 +1011,14 @@ class dataprocessing:
                     return replacement_lookup_dict[merged_value]
             return x[col]
+        # Apply replacement logic
         df[output_column_name] = df.apply(replace_values, axis=1)
+        # Drop the intermediate 'Merged' column
+        df.drop(columns=['Merged'], inplace=True)
         return df
     def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
         """
         Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
@@ -1049,35 +1055,38 @@ class dataprocessing:
         return df_final
-    def convert_df_wide_2_long(self, df,value_cols,variable_col_name='Stacked',value_col_name='Value'):
+    def convert_df_wide_2_long(self, df, value_cols, variable_col_name='Stacked', value_col_name='Value'):
         """
         Changes a dataframe from wide to long format.
         Args:
             df (pandas.DataFrame): The DataFrame containing the data.
-            value_cols (list of str or str if only one): list of column names which are to be transformed from several columns into one.
-            variable_col_name (str, optional): Name of new variables column, which contains the names of the columns which have been stacked into one. Defaults to 'Stacked'.
-            value_col_name (str, optional): Name of the new value column which contains all the data from the stacked columns. Defaults to 'Value'.
+            value_cols (list of str or str if only one): List of column names to transform from several columns into one.
+            variable_col_name (str, optional): Name of the new variable column containing the original column names. Defaults to 'Stacked'.
+            value_col_name (str, optional): Name of the new value column containing the data from stacked columns. Defaults to 'Value'.
         Returns:
-            pandas.DataFrame:: Returns dataframe transformed from long to wide.
+            pandas.DataFrame: DataFrame transformed from wide to long format.
         Raises:
-            ValueError: If number of column names to be depivoted is less than 2, then this function is not neccesary.
+            ValueError: If the number of columns to depivot is less than 2.
         """
-        # Check length of value cols is greater than 1
+        # Check length of value_cols is greater than 1
         if len(value_cols) < 2:
             raise ValueError("Number of inputs in list must be greater than 1")
         # Find the columns that are not to be depivoted into one column
-        id_vars = list(set(df.columns.tolist()) - set(value_cols))
+        id_vars = [col for col in df.columns if col not in value_cols]  # Preserve column order in the DataFrame
         # Melt all columns chosen into one column
-        df_final = pd.melt(df, id_vars,value_cols,var_name=variable_col_name,value_name=value_col_name)
+        df_final = pd.melt(df, id_vars=id_vars, value_vars=value_cols, var_name=variable_col_name, value_name=value_col_name)
+        # Sort column order to match expected output
+        ordered_columns = id_vars + [variable_col_name, value_col_name]
+        df_final = df_final[ordered_columns]
         return df_final
     def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
         """
         Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
@@ -1102,18 +1111,24 @@ class dataprocessing:
         Returns:
             pandas.DataFrame: Dataframe with manual changes added
         """
         # Raise type error if more than one col is supported
         if isinstance(col_to_change, list):
             raise TypeError("Col to change must be specified as a string, not a list")
         # Raises value error if input is invalid for change_in_existing_df_col
         if change_in_existing_df_col not in ["Yes", "No"]:
             raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
         # Raises value error if input is invalid for add_notes_col
         if add_notes not in ["Yes", "No"]:
             raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
+        # Validate filters_dict format
+        for col, cond in filters_dict.items():
+            if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
+                raise ValueError(f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'")
         # Create the filtered df by applying the conditions
         df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
@@ -1122,7 +1137,7 @@ class dataprocessing:
         if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
             df = df.copy()
             df[new_col_to_change_name] = df[col_to_change]
         # Update the new cell in the chosen column
         df.loc[df_filtered.index, col_to_update] = new_value
@@ -1146,32 +1161,32 @@ class dataprocessing:
     def format_numbers_with_commas(self, df, decimal_length_chosen=2):
         """
-        Converts data in numerical format into numbers with commas and a chosen decimal place length
+        Converts data in numerical format into numbers with commas and a chosen decimal place length.
         Args:
             df (pandas.DataFrame): The DataFrame containing the data.
-            decimal_length_chosen (int, optional): _description_. Defaults to 2.
+            decimal_length_chosen (int, optional): Number of decimal places. Defaults to 2.
         Returns:
-            pandas.DataFrame: The dataframe with the chosen updated format
+            pandas.DataFrame: The DataFrame with the chosen updated format.
         """
         def format_number_with_commas(x, decimal_length=decimal_length_chosen):
-            if isinstance(x, (int, float)):
+            if pd.isna(x):  # Preserve None/NaN values
+                return pd.NA  # Explicitly normalize to pd.NA
+            elif isinstance(x, (int, float)):
                 if decimal_length is not None:
-                    format_str = "{:,.{}f}".format(x, decimal_length)
-                    formatted_number = format_str.format(x)
+                    format_str = f"{{:,.{decimal_length}f}}"
+                    return format_str.format(x)
                 else:
-                    formatted_number = "{:,}".format(x)
-                return formatted_number
+                    return f"{x:,}"
             else:
                 return x  # Return unchanged if not a number
-        # Apply the function across several columns using applymap()
-        formatted_df = df.applymap(format_number_with_commas)
+        # Apply formatting column by column
+        formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(value=pd.NA)
         return formatted_df
     def filter_df_on_multiple_conditions(self, df, filters_dict):
         """
         Filter a dataframe based on mulitple conditions
@@ -1269,7 +1284,6 @@ class dataprocessing:
         """
         #This line removes zero values from given column
             return data_frame.loc[~(data_frame[column_to_filter] ==0)]
     def upgrade_outdated_packages(self):
@@ -1392,10 +1406,10 @@ class dataprocessing:
         Returns:
         pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
         """
         # If there is no date column
         if date_col is None:
-            df = df.applymap(lambda x: 1 if x > dummy_threshold else 0)
+            df = df.apply(lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0))
             if add_total_dummy_col != 'No':
                 # Find max value of rows
@@ -1403,8 +1417,10 @@ class dataprocessing:
         # If there is a date column
         else:
-            # Create dummies
-            df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].applymap(lambda x: 1 if x > dummy_threshold else 0)
+            # Create dummies for all columns except the date column
+            df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
+                lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
+            )
             if add_total_dummy_col != 'No':
                 # Find max value of rows
@@ -1427,7 +1443,6 @@ class dataprocessing:
         Returns:
         pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
         """
         if new_column is not None:
             # Create a new column for replacements
             df[new_column] = df[column]
@@ -1435,15 +1450,15 @@ class dataprocessing:
         else:
             # Modify the existing column
             temp_column = column
-        # Apply substring replacements
-        for old, new in replacements.items():
-            df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
         # Optionally convert to lowercase
         if to_lower:
             df[temp_column] = df[temp_column].str.lower()
+        # Apply substring replacements
+        for old, new in replacements.items():
+            df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
         return df
     def add_total_column(self, df, exclude_col=None, total_col_name='Total'):
@@ -1458,11 +1473,11 @@ class dataprocessing:
         Returns:
         pd.DataFrame: The DataFrame with an added total column.
         """
-        # If exclude_col is provided, drop that column before summing
-        if exclude_col:
-            df[total_col_name] = df.drop(columns=[exclude_col]).sum(axis=1)
+        if exclude_col and exclude_col in df.columns:
+            # Ensure the column to exclude exists before dropping
+            df[total_col_name] = df.drop(columns=[exclude_col], errors='ignore').sum(axis=1)
         else:
-            # Sum across all columns if exclude_col is not provided
+            # Sum across all columns if no column is specified to exclude
             df[total_col_name] = df.sum(axis=1)
         return df
@@ -1502,7 +1517,7 @@ class dataprocessing:
         df[new_col_name] = df[column_name].apply(categorize_text)
         return df
-    def compare_overlap(self,df1, df2, date_col):
+    def compare_overlap(self, df1, df2, date_col):
         """
         Compare overlapping periods between two DataFrames and provide a summary of total differences.
@@ -1517,64 +1532,70 @@ class dataprocessing:
         # Ensure date columns are in datetime format
         df1[date_col] = pd.to_datetime(df1[date_col])
         df2[date_col] = pd.to_datetime(df2[date_col])
         # Determine the overlap period
         start_date = max(df1[date_col].min(), df2[date_col].min())
         end_date = min(df1[date_col].max(), df2[date_col].max())
-        # Filter dataframes to the overlapping period
+        # Filter DataFrames to the overlapping period
         df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
         df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
-        # Merge the dataframes on the date column to align data for comparison
+        # Merge the DataFrames on the date column
         merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
-        # Get the common columns between the two DataFrames, excluding the date column
+        # Get common columns, excluding the date column
         common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
-        # Initialize a list to collect total differences for each column
+        # Create a DataFrame for differences
+        diff_df = pd.DataFrame({date_col: merged_df[date_col]})
         total_diff_list = []
-        # Create a DataFrame for the differences
-        diff_df = pd.DataFrame({date_col: merged_df[date_col]})  # Initialize diff_df with the date column
         for col in common_cols:
-            # Calculate the difference for each row
             diff_col = f'diff_{col}'
-            diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']
-            # Calculate the total difference for the column and add it to the list
+            diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']  # Corrected subtraction order
+            # Sum differences for the column
             total_diff = diff_df[diff_col].sum()
             total_diff_list.append({'Column': col, 'Total Difference': total_diff})
-        # Create a DataFrame for the summary of total differences
+        # Create summary DataFrame
         total_diff_df = pd.DataFrame(total_diff_list)
-        # Apply formatting to the numerical columns
-        float_format = "{:,.2f}".format  # Format to 2 decimal places with comma as thousand separator
-        diff_df.iloc[:, 1:] = diff_df.iloc[:, 1:].applymap(float_format)
-        total_diff_df['Total Difference'] = total_diff_df['Total Difference'].apply(float_format)
         return diff_df, total_diff_df
-    def week_commencing_2_week_commencing_conversion(self,df,date_col,week_commencing='sun'):
+    def week_commencing_2_week_commencing_conversion_isoweekday(self, df, date_col, week_commencing='mon'):
         """
-        Convert week commencing column in a DataFrame to the start of another day specified.
+        Convert a DataFrame's date column so that each date is mapped back
+        to the 'week_commencing' day of the *current ISO week*.
         Args:
-            df (pandas.DataFrame): The DataFrame containing the date-based data.
-            date_col (str): The name of the date column in the DataFrame.
-            week_commencing (str, optional): The day of the week that the week starts on ('sun' for Sunday, 'mon' for Monday, etc.). Defaults to 'sun'.
+            df (pandas.DataFrame): The DataFrame with date-based data.
+            date_col (str): The name of the date column.
+            week_commencing (str): The desired start of the week.
+                ('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
+                Uses ISO day numbering (Mon=1, ..., Sun=7).
         Returns:
-            pandas.DataFrame: The original DataFrame with an additional column indicating the start of the week.
+            pandas.DataFrame: Original DataFrame with an extra column
+                            'week_start_<week_commencing>' containing the
+                            start-of-week date for each row.
         """
-        # Week commencing dictionary
-        day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
-        df['week_start_'+ week_commencing] = df[date_col].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
+        # ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
+        iso_day_dict = {"mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6, "sun": 7}
+        target_day = iso_day_dict[week_commencing]
+        def map_to_week_start(date_val):
+            delta = (date_val.isoweekday() - target_day) % 7
+            return date_val - pd.Timedelta(days=delta)
+        # Apply the transformation
+        new_col = f"week_start_{week_commencing}"
+        df[new_col] = df[date_col].apply(map_to_week_start)
         return df
     def plot_chart(self, df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs):
         """
         Plot various types of charts using Plotly.
@@ -1743,17 +1764,6 @@ class dataprocessing:
 ########################################################################################################################################
 ########################################################################################################################################
 ims_proc = dataprocessing()
 class datapull:
@@ -1764,46 +1774,46 @@ class datapull:
         print("\n1. pull_fred_data")
         print("   - Description: Get data from FRED by using series id tokens.")
         print("   - Usage: pull_fred_data(week_commencing, series_id_list)")
-        print("   - Example: pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])")
+        print("   - Example: pull_fred_data('mon', ['GPDIC1'])")
         print("\n2. pull_boe_data")
         print("   - Description: Fetch and process Bank of England interest rate data.")
         print("   - Usage: pull_boe_data(week_commencing)")
         print("   - Example: pull_boe_data('mon')")
-        print("\n3. pull_ons_data")
-        print("   - Description: Fetch and process time series data from the ONS API.")
-        print("   - Usage: pull_ons_data(series_list, week_commencing)")
-        print("   - Example: pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')")
-        print("\n4. pull_oecd")
+        print("\n3. pull_oecd")
         print("   - Description: Fetch macroeconomic data from OECD for a specified country.")
-        print("   - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '1950-01-01')")
-        print("   - Example: pull_oecd('GBR', 'mon', '1950-01-01')")
+        print("   - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '2020-01-01')")
+        print("   - Example: pull_oecd('GBR', 'mon', '2000-01-01')")
-        print("\n5. get_google_mobility_data")
+        print("\n4. get_google_mobility_data")
         print("   - Description: Fetch Google Mobility data for the specified country.")
         print("   - Usage: get_google_mobility_data(country, wc)")
         print("   - Example: get_google_mobility_data('United Kingdom', 'mon')")
-        print("\n6. pull_combined_dummies")
+        print("\n5. pull_seasonality")
         print("   - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.")
-        print("   - Usage: pull_combined_dummies(week_commencing)")
-        print("   - Example: pull_combined_dummies('mon')")
+        print("   - Usage: pull_seasonality(week_commencing, start_date, countries)")
+        print("   - Example: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])")
-        print("\n7. pull_weather")
+        print("\n6. pull_weather")
         print("   - Description: Fetch and process historical weather data for the specified country.")
         print("   - Usage: pull_weather(week_commencing, country)")
         print("   - Example: pull_weather('mon', 'GBR')")
+        print("\n7. pull_macro_ons_uk")
+        print("   - Description: Fetch and process time series data from the Beta ONS API.")
+        print("   - Usage: pull_macro_ons_uk(aditional_list, week_commencing, sector)")
+        print("   - Example: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')")
+        print("\n8. pull_yfinance")
+        print("   - Description: Fetch and process time series data from the Beta ONS API.")
+        print("   - Usage: pull_yfinance(tickers, week_start_day)")
+        print("   - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
-        print("\n8. pull_covid_data")
-        print("   - Description: Get covid pandemic data for the country of interest.")
-        print("   - Usage: pull_covid_data(folder_path, country, week_commencing)")
-        print("   - Example: pull_covid_data('C:/Users/--username--/OneDrive/', 'GB', 'mon')")
     ###############################################################  MACRO ##########################################################################
-    def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]) -> pd.DataFrame:
+    def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
         '''
         Parameters
         ----------
@@ -1812,16 +1822,12 @@ class datapull:
         series_id_list : list[str]
             provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
-            ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]
+            ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]
         Returns
         ----------
         pd.DataFrame
             Return a data frame with FRED data according to the series IDs provided
-        Example
-        ----------
-        pull_fred_data("mon", ["GCEC1", "SP500"])
         '''
         # Fred API
         fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
@@ -1864,169 +1870,82 @@ class datapull:
         return fred_df_final
-    def pull_boe_data(self, week_commencing="mon", max_retries=30, delay=5):
+    def pull_boe_data(self, week_commencing="mon", max_retries=5, delay=5):
         """
         Fetch and process Bank of England interest rate data.
         Args:
-            week_commencing (str): The starting day of the week for aggregation.
-                                Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
-                                Default is "sun".
-            max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 30.
+            week_commencing (str): The starting day of the week for aggregation.
+                                Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
+                                Default is "mon".
+            max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
             delay (int): Delay in seconds between retry attempts. Default is 5.
         Returns:
-            pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
-                        The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
+            pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
+                        The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
                         and 'macro_boe_intr_rate' contains the average interest rate for the week.
         """
         # Week commencing dictionary
         day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
-        # Function to fetch the data with retries
-        def fetch_data_with_retries(url, max_retries, delay):
-            for attempt in range(max_retries):
-                try:
-                    html_table = pd.read_html(url)[0]
-                    return html_table
-                except Exception as e:
-                    print(f"Attempt {attempt + 1} failed: {e}")
-                    if attempt < max_retries - 1:
-                        time.sleep(delay)
-                    else:
-                        raise
-        # Import HTML data from Bank of England rate
+        # URL of the Bank of England data page
         url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
-        html_table = fetch_data_with_retries(url, max_retries, delay)
-        df = pd.DataFrame(html_table)
+        # Retry logic for HTTP request
+        for attempt in range(max_retries):
+            try:
+                # Set up headers to mimic a browser request
+                headers = {
+                    "User-Agent": (
+                        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+                        "Chrome/91.0.4472.124 Safari/537.36"
+                    )
+                }
+                response = requests.get(url, headers=headers)
+                response.raise_for_status()  # Raise an exception for HTTP errors
+                break
+            except requests.exceptions.RequestException as e:
+                print(f"Attempt {attempt + 1} failed: {e}")
+                if attempt < max_retries - 1:
+                    time.sleep(delay)
+                else:
+                    raise
+        # Parse the HTML page
+        soup = BeautifulSoup(response.content, "html.parser")
+        # Find the table on the page
+        table = soup.find("table")  # Locate the first table
+        table_html = str(table)  # Convert table to string
+        df = pd.read_html(StringIO(table_html))[0]  # Use StringIO to wrap the table HTML
+        # Rename and clean up columns
         df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
-        # Change date column to datetime and find the corresponding week to the date
         df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
-        df.sort_values("OBS", axis=0, inplace=True)
-        # Create a daily date range and find the week commencing for that day
-        date_range = pd.date_range(df["OBS"].iloc[0], datetime.today(), freq="d")
-        df_daily = pd.DataFrame(date_range, columns=["OBS"])
-        # Adjust each date to the specified week commencing day
-        df_daily['Week_Commencing'] = df_daily["OBS"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
-        # Outer merge the daily date range on the boe dataframe and forward fill in the blanks
-        df_final = df_daily.merge(df, on='OBS', how="left")
-        df_final["macro_boe_intr_rate"].ffill(inplace=True)
-        # Group by the week start date and get the mean of the interest rates for each week
-        df_final = df_final.groupby('Week_Commencing')['macro_boe_intr_rate'].mean().reset_index()
-        df_final['Week_Commencing'] = df_final['Week_Commencing'].dt.strftime('%d/%m/%Y')
-        df_final.rename(columns={'Week_Commencing': 'OBS'}, inplace=True)
-        return df_final
+        df.sort_values("OBS", inplace=True)
-    def pull_ons_data(self, series_list, week_commencing):
-        """
-        Fetch and process time series data from the ONS API.
+        # Create a daily date range
+        date_range = pd.date_range(df["OBS"].min(), datetime.today(), freq="D")
+        df_daily = pd.DataFrame(date_range, columns=["OBS"])
-        Args:
-            series_list (list): A list of dictionaries where each dictionary represents a time series.
-                                Each dictionary should have the keys 'series_id' and 'dataset_id'.
-            week_commencing (str): The starting day of the week for aggregation.
-                                Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
+        # Adjust each date to the specified week commencing day
+        df_daily["Week_Commencing"] = df_daily["OBS"].apply(
+            lambda x: x - timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
+        )
-        Returns:
-            pd.DataFrame: A DataFrame with weekly aggregated ONS data. The 'OBS' column contains the week
-                        commencing dates and other columns contain the aggregated time series values.
-        """
-        def parse_quarter(date_str):
-            """Parses a string in 'YYYY Q#' format into a datetime object."""
-            year, quarter = date_str.split(' ')
-            quarter_number = int(quarter[1])
-            month = (quarter_number - 1) * 3 + 1
-            return pd.Timestamp(f"{year}-{month:02d}-01")
+        # Merge and forward-fill missing rates
+        df_daily = df_daily.merge(df, on="OBS", how="left")
+        df_daily["macro_boe_intr_rate"] = df_daily["macro_boe_intr_rate"].ffill()
-        # Generate a date range from 1950-01-01 to today
-        date_range = pd.date_range(start="1950-01-01", end=datetime.today(), freq='D')
-        daily_df = pd.DataFrame(date_range, columns=['OBS'])
-        # Keep track of the renamed value columns
-        value_columns = []
+        # Group by week commencing and calculate the average rate
+        df_final = df_daily.groupby("Week_Commencing")["macro_boe_intr_rate"].mean().reset_index()
+        df_final["Week_Commencing"] = df_final["Week_Commencing"].dt.strftime('%d/%m/%Y')
+        df_final.rename(columns={"Week_Commencing": "OBS"}, inplace=True)
-        for series in series_list:
-            series_id = series['series_id']
-            dataset_id = series['dataset_id']
-            # Construct the URL for data
-            data_url = f"https://api.ons.gov.uk/timeseries/{series_id}/dataset/{dataset_id}/data"
-            # Make the request to the ONS API for data
-            data_response = requests.get(data_url)
-            # Check if the request was successful
-            if data_response.status_code != 200:
-                print(f"Failed to fetch data for series {series_id}: {data_response.status_code} {data_response.text}")
-                continue
-            # Parse the JSON response for data
-            data = data_response.json()
-            # Attempt to extract the name of the time series from the data response
-            series_name = data.get('description', {}).get('title', 'Value')
-            # Determine the most granular time series data available
-            if 'months' in data and data['months']:
-                time_series_data = data['months']
-            elif 'quarters' in data and data['quarters']:
-                time_series_data = data['quarters']
-            elif 'years' in data and data['years']:
-                time_series_data = data['years']
-            else:
-                print("No time series data found in the response")
-                continue
-            # Create a DataFrame from the time series data
-            df = pd.DataFrame(time_series_data)
-            # Handle different frequencies in the data
-            if 'date' in df.columns:
-                if any(df['date'].str.contains('Q')):
-                    df['date'] = df['date'].apply(parse_quarter)
-                else:
-                    df['date'] = pd.to_datetime(df['date'])
-            df = df.rename(columns={'date': 'OBS', 'value': series_name})
-            # Rename the value column
-            new_col_name = 'macro_' + series_name.lower().replace(':', '').replace(' ', '_').replace('-', '_')
-            df = df.rename(columns={series_name: new_col_name})
-            # Track the renamed value column
-            value_columns.append(new_col_name)
-            # Merge the data based on the observation date
-            daily_df = pd.merge_asof(daily_df, df[['OBS', new_col_name]], on='OBS', direction='backward')
-        # Ensure columns are numeric
-        for col in value_columns:
-            if col in daily_df.columns:
-                daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
-            else:
-                print(f"Column {col} not found in daily_df")
-        # Aggregate results by week
-        ons_df_final = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
-                                                    date_column="OBS",
-                                                    group_columns=[],
-                                                    sum_columns=value_columns,
-                                                    wc=week_commencing,
-                                                    aggregation="average")
-        return ons_df_final
+        return df_final
-    def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "1950-01-01") -> pd.DataFrame:
+    def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "2020-01-01") -> pd.DataFrame:
         """
         Fetch and process time series data from the OECD API.
@@ -2104,7 +2023,7 @@ class datapull:
                     break
             # get data for the next variable if url doesn't exist
-            if url_test == False:
+            if url_test is False:
                 continue
             root = ET.fromstring(data_response.content)
@@ -2169,7 +2088,7 @@ class datapull:
         return oecd_df_final
-    def get_google_mobility_data(self, country: str, wc: str) -> pd.DataFrame:
+    def get_google_mobility_data(self, country="United Kingdom", wc="mon") -> pd.DataFrame:
         """
         Fetch Google Mobility data for the specified country.
@@ -2189,7 +2108,7 @@ class datapull:
         # Load the CSV file into a pandas DataFrame
         csv_data = StringIO(response.text)
-        df = pd.read_csv(csv_data)
+        df = pd.read_csv(csv_data, low_memory=False)
         # Filter the DataFrame for the specified country
         country_df = df[df['country_region'] == country]
@@ -2203,12 +2122,12 @@ class datapull:
     ###############################################################  Seasonality  ##########################################################################
-    def pull_combined_dummies(self, week_commencing):
+    def pull_seasonality(self, week_commencing, start_date, countries):
         # Week commencing dictionary
         day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
-        # Create daily date range dataframe
-        date_range = pd.date_range(datetime(2015, 1, 1), datetime.today(), freq="d")
+        # Create daily date range dataframe starting from start_date
+        date_range = pd.date_range(start=pd.to_datetime(start_date), end=datetime.today(), freq="d")
         df_daily = pd.DataFrame(date_range, columns=["Date"])
         # Create weekly date range dataframe
@@ -2218,7 +2137,7 @@ class datapull:
         df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
         df_weekly_start.set_index("Date", inplace=True)
         # Create individual weekly dummies
         dummy_columns = {}
         for i in range(len(df_weekly_start)):
@@ -2228,84 +2147,59 @@ class datapull:
         df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
         df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
-        # Create monthly dummies
+        # Add public holidays for each country and holiday type
+        for country in countries:
+            country_holidays = holidays.CountryHoliday(country, years=range(int(start_date[:4]), datetime.today().year + 1))
+            df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(lambda x: 1 if x in country_holidays else 0)
+            # Extract specific holidays
+            for date, name in country_holidays.items():
+                col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
+                if col_name not in df_daily.columns:
+                    df_daily[col_name] = 0
+                df_daily.loc[df_daily["Date"] == pd.Timestamp(date), col_name] = 1
+        # Map daily holidays to weekly aggregation
+        df_daily['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
+        df_holidays = df_daily.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
+        df_holidays.set_index("Date", inplace=True)
+        # Create monthly dummies (separately from holidays)
         df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
-        df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"])
+        df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"], dtype=int)
         df_monthly_dummies['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
         df_monthly_dummies = df_monthly_dummies.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
         df_monthly_dummies.set_index("Date", inplace=True)
-        df_monthly_dummies = df_monthly_dummies / 7
-        # Combine weekly and monthly dataframes
-        df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
+        # Divide only the monthly dummy columns by 7 (exclude holiday-related columns)
+        monthly_cols = [col for col in df_monthly_dummies.columns if not col.startswith("seas_holiday") and not col.startswith("seas_")]
+        df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
+        # Merge weekly dummies, monthly dummies, and holidays
+        df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)  # Combine weekly and monthly first
+        df_combined = pd.concat([df_combined, df_holidays], axis=1)  # Add holidays separately
+        # Drop duplicate columns if any exist (this ensures holidays are not duplicated)
+        df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
         # Create weekly dummies
         df_combined.reset_index(inplace=True)
         df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
-        df_combined = pd.get_dummies(df_combined, prefix="wk", columns=["Week"])
+        df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int)
         # Create yearly dummies
         df_combined["Year"] = df_combined["Date"].dt.year
-        df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"])
+        df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
         # Add constant
         df_combined["Constant"] = 1
         # Add trend
         df_combined["Trend"] = df_combined.index + 1
-        # Set date as index
-        df_combined.set_index("Date", inplace=True)
-        # Create COVID lockdown dummies
-        lockdown_periods = [
-            # Lockdown 1
-            ("2020-03-23", "2020-05-24"),
-            # Lockdown 2
-            ("2020-11-05", "2020-12-02"),
-            # Lockdown 3
-            ("2021-01-04", "2021-03-08")
-        ]
-        df_covid = pd.DataFrame(date_range, columns=["Date"])
-        df_covid["national_lockdown"] = 0
-        for start, end in lockdown_periods:
-            df_covid.loc[(df_covid["Date"] >= start) & (df_covid["Date"] <= end), "national_lockdown"] = 1
-        df_covid['week_start'] = df_covid["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
-        df_covid.drop("Date", axis=1, inplace=True)
-        df_covid.rename(columns={"week_start": "OBS"}, inplace=True)
-        df_national_lockdown_total = df_covid.groupby('OBS').sum(numeric_only=True)
-        df_national_lockdown_total.rename(columns={"national_lockdown": "covid_uk_national_lockdown_total"}, inplace=True)
-        df_national_lockdown_1 = df_national_lockdown_total.copy(deep=True)
-        df_national_lockdown_2 = df_national_lockdown_total.copy(deep=True)
-        df_national_lockdown_3 = df_national_lockdown_total.copy(deep=True)
-        df_national_lockdown_1.loc[df_national_lockdown_1.index > "2020-05-24"] = 0
-        df_national_lockdown_1.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_1"}, inplace=True)
-        df_national_lockdown_2.loc[df_national_lockdown_2.index < "2020-11-05"] = 0
-        df_national_lockdown_2.loc[df_national_lockdown_2.index > "2020-12-02"] = 0
-        df_national_lockdown_2.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_2"}, inplace=True)
-        df_national_lockdown_3.loc[df_national_lockdown_3.index < "2021-01-04"] = 0
-        df_national_lockdown_3.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_3"}, inplace=True)
-        df_final_covid = pd.concat([df_national_lockdown_total, df_national_lockdown_1, df_national_lockdown_2, df_national_lockdown_3], axis=1)
-        df_final_covid.reset_index(inplace=True)
-        df_final_covid.rename(columns={"index": "OBS"}, inplace=True)
         # Create seasonal indicators for the last day and last Friday of the month
-        min_date = '2019-12-29'
-        max_date = datetime.today().strftime('%Y-%m-%d')
-        date_range_seas = pd.date_range(start=min_date, end=max_date)
-        df_seas = pd.DataFrame(date_range_seas, columns=['Date'])
-        df_seas['Last_Day_of_Month'] = df_seas['Date'].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
+        df_combined['seas_last_day_of_month'] = df_combined["Date"].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
         def is_last_friday(date):
             last_day_of_month = date.to_period('M').to_timestamp('M')
@@ -2317,28 +2211,19 @@ class datapull:
             last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
             return 1 if date == last_friday else 0
-        df_seas['Last_Friday_of_Month'] = df_seas['Date'].apply(is_last_friday)
+        df_combined['seas_last_friday_of_month'] = df_combined["Date"].apply(is_last_friday)
-        df_seas['week_start'] = df_seas["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
-        df_seas = df_seas.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
-        df_seas.set_index("Date", inplace=True)
-        # Combine all dataframes
-        df_combined = df_combined.reset_index().rename(columns={"Date": "OBS"})
-        df_final_combined = pd.merge(df_combined, df_final_covid, how='left', left_on='OBS', right_on='OBS')
-        df_final_combined = pd.merge(df_final_combined, df_seas, how='left', left_on='OBS', right_on='Date')
-        # Fill any NaN values with 0
-        df_final_combined.fillna(0, inplace=True)
+        # Rename Date to OBS
+        df_combined.rename(columns={"Date": "OBS"}, inplace=True)
-        return df_final_combined
+        return df_combined
     def pull_weather(self, week_commencing, country) -> pd.DataFrame:
         import pandas as pd
-        import urllib.request
+        import urllib.request  # noqa: F811
         from datetime import datetime
         import requests
-        from geopy.geocoders import Nominatim
+        from geopy.geocoders import Nominatim  # noqa: F811
         # Week commencing dictionary
         day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
@@ -2936,35 +2821,238 @@ class datapull:
         return final_weather
-    def pull_covid_data(self, folder_path: str, country: str = "GB", week_commencing: str = "mon") -> pd.DataFrame:
+    def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
         """
-        Get covid pandemic data for the country of interest.
+        Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
+        aggregates it to weekly averages, and renames variables based on specified rules.
-        Args:
-            folder_path (str): A string containing the local location of the OneDrive folder.
-                                Example: "C:/Users/-- username --/OneDrive - im-sciences.com"
-                                The file location within the MasterDrive of the worldwide covid data is:
-                                MasterDrive/Central Database/Covid/oxford-government-response.csv
-            country (str): A string containing the country of interest (E.g: "GB", "FR")
-            week_commencing (str): The starting day of the week for aggregation.
-                                Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
+        Parameters:
+            cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
+            week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
+            sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
+        Returns:
+            pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
+                        and all series as renamed columns.
+        """
+        # Define CDIDs for sectors and defaults
+        sector_cdids = {
+            "fast_food": ["L7TD", "L78Q", "DOAD"],
+            "default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
+        }
+        default_cdids = sector_cdids["default"]
+        sector_specific_cdids = sector_cdids.get(sector, [])
+        standard_cdids = list(set(default_cdids + sector_specific_cdids))  # Avoid duplicates
+        # Combine standard CDIDs and additional CDIDs
+        if cdid_list is None:
+            cdid_list = []
+        cdid_list = list(set(standard_cdids + cdid_list))  # Avoid duplicates
+        base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
+        base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
+        combined_df = pd.DataFrame()
+        # Map week start day to pandas weekday convention
+        days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
+        if week_start_day not in days_map:
+            raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
+        week_start = days_map[week_start_day]
+        for cdid in cdid_list:
+            try:
+                # Search for the series
+                search_url = f"{base_search_url}{cdid}"
+                search_response = requests.get(search_url)
+                search_response.raise_for_status()
+                search_data = search_response.json()
+                items = search_data.get("items", [])
+                if not items:
+                    print(f"No data found for CDID: {cdid}")
+                    continue
+                # Extract series name and latest release URI
+                series_name = items[0].get("title", f"Series_{cdid}")
+                latest_date = max(
+                    datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
+                    for item in items if "release_date" in item
+                )
+                latest_uri = next(
+                    item["uri"] for item in items
+                    if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
+                )
+                # Fetch the dataset
+                data_url = f"{base_data_url}{latest_uri}"
+                data_response = requests.get(data_url)
+                data_response.raise_for_status()
+                data_json = data_response.json()
+                # Detect the frequency and process accordingly
+                if "months" in data_json and data_json["months"]:
+                    frequency_key = "months"
+                elif "quarters" in data_json and data_json["quarters"]:
+                    frequency_key = "quarters"
+                elif "years" in data_json and data_json["years"]:
+                    frequency_key = "years"
+                else:
+                    print(f"Unsupported frequency or no data for CDID: {cdid}")
+                    continue
+                # Prepare the DataFrame
+                df = pd.DataFrame(data_json[frequency_key])
+                # Parse the 'date' field based on frequency
+                if frequency_key == "months":
+                    df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
+                elif frequency_key == "quarters":
+                    def parse_quarter(quarter_str):
+                        year, qtr = quarter_str.split(" Q")
+                        month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
+                        return datetime(int(year), month, 1)
+                    df["date"] = df["date"].apply(parse_quarter)
+                elif frequency_key == "years":
+                    df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
+                df["value"] = pd.to_numeric(df["value"], errors="coerce")
+                df.rename(columns={"value": series_name}, inplace=True)
+                # Combine data
+                df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
+                if combined_df.empty:
+                    combined_df = df
+                else:
+                    combined_df = pd.merge(combined_df, df, on="date", how="outer")
+            except requests.exceptions.RequestException as e:
+                print(f"Error fetching data for CDID {cdid}: {e}")
+            except (KeyError, ValueError) as e:
+                print(f"Error processing data for CDID {cdid}: {e}")
+        if not combined_df.empty:
+            min_date = combined_df["date"].min()
+            max_date = datetime.today()
+            date_range = pd.date_range(start=min_date, end=max_date, freq='D')
+            daily_df = pd.DataFrame(date_range, columns=['date'])
+            daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
+            daily_df = daily_df.ffill()
+            # Aggregate to weekly frequency
+            daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
+            weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
+            def clean_column_name(name):
+                name = re.sub(r"\(.*?\)", "", name)
+                name = re.split(r":", name)[0]
+                name = re.sub(r"\d+", "", name)
+                name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
+                name = re.sub(r"[^\w\s]", "", name)
+                name = name.replace(" ", "_")
+                name = re.sub(r"_+", "_", name)
+                name = name.rstrip("_")
+                return f"macro_{name.lower()}_uk"
+            weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
+            weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
+            weekly_df = weekly_df.fillna(0)
+            return weekly_df
+        else:
+            print("No data available to process.")
+            return pd.DataFrame()
+    def pull_yfinance(self, tickers=None, week_start_day="mon"):
+        """
+        Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
+        aggregates it to weekly averages, and renames variables.
+        Parameters:
+            tickers (list): A list of additional stock tickers to fetch (e.g., ['AAPL', 'MSFT']). Defaults to None.
+            week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
         Returns:
-            pd.DataFrame: A DataFrame containing seasonality and public holiday dummies for the country of interest.
-                            The 'OBS' column contains the week commencing dates.
+            pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column
+                        and aggregated stock data for the specified tickers, with NaN values filled with 0.
         """
+        # Define default tickers
+        default_tickers = ["^FTSE", "GBPUSD=X", "GBPEUR=X", "^GSPC"]
-        df = pd.read_csv(f'{folder_path}/MasterDrive/Central Database/Covid/oxford-government-response.csv')
+        # Combine default tickers with additional ones
+        if tickers is None:
+            tickers = []
+        tickers = list(set(default_tickers + tickers))  # Ensure no duplicates
+        # Automatically set end_date to today
+        end_date = datetime.today().strftime("%Y-%m-%d")
+        # Mapping week start day to pandas weekday convention
+        days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
+        if week_start_day not in days_map:
+            raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
+        week_start = days_map[week_start_day]
-        country_df = df[df['location_key']==country]
-        country_df.rename(columns={'date': 'OBS'}, inplace=True)
-        country_df.drop('location_key', axis=1, inplace=True)
+        # Fetch data for all tickers without specifying a start date to get all available data
+        data = yf.download(tickers, end=end_date, group_by="ticker", auto_adjust=True)
+        # Process the data
+        combined_df = pd.DataFrame()
+        for ticker in tickers:
+            try:
+                # Extract the ticker's data
+                ticker_data = data[ticker] if len(tickers) > 1 else data
+                ticker_data = ticker_data.reset_index()
+                # Ensure necessary columns are present
+                if "Close" not in ticker_data.columns:
+                    raise ValueError(f"Ticker {ticker} does not have 'Close' price data.")
+                # Keep only relevant columns
+                ticker_data = ticker_data[["Date", "Close"]]
+                ticker_data.rename(columns={"Close": ticker}, inplace=True)
-        agg_df = ims_proc.aggregate_daily_to_wc_wide(country_df, 'OBS', [], country_df.columns.to_list(), week_commencing, 'average')
+                # Merge data
+                if combined_df.empty:
+                    combined_df = ticker_data
+                else:
+                    combined_df = pd.merge(combined_df, ticker_data, on="Date", how="outer")
-        covid_df = ims_proc.rename_cols(agg_df, 'covid_')
+            except KeyError:
+                print(f"Data for ticker {ticker} not available.")
+            except Exception as e:
+                print(f"Error processing ticker {ticker}: {e}")
-        covid_df['OBS'] = covid_df['OBS'].apply(lambda x: x[0].date())
+        if not combined_df.empty:
+            # Convert to daily frequency
+            combined_df["Date"] = pd.to_datetime(combined_df["Date"])
+            combined_df.set_index("Date", inplace=True)
-        return covid_df
+            # Fill missing dates
+            min_date = combined_df.index.min()
+            max_date = combined_df.index.max()
+            daily_index = pd.date_range(start=min_date, end=max_date, freq='D')
+            combined_df = combined_df.reindex(daily_index)
+            combined_df.index.name = "Date"
+            combined_df = combined_df.ffill()
+            # Aggregate to weekly frequency
+            combined_df["OBS"] = combined_df.index - pd.to_timedelta((combined_df.index.weekday - week_start) % 7, unit="D")
+            weekly_df = combined_df.groupby("OBS").mean(numeric_only=True).reset_index()
+            # Fill NaN values with 0
+            weekly_df = weekly_df.fillna(0)
+            # Clean column names
+            def clean_column_name(name):
+                name = re.sub(r"[^\w\s]", "", name)
+                return f"macro_{name.lower()}"
+            weekly_df.columns = [clean_column_name(col) if col != "OBS" else col for col in weekly_df.columns]
+            return weekly_df
+        else:
+            print("No data available to process.")
+            return pd.DataFrame()

imsciences 0.6.3.1__py3-none-any.whl → 0.8__py3-none-any.whl

imsciences 0.6.3.1py3-none-any.whl → 0.8py3-none-any.whl