PyPI - imsciences - Versions diffs - 0.6.3.2__py3-none-any.whl → 0.8__py3-none-any.whl - Mend

imsciences 0.6.3.2py3-none-any.whl → 0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

imsciences/__init__.py CHANGED Viewed

@@ -1,3 +1,2 @@
 from .datafunctions import dataprocessing
 from .datafunctions import datapull
-from .unittesting import TestDataProcessor

imsciences/datafunctions.py CHANGED Viewed

@@ -4,21 +4,18 @@ import os
 import plotly.express as px
 import plotly.graph_objs as go
 import numpy as np
-import datetime
 import re
 from fredapi import Fred
 import time
-from datetime import datetime, timedelta  # noqa: F811
+from datetime import datetime, timedelta
 from io import StringIO
-import urllib
-import requests_cache  # noqa: F401
-import urllib.request  # noqa: F401
 import requests
-from geopy.geocoders import Nominatim  # noqa: F401
 import subprocess
 import json
 import xml.etree.ElementTree as ET
 from bs4 import BeautifulSoup
+import yfinance as yf
+import holidays
 class dataprocessing:
@@ -1767,17 +1764,6 @@ class dataprocessing:
 ########################################################################################################################################
 ########################################################################################################################################
 ims_proc = dataprocessing()
 class datapull:
@@ -1788,38 +1774,43 @@ class datapull:
         print("\n1. pull_fred_data")
         print("   - Description: Get data from FRED by using series id tokens.")
         print("   - Usage: pull_fred_data(week_commencing, series_id_list)")
-        print("   - Example: pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])")
+        print("   - Example: pull_fred_data('mon', ['GPDIC1'])")
         print("\n2. pull_boe_data")
         print("   - Description: Fetch and process Bank of England interest rate data.")
         print("   - Usage: pull_boe_data(week_commencing)")
         print("   - Example: pull_boe_data('mon')")
-        print("\n3. pull_ons_data")
-        print("   - Description: Fetch and process time series data from the ONS API.")
-        print("   - Usage: pull_ons_data(series_list, week_commencing)")
-        print("   - Example: pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')")
-        print("\n4. pull_oecd")
+        print("\n3. pull_oecd")
         print("   - Description: Fetch macroeconomic data from OECD for a specified country.")
-        print("   - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '1950-01-01')")
-        print("   - Example: pull_oecd('GBR', 'mon', '1950-01-01')")
+        print("   - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '2020-01-01')")
+        print("   - Example: pull_oecd('GBR', 'mon', '2000-01-01')")
-        print("\n5. get_google_mobility_data")
+        print("\n4. get_google_mobility_data")
         print("   - Description: Fetch Google Mobility data for the specified country.")
         print("   - Usage: get_google_mobility_data(country, wc)")
         print("   - Example: get_google_mobility_data('United Kingdom', 'mon')")
-        print("\n6. pull_combined_dummies")
+        print("\n5. pull_seasonality")
         print("   - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.")
-        print("   - Usage: pull_combined_dummies(week_commencing)")
-        print("   - Example: pull_combined_dummies('mon')")
+        print("   - Usage: pull_seasonality(week_commencing, start_date, countries)")
+        print("   - Example: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])")
-        print("\n7. pull_weather")
+        print("\n6. pull_weather")
         print("   - Description: Fetch and process historical weather data for the specified country.")
         print("   - Usage: pull_weather(week_commencing, country)")
         print("   - Example: pull_weather('mon', 'GBR')")
+        print("\n7. pull_macro_ons_uk")
+        print("   - Description: Fetch and process time series data from the Beta ONS API.")
+        print("   - Usage: pull_macro_ons_uk(aditional_list, week_commencing, sector)")
+        print("   - Example: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')")
+        print("\n8. pull_yfinance")
+        print("   - Description: Fetch and process time series data from the Beta ONS API.")
+        print("   - Usage: pull_yfinance(tickers, week_start_day)")
+        print("   - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
     ###############################################################  MACRO ##########################################################################
     def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
@@ -1837,10 +1828,6 @@ class datapull:
         ----------
         pd.DataFrame
             Return a data frame with FRED data according to the series IDs provided
-        Example
-        ----------
-        pull_fred_data("mon", ["GCEC1", "SP500"])
         '''
         # Fred API
         fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
@@ -1958,107 +1945,7 @@ class datapull:
         return df_final
-    def pull_ons_data(self, series_list, week_commencing):
-        """
-        Fetch and process time series data from the ONS API.
-        Args:
-            series_list (list): A list of dictionaries where each dictionary represents a time series.
-                                Each dictionary should have the keys 'series_id' and 'dataset_id'.
-            week_commencing (str): The starting day of the week for aggregation.
-                                Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
-        Returns:
-            pd.DataFrame: A DataFrame with weekly aggregated ONS data. The 'OBS' column contains the week
-                        commencing dates and other columns contain the aggregated time series values.
-        """
-        def parse_quarter(date_str):
-            """Parses a string in 'YYYY Q#' format into a datetime object."""
-            year, quarter = date_str.split(' ')
-            quarter_number = int(quarter[1])
-            month = (quarter_number - 1) * 3 + 1
-            return pd.Timestamp(f"{year}-{month:02d}-01")
-        # Generate a date range from 1950-01-01 to today
-        date_range = pd.date_range(start="1950-01-01", end=datetime.today(), freq='D')
-        daily_df = pd.DataFrame(date_range, columns=['OBS'])
-        # Keep track of the renamed value columns
-        value_columns = []
-        for series in series_list:
-            series_id = series['series_id']
-            dataset_id = series['dataset_id']
-            # Construct the URL for data
-            data_url = f"https://api.ons.gov.uk/timeseries/{series_id}/dataset/{dataset_id}/data"
-            # Make the request to the ONS API for data
-            data_response = requests.get(data_url)
-            # Check if the request was successful
-            if data_response.status_code != 200:
-                print(f"Failed to fetch data for series {series_id}: {data_response.status_code} {data_response.text}")
-                continue
-            # Parse the JSON response for data
-            data = data_response.json()
-            # Attempt to extract the name of the time series from the data response
-            series_name = data.get('description', {}).get('title', 'Value')
-            # Determine the most granular time series data available
-            if 'months' in data and data['months']:
-                time_series_data = data['months']
-            elif 'quarters' in data and data['quarters']:
-                time_series_data = data['quarters']
-            elif 'years' in data and data['years']:
-                time_series_data = data['years']
-            else:
-                print("No time series data found in the response")
-                continue
-            # Create a DataFrame from the time series data
-            df = pd.DataFrame(time_series_data)
-            # Handle different frequencies in the data
-            if 'date' in df.columns:
-                if any(df['date'].str.contains('Q')):
-                    df['date'] = df['date'].apply(parse_quarter)
-                else:
-                    df['date'] = pd.to_datetime(df['date'])
-            df = df.rename(columns={'date': 'OBS', 'value': series_name})
-            # Rename the value column
-            new_col_name = 'macro_' + series_name.lower().replace(':', '').replace(' ', '_').replace('-', '_')
-            df = df.rename(columns={series_name: new_col_name})
-            # Track the renamed value column
-            value_columns.append(new_col_name)
-            # Merge the data based on the observation date
-            daily_df = pd.merge_asof(daily_df, df[['OBS', new_col_name]], on='OBS', direction='backward')
-        # Ensure columns are numeric
-        for col in value_columns:
-            if col in daily_df.columns:
-                daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
-            else:
-                print(f"Column {col} not found in daily_df")
-        # Aggregate results by week
-        ons_df_final = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
-                                                    date_column="OBS",
-                                                    group_columns=[],
-                                                    sum_columns=value_columns,
-                                                    wc=week_commencing,
-                                                    aggregation="average")
-        return ons_df_final
-    def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "1950-01-01") -> pd.DataFrame:
+    def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "2020-01-01") -> pd.DataFrame:
         """
         Fetch and process time series data from the OECD API.
@@ -2235,12 +2122,12 @@ class datapull:
     ###############################################################  Seasonality  ##########################################################################
-    def pull_combined_dummies(self, week_commencing):
+    def pull_seasonality(self, week_commencing, start_date, countries):
         # Week commencing dictionary
         day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
-        # Create daily date range dataframe
-        date_range = pd.date_range(datetime(2015, 1, 1), datetime.today(), freq="d")
+        # Create daily date range dataframe starting from start_date
+        date_range = pd.date_range(start=pd.to_datetime(start_date), end=datetime.today(), freq="d")
         df_daily = pd.DataFrame(date_range, columns=["Date"])
         # Create weekly date range dataframe
@@ -2250,7 +2137,7 @@ class datapull:
         df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
         df_weekly_start.set_index("Date", inplace=True)
         # Create individual weekly dummies
         dummy_columns = {}
         for i in range(len(df_weekly_start)):
@@ -2260,84 +2147,59 @@ class datapull:
         df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
         df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
-        # Create monthly dummies
+        # Add public holidays for each country and holiday type
+        for country in countries:
+            country_holidays = holidays.CountryHoliday(country, years=range(int(start_date[:4]), datetime.today().year + 1))
+            df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(lambda x: 1 if x in country_holidays else 0)
+            # Extract specific holidays
+            for date, name in country_holidays.items():
+                col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
+                if col_name not in df_daily.columns:
+                    df_daily[col_name] = 0
+                df_daily.loc[df_daily["Date"] == pd.Timestamp(date), col_name] = 1
+        # Map daily holidays to weekly aggregation
+        df_daily['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
+        df_holidays = df_daily.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
+        df_holidays.set_index("Date", inplace=True)
+        # Create monthly dummies (separately from holidays)
         df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
-        df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"])
+        df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"], dtype=int)
         df_monthly_dummies['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
         df_monthly_dummies = df_monthly_dummies.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
         df_monthly_dummies.set_index("Date", inplace=True)
-        df_monthly_dummies = df_monthly_dummies / 7
-        # Combine weekly and monthly dataframes
-        df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
+        # Divide only the monthly dummy columns by 7 (exclude holiday-related columns)
+        monthly_cols = [col for col in df_monthly_dummies.columns if not col.startswith("seas_holiday") and not col.startswith("seas_")]
+        df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
+        # Merge weekly dummies, monthly dummies, and holidays
+        df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)  # Combine weekly and monthly first
+        df_combined = pd.concat([df_combined, df_holidays], axis=1)  # Add holidays separately
+        # Drop duplicate columns if any exist (this ensures holidays are not duplicated)
+        df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
         # Create weekly dummies
         df_combined.reset_index(inplace=True)
         df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
-        df_combined = pd.get_dummies(df_combined, prefix="wk", columns=["Week"])
+        df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int)
         # Create yearly dummies
         df_combined["Year"] = df_combined["Date"].dt.year
-        df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"])
+        df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
         # Add constant
         df_combined["Constant"] = 1
         # Add trend
         df_combined["Trend"] = df_combined.index + 1
-        # Set date as index
-        df_combined.set_index("Date", inplace=True)
-        # Create COVID lockdown dummies
-        lockdown_periods = [
-            # Lockdown 1
-            ("2020-03-23", "2020-05-24"),
-            # Lockdown 2
-            ("2020-11-05", "2020-12-02"),
-            # Lockdown 3
-            ("2021-01-04", "2021-03-08")
-        ]
-        df_covid = pd.DataFrame(date_range, columns=["Date"])
-        df_covid["national_lockdown"] = 0
-        for start, end in lockdown_periods:
-            df_covid.loc[(df_covid["Date"] >= start) & (df_covid["Date"] <= end), "national_lockdown"] = 1
-        df_covid['week_start'] = df_covid["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
-        df_covid.drop("Date", axis=1, inplace=True)
-        df_covid.rename(columns={"week_start": "OBS"}, inplace=True)
-        df_national_lockdown_total = df_covid.groupby('OBS').sum(numeric_only=True)
-        df_national_lockdown_total.rename(columns={"national_lockdown": "covid_uk_national_lockdown_total"}, inplace=True)
-        df_national_lockdown_1 = df_national_lockdown_total.copy(deep=True)
-        df_national_lockdown_2 = df_national_lockdown_total.copy(deep=True)
-        df_national_lockdown_3 = df_national_lockdown_total.copy(deep=True)
-        df_national_lockdown_1.loc[df_national_lockdown_1.index > "2020-05-24"] = 0
-        df_national_lockdown_1.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_1"}, inplace=True)
-        df_national_lockdown_2.loc[df_national_lockdown_2.index < "2020-11-05"] = 0
-        df_national_lockdown_2.loc[df_national_lockdown_2.index > "2020-12-02"] = 0
-        df_national_lockdown_2.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_2"}, inplace=True)
-        df_national_lockdown_3.loc[df_national_lockdown_3.index < "2021-01-04"] = 0
-        df_national_lockdown_3.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_3"}, inplace=True)
-        df_final_covid = pd.concat([df_national_lockdown_total, df_national_lockdown_1, df_national_lockdown_2, df_national_lockdown_3], axis=1)
-        df_final_covid.reset_index(inplace=True)
-        df_final_covid.rename(columns={"index": "OBS"}, inplace=True)
         # Create seasonal indicators for the last day and last Friday of the month
-        min_date = '2019-12-29'
-        max_date = datetime.today().strftime('%Y-%m-%d')
-        date_range_seas = pd.date_range(start=min_date, end=max_date)
-        df_seas = pd.DataFrame(date_range_seas, columns=['Date'])
-        df_seas['Last_Day_of_Month'] = df_seas['Date'].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
+        df_combined['seas_last_day_of_month'] = df_combined["Date"].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
         def is_last_friday(date):
             last_day_of_month = date.to_period('M').to_timestamp('M')
@@ -2349,21 +2211,12 @@ class datapull:
             last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
             return 1 if date == last_friday else 0
-        df_seas['Last_Friday_of_Month'] = df_seas['Date'].apply(is_last_friday)
+        df_combined['seas_last_friday_of_month'] = df_combined["Date"].apply(is_last_friday)
-        df_seas['week_start'] = df_seas["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
-        df_seas = df_seas.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
-        df_seas.set_index("Date", inplace=True)
+        # Rename Date to OBS
+        df_combined.rename(columns={"Date": "OBS"}, inplace=True)
-        # Combine all dataframes
-        df_combined = df_combined.reset_index().rename(columns={"Date": "OBS"})
-        df_final_combined = pd.merge(df_combined, df_final_covid, how='left', left_on='OBS', right_on='OBS')
-        df_final_combined = pd.merge(df_final_combined, df_seas, how='left', left_on='OBS', right_on='Date')
-        # Fill any NaN values with 0
-        df_final_combined.fillna(0, inplace=True)
-        return df_final_combined
+        return df_combined
     def pull_weather(self, week_commencing, country) -> pd.DataFrame:
         import pandas as pd
@@ -2966,4 +2819,240 @@ class datapull:
         final_weather = ims_proc.rename_cols(merged_df, 'seas_')
-        return final_weather
+        return final_weather
+    def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
+        """
+        Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
+        aggregates it to weekly averages, and renames variables based on specified rules.
+        Parameters:
+            cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
+            week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
+            sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
+        Returns:
+            pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
+                        and all series as renamed columns.
+        """
+        # Define CDIDs for sectors and defaults
+        sector_cdids = {
+            "fast_food": ["L7TD", "L78Q", "DOAD"],
+            "default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
+        }
+        default_cdids = sector_cdids["default"]
+        sector_specific_cdids = sector_cdids.get(sector, [])
+        standard_cdids = list(set(default_cdids + sector_specific_cdids))  # Avoid duplicates
+        # Combine standard CDIDs and additional CDIDs
+        if cdid_list is None:
+            cdid_list = []
+        cdid_list = list(set(standard_cdids + cdid_list))  # Avoid duplicates
+        base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
+        base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
+        combined_df = pd.DataFrame()
+        # Map week start day to pandas weekday convention
+        days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
+        if week_start_day not in days_map:
+            raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
+        week_start = days_map[week_start_day]
+        for cdid in cdid_list:
+            try:
+                # Search for the series
+                search_url = f"{base_search_url}{cdid}"
+                search_response = requests.get(search_url)
+                search_response.raise_for_status()
+                search_data = search_response.json()
+                items = search_data.get("items", [])
+                if not items:
+                    print(f"No data found for CDID: {cdid}")
+                    continue
+                # Extract series name and latest release URI
+                series_name = items[0].get("title", f"Series_{cdid}")
+                latest_date = max(
+                    datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
+                    for item in items if "release_date" in item
+                )
+                latest_uri = next(
+                    item["uri"] for item in items
+                    if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
+                )
+                # Fetch the dataset
+                data_url = f"{base_data_url}{latest_uri}"
+                data_response = requests.get(data_url)
+                data_response.raise_for_status()
+                data_json = data_response.json()
+                # Detect the frequency and process accordingly
+                if "months" in data_json and data_json["months"]:
+                    frequency_key = "months"
+                elif "quarters" in data_json and data_json["quarters"]:
+                    frequency_key = "quarters"
+                elif "years" in data_json and data_json["years"]:
+                    frequency_key = "years"
+                else:
+                    print(f"Unsupported frequency or no data for CDID: {cdid}")
+                    continue
+                # Prepare the DataFrame
+                df = pd.DataFrame(data_json[frequency_key])
+                # Parse the 'date' field based on frequency
+                if frequency_key == "months":
+                    df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
+                elif frequency_key == "quarters":
+                    def parse_quarter(quarter_str):
+                        year, qtr = quarter_str.split(" Q")
+                        month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
+                        return datetime(int(year), month, 1)
+                    df["date"] = df["date"].apply(parse_quarter)
+                elif frequency_key == "years":
+                    df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
+                df["value"] = pd.to_numeric(df["value"], errors="coerce")
+                df.rename(columns={"value": series_name}, inplace=True)
+                # Combine data
+                df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
+                if combined_df.empty:
+                    combined_df = df
+                else:
+                    combined_df = pd.merge(combined_df, df, on="date", how="outer")
+            except requests.exceptions.RequestException as e:
+                print(f"Error fetching data for CDID {cdid}: {e}")
+            except (KeyError, ValueError) as e:
+                print(f"Error processing data for CDID {cdid}: {e}")
+        if not combined_df.empty:
+            min_date = combined_df["date"].min()
+            max_date = datetime.today()
+            date_range = pd.date_range(start=min_date, end=max_date, freq='D')
+            daily_df = pd.DataFrame(date_range, columns=['date'])
+            daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
+            daily_df = daily_df.ffill()
+            # Aggregate to weekly frequency
+            daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
+            weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
+            def clean_column_name(name):
+                name = re.sub(r"\(.*?\)", "", name)
+                name = re.split(r":", name)[0]
+                name = re.sub(r"\d+", "", name)
+                name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
+                name = re.sub(r"[^\w\s]", "", name)
+                name = name.replace(" ", "_")
+                name = re.sub(r"_+", "_", name)
+                name = name.rstrip("_")
+                return f"macro_{name.lower()}_uk"
+            weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
+            weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
+            weekly_df = weekly_df.fillna(0)
+            return weekly_df
+        else:
+            print("No data available to process.")
+            return pd.DataFrame()
+    def pull_yfinance(self, tickers=None, week_start_day="mon"):
+        """
+        Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
+        aggregates it to weekly averages, and renames variables.
+        Parameters:
+            tickers (list): A list of additional stock tickers to fetch (e.g., ['AAPL', 'MSFT']). Defaults to None.
+            week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
+        Returns:
+            pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column
+                        and aggregated stock data for the specified tickers, with NaN values filled with 0.
+        """
+        # Define default tickers
+        default_tickers = ["^FTSE", "GBPUSD=X", "GBPEUR=X", "^GSPC"]
+        # Combine default tickers with additional ones
+        if tickers is None:
+            tickers = []
+        tickers = list(set(default_tickers + tickers))  # Ensure no duplicates
+        # Automatically set end_date to today
+        end_date = datetime.today().strftime("%Y-%m-%d")
+        # Mapping week start day to pandas weekday convention
+        days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
+        if week_start_day not in days_map:
+            raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
+        week_start = days_map[week_start_day]
+        # Fetch data for all tickers without specifying a start date to get all available data
+        data = yf.download(tickers, end=end_date, group_by="ticker", auto_adjust=True)
+        # Process the data
+        combined_df = pd.DataFrame()
+        for ticker in tickers:
+            try:
+                # Extract the ticker's data
+                ticker_data = data[ticker] if len(tickers) > 1 else data
+                ticker_data = ticker_data.reset_index()
+                # Ensure necessary columns are present
+                if "Close" not in ticker_data.columns:
+                    raise ValueError(f"Ticker {ticker} does not have 'Close' price data.")
+                # Keep only relevant columns
+                ticker_data = ticker_data[["Date", "Close"]]
+                ticker_data.rename(columns={"Close": ticker}, inplace=True)
+                # Merge data
+                if combined_df.empty:
+                    combined_df = ticker_data
+                else:
+                    combined_df = pd.merge(combined_df, ticker_data, on="Date", how="outer")
+            except KeyError:
+                print(f"Data for ticker {ticker} not available.")
+            except Exception as e:
+                print(f"Error processing ticker {ticker}: {e}")
+        if not combined_df.empty:
+            # Convert to daily frequency
+            combined_df["Date"] = pd.to_datetime(combined_df["Date"])
+            combined_df.set_index("Date", inplace=True)
+            # Fill missing dates
+            min_date = combined_df.index.min()
+            max_date = combined_df.index.max()
+            daily_index = pd.date_range(start=min_date, end=max_date, freq='D')
+            combined_df = combined_df.reindex(daily_index)
+            combined_df.index.name = "Date"
+            combined_df = combined_df.ffill()
+            # Aggregate to weekly frequency
+            combined_df["OBS"] = combined_df.index - pd.to_timedelta((combined_df.index.weekday - week_start) % 7, unit="D")
+            weekly_df = combined_df.groupby("OBS").mean(numeric_only=True).reset_index()
+            # Fill NaN values with 0
+            weekly_df = weekly_df.fillna(0)
+            # Clean column names
+            def clean_column_name(name):
+                name = re.sub(r"[^\w\s]", "", name)
+                return f"macro_{name.lower()}"
+            weekly_df.columns = [clean_column_name(col) if col != "OBS" else col for col in weekly_df.columns]
+            return weekly_df
+        else:
+            print("No data available to process.")
+            return pd.DataFrame()

{imsciences-0.6.3.2.dist-info → imsciences-0.8.dist-info}/METADATA RENAMED Viewed

@@ -1,10 +1,9 @@
 Metadata-Version: 2.1
 Name: imsciences
-Version: 0.6.3.2
+Version: 0.8
 Summary: IMS Data Processing Package
 Author: IMS
 Author-email: cam@im-sciences.com
-License: MIT
 Keywords: python,data processing,apis
 Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers
@@ -20,14 +19,34 @@ Requires-Dist: fredapi
 Requires-Dist: requests-cache
 Requires-Dist: geopy
 Requires-Dist: bs4
+Requires-Dist: yfinance
+Requires-Dist: holidays
 # IMS Package Documentation
-The IMS package is a python library for processing incoming data into a format that can be used for specifically for econometrics projects that use weekly timeseries data. IMS processing offers a variety of functions to manipulate and analyze data efficiently. Here are the functionalities provided by the package:
+The **IMSciences package** is a Python library designed to process incoming data into a format tailored for econometrics projects, particularly those utilising weekly time series data. This package offers a suite of functions for efficient data manipulation and analysis.
+---
+## Key Features
+- Seamless data processing for econometrics workflows.
+- Aggregation, filtering, and transformation of time series data.
+- Integration with external data sources like FRED, Bank of England, ONS and OECD.
+---
+## Table of Contents
+1. [Data Processing](#data-processing)
+2. [Data Pulling](#data-pulling)
+3. [Installation](#installation)
+4. [Useage](#useage)
+5. [License](#license)
+---
 ## Data Processing
-# Function Descriptions and Usage Examples
 ## 1. `get_wd_levels`
 - **Description**: Get the working directory with the option of moving up parents.
@@ -333,51 +352,82 @@ The IMS package is a python library for processing incoming data into a format t
 ## Data Pulling
-## 1. `pull_fred_data`
+## 1. pull_fred_data
 - **Description**: Fetch data from FRED using series ID tokens.
-- **Usage**: `pull_fred_data(week_commencing, series_id_list)`
-- **Example**: `pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])`
+- **Usage**: pull_fred_data(week_commencing, series_id_list)
+- **Example**: pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])
 ---
-## 2. `pull_boe_data`
+## 2. pull_boe_data
 - **Description**: Fetch and process Bank of England interest rate data.
-- **Usage**: `pull_boe_data(week_commencing)`
-- **Example**: `pull_boe_data('mon')`
----
-## 3. `pull_ons_data`
-- **Description**: Fetch and process time series data from the ONS API.
-- **Usage**: `pull_ons_data(series_list, week_commencing)`
-- **Example**: `pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')`
+- **Usage**: pull_boe_data(week_commencing)
+- **Example**: pull_boe_data('mon')
 ---
-## 4. `pull_oecd`
+## 3. pull_oecd
 - **Description**: Fetch macroeconomic data from OECD for a specified country.
-- **Usage**: `pull_oecd(country='GBR', week_commencing='mon', start_date='1950-01-01')`
-- **Example**: `pull_oecd('GBR', 'mon', '1950-01-01')`
+- **Usage**: pull_oecd(country='GBR', week_commencing='mon', start_date='2020-01-01')
+- **Example**: pull_oecd('GBR', 'mon', '2000-01-01')
 ---
-## 5. `get_google_mobility_data`
+## 4. get_google_mobility_data
 - **Description**: Fetch Google Mobility data for the specified country.
-- **Usage**: `get_google_mobility_data(country, wc)`
-- **Example**: `get_google_mobility_data('United Kingdom', 'mon')`
+- **Usage**: get_google_mobility_data(country, wc)
+- **Example**: get_google_mobility_data('United Kingdom', 'mon')
 ---
-## 6. `pull_combined_dummies`
+## 5. pull_seasonality
 - **Description**: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.
-- **Usage**: `pull_combined_dummies(week_commencing)`
-- **Example**: `pull_combined_dummies('mon')`
+- **Usage**: pull_seasonality(week_commencing, start_date, countries)
+- **Example**: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])
 ---
-## 7. `pull_weather`
+## 6. pull_weather
 - **Description**: Fetch and process historical weather data for the specified country.
-- **Usage**: `pull_weather(week_commencing, country)`
-- **Example**: `pull_weather('mon', 'GBR')`
+- **Usage**: pull_weather(week_commencing, country)
+- **Example**: pull_weather('mon', 'GBR')
+---
+## 7. pull_macro_ons_uk
+- **Description**: Fetch and process time series data from the Beta ONS API.
+- **Usage**: pull_macro_ons_uk(additional_list, week_commencing, sector)
+- **Example**: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')
+---
+## 8. pull_yfinance
+- **Description**: Fetch and process time series data from Yahoo Finance.
+- **Usage**: pull_yfinance(tickers, week_start_day)
+- **Example**: pull_yfinance(['^FTMC', '^IXIC'], 'mon')
+## Installation
+Install the IMS package via pip:
+```bash
+pip install ims-package
+```
+---
+## Useage
+```bash
+from imsciences import *
+ims = dataprocessing()
+ims_pull = datapull()
+```
+---
+## License
+This project is licensed under the MIT License.
 ---

{imsciences-0.6.3.2.dist-info → imsciences-0.8.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
 dataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
 dataprocessing/data-processing-functions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
 dataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
-imsciences/__init__.py,sha256=0IwH7R_2N8vimJJo2DLzIG1hq9ddn8gB6ijlLrQemZs,122
+imsciences/__init__.py,sha256=7CfK2dMjPnBBw6I4st-20MdMlLjZULviFVXF2eMD9NI,80
 imsciences/datafunctions-IMS-24Ltp-3.py,sha256=3Snv-0iE_03StmyjtT-riOU9f4v8TaJWLoyZLJp6l8Y,141406
-imsciences/datafunctions.py,sha256=lvvodU8dZ9IN_GS7FYMuft9ZsQkD2BMIGQxLiN8GY7c,151557
+imsciences/datafunctions.py,sha256=KbZuvjJF-1gydPsb2qFlvpbVLwuG6y-lhLKt-wZ5JDI,156389
 imsciences/datapull.py,sha256=TPY0LDgOkcKTBk8OekbD0Grg5x0SomAK2dZ7MuT6X1E,19000
 imsciences/unittesting.py,sha256=d9H5HN8y7oof59hqN9mGqkjulExqFd93BEW-X8w_Id8,58142
 imsciencesdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
 imsciencesdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
 imsdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
 imsdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
-imsciences-0.6.3.2.dist-info/METADATA,sha256=k22-OJm6rdvDU7mubqDGW1K9Z-inek4VCQ4HdAw51cA,16981
-imsciences-0.6.3.2.dist-info/PKG-INFO-IMS-24Ltp-3,sha256=yqZbigwHjnYoqyI81PGz_AeofRFfOrwH_Vyawyef-mg,854
-imsciences-0.6.3.2.dist-info/WHEEL,sha256=ixB2d4u7mugx_bCBycvM9OzZ5yD7NmPXFRtKlORZS2Y,91
-imsciences-0.6.3.2.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
-imsciences-0.6.3.2.dist-info/RECORD,,
+imsciences-0.8.dist-info/METADATA,sha256=moylR64i_w4kk3TPPZMpFmAPc9f0A4xJgjAY-Zy-Tac,17845
+imsciences-0.8.dist-info/PKG-INFO-IMS-24Ltp-3,sha256=yqZbigwHjnYoqyI81PGz_AeofRFfOrwH_Vyawyef-mg,854
+imsciences-0.8.dist-info/WHEEL,sha256=ixB2d4u7mugx_bCBycvM9OzZ5yD7NmPXFRtKlORZS2Y,91
+imsciences-0.8.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
+imsciences-0.8.dist-info/RECORD,,

{imsciences-0.6.3.2.dist-info → imsciences-0.8.dist-info}/PKG-INFO-IMS-24Ltp-3 RENAMED Viewed

File without changes

{imsciences-0.6.3.2.dist-info → imsciences-0.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{imsciences-0.6.3.2.dist-info → imsciences-0.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

imsciences 0.6.3.2__py3-none-any.whl → 0.8__py3-none-any.whl

imsciences 0.6.3.2py3-none-any.whl → 0.8py3-none-any.whl