imsciences 0.6.3.2__py3-none-any.whl → 0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imsciences/__init__.py CHANGED
@@ -1,3 +1,2 @@
1
1
  from .datafunctions import dataprocessing
2
2
  from .datafunctions import datapull
3
- from .unittesting import TestDataProcessor
@@ -4,21 +4,18 @@ import os
4
4
  import plotly.express as px
5
5
  import plotly.graph_objs as go
6
6
  import numpy as np
7
- import datetime
8
7
  import re
9
8
  from fredapi import Fred
10
9
  import time
11
- from datetime import datetime, timedelta # noqa: F811
10
+ from datetime import datetime, timedelta
12
11
  from io import StringIO
13
- import urllib
14
- import requests_cache # noqa: F401
15
- import urllib.request # noqa: F401
16
12
  import requests
17
- from geopy.geocoders import Nominatim # noqa: F401
18
13
  import subprocess
19
14
  import json
20
15
  import xml.etree.ElementTree as ET
21
16
  from bs4 import BeautifulSoup
17
+ import yfinance as yf
18
+ import holidays
22
19
 
23
20
  class dataprocessing:
24
21
 
@@ -1767,17 +1764,6 @@ class dataprocessing:
1767
1764
  ########################################################################################################################################
1768
1765
  ########################################################################################################################################
1769
1766
 
1770
-
1771
-
1772
-
1773
-
1774
-
1775
-
1776
-
1777
-
1778
-
1779
-
1780
-
1781
1767
  ims_proc = dataprocessing()
1782
1768
 
1783
1769
  class datapull:
@@ -1788,38 +1774,43 @@ class datapull:
1788
1774
  print("\n1. pull_fred_data")
1789
1775
  print(" - Description: Get data from FRED by using series id tokens.")
1790
1776
  print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
1791
- print(" - Example: pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])")
1777
+ print(" - Example: pull_fred_data('mon', ['GPDIC1'])")
1792
1778
 
1793
1779
  print("\n2. pull_boe_data")
1794
1780
  print(" - Description: Fetch and process Bank of England interest rate data.")
1795
1781
  print(" - Usage: pull_boe_data(week_commencing)")
1796
1782
  print(" - Example: pull_boe_data('mon')")
1797
1783
 
1798
- print("\n3. pull_ons_data")
1799
- print(" - Description: Fetch and process time series data from the ONS API.")
1800
- print(" - Usage: pull_ons_data(series_list, week_commencing)")
1801
- print(" - Example: pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')")
1802
-
1803
- print("\n4. pull_oecd")
1784
+ print("\n3. pull_oecd")
1804
1785
  print(" - Description: Fetch macroeconomic data from OECD for a specified country.")
1805
- print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '1950-01-01')")
1806
- print(" - Example: pull_oecd('GBR', 'mon', '1950-01-01')")
1786
+ print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '2020-01-01')")
1787
+ print(" - Example: pull_oecd('GBR', 'mon', '2000-01-01')")
1807
1788
 
1808
- print("\n5. get_google_mobility_data")
1789
+ print("\n4. get_google_mobility_data")
1809
1790
  print(" - Description: Fetch Google Mobility data for the specified country.")
1810
1791
  print(" - Usage: get_google_mobility_data(country, wc)")
1811
1792
  print(" - Example: get_google_mobility_data('United Kingdom', 'mon')")
1812
1793
 
1813
- print("\n6. pull_combined_dummies")
1794
+ print("\n5. pull_seasonality")
1814
1795
  print(" - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.")
1815
- print(" - Usage: pull_combined_dummies(week_commencing)")
1816
- print(" - Example: pull_combined_dummies('mon')")
1796
+ print(" - Usage: pull_seasonality(week_commencing, start_date, countries)")
1797
+ print(" - Example: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])")
1817
1798
 
1818
- print("\n7. pull_weather")
1799
+ print("\n6. pull_weather")
1819
1800
  print(" - Description: Fetch and process historical weather data for the specified country.")
1820
1801
  print(" - Usage: pull_weather(week_commencing, country)")
1821
1802
  print(" - Example: pull_weather('mon', 'GBR')")
1822
-
1803
+
1804
+ print("\n7. pull_macro_ons_uk")
1805
+ print(" - Description: Fetch and process time series data from the Beta ONS API.")
1806
+ print(" - Usage: pull_macro_ons_uk(aditional_list, week_commencing, sector)")
1807
+ print(" - Example: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')")
1808
+
1809
+ print("\n8. pull_yfinance")
1810
+ print(" - Description: Fetch and process time series data from the Beta ONS API.")
1811
+ print(" - Usage: pull_yfinance(tickers, week_start_day)")
1812
+ print(" - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
1813
+
1823
1814
  ############################################################### MACRO ##########################################################################
1824
1815
 
1825
1816
  def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
@@ -1837,10 +1828,6 @@ class datapull:
1837
1828
  ----------
1838
1829
  pd.DataFrame
1839
1830
  Return a data frame with FRED data according to the series IDs provided
1840
-
1841
- Example
1842
- ----------
1843
- pull_fred_data("mon", ["GCEC1", "SP500"])
1844
1831
  '''
1845
1832
  # Fred API
1846
1833
  fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
@@ -1958,107 +1945,7 @@ class datapull:
1958
1945
 
1959
1946
  return df_final
1960
1947
 
1961
- def pull_ons_data(self, series_list, week_commencing):
1962
- """
1963
- Fetch and process time series data from the ONS API.
1964
-
1965
- Args:
1966
- series_list (list): A list of dictionaries where each dictionary represents a time series.
1967
- Each dictionary should have the keys 'series_id' and 'dataset_id'.
1968
- week_commencing (str): The starting day of the week for aggregation.
1969
- Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1970
-
1971
- Returns:
1972
- pd.DataFrame: A DataFrame with weekly aggregated ONS data. The 'OBS' column contains the week
1973
- commencing dates and other columns contain the aggregated time series values.
1974
- """
1975
-
1976
- def parse_quarter(date_str):
1977
- """Parses a string in 'YYYY Q#' format into a datetime object."""
1978
- year, quarter = date_str.split(' ')
1979
- quarter_number = int(quarter[1])
1980
- month = (quarter_number - 1) * 3 + 1
1981
- return pd.Timestamp(f"{year}-{month:02d}-01")
1982
-
1983
- # Generate a date range from 1950-01-01 to today
1984
- date_range = pd.date_range(start="1950-01-01", end=datetime.today(), freq='D')
1985
- daily_df = pd.DataFrame(date_range, columns=['OBS'])
1986
-
1987
- # Keep track of the renamed value columns
1988
- value_columns = []
1989
-
1990
- for series in series_list:
1991
- series_id = series['series_id']
1992
- dataset_id = series['dataset_id']
1993
-
1994
- # Construct the URL for data
1995
- data_url = f"https://api.ons.gov.uk/timeseries/{series_id}/dataset/{dataset_id}/data"
1996
-
1997
- # Make the request to the ONS API for data
1998
- data_response = requests.get(data_url)
1999
-
2000
- # Check if the request was successful
2001
- if data_response.status_code != 200:
2002
- print(f"Failed to fetch data for series {series_id}: {data_response.status_code} {data_response.text}")
2003
- continue
2004
-
2005
- # Parse the JSON response for data
2006
- data = data_response.json()
2007
-
2008
- # Attempt to extract the name of the time series from the data response
2009
- series_name = data.get('description', {}).get('title', 'Value')
2010
-
2011
- # Determine the most granular time series data available
2012
- if 'months' in data and data['months']:
2013
- time_series_data = data['months']
2014
- elif 'quarters' in data and data['quarters']:
2015
- time_series_data = data['quarters']
2016
- elif 'years' in data and data['years']:
2017
- time_series_data = data['years']
2018
- else:
2019
- print("No time series data found in the response")
2020
- continue
2021
-
2022
- # Create a DataFrame from the time series data
2023
- df = pd.DataFrame(time_series_data)
2024
-
2025
- # Handle different frequencies in the data
2026
- if 'date' in df.columns:
2027
- if any(df['date'].str.contains('Q')):
2028
- df['date'] = df['date'].apply(parse_quarter)
2029
- else:
2030
- df['date'] = pd.to_datetime(df['date'])
2031
-
2032
- df = df.rename(columns={'date': 'OBS', 'value': series_name})
2033
-
2034
- # Rename the value column
2035
- new_col_name = 'macro_' + series_name.lower().replace(':', '').replace(' ', '_').replace('-', '_')
2036
- df = df.rename(columns={series_name: new_col_name})
2037
-
2038
- # Track the renamed value column
2039
- value_columns.append(new_col_name)
2040
-
2041
- # Merge the data based on the observation date
2042
- daily_df = pd.merge_asof(daily_df, df[['OBS', new_col_name]], on='OBS', direction='backward')
2043
-
2044
- # Ensure columns are numeric
2045
- for col in value_columns:
2046
- if col in daily_df.columns:
2047
- daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
2048
- else:
2049
- print(f"Column {col} not found in daily_df")
2050
-
2051
- # Aggregate results by week
2052
- ons_df_final = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
2053
- date_column="OBS",
2054
- group_columns=[],
2055
- sum_columns=value_columns,
2056
- wc=week_commencing,
2057
- aggregation="average")
2058
-
2059
- return ons_df_final
2060
-
2061
- def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "1950-01-01") -> pd.DataFrame:
1948
+ def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "2020-01-01") -> pd.DataFrame:
2062
1949
  """
2063
1950
  Fetch and process time series data from the OECD API.
2064
1951
 
@@ -2235,12 +2122,12 @@ class datapull:
2235
2122
 
2236
2123
  ############################################################### Seasonality ##########################################################################
2237
2124
 
2238
- def pull_combined_dummies(self, week_commencing):
2125
+ def pull_seasonality(self, week_commencing, start_date, countries):
2239
2126
  # Week commencing dictionary
2240
2127
  day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
2241
2128
 
2242
- # Create daily date range dataframe
2243
- date_range = pd.date_range(datetime(2015, 1, 1), datetime.today(), freq="d")
2129
+ # Create daily date range dataframe starting from start_date
2130
+ date_range = pd.date_range(start=pd.to_datetime(start_date), end=datetime.today(), freq="d")
2244
2131
  df_daily = pd.DataFrame(date_range, columns=["Date"])
2245
2132
 
2246
2133
  # Create weekly date range dataframe
@@ -2250,7 +2137,7 @@ class datapull:
2250
2137
 
2251
2138
  df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
2252
2139
  df_weekly_start.set_index("Date", inplace=True)
2253
-
2140
+
2254
2141
  # Create individual weekly dummies
2255
2142
  dummy_columns = {}
2256
2143
  for i in range(len(df_weekly_start)):
@@ -2260,84 +2147,59 @@ class datapull:
2260
2147
 
2261
2148
  df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
2262
2149
  df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
2263
-
2264
- # Create monthly dummies
2150
+
2151
+ # Add public holidays for each country and holiday type
2152
+ for country in countries:
2153
+ country_holidays = holidays.CountryHoliday(country, years=range(int(start_date[:4]), datetime.today().year + 1))
2154
+ df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(lambda x: 1 if x in country_holidays else 0)
2155
+
2156
+ # Extract specific holidays
2157
+ for date, name in country_holidays.items():
2158
+ col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
2159
+ if col_name not in df_daily.columns:
2160
+ df_daily[col_name] = 0
2161
+ df_daily.loc[df_daily["Date"] == pd.Timestamp(date), col_name] = 1
2162
+
2163
+ # Map daily holidays to weekly aggregation
2164
+ df_daily['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2165
+ df_holidays = df_daily.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
2166
+ df_holidays.set_index("Date", inplace=True)
2167
+
2168
+ # Create monthly dummies (separately from holidays)
2265
2169
  df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
2266
- df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"])
2170
+ df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"], dtype=int)
2267
2171
  df_monthly_dummies['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2268
2172
  df_monthly_dummies = df_monthly_dummies.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
2269
-
2270
2173
  df_monthly_dummies.set_index("Date", inplace=True)
2271
- df_monthly_dummies = df_monthly_dummies / 7
2272
-
2273
- # Combine weekly and monthly dataframes
2274
- df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
2275
-
2174
+
2175
+ # Divide only the monthly dummy columns by 7 (exclude holiday-related columns)
2176
+ monthly_cols = [col for col in df_monthly_dummies.columns if not col.startswith("seas_holiday") and not col.startswith("seas_")]
2177
+ df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
2178
+
2179
+ # Merge weekly dummies, monthly dummies, and holidays
2180
+ df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1) # Combine weekly and monthly first
2181
+ df_combined = pd.concat([df_combined, df_holidays], axis=1) # Add holidays separately
2182
+
2183
+ # Drop duplicate columns if any exist (this ensures holidays are not duplicated)
2184
+ df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
2185
+
2276
2186
  # Create weekly dummies
2277
2187
  df_combined.reset_index(inplace=True)
2278
2188
  df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
2279
- df_combined = pd.get_dummies(df_combined, prefix="wk", columns=["Week"])
2189
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int)
2280
2190
 
2281
2191
  # Create yearly dummies
2282
2192
  df_combined["Year"] = df_combined["Date"].dt.year
2283
- df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"])
2193
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
2284
2194
 
2285
2195
  # Add constant
2286
2196
  df_combined["Constant"] = 1
2287
2197
 
2288
2198
  # Add trend
2289
2199
  df_combined["Trend"] = df_combined.index + 1
2290
-
2291
- # Set date as index
2292
- df_combined.set_index("Date", inplace=True)
2293
-
2294
- # Create COVID lockdown dummies
2295
- lockdown_periods = [
2296
- # Lockdown 1
2297
- ("2020-03-23", "2020-05-24"),
2298
- # Lockdown 2
2299
- ("2020-11-05", "2020-12-02"),
2300
- # Lockdown 3
2301
- ("2021-01-04", "2021-03-08")
2302
- ]
2303
-
2304
- df_covid = pd.DataFrame(date_range, columns=["Date"])
2305
- df_covid["national_lockdown"] = 0
2306
-
2307
- for start, end in lockdown_periods:
2308
- df_covid.loc[(df_covid["Date"] >= start) & (df_covid["Date"] <= end), "national_lockdown"] = 1
2309
-
2310
- df_covid['week_start'] = df_covid["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2311
- df_covid.drop("Date", axis=1, inplace=True)
2312
- df_covid.rename(columns={"week_start": "OBS"}, inplace=True)
2313
- df_national_lockdown_total = df_covid.groupby('OBS').sum(numeric_only=True)
2314
- df_national_lockdown_total.rename(columns={"national_lockdown": "covid_uk_national_lockdown_total"}, inplace=True)
2315
-
2316
- df_national_lockdown_1 = df_national_lockdown_total.copy(deep=True)
2317
- df_national_lockdown_2 = df_national_lockdown_total.copy(deep=True)
2318
- df_national_lockdown_3 = df_national_lockdown_total.copy(deep=True)
2319
-
2320
- df_national_lockdown_1.loc[df_national_lockdown_1.index > "2020-05-24"] = 0
2321
- df_national_lockdown_1.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_1"}, inplace=True)
2322
2200
 
2323
- df_national_lockdown_2.loc[df_national_lockdown_2.index < "2020-11-05"] = 0
2324
- df_national_lockdown_2.loc[df_national_lockdown_2.index > "2020-12-02"] = 0
2325
- df_national_lockdown_2.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_2"}, inplace=True)
2326
-
2327
- df_national_lockdown_3.loc[df_national_lockdown_3.index < "2021-01-04"] = 0
2328
- df_national_lockdown_3.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_3"}, inplace=True)
2329
-
2330
- df_final_covid = pd.concat([df_national_lockdown_total, df_national_lockdown_1, df_national_lockdown_2, df_national_lockdown_3], axis=1)
2331
- df_final_covid.reset_index(inplace=True)
2332
- df_final_covid.rename(columns={"index": "OBS"}, inplace=True)
2333
-
2334
2201
  # Create seasonal indicators for the last day and last Friday of the month
2335
- min_date = '2019-12-29'
2336
- max_date = datetime.today().strftime('%Y-%m-%d')
2337
- date_range_seas = pd.date_range(start=min_date, end=max_date)
2338
-
2339
- df_seas = pd.DataFrame(date_range_seas, columns=['Date'])
2340
- df_seas['Last_Day_of_Month'] = df_seas['Date'].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
2202
+ df_combined['seas_last_day_of_month'] = df_combined["Date"].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
2341
2203
 
2342
2204
  def is_last_friday(date):
2343
2205
  last_day_of_month = date.to_period('M').to_timestamp('M')
@@ -2349,21 +2211,12 @@ class datapull:
2349
2211
  last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
2350
2212
  return 1 if date == last_friday else 0
2351
2213
 
2352
- df_seas['Last_Friday_of_Month'] = df_seas['Date'].apply(is_last_friday)
2214
+ df_combined['seas_last_friday_of_month'] = df_combined["Date"].apply(is_last_friday)
2353
2215
 
2354
- df_seas['week_start'] = df_seas["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2355
- df_seas = df_seas.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
2356
- df_seas.set_index("Date", inplace=True)
2216
+ # Rename Date to OBS
2217
+ df_combined.rename(columns={"Date": "OBS"}, inplace=True)
2357
2218
 
2358
- # Combine all dataframes
2359
- df_combined = df_combined.reset_index().rename(columns={"Date": "OBS"})
2360
- df_final_combined = pd.merge(df_combined, df_final_covid, how='left', left_on='OBS', right_on='OBS')
2361
- df_final_combined = pd.merge(df_final_combined, df_seas, how='left', left_on='OBS', right_on='Date')
2362
-
2363
- # Fill any NaN values with 0
2364
- df_final_combined.fillna(0, inplace=True)
2365
-
2366
- return df_final_combined
2219
+ return df_combined
2367
2220
 
2368
2221
  def pull_weather(self, week_commencing, country) -> pd.DataFrame:
2369
2222
  import pandas as pd
@@ -2966,4 +2819,240 @@ class datapull:
2966
2819
 
2967
2820
  final_weather = ims_proc.rename_cols(merged_df, 'seas_')
2968
2821
 
2969
- return final_weather
2822
+ return final_weather
2823
+
2824
+ def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
2825
+ """
2826
+ Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
2827
+ aggregates it to weekly averages, and renames variables based on specified rules.
2828
+
2829
+ Parameters:
2830
+ cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
2831
+ week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
2832
+ sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
2833
+
2834
+ Returns:
2835
+ pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
2836
+ and all series as renamed columns.
2837
+ """
2838
+ # Define CDIDs for sectors and defaults
2839
+ sector_cdids = {
2840
+ "fast_food": ["L7TD", "L78Q", "DOAD"],
2841
+ "default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
2842
+ }
2843
+
2844
+ default_cdids = sector_cdids["default"]
2845
+ sector_specific_cdids = sector_cdids.get(sector, [])
2846
+ standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Avoid duplicates
2847
+
2848
+ # Combine standard CDIDs and additional CDIDs
2849
+ if cdid_list is None:
2850
+ cdid_list = []
2851
+ cdid_list = list(set(standard_cdids + cdid_list)) # Avoid duplicates
2852
+
2853
+ base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
2854
+ base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
2855
+ combined_df = pd.DataFrame()
2856
+
2857
+ # Map week start day to pandas weekday convention
2858
+ days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
2859
+ if week_start_day not in days_map:
2860
+ raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
2861
+ week_start = days_map[week_start_day]
2862
+
2863
+ for cdid in cdid_list:
2864
+ try:
2865
+ # Search for the series
2866
+ search_url = f"{base_search_url}{cdid}"
2867
+ search_response = requests.get(search_url)
2868
+ search_response.raise_for_status()
2869
+ search_data = search_response.json()
2870
+
2871
+ items = search_data.get("items", [])
2872
+ if not items:
2873
+ print(f"No data found for CDID: {cdid}")
2874
+ continue
2875
+
2876
+ # Extract series name and latest release URI
2877
+ series_name = items[0].get("title", f"Series_{cdid}")
2878
+ latest_date = max(
2879
+ datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
2880
+ for item in items if "release_date" in item
2881
+ )
2882
+ latest_uri = next(
2883
+ item["uri"] for item in items
2884
+ if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
2885
+ )
2886
+
2887
+ # Fetch the dataset
2888
+ data_url = f"{base_data_url}{latest_uri}"
2889
+ data_response = requests.get(data_url)
2890
+ data_response.raise_for_status()
2891
+ data_json = data_response.json()
2892
+
2893
+ # Detect the frequency and process accordingly
2894
+ if "months" in data_json and data_json["months"]:
2895
+ frequency_key = "months"
2896
+ elif "quarters" in data_json and data_json["quarters"]:
2897
+ frequency_key = "quarters"
2898
+ elif "years" in data_json and data_json["years"]:
2899
+ frequency_key = "years"
2900
+ else:
2901
+ print(f"Unsupported frequency or no data for CDID: {cdid}")
2902
+ continue
2903
+
2904
+ # Prepare the DataFrame
2905
+ df = pd.DataFrame(data_json[frequency_key])
2906
+
2907
+ # Parse the 'date' field based on frequency
2908
+ if frequency_key == "months":
2909
+ df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
2910
+ elif frequency_key == "quarters":
2911
+ def parse_quarter(quarter_str):
2912
+ year, qtr = quarter_str.split(" Q")
2913
+ month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
2914
+ return datetime(int(year), month, 1)
2915
+ df["date"] = df["date"].apply(parse_quarter)
2916
+ elif frequency_key == "years":
2917
+ df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
2918
+
2919
+ df["value"] = pd.to_numeric(df["value"], errors="coerce")
2920
+ df.rename(columns={"value": series_name}, inplace=True)
2921
+
2922
+ # Combine data
2923
+ df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
2924
+ if combined_df.empty:
2925
+ combined_df = df
2926
+ else:
2927
+ combined_df = pd.merge(combined_df, df, on="date", how="outer")
2928
+
2929
+ except requests.exceptions.RequestException as e:
2930
+ print(f"Error fetching data for CDID {cdid}: {e}")
2931
+ except (KeyError, ValueError) as e:
2932
+ print(f"Error processing data for CDID {cdid}: {e}")
2933
+
2934
+ if not combined_df.empty:
2935
+ min_date = combined_df["date"].min()
2936
+ max_date = datetime.today()
2937
+ date_range = pd.date_range(start=min_date, end=max_date, freq='D')
2938
+ daily_df = pd.DataFrame(date_range, columns=['date'])
2939
+ daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
2940
+ daily_df = daily_df.ffill()
2941
+
2942
+ # Aggregate to weekly frequency
2943
+ daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
2944
+ weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
2945
+
2946
+ def clean_column_name(name):
2947
+ name = re.sub(r"\(.*?\)", "", name)
2948
+ name = re.split(r":", name)[0]
2949
+ name = re.sub(r"\d+", "", name)
2950
+ name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
2951
+ name = re.sub(r"[^\w\s]", "", name)
2952
+ name = name.replace(" ", "_")
2953
+ name = re.sub(r"_+", "_", name)
2954
+ name = name.rstrip("_")
2955
+ return f"macro_{name.lower()}_uk"
2956
+
2957
+ weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
2958
+ weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
2959
+
2960
+ weekly_df = weekly_df.fillna(0)
2961
+
2962
+ return weekly_df
2963
+ else:
2964
+ print("No data available to process.")
2965
+ return pd.DataFrame()
2966
+
2967
+ def pull_yfinance(self, tickers=None, week_start_day="mon"):
2968
+ """
2969
+ Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
2970
+ aggregates it to weekly averages, and renames variables.
2971
+
2972
+ Parameters:
2973
+ tickers (list): A list of additional stock tickers to fetch (e.g., ['AAPL', 'MSFT']). Defaults to None.
2974
+ week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
2975
+
2976
+ Returns:
2977
+ pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column
2978
+ and aggregated stock data for the specified tickers, with NaN values filled with 0.
2979
+ """
2980
+ # Define default tickers
2981
+ default_tickers = ["^FTSE", "GBPUSD=X", "GBPEUR=X", "^GSPC"]
2982
+
2983
+ # Combine default tickers with additional ones
2984
+ if tickers is None:
2985
+ tickers = []
2986
+ tickers = list(set(default_tickers + tickers)) # Ensure no duplicates
2987
+
2988
+ # Automatically set end_date to today
2989
+ end_date = datetime.today().strftime("%Y-%m-%d")
2990
+
2991
+ # Mapping week start day to pandas weekday convention
2992
+ days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
2993
+ if week_start_day not in days_map:
2994
+ raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
2995
+ week_start = days_map[week_start_day]
2996
+
2997
+ # Fetch data for all tickers without specifying a start date to get all available data
2998
+ data = yf.download(tickers, end=end_date, group_by="ticker", auto_adjust=True)
2999
+
3000
+ # Process the data
3001
+ combined_df = pd.DataFrame()
3002
+ for ticker in tickers:
3003
+ try:
3004
+ # Extract the ticker's data
3005
+ ticker_data = data[ticker] if len(tickers) > 1 else data
3006
+ ticker_data = ticker_data.reset_index()
3007
+
3008
+ # Ensure necessary columns are present
3009
+ if "Close" not in ticker_data.columns:
3010
+ raise ValueError(f"Ticker {ticker} does not have 'Close' price data.")
3011
+
3012
+ # Keep only relevant columns
3013
+ ticker_data = ticker_data[["Date", "Close"]]
3014
+ ticker_data.rename(columns={"Close": ticker}, inplace=True)
3015
+
3016
+ # Merge data
3017
+ if combined_df.empty:
3018
+ combined_df = ticker_data
3019
+ else:
3020
+ combined_df = pd.merge(combined_df, ticker_data, on="Date", how="outer")
3021
+
3022
+ except KeyError:
3023
+ print(f"Data for ticker {ticker} not available.")
3024
+ except Exception as e:
3025
+ print(f"Error processing ticker {ticker}: {e}")
3026
+
3027
+ if not combined_df.empty:
3028
+ # Convert to daily frequency
3029
+ combined_df["Date"] = pd.to_datetime(combined_df["Date"])
3030
+ combined_df.set_index("Date", inplace=True)
3031
+
3032
+ # Fill missing dates
3033
+ min_date = combined_df.index.min()
3034
+ max_date = combined_df.index.max()
3035
+ daily_index = pd.date_range(start=min_date, end=max_date, freq='D')
3036
+ combined_df = combined_df.reindex(daily_index)
3037
+ combined_df.index.name = "Date"
3038
+ combined_df = combined_df.ffill()
3039
+
3040
+ # Aggregate to weekly frequency
3041
+ combined_df["OBS"] = combined_df.index - pd.to_timedelta((combined_df.index.weekday - week_start) % 7, unit="D")
3042
+ weekly_df = combined_df.groupby("OBS").mean(numeric_only=True).reset_index()
3043
+
3044
+ # Fill NaN values with 0
3045
+ weekly_df = weekly_df.fillna(0)
3046
+
3047
+ # Clean column names
3048
+ def clean_column_name(name):
3049
+ name = re.sub(r"[^\w\s]", "", name)
3050
+ return f"macro_{name.lower()}"
3051
+
3052
+ weekly_df.columns = [clean_column_name(col) if col != "OBS" else col for col in weekly_df.columns]
3053
+
3054
+ return weekly_df
3055
+
3056
+ else:
3057
+ print("No data available to process.")
3058
+ return pd.DataFrame()
@@ -1,10 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.6.3.2
3
+ Version: 0.8
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
7
- License: MIT
8
7
  Keywords: python,data processing,apis
9
8
  Classifier: Development Status :: 3 - Alpha
10
9
  Classifier: Intended Audience :: Developers
@@ -20,14 +19,34 @@ Requires-Dist: fredapi
20
19
  Requires-Dist: requests-cache
21
20
  Requires-Dist: geopy
22
21
  Requires-Dist: bs4
22
+ Requires-Dist: yfinance
23
+ Requires-Dist: holidays
23
24
 
24
25
  # IMS Package Documentation
25
26
 
26
- The IMS package is a python library for processing incoming data into a format that can be used for specifically for econometrics projects that use weekly timeseries data. IMS processing offers a variety of functions to manipulate and analyze data efficiently. Here are the functionalities provided by the package:
27
+ The **IMSciences package** is a Python library designed to process incoming data into a format tailored for econometrics projects, particularly those utilising weekly time series data. This package offers a suite of functions for efficient data manipulation and analysis.
28
+
29
+ ---
30
+
31
+ ## Key Features
32
+ - Seamless data processing for econometrics workflows.
33
+ - Aggregation, filtering, and transformation of time series data.
34
+ - Integration with external data sources like FRED, Bank of England, ONS and OECD.
35
+
36
+ ---
37
+
38
+ ## Table of Contents
39
+
40
+ 1. [Data Processing](#data-processing)
41
+ 2. [Data Pulling](#data-pulling)
42
+ 3. [Installation](#installation)
43
+ 4. [Useage](#useage)
44
+ 5. [License](#license)
45
+
46
+ ---
27
47
 
28
48
  ## Data Processing
29
49
 
30
- # Function Descriptions and Usage Examples
31
50
 
32
51
  ## 1. `get_wd_levels`
33
52
  - **Description**: Get the working directory with the option of moving up parents.
@@ -333,51 +352,82 @@ The IMS package is a python library for processing incoming data into a format t
333
352
 
334
353
  ## Data Pulling
335
354
 
336
- ## 1. `pull_fred_data`
355
+ ## 1. pull_fred_data
337
356
  - **Description**: Fetch data from FRED using series ID tokens.
338
- - **Usage**: `pull_fred_data(week_commencing, series_id_list)`
339
- - **Example**: `pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])`
357
+ - **Usage**: pull_fred_data(week_commencing, series_id_list)
358
+ - **Example**: pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])
340
359
 
341
360
  ---
342
361
 
343
- ## 2. `pull_boe_data`
362
+ ## 2. pull_boe_data
344
363
  - **Description**: Fetch and process Bank of England interest rate data.
345
- - **Usage**: `pull_boe_data(week_commencing)`
346
- - **Example**: `pull_boe_data('mon')`
347
-
348
- ---
349
-
350
- ## 3. `pull_ons_data`
351
- - **Description**: Fetch and process time series data from the ONS API.
352
- - **Usage**: `pull_ons_data(series_list, week_commencing)`
353
- - **Example**: `pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')`
364
+ - **Usage**: pull_boe_data(week_commencing)
365
+ - **Example**: pull_boe_data('mon')
354
366
 
355
367
  ---
356
368
 
357
- ## 4. `pull_oecd`
369
+ ## 3. pull_oecd
358
370
  - **Description**: Fetch macroeconomic data from OECD for a specified country.
359
- - **Usage**: `pull_oecd(country='GBR', week_commencing='mon', start_date='1950-01-01')`
360
- - **Example**: `pull_oecd('GBR', 'mon', '1950-01-01')`
371
+ - **Usage**: pull_oecd(country='GBR', week_commencing='mon', start_date='2020-01-01')
372
+ - **Example**: pull_oecd('GBR', 'mon', '2000-01-01')
361
373
 
362
374
  ---
363
375
 
364
- ## 5. `get_google_mobility_data`
376
+ ## 4. get_google_mobility_data
365
377
  - **Description**: Fetch Google Mobility data for the specified country.
366
- - **Usage**: `get_google_mobility_data(country, wc)`
367
- - **Example**: `get_google_mobility_data('United Kingdom', 'mon')`
378
+ - **Usage**: get_google_mobility_data(country, wc)
379
+ - **Example**: get_google_mobility_data('United Kingdom', 'mon')
368
380
 
369
381
  ---
370
382
 
371
- ## 6. `pull_combined_dummies`
383
+ ## 5. pull_seasonality
372
384
  - **Description**: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.
373
- - **Usage**: `pull_combined_dummies(week_commencing)`
374
- - **Example**: `pull_combined_dummies('mon')`
385
+ - **Usage**: pull_seasonality(week_commencing, start_date, countries)
386
+ - **Example**: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])
375
387
 
376
388
  ---
377
389
 
378
- ## 7. `pull_weather`
390
+ ## 6. pull_weather
379
391
  - **Description**: Fetch and process historical weather data for the specified country.
380
- - **Usage**: `pull_weather(week_commencing, country)`
381
- - **Example**: `pull_weather('mon', 'GBR')`
392
+ - **Usage**: pull_weather(week_commencing, country)
393
+ - **Example**: pull_weather('mon', 'GBR')
394
+
395
+ ---
396
+
397
+ ## 7. pull_macro_ons_uk
398
+ - **Description**: Fetch and process time series data from the Beta ONS API.
399
+ - **Usage**: pull_macro_ons_uk(additional_list, week_commencing, sector)
400
+ - **Example**: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')
401
+
402
+ ---
403
+
404
+ ## 8. pull_yfinance
405
+ - **Description**: Fetch and process time series data from Yahoo Finance.
406
+ - **Usage**: pull_yfinance(tickers, week_start_day)
407
+ - **Example**: pull_yfinance(['^FTMC', '^IXIC'], 'mon')
408
+
409
+ ## Installation
410
+
411
+ Install the IMS package via pip:
412
+
413
+ ```bash
414
+ pip install ims-package
415
+ ```
416
+
417
+ ---
418
+
419
+ ## Useage
420
+
421
+ ```bash
422
+ from imsciences import *
423
+ ims = dataprocessing()
424
+ ims_pull = datapull()
425
+ ```
426
+
427
+ ---
428
+
429
+ ## License
430
+
431
+ This project is licensed under the MIT License.
382
432
 
383
433
  ---
@@ -1,17 +1,17 @@
1
1
  dataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
2
2
  dataprocessing/data-processing-functions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
3
3
  dataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
4
- imsciences/__init__.py,sha256=0IwH7R_2N8vimJJo2DLzIG1hq9ddn8gB6ijlLrQemZs,122
4
+ imsciences/__init__.py,sha256=7CfK2dMjPnBBw6I4st-20MdMlLjZULviFVXF2eMD9NI,80
5
5
  imsciences/datafunctions-IMS-24Ltp-3.py,sha256=3Snv-0iE_03StmyjtT-riOU9f4v8TaJWLoyZLJp6l8Y,141406
6
- imsciences/datafunctions.py,sha256=lvvodU8dZ9IN_GS7FYMuft9ZsQkD2BMIGQxLiN8GY7c,151557
6
+ imsciences/datafunctions.py,sha256=KbZuvjJF-1gydPsb2qFlvpbVLwuG6y-lhLKt-wZ5JDI,156389
7
7
  imsciences/datapull.py,sha256=TPY0LDgOkcKTBk8OekbD0Grg5x0SomAK2dZ7MuT6X1E,19000
8
8
  imsciences/unittesting.py,sha256=d9H5HN8y7oof59hqN9mGqkjulExqFd93BEW-X8w_Id8,58142
9
9
  imsciencesdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
10
10
  imsciencesdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
11
11
  imsdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
12
12
  imsdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
13
- imsciences-0.6.3.2.dist-info/METADATA,sha256=k22-OJm6rdvDU7mubqDGW1K9Z-inek4VCQ4HdAw51cA,16981
14
- imsciences-0.6.3.2.dist-info/PKG-INFO-IMS-24Ltp-3,sha256=yqZbigwHjnYoqyI81PGz_AeofRFfOrwH_Vyawyef-mg,854
15
- imsciences-0.6.3.2.dist-info/WHEEL,sha256=ixB2d4u7mugx_bCBycvM9OzZ5yD7NmPXFRtKlORZS2Y,91
16
- imsciences-0.6.3.2.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
17
- imsciences-0.6.3.2.dist-info/RECORD,,
13
+ imsciences-0.8.dist-info/METADATA,sha256=moylR64i_w4kk3TPPZMpFmAPc9f0A4xJgjAY-Zy-Tac,17845
14
+ imsciences-0.8.dist-info/PKG-INFO-IMS-24Ltp-3,sha256=yqZbigwHjnYoqyI81PGz_AeofRFfOrwH_Vyawyef-mg,854
15
+ imsciences-0.8.dist-info/WHEEL,sha256=ixB2d4u7mugx_bCBycvM9OzZ5yD7NmPXFRtKlORZS2Y,91
16
+ imsciences-0.8.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
17
+ imsciences-0.8.dist-info/RECORD,,