imsciences 0.6.3.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imsciences/__init__.py CHANGED
@@ -1,3 +1,2 @@
1
1
  from .datafunctions import dataprocessing
2
2
  from .datafunctions import datapull
3
- from .unittesting import TestDataProcessor
@@ -4,21 +4,18 @@ import os
4
4
  import plotly.express as px
5
5
  import plotly.graph_objs as go
6
6
  import numpy as np
7
- import datetime
8
7
  import re
9
8
  from fredapi import Fred
10
9
  import time
11
- from datetime import datetime, timedelta # noqa: F811
10
+ from datetime import datetime, timedelta
12
11
  from io import StringIO
13
- import urllib
14
- import requests_cache # noqa: F401
15
- import urllib.request # noqa: F401
16
12
  import requests
17
- from geopy.geocoders import Nominatim # noqa: F401
18
13
  import subprocess
19
14
  import json
20
15
  import xml.etree.ElementTree as ET
21
16
  from bs4 import BeautifulSoup
17
+ import yfinance as yf
18
+ import holidays
22
19
 
23
20
  class dataprocessing:
24
21
 
@@ -1767,17 +1764,6 @@ class dataprocessing:
1767
1764
  ########################################################################################################################################
1768
1765
  ########################################################################################################################################
1769
1766
 
1770
-
1771
-
1772
-
1773
-
1774
-
1775
-
1776
-
1777
-
1778
-
1779
-
1780
-
1781
1767
  ims_proc = dataprocessing()
1782
1768
 
1783
1769
  class datapull:
@@ -1788,38 +1774,43 @@ class datapull:
1788
1774
  print("\n1. pull_fred_data")
1789
1775
  print(" - Description: Get data from FRED by using series id tokens.")
1790
1776
  print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
1791
- print(" - Example: pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])")
1777
+ print(" - Example: pull_fred_data('mon', ['GPDIC1'])")
1792
1778
 
1793
1779
  print("\n2. pull_boe_data")
1794
1780
  print(" - Description: Fetch and process Bank of England interest rate data.")
1795
1781
  print(" - Usage: pull_boe_data(week_commencing)")
1796
1782
  print(" - Example: pull_boe_data('mon')")
1797
1783
 
1798
- print("\n3. pull_ons_data")
1799
- print(" - Description: Fetch and process time series data from the ONS API.")
1800
- print(" - Usage: pull_ons_data(series_list, week_commencing)")
1801
- print(" - Example: pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')")
1802
-
1803
- print("\n4. pull_oecd")
1784
+ print("\n3. pull_oecd")
1804
1785
  print(" - Description: Fetch macroeconomic data from OECD for a specified country.")
1805
- print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '1950-01-01')")
1806
- print(" - Example: pull_oecd('GBR', 'mon', '1950-01-01')")
1786
+ print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '2020-01-01')")
1787
+ print(" - Example: pull_oecd('GBR', 'mon', '2000-01-01')")
1807
1788
 
1808
- print("\n5. get_google_mobility_data")
1789
+ print("\n4. get_google_mobility_data")
1809
1790
  print(" - Description: Fetch Google Mobility data for the specified country.")
1810
1791
  print(" - Usage: get_google_mobility_data(country, wc)")
1811
1792
  print(" - Example: get_google_mobility_data('United Kingdom', 'mon')")
1812
1793
 
1813
- print("\n6. pull_combined_dummies")
1794
+ print("\n5. pull_seasonality")
1814
1795
  print(" - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.")
1815
- print(" - Usage: pull_combined_dummies(week_commencing)")
1816
- print(" - Example: pull_combined_dummies('mon')")
1796
+ print(" - Usage: pull_seasonality(week_commencing, start_date, countries)")
1797
+ print(" - Example: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])")
1817
1798
 
1818
- print("\n7. pull_weather")
1799
+ print("\n6. pull_weather")
1819
1800
  print(" - Description: Fetch and process historical weather data for the specified country.")
1820
1801
  print(" - Usage: pull_weather(week_commencing, country)")
1821
1802
  print(" - Example: pull_weather('mon', 'GBR')")
1822
-
1803
+
1804
+ print("\n7. pull_macro_ons_uk")
1805
+ print(" - Description: Fetch and process time series data from the Beta ONS API.")
1806
+ print(" - Usage: pull_macro_ons_uk(aditional_list, week_commencing, sector)")
1807
+ print(" - Example: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')")
1808
+
1809
+ print("\n8. pull_yfinance")
1810
+ print(" - Description: Fetch and process time series data from the Beta ONS API.")
1811
+ print(" - Usage: pull_yfinance(tickers, week_start_day)")
1812
+ print(" - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
1813
+
1823
1814
  ############################################################### MACRO ##########################################################################
1824
1815
 
1825
1816
  def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
@@ -1837,10 +1828,6 @@ class datapull:
1837
1828
  ----------
1838
1829
  pd.DataFrame
1839
1830
  Return a data frame with FRED data according to the series IDs provided
1840
-
1841
- Example
1842
- ----------
1843
- pull_fred_data("mon", ["GCEC1", "SP500"])
1844
1831
  '''
1845
1832
  # Fred API
1846
1833
  fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
@@ -1958,107 +1945,7 @@ class datapull:
1958
1945
 
1959
1946
  return df_final
1960
1947
 
1961
- def pull_ons_data(self, series_list, week_commencing):
1962
- """
1963
- Fetch and process time series data from the ONS API.
1964
-
1965
- Args:
1966
- series_list (list): A list of dictionaries where each dictionary represents a time series.
1967
- Each dictionary should have the keys 'series_id' and 'dataset_id'.
1968
- week_commencing (str): The starting day of the week for aggregation.
1969
- Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1970
-
1971
- Returns:
1972
- pd.DataFrame: A DataFrame with weekly aggregated ONS data. The 'OBS' column contains the week
1973
- commencing dates and other columns contain the aggregated time series values.
1974
- """
1975
-
1976
- def parse_quarter(date_str):
1977
- """Parses a string in 'YYYY Q#' format into a datetime object."""
1978
- year, quarter = date_str.split(' ')
1979
- quarter_number = int(quarter[1])
1980
- month = (quarter_number - 1) * 3 + 1
1981
- return pd.Timestamp(f"{year}-{month:02d}-01")
1982
-
1983
- # Generate a date range from 1950-01-01 to today
1984
- date_range = pd.date_range(start="1950-01-01", end=datetime.today(), freq='D')
1985
- daily_df = pd.DataFrame(date_range, columns=['OBS'])
1986
-
1987
- # Keep track of the renamed value columns
1988
- value_columns = []
1989
-
1990
- for series in series_list:
1991
- series_id = series['series_id']
1992
- dataset_id = series['dataset_id']
1993
-
1994
- # Construct the URL for data
1995
- data_url = f"https://api.ons.gov.uk/timeseries/{series_id}/dataset/{dataset_id}/data"
1996
-
1997
- # Make the request to the ONS API for data
1998
- data_response = requests.get(data_url)
1999
-
2000
- # Check if the request was successful
2001
- if data_response.status_code != 200:
2002
- print(f"Failed to fetch data for series {series_id}: {data_response.status_code} {data_response.text}")
2003
- continue
2004
-
2005
- # Parse the JSON response for data
2006
- data = data_response.json()
2007
-
2008
- # Attempt to extract the name of the time series from the data response
2009
- series_name = data.get('description', {}).get('title', 'Value')
2010
-
2011
- # Determine the most granular time series data available
2012
- if 'months' in data and data['months']:
2013
- time_series_data = data['months']
2014
- elif 'quarters' in data and data['quarters']:
2015
- time_series_data = data['quarters']
2016
- elif 'years' in data and data['years']:
2017
- time_series_data = data['years']
2018
- else:
2019
- print("No time series data found in the response")
2020
- continue
2021
-
2022
- # Create a DataFrame from the time series data
2023
- df = pd.DataFrame(time_series_data)
2024
-
2025
- # Handle different frequencies in the data
2026
- if 'date' in df.columns:
2027
- if any(df['date'].str.contains('Q')):
2028
- df['date'] = df['date'].apply(parse_quarter)
2029
- else:
2030
- df['date'] = pd.to_datetime(df['date'])
2031
-
2032
- df = df.rename(columns={'date': 'OBS', 'value': series_name})
2033
-
2034
- # Rename the value column
2035
- new_col_name = 'macro_' + series_name.lower().replace(':', '').replace(' ', '_').replace('-', '_')
2036
- df = df.rename(columns={series_name: new_col_name})
2037
-
2038
- # Track the renamed value column
2039
- value_columns.append(new_col_name)
2040
-
2041
- # Merge the data based on the observation date
2042
- daily_df = pd.merge_asof(daily_df, df[['OBS', new_col_name]], on='OBS', direction='backward')
2043
-
2044
- # Ensure columns are numeric
2045
- for col in value_columns:
2046
- if col in daily_df.columns:
2047
- daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
2048
- else:
2049
- print(f"Column {col} not found in daily_df")
2050
-
2051
- # Aggregate results by week
2052
- ons_df_final = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
2053
- date_column="OBS",
2054
- group_columns=[],
2055
- sum_columns=value_columns,
2056
- wc=week_commencing,
2057
- aggregation="average")
2058
-
2059
- return ons_df_final
2060
-
2061
- def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "1950-01-01") -> pd.DataFrame:
1948
+ def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "2020-01-01") -> pd.DataFrame:
2062
1949
  """
2063
1950
  Fetch and process time series data from the OECD API.
2064
1951
 
@@ -2235,135 +2122,160 @@ class datapull:
2235
2122
 
2236
2123
  ############################################################### Seasonality ##########################################################################
2237
2124
 
2238
- def pull_combined_dummies(self, week_commencing):
2125
+ def pull_seasonality(self, week_commencing, start_date, countries):
2239
2126
  # Week commencing dictionary
2240
2127
  day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
2241
2128
 
2242
- # Create daily date range dataframe
2243
- date_range = pd.date_range(datetime(2015, 1, 1), datetime.today(), freq="d")
2129
+ # Create daily date range dataframe starting from start_date
2130
+ date_range = pd.date_range(
2131
+ start=pd.to_datetime(start_date),
2132
+ end=datetime.today(),
2133
+ freq="D"
2134
+ )
2244
2135
  df_daily = pd.DataFrame(date_range, columns=["Date"])
2245
-
2246
- # Create weekly date range dataframe
2247
- df_daily['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2136
+
2137
+ # ------------------------------------------------
2138
+ # 1. Identify "week_start" for each daily row
2139
+ # ------------------------------------------------
2140
+ df_daily['week_start'] = df_daily["Date"].apply(
2141
+ lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
2142
+ )
2143
+
2144
+ # ------------------------------------------------
2145
+ # 2. Build a weekly index (df_weekly_start) with dummy columns
2146
+ # ------------------------------------------------
2248
2147
  df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
2249
2148
  df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
2250
2149
 
2150
+ # Set index to weekly "start of week"
2251
2151
  df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
2252
2152
  df_weekly_start.set_index("Date", inplace=True)
2253
-
2153
+
2254
2154
  # Create individual weekly dummies
2255
2155
  dummy_columns = {}
2256
2156
  for i in range(len(df_weekly_start)):
2257
2157
  col_name = f"dum_{df_weekly_start.index[i].strftime('%Y_%m_%d')}"
2258
2158
  dummy_columns[col_name] = [0] * len(df_weekly_start)
2259
2159
  dummy_columns[col_name][i] = 1
2260
-
2160
+
2261
2161
  df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
2262
2162
  df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
2263
2163
 
2264
- # Create monthly dummies
2265
- df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
2266
- df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"])
2267
- df_monthly_dummies['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2268
- df_monthly_dummies = df_monthly_dummies.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
2269
-
2270
- df_monthly_dummies.set_index("Date", inplace=True)
2271
- df_monthly_dummies = df_monthly_dummies / 7
2272
-
2273
- # Combine weekly and monthly dataframes
2274
- df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
2275
-
2276
- # Create weekly dummies
2277
- df_combined.reset_index(inplace=True)
2278
- df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
2279
- df_combined = pd.get_dummies(df_combined, prefix="wk", columns=["Week"])
2280
-
2281
- # Create yearly dummies
2282
- df_combined["Year"] = df_combined["Date"].dt.year
2283
- df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"])
2284
-
2285
- # Add constant
2286
- df_combined["Constant"] = 1
2287
-
2288
- # Add trend
2289
- df_combined["Trend"] = df_combined.index + 1
2290
-
2291
- # Set date as index
2292
- df_combined.set_index("Date", inplace=True)
2293
-
2294
- # Create COVID lockdown dummies
2295
- lockdown_periods = [
2296
- # Lockdown 1
2297
- ("2020-03-23", "2020-05-24"),
2298
- # Lockdown 2
2299
- ("2020-11-05", "2020-12-02"),
2300
- # Lockdown 3
2301
- ("2021-01-04", "2021-03-08")
2302
- ]
2303
-
2304
- df_covid = pd.DataFrame(date_range, columns=["Date"])
2305
- df_covid["national_lockdown"] = 0
2306
-
2307
- for start, end in lockdown_periods:
2308
- df_covid.loc[(df_covid["Date"] >= start) & (df_covid["Date"] <= end), "national_lockdown"] = 1
2309
-
2310
- df_covid['week_start'] = df_covid["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2311
- df_covid.drop("Date", axis=1, inplace=True)
2312
- df_covid.rename(columns={"week_start": "OBS"}, inplace=True)
2313
- df_national_lockdown_total = df_covid.groupby('OBS').sum(numeric_only=True)
2314
- df_national_lockdown_total.rename(columns={"national_lockdown": "covid_uk_national_lockdown_total"}, inplace=True)
2315
-
2316
- df_national_lockdown_1 = df_national_lockdown_total.copy(deep=True)
2317
- df_national_lockdown_2 = df_national_lockdown_total.copy(deep=True)
2318
- df_national_lockdown_3 = df_national_lockdown_total.copy(deep=True)
2319
-
2320
- df_national_lockdown_1.loc[df_national_lockdown_1.index > "2020-05-24"] = 0
2321
- df_national_lockdown_1.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_1"}, inplace=True)
2322
-
2323
- df_national_lockdown_2.loc[df_national_lockdown_2.index < "2020-11-05"] = 0
2324
- df_national_lockdown_2.loc[df_national_lockdown_2.index > "2020-12-02"] = 0
2325
- df_national_lockdown_2.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_2"}, inplace=True)
2326
-
2327
- df_national_lockdown_3.loc[df_national_lockdown_3.index < "2021-01-04"] = 0
2328
- df_national_lockdown_3.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_3"}, inplace=True)
2164
+ # ------------------------------------------------
2165
+ # 3. Public holidays (daily) and specific holiday columns
2166
+ # ------------------------------------------------
2167
+ for country in countries:
2168
+ country_holidays = holidays.CountryHoliday(
2169
+ country,
2170
+ years=range(int(start_date[:4]), datetime.today().year + 1)
2171
+ )
2172
+ # Daily indicator: 1 if that date is a holiday
2173
+ df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(
2174
+ lambda x: 1 if x in country_holidays else 0
2175
+ )
2176
+ # Create columns for specific holiday names
2177
+ for date_hol, name in country_holidays.items():
2178
+ col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
2179
+ if col_name not in df_daily.columns:
2180
+ df_daily[col_name] = 0
2181
+ df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
2182
+
2183
+ # ------------------------------------------------
2184
+ # 4. Add daily indicators for last day & last Friday of month
2185
+ # Then aggregate them to weekly level using .max()
2186
+ # ------------------------------------------------
2187
+ # Last day of month (daily)
2188
+ df_daily["seas_last_day_of_month"] = df_daily["Date"].apply(
2189
+ lambda d: 1 if d == d.to_period("M").to_timestamp("M") else 0
2190
+ )
2329
2191
 
2330
- df_final_covid = pd.concat([df_national_lockdown_total, df_national_lockdown_1, df_national_lockdown_2, df_national_lockdown_3], axis=1)
2331
- df_final_covid.reset_index(inplace=True)
2332
- df_final_covid.rename(columns={"index": "OBS"}, inplace=True)
2333
-
2334
- # Create seasonal indicators for the last day and last Friday of the month
2335
- min_date = '2019-12-29'
2336
- max_date = datetime.today().strftime('%Y-%m-%d')
2337
- date_range_seas = pd.date_range(start=min_date, end=max_date)
2338
-
2339
- df_seas = pd.DataFrame(date_range_seas, columns=['Date'])
2340
- df_seas['Last_Day_of_Month'] = df_seas['Date'].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
2341
-
2192
+ # Last Friday of month (daily)
2342
2193
  def is_last_friday(date):
2343
- last_day_of_month = date.to_period('M').to_timestamp('M')
2194
+ # last day of the month
2195
+ last_day_of_month = date.to_period("M").to_timestamp("M")
2344
2196
  last_day_weekday = last_day_of_month.dayofweek
2197
+ # Determine how many days we go back from the last day to get Friday
2345
2198
  if last_day_weekday >= 4:
2346
2199
  days_to_subtract = last_day_weekday - 4
2347
2200
  else:
2348
2201
  days_to_subtract = last_day_weekday + 3
2349
2202
  last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
2350
2203
  return 1 if date == last_friday else 0
2204
+
2205
+ df_daily["seas_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
2206
+
2207
+ # ------------------------------------------------
2208
+ # 5. Weekly aggregation for HOLIDAYS & monthly dummies
2209
+ # (Using .max() for holiday indicators so they become binary)
2210
+ # ------------------------------------------------
2211
+ # For monthly dummies, create a daily col "Month", then get_dummies
2212
+ df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
2213
+ df_monthly_dummies = pd.get_dummies(
2214
+ df_daily,
2215
+ prefix="seas",
2216
+ columns=["Month"],
2217
+ dtype=int
2218
+ )
2219
+ # Recalculate 'week_start' (already in df_daily, but just to be sure)
2220
+ df_monthly_dummies['week_start'] = df_daily['week_start']
2221
+
2222
+ # Group monthly dummies by .sum() or .mean()—often we average across the week
2223
+ df_monthly_dummies = (
2224
+ df_monthly_dummies
2225
+ .groupby('week_start')
2226
+ .sum(numeric_only=True) # sum the daily flags
2227
+ .reset_index()
2228
+ .rename(columns={'week_start': "Date"})
2229
+ .set_index("Date")
2230
+ )
2231
+ # Divide the monthly dummy columns by 7 to spread them across the week
2232
+ monthly_cols = [
2233
+ c for c in df_monthly_dummies.columns
2234
+ if c.startswith("seas_month_")
2235
+ ]
2236
+ df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
2237
+
2238
+ # Group holiday columns (and last-day-of-month columns) by .max() => binary
2239
+ df_holidays = (
2240
+ df_daily
2241
+ .groupby('week_start')
2242
+ .max(numeric_only=True) # use max => if any day=1, entire week=1
2243
+ .reset_index()
2244
+ .rename(columns={'week_start': "Date"})
2245
+ .set_index("Date")
2246
+ )
2351
2247
 
2352
- df_seas['Last_Friday_of_Month'] = df_seas['Date'].apply(is_last_friday)
2248
+ # ------------------------------------------------
2249
+ # 6. Combine weekly start, monthly dummies, holiday flags
2250
+ # ------------------------------------------------
2251
+ df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
2252
+ df_combined = pd.concat([df_combined, df_holidays], axis=1)
2253
+ df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
2254
+
2255
+ # ------------------------------------------------
2256
+ # 7. Create weekly dummies for Week of Year & yearly dummies
2257
+ # ------------------------------------------------
2258
+ df_combined.reset_index(inplace=True)
2259
+ df_combined.rename(columns={"index": "old_index"}, inplace=True) # just in case
2353
2260
 
2354
- df_seas['week_start'] = df_seas["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2355
- df_seas = df_seas.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
2356
- df_seas.set_index("Date", inplace=True)
2261
+ df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
2262
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int)
2357
2263
 
2358
- # Combine all dataframes
2359
- df_combined = df_combined.reset_index().rename(columns={"Date": "OBS"})
2360
- df_final_combined = pd.merge(df_combined, df_final_covid, how='left', left_on='OBS', right_on='OBS')
2361
- df_final_combined = pd.merge(df_final_combined, df_seas, how='left', left_on='OBS', right_on='Date')
2362
-
2363
- # Fill any NaN values with 0
2364
- df_final_combined.fillna(0, inplace=True)
2264
+ df_combined["Year"] = df_combined["Date"].dt.year
2265
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
2266
+
2267
+ # ------------------------------------------------
2268
+ # 8. Add constant & trend
2269
+ # ------------------------------------------------
2270
+ df_combined["Constant"] = 1
2271
+ df_combined["Trend"] = df_combined.index + 1
2365
2272
 
2366
- return df_final_combined
2273
+ # ------------------------------------------------
2274
+ # 9. Rename Date -> OBS and return
2275
+ # ------------------------------------------------
2276
+ df_combined.rename(columns={"Date": "OBS"}, inplace=True)
2277
+
2278
+ return df_combined
2367
2279
 
2368
2280
  def pull_weather(self, week_commencing, country) -> pd.DataFrame:
2369
2281
  import pandas as pd
@@ -2966,4 +2878,240 @@ class datapull:
2966
2878
 
2967
2879
  final_weather = ims_proc.rename_cols(merged_df, 'seas_')
2968
2880
 
2969
- return final_weather
2881
+ return final_weather
2882
+
2883
+ def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
2884
+ """
2885
+ Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
2886
+ aggregates it to weekly averages, and renames variables based on specified rules.
2887
+
2888
+ Parameters:
2889
+ cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
2890
+ week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
2891
+ sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
2892
+
2893
+ Returns:
2894
+ pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
2895
+ and all series as renamed columns.
2896
+ """
2897
+ # Define CDIDs for sectors and defaults
2898
+ sector_cdids = {
2899
+ "fast_food": ["L7TD", "L78Q", "DOAD"],
2900
+ "default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
2901
+ }
2902
+
2903
+ default_cdids = sector_cdids["default"]
2904
+ sector_specific_cdids = sector_cdids.get(sector, [])
2905
+ standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Avoid duplicates
2906
+
2907
+ # Combine standard CDIDs and additional CDIDs
2908
+ if cdid_list is None:
2909
+ cdid_list = []
2910
+ cdid_list = list(set(standard_cdids + cdid_list)) # Avoid duplicates
2911
+
2912
+ base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
2913
+ base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
2914
+ combined_df = pd.DataFrame()
2915
+
2916
+ # Map week start day to pandas weekday convention
2917
+ days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
2918
+ if week_start_day not in days_map:
2919
+ raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
2920
+ week_start = days_map[week_start_day]
2921
+
2922
+ for cdid in cdid_list:
2923
+ try:
2924
+ # Search for the series
2925
+ search_url = f"{base_search_url}{cdid}"
2926
+ search_response = requests.get(search_url)
2927
+ search_response.raise_for_status()
2928
+ search_data = search_response.json()
2929
+
2930
+ items = search_data.get("items", [])
2931
+ if not items:
2932
+ print(f"No data found for CDID: {cdid}")
2933
+ continue
2934
+
2935
+ # Extract series name and latest release URI
2936
+ series_name = items[0].get("title", f"Series_{cdid}")
2937
+ latest_date = max(
2938
+ datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
2939
+ for item in items if "release_date" in item
2940
+ )
2941
+ latest_uri = next(
2942
+ item["uri"] for item in items
2943
+ if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
2944
+ )
2945
+
2946
+ # Fetch the dataset
2947
+ data_url = f"{base_data_url}{latest_uri}"
2948
+ data_response = requests.get(data_url)
2949
+ data_response.raise_for_status()
2950
+ data_json = data_response.json()
2951
+
2952
+ # Detect the frequency and process accordingly
2953
+ if "months" in data_json and data_json["months"]:
2954
+ frequency_key = "months"
2955
+ elif "quarters" in data_json and data_json["quarters"]:
2956
+ frequency_key = "quarters"
2957
+ elif "years" in data_json and data_json["years"]:
2958
+ frequency_key = "years"
2959
+ else:
2960
+ print(f"Unsupported frequency or no data for CDID: {cdid}")
2961
+ continue
2962
+
2963
+ # Prepare the DataFrame
2964
+ df = pd.DataFrame(data_json[frequency_key])
2965
+
2966
+ # Parse the 'date' field based on frequency
2967
+ if frequency_key == "months":
2968
+ df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
2969
+ elif frequency_key == "quarters":
2970
+ def parse_quarter(quarter_str):
2971
+ year, qtr = quarter_str.split(" Q")
2972
+ month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
2973
+ return datetime(int(year), month, 1)
2974
+ df["date"] = df["date"].apply(parse_quarter)
2975
+ elif frequency_key == "years":
2976
+ df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
2977
+
2978
+ df["value"] = pd.to_numeric(df["value"], errors="coerce")
2979
+ df.rename(columns={"value": series_name}, inplace=True)
2980
+
2981
+ # Combine data
2982
+ df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
2983
+ if combined_df.empty:
2984
+ combined_df = df
2985
+ else:
2986
+ combined_df = pd.merge(combined_df, df, on="date", how="outer")
2987
+
2988
+ except requests.exceptions.RequestException as e:
2989
+ print(f"Error fetching data for CDID {cdid}: {e}")
2990
+ except (KeyError, ValueError) as e:
2991
+ print(f"Error processing data for CDID {cdid}: {e}")
2992
+
2993
+ if not combined_df.empty:
2994
+ min_date = combined_df["date"].min()
2995
+ max_date = datetime.today()
2996
+ date_range = pd.date_range(start=min_date, end=max_date, freq='D')
2997
+ daily_df = pd.DataFrame(date_range, columns=['date'])
2998
+ daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
2999
+ daily_df = daily_df.ffill()
3000
+
3001
+ # Aggregate to weekly frequency
3002
+ daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
3003
+ weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
3004
+
3005
+ def clean_column_name(name):
3006
+ name = re.sub(r"\(.*?\)", "", name)
3007
+ name = re.split(r":", name)[0]
3008
+ name = re.sub(r"\d+", "", name)
3009
+ name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
3010
+ name = re.sub(r"[^\w\s]", "", name)
3011
+ name = name.replace(" ", "_")
3012
+ name = re.sub(r"_+", "_", name)
3013
+ name = name.rstrip("_")
3014
+ return f"macro_{name.lower()}_uk"
3015
+
3016
+ weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
3017
+ weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
3018
+
3019
+ weekly_df = weekly_df.fillna(0)
3020
+
3021
+ return weekly_df
3022
+ else:
3023
+ print("No data available to process.")
3024
+ return pd.DataFrame()
3025
+
3026
+ def pull_yfinance(self, tickers=None, week_start_day="mon"):
3027
+ """
3028
+ Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
3029
+ aggregates it to weekly averages, and renames variables.
3030
+
3031
+ Parameters:
3032
+ tickers (list): A list of additional stock tickers to fetch (e.g., ['AAPL', 'MSFT']). Defaults to None.
3033
+ week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
3034
+
3035
+ Returns:
3036
+ pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column
3037
+ and aggregated stock data for the specified tickers, with NaN values filled with 0.
3038
+ """
3039
+ # Define default tickers
3040
+ default_tickers = ["^FTSE", "GBPUSD=X", "GBPEUR=X", "^GSPC"]
3041
+
3042
+ # Combine default tickers with additional ones
3043
+ if tickers is None:
3044
+ tickers = []
3045
+ tickers = list(set(default_tickers + tickers)) # Ensure no duplicates
3046
+
3047
+ # Automatically set end_date to today
3048
+ end_date = datetime.today().strftime("%Y-%m-%d")
3049
+
3050
+ # Mapping week start day to pandas weekday convention
3051
+ days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
3052
+ if week_start_day not in days_map:
3053
+ raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
3054
+ week_start = days_map[week_start_day]
3055
+
3056
+ # Fetch data for all tickers without specifying a start date to get all available data
3057
+ data = yf.download(tickers, end=end_date, group_by="ticker", auto_adjust=True)
3058
+
3059
+ # Process the data
3060
+ combined_df = pd.DataFrame()
3061
+ for ticker in tickers:
3062
+ try:
3063
+ # Extract the ticker's data
3064
+ ticker_data = data[ticker] if len(tickers) > 1 else data
3065
+ ticker_data = ticker_data.reset_index()
3066
+
3067
+ # Ensure necessary columns are present
3068
+ if "Close" not in ticker_data.columns:
3069
+ raise ValueError(f"Ticker {ticker} does not have 'Close' price data.")
3070
+
3071
+ # Keep only relevant columns
3072
+ ticker_data = ticker_data[["Date", "Close"]]
3073
+ ticker_data.rename(columns={"Close": ticker}, inplace=True)
3074
+
3075
+ # Merge data
3076
+ if combined_df.empty:
3077
+ combined_df = ticker_data
3078
+ else:
3079
+ combined_df = pd.merge(combined_df, ticker_data, on="Date", how="outer")
3080
+
3081
+ except KeyError:
3082
+ print(f"Data for ticker {ticker} not available.")
3083
+ except Exception as e:
3084
+ print(f"Error processing ticker {ticker}: {e}")
3085
+
3086
+ if not combined_df.empty:
3087
+ # Convert to daily frequency
3088
+ combined_df["Date"] = pd.to_datetime(combined_df["Date"])
3089
+ combined_df.set_index("Date", inplace=True)
3090
+
3091
+ # Fill missing dates
3092
+ min_date = combined_df.index.min()
3093
+ max_date = combined_df.index.max()
3094
+ daily_index = pd.date_range(start=min_date, end=max_date, freq='D')
3095
+ combined_df = combined_df.reindex(daily_index)
3096
+ combined_df.index.name = "Date"
3097
+ combined_df = combined_df.ffill()
3098
+
3099
+ # Aggregate to weekly frequency
3100
+ combined_df["OBS"] = combined_df.index - pd.to_timedelta((combined_df.index.weekday - week_start) % 7, unit="D")
3101
+ weekly_df = combined_df.groupby("OBS").mean(numeric_only=True).reset_index()
3102
+
3103
+ # Fill NaN values with 0
3104
+ weekly_df = weekly_df.fillna(0)
3105
+
3106
+ # Clean column names
3107
+ def clean_column_name(name):
3108
+ name = re.sub(r"[^\w\s]", "", name)
3109
+ return f"macro_{name.lower()}"
3110
+
3111
+ weekly_df.columns = [clean_column_name(col) if col != "OBS" else col for col in weekly_df.columns]
3112
+
3113
+ return weekly_df
3114
+
3115
+ else:
3116
+ print("No data available to process.")
3117
+ return pd.DataFrame()
@@ -1,10 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.6.3.2
3
+ Version: 0.8.1
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
7
- License: MIT
8
7
  Keywords: python,data processing,apis
9
8
  Classifier: Development Status :: 3 - Alpha
10
9
  Classifier: Intended Audience :: Developers
@@ -20,93 +19,113 @@ Requires-Dist: fredapi
20
19
  Requires-Dist: requests-cache
21
20
  Requires-Dist: geopy
22
21
  Requires-Dist: bs4
22
+ Requires-Dist: yfinance
23
+ Requires-Dist: holidays
23
24
 
24
25
  # IMS Package Documentation
25
26
 
26
- The IMS package is a python library for processing incoming data into a format that can be used for specifically for econometrics projects that use weekly timeseries data. IMS processing offers a variety of functions to manipulate and analyze data efficiently. Here are the functionalities provided by the package:
27
+ The **IMSciences package** is a Python library designed to process incoming data into a format tailored for econometrics projects, particularly those utilising weekly time series data. This package offers a suite of functions for efficient data manipulation and analysis.
27
28
 
28
- ## Data Processing
29
+ ---
30
+
31
+ ## Key Features
32
+ - Seamless data processing for econometrics workflows.
33
+ - Aggregation, filtering, and transformation of time series data.
34
+ - Integration with external data sources like FRED, Bank of England, ONS and OECD.
35
+
36
+ ---
37
+
38
+ Table of Contents
39
+ =================
29
40
 
30
- # Function Descriptions and Usage Examples
41
+ 1. `Data Processing <#data-processing>`_
42
+ 2. `Data Pulling <#data-pulling>`_
43
+ 3. `Installation <#installation>`_
44
+ 4. `Usage <#usage>`_
45
+ 5. `License <#license>`_
46
+
47
+ ---
31
48
 
32
- ## 1. `get_wd_levels`
49
+ ## Data Processing
50
+
51
+ ## 1. get_wd_levels
33
52
  - **Description**: Get the working directory with the option of moving up parents.
34
53
  - **Usage**: `get_wd_levels(levels)`
35
54
  - **Example**: `get_wd_levels(0)`
36
55
 
37
56
  ---
38
57
 
39
- ## 2. `remove_rows`
58
+ ## 2. remove_rows
40
59
  - **Description**: Removes a specified number of rows from a pandas DataFrame.
41
60
  - **Usage**: `remove_rows(data_frame, num_rows_to_remove)`
42
61
  - **Example**: `remove_rows(df, 2)`
43
62
 
44
63
  ---
45
64
 
46
- ## 3. `aggregate_daily_to_wc_long`
65
+ ## 3. aggregate_daily_to_wc_long
47
66
  - **Description**: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.
48
67
  - **Usage**: `aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')`
49
68
  - **Example**: `aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')`
50
69
 
51
70
  ---
52
71
 
53
- ## 4. `convert_monthly_to_daily`
72
+ ## 4. convert_monthly_to_daily
54
73
  - **Description**: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.
55
74
  - **Usage**: `convert_monthly_to_daily(df, date_column, divide)`
56
75
  - **Example**: `convert_monthly_to_daily(df, 'date')`
57
76
 
58
77
  ---
59
78
 
60
- ## 5. `plot_two`
79
+ ## 5. plot_two
61
80
  - **Description**: Plots specified columns from two different DataFrames using a shared date column. Useful for comparing data.
62
81
  - **Usage**: `plot_two(df1, col1, df2, col2, date_column, same_axis=True)`
63
82
  - **Example**: `plot_two(df1, 'cost', df2, 'cost', 'obs', True)`
64
83
 
65
84
  ---
66
85
 
67
- ## 6. `remove_nan_rows`
86
+ ## 6. remove_nan_rows
68
87
  - **Description**: Removes rows from a DataFrame where the specified column has NaN values.
69
88
  - **Usage**: `remove_nan_rows(df, col_to_remove_rows)`
70
89
  - **Example**: `remove_nan_rows(df, 'date')`
71
90
 
72
91
  ---
73
92
 
74
- ## 7. `filter_rows`
93
+ ## 7. filter_rows
75
94
  - **Description**: Filters the DataFrame based on whether the values in a specified column are in a provided list.
76
95
  - **Usage**: `filter_rows(df, col_to_filter, list_of_filters)`
77
96
  - **Example**: `filter_rows(df, 'country', ['UK', 'IE'])`
78
97
 
79
98
  ---
80
99
 
81
- ## 8. `plot_one`
100
+ ## 8. plot_one
82
101
  - **Description**: Plots a specified column from a DataFrame.
83
102
  - **Usage**: `plot_one(df1, col1, date_column)`
84
103
  - **Example**: `plot_one(df, 'Spend', 'OBS')`
85
104
 
86
105
  ---
87
106
 
88
- ## 9. `week_of_year_mapping`
107
+ ## 9. week_of_year_mapping
89
108
  - **Description**: Converts a week column in `yyyy-Www` or `yyyy-ww` format to week commencing date.
90
109
  - **Usage**: `week_of_year_mapping(df, week_col, start_day_str)`
91
110
  - **Example**: `week_of_year_mapping(df, 'week', 'mon')`
92
111
 
93
112
  ---
94
113
 
95
- ## 10. `exclude_rows`
114
+ ## 10. exclude_rows
96
115
  - **Description**: Removes rows from a DataFrame based on whether the values in a specified column are not in a provided list.
97
116
  - **Usage**: `exclude_rows(df, col_to_filter, list_of_filters)`
98
117
  - **Example**: `exclude_rows(df, 'week', ['2022-W20', '2022-W21'])`
99
118
 
100
119
  ---
101
120
 
102
- ## 11. `rename_cols`
121
+ ## 11. rename_cols
103
122
  - **Description**: Renames columns in a pandas DataFrame.
104
123
  - **Usage**: `rename_cols(df, name)`
105
124
  - **Example**: `rename_cols(df, 'ame_facebook')`
106
125
 
107
126
  ---
108
127
 
109
- ## 12. `merge_new_and_old`
128
+ ## 12. merge_new_and_old
110
129
  - **Description**: Creates a new DataFrame with two columns: one for dates and one for merged numeric values.
111
130
  - Merges numeric values from specified columns in the old and new DataFrames based on a given cutoff date.
112
131
  - **Usage**: `merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')`
@@ -114,21 +133,21 @@ The IMS package is a python library for processing incoming data into a format t
114
133
 
115
134
  ---
116
135
 
117
- ## 13. `merge_dataframes_on_date`
136
+ ## 13. merge_dataframes_on_date
118
137
  - **Description**: Merge a list of DataFrames on a common column.
119
138
  - **Usage**: `merge_dataframes_on_date(dataframes, common_column='OBS', merge_how='outer')`
120
139
  - **Example**: `merge_dataframes_on_date([df1, df2, df3], common_column='OBS', merge_how='outer')`
121
140
 
122
141
  ---
123
142
 
124
- ## 14. `merge_and_update_dfs`
143
+ ## 14. merge_and_update_dfs
125
144
  - **Description**: Merges two dataframes on a key column, updates the first dataframe's columns with the second's where available, and returns a dataframe sorted by the key column.
126
145
  - **Usage**: `merge_and_update_dfs(df1, df2, key_column)`
127
146
  - **Example**: `merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')`
128
147
 
129
148
  ---
130
149
 
131
- ## 15. `convert_us_to_uk_dates`
150
+ ## 15. convert_us_to_uk_dates
132
151
  - **Description**: Convert a DataFrame column with mixed date formats to datetime.
133
152
  - **Usage**: `convert_us_to_uk_dates(df, date_col)`
134
153
  - **Example**: `convert_us_to_uk_dates(df, 'date')`
@@ -142,189 +161,189 @@ The IMS package is a python library for processing incoming data into a format t
142
161
 
143
162
  ---
144
163
 
145
- ## 17. `pivot_table`
164
+ ## 17. pivot_table
146
165
  - **Description**: Dynamically pivots a DataFrame based on specified columns.
147
166
  - **Usage**: `pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')`
148
167
  - **Example**: `pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1', 'OBS': ' >= datetime(2019,9,9)', 'Metric Short Names': ' == spd'}, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=True, fill_missing_weekly_dates=True, week_commencing='W-MON')`
149
168
 
150
169
  ---
151
170
 
152
- ## 18. `apply_lookup_table_for_columns`
171
+ ## 18. apply_lookup_table_for_columns
153
172
  - **Description**: Equivalent of XLOOKUP in Excel. Allows mapping of a dictionary of substrings within a column.
154
173
  - **Usage**: `apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')`
155
174
  - **Example**: `apply_lookup_table_for_columns(df, col_names, {'spend': 'spd', 'clicks': 'clk'}, if_not_in_dict='Other', new_column_name='Metrics Short')`
156
175
 
157
176
  ---
158
177
 
159
- ## 19. `aggregate_daily_to_wc_wide`
178
+ ## 19. aggregate_daily_to_wc_wide
160
179
  - **Description**: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.
161
180
  - **Usage**: `aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc, aggregation='sum', include_totals=False)`
162
181
  - **Example**: `aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average', True)`
163
182
 
164
183
  ---
165
184
 
166
- ## 20. `merge_cols_with_seperator`
185
+ ## 20. merge_cols_with_seperator
167
186
  - **Description**: Merges multiple columns in a DataFrame into one column with a separator `_`. Useful for lookup tables.
168
187
  - **Usage**: `merge_cols_with_seperator(df, col_names, seperator='_', output_column_name='Merged', starting_prefix_str=None, ending_prefix_str=None)`
169
188
  - **Example**: `merge_cols_with_seperator(df, ['Campaign', 'Product'], seperator='|', output_column_name='Merged Columns', starting_prefix_str='start_', ending_prefix_str='_end')`
170
189
 
171
190
  ---
172
191
 
173
- ## 21. `check_sum_of_df_cols_are_equal`
192
+ ## 21. check_sum_of_df_cols_are_equal
174
193
  - **Description**: Checks if the sum of two columns in two DataFrames are the same, and provides the sums and differences.
175
194
  - **Usage**: `check_sum_of_df_cols_are_equal(df_1, df_2, cols_1, cols_2)`
176
195
  - **Example**: `check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')`
177
196
 
178
197
  ---
179
198
 
180
- ## 22. `convert_2_df_cols_to_dict`
199
+ ## 22. convert_2_df_cols_to_dict
181
200
  - **Description**: Creates a dictionary using two columns in a DataFrame.
182
201
  - **Usage**: `convert_2_df_cols_to_dict(df, key_col, value_col)`
183
202
  - **Example**: `convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')`
184
203
 
185
204
  ---
186
205
 
187
- ## 23. `create_FY_and_H_columns`
206
+ ## 23. create_FY_and_H_columns
188
207
  - **Description**: Creates financial year, half-year, and financial half-year columns.
189
208
  - **Usage**: `create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')`
190
209
  - **Example**: `create_FY_and_H_columns(df, 'Week (M-S)', '2022-10-03', 'FY2023', short_format='Yes', half_years='Yes', combined_FY_and_H='Yes')`
191
210
 
192
211
  ---
193
212
 
194
- ## 24. `keyword_lookup_replacement`
213
+ ## 24. keyword_lookup_replacement
195
214
  - **Description**: Updates chosen values in a specified column of the DataFrame based on a lookup dictionary.
196
215
  - **Usage**: `keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')`
197
216
  - **Example**: `keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment', 'product'], qlik_dict_for_channel, output_column_name='Channel New')`
198
217
 
199
218
  ---
200
219
 
201
- ## 25. `create_new_version_of_col_using_LUT`
220
+ ## 25. create_new_version_of_col_using_LUT
202
221
  - **Description**: Creates a new column in a DataFrame by mapping values from an old column using a lookup table.
203
222
  - **Usage**: `create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')`
204
223
  - **Example**: `create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', search_campaign_name_retag_lut, 'Campaign Name New')`
205
224
 
206
225
  ---
207
226
 
208
- ## 26. `convert_df_wide_2_long`
227
+ ## 26. convert_df_wide_2_long
209
228
  - **Description**: Converts a DataFrame from wide to long format.
210
229
  - **Usage**: `convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')`
211
230
  - **Example**: `convert_df_wide_2_long(df, ['Media Cost', 'Impressions', 'Clicks'], variable_col_name='Metric')`
212
231
 
213
232
  ---
214
233
 
215
- ## 27. `manually_edit_data`
234
+ ## 27. manually_edit_data
216
235
  - **Description**: Enables manual updates to DataFrame cells by applying filters and editing a column.
217
236
  - **Usage**: `manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)`
218
237
  - **Example**: `manually_edit_data(df, {'OBS': ' <= datetime(2023,1,23)', 'File_Name': ' == France media'}, 'Master Include', 1, change_in_existing_df_col='Yes', new_col_to_change_name='Master Include', manual_edit_col_name='Manual Changes')`
219
238
 
220
239
  ---
221
240
 
222
- ## 28. `format_numbers_with_commas`
241
+ ## 28. format_numbers_with_commas
223
242
  - **Description**: Formats numeric data into numbers with commas and specified decimal places.
224
243
  - **Usage**: `format_numbers_with_commas(df, decimal_length_chosen=2)`
225
244
  - **Example**: `format_numbers_with_commas(df, 1)`
226
245
 
227
246
  ---
228
247
 
229
- ## 29. `filter_df_on_multiple_conditions`
248
+ ## 29. filter_df_on_multiple_conditions
230
249
  - **Description**: Filters a DataFrame based on multiple conditions from a dictionary.
231
250
  - **Usage**: `filter_df_on_multiple_conditions(df, filters_dict)`
232
251
  - **Example**: `filter_df_on_multiple_conditions(df, {'OBS': ' <= datetime(2023,1,23)', 'File_Name': ' == France media'})`
233
252
 
234
253
  ---
235
254
 
236
- ## 30. `read_and_concatenate_files`
255
+ ## 30. read_and_concatenate_files
237
256
  - **Description**: Reads and concatenates all files of a specified type in a folder.
238
257
  - **Usage**: `read_and_concatenate_files(folder_path, file_type='csv')`
239
258
  - **Example**: `read_and_concatenate_files(folder_path, file_type='csv')`
240
259
 
241
260
  ---
242
261
 
243
- ## 31. `remove_zero_values`
262
+ ## 31. remove_zero_values
244
263
  - **Description**: Removes rows with zero values in a specified column.
245
264
  - **Usage**: `remove_zero_values(data_frame, column_to_filter)`
246
265
  - **Example**: `remove_zero_values(df, 'Funeral_Delivery')`
247
266
 
248
267
  ---
249
268
 
250
- ## 32. `upgrade_outdated_packages`
269
+ ## 32. upgrade_outdated_packages
251
270
  - **Description**: Upgrades all outdated packages in the environment.
252
271
  - **Usage**: `upgrade_outdated_packages()`
253
272
  - **Example**: `upgrade_outdated_packages()`
254
273
 
255
274
  ---
256
275
 
257
- ## 33. `convert_mixed_formats_dates`
276
+ ## 33. convert_mixed_formats_dates
258
277
  - **Description**: Converts a mix of US and UK date formats to datetime.
259
278
  - **Usage**: `convert_mixed_formats_dates(df, date_col)`
260
279
  - **Example**: `convert_mixed_formats_dates(df, 'OBS')`
261
280
 
262
281
  ---
263
282
 
264
- ## 34. `fill_weekly_date_range`
283
+ ## 34. fill_weekly_date_range
265
284
  - **Description**: Fills in missing weeks with zero values.
266
285
  - **Usage**: `fill_weekly_date_range(df, date_column, freq)`
267
286
  - **Example**: `fill_weekly_date_range(df, 'OBS', 'W-MON')`
268
287
 
269
288
  ---
270
289
 
271
- ## 35. `add_prefix_and_suffix`
290
+ ## 35. add_prefix_and_suffix
272
291
  - **Description**: Adds prefixes and/or suffixes to column headers.
273
292
  - **Usage**: `add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)`
274
293
  - **Example**: `add_prefix_and_suffix(df, prefix='media_', suffix='_spd', date_col='obs')`
275
294
 
276
295
  ---
277
296
 
278
- ## 36. `create_dummies`
297
+ ## 36. create_dummies
279
298
  - **Description**: Converts time series into binary indicators based on a threshold.
280
299
  - **Usage**: `create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')`
281
300
  - **Example**: `create_dummies(df, date_col='obs', dummy_threshold=100, add_total_dummy_col='Yes', total_col_name='med_total_dum')`
282
301
 
283
302
  ---
284
303
 
285
- ## 37. `replace_substrings`
304
+ ## 37. replace_substrings
286
305
  - **Description**: Replaces substrings in a column of strings using a dictionary and can change column values to lowercase.
287
306
  - **Usage**: `replace_substrings(df, column, replacements, to_lower=False, new_column=None)`
288
307
  - **Example**: `replace_substrings(df, 'Influencer Handle', replacement_dict, to_lower=True, new_column='Short Version')`
289
308
 
290
309
  ---
291
310
 
292
- ## 38. `add_total_column`
311
+ ## 38. `add_total_column
293
312
  - **Description**: Sums all columns (excluding a specified column) to create a total column.
294
313
  - **Usage**: `add_total_column(df, exclude_col=None, total_col_name='Total')`
295
314
  - **Example**: `add_total_column(df, exclude_col='obs', total_col_name='total_media_spd')`
296
315
 
297
316
  ---
298
317
 
299
- ## 39. `apply_lookup_table_based_on_substring`
318
+ ## 39. apply_lookup_table_based_on_substring
300
319
  - **Description**: Maps substrings in a column to values using a lookup dictionary.
301
320
  - **Usage**: `apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')`
302
321
  - **Example**: `apply_lookup_table_based_on_substring(df, 'Campaign Name', campaign_dict, new_col_name='Campaign Name Short', other_label='Full Funnel')`
303
322
 
304
323
  ---
305
324
 
306
- ## 40. `compare_overlap`
325
+ ## 40. compare_overlap
307
326
  - **Description**: Compares matching rows and columns in two DataFrames and outputs the differences.
308
327
  - **Usage**: `compare_overlap(df1, df2, date_col)`
309
328
  - **Example**: `compare_overlap(df_1, df_2, 'obs')`
310
329
 
311
330
  ---
312
331
 
313
- ## 41. `week_commencing_2_week_commencing_conversion`
332
+ ## 41. week_commencing_2_week_commencing_conversion
314
333
  - **Description**: Converts a week commencing column to a different start day.
315
334
  - **Usage**: `week_commencing_2_week_commencing_conversion(df, date_col, week_commencing='sun')`
316
335
  - **Example**: `week_commencing_2_week_commencing_conversion(df, 'obs', week_commencing='mon')`
317
336
 
318
337
  ---
319
338
 
320
- ## 42. `plot_chart`
339
+ ## 42. plot_chart
321
340
  - **Description**: Plots various chart types including line, area, scatter, and bar.
322
341
  - **Usage**: `plot_chart(df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs)`
323
342
  - **Example**: `plot_chart(df, 'obs', df.cols, chart_type='line', title='Spend Over Time', x_title='Date', y_title='Spend')`
324
343
 
325
344
  ---
326
345
 
327
- ## 43. `plot_two_with_common_cols`
346
+ ## 43. plot_two_with_common_cols
328
347
  - **Description**: Plots charts for two DataFrames based on common column names.
329
348
  - **Usage**: `plot_two_with_common_cols(df1, df2, date_column, same_axis=True)`
330
349
  - **Example**: `plot_two_with_common_cols(df_1, df_2, date_column='obs')`
@@ -333,51 +352,82 @@ The IMS package is a python library for processing incoming data into a format t
333
352
 
334
353
  ## Data Pulling
335
354
 
336
- ## 1. `pull_fred_data`
355
+ ## 1. pull_fred_data
337
356
  - **Description**: Fetch data from FRED using series ID tokens.
338
- - **Usage**: `pull_fred_data(week_commencing, series_id_list)`
339
- - **Example**: `pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])`
357
+ - **Usage**: pull_fred_data(week_commencing, series_id_list)
358
+ - **Example**: pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])
340
359
 
341
360
  ---
342
361
 
343
- ## 2. `pull_boe_data`
362
+ ## 2. pull_boe_data
344
363
  - **Description**: Fetch and process Bank of England interest rate data.
345
- - **Usage**: `pull_boe_data(week_commencing)`
346
- - **Example**: `pull_boe_data('mon')`
364
+ - **Usage**: pull_boe_data(week_commencing)
365
+ - **Example**: pull_boe_data('mon')
347
366
 
348
367
  ---
349
368
 
350
- ## 3. `pull_ons_data`
351
- - **Description**: Fetch and process time series data from the ONS API.
352
- - **Usage**: `pull_ons_data(series_list, week_commencing)`
353
- - **Example**: `pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')`
354
-
355
- ---
356
-
357
- ## 4. `pull_oecd`
369
+ ## 3. pull_oecd
358
370
  - **Description**: Fetch macroeconomic data from OECD for a specified country.
359
- - **Usage**: `pull_oecd(country='GBR', week_commencing='mon', start_date='1950-01-01')`
360
- - **Example**: `pull_oecd('GBR', 'mon', '1950-01-01')`
371
+ - **Usage**: pull_oecd(country='GBR', week_commencing='mon', start_date='2020-01-01')
372
+ - **Example**: pull_oecd('GBR', 'mon', '2000-01-01')
361
373
 
362
374
  ---
363
375
 
364
- ## 5. `get_google_mobility_data`
376
+ ## 4. get_google_mobility_data
365
377
  - **Description**: Fetch Google Mobility data for the specified country.
366
- - **Usage**: `get_google_mobility_data(country, wc)`
367
- - **Example**: `get_google_mobility_data('United Kingdom', 'mon')`
378
+ - **Usage**: get_google_mobility_data(country, wc)
379
+ - **Example**: get_google_mobility_data('United Kingdom', 'mon')
368
380
 
369
381
  ---
370
382
 
371
- ## 6. `pull_combined_dummies`
383
+ ## 5. pull_seasonality
372
384
  - **Description**: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.
373
- - **Usage**: `pull_combined_dummies(week_commencing)`
374
- - **Example**: `pull_combined_dummies('mon')`
385
+ - **Usage**: pull_seasonality(week_commencing, start_date, countries)
386
+ - **Example**: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])
375
387
 
376
388
  ---
377
389
 
378
- ## 7. `pull_weather`
390
+ ## 6. pull_weather
379
391
  - **Description**: Fetch and process historical weather data for the specified country.
380
- - **Usage**: `pull_weather(week_commencing, country)`
381
- - **Example**: `pull_weather('mon', 'GBR')`
392
+ - **Usage**: pull_weather(week_commencing, country)
393
+ - **Example**: pull_weather('mon', 'GBR')
394
+
395
+ ---
396
+
397
+ ## 7. pull_macro_ons_uk
398
+ - **Description**: Fetch and process time series data from the Beta ONS API.
399
+ - **Usage**: pull_macro_ons_uk(additional_list, week_commencing, sector)
400
+ - **Example**: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')
401
+
402
+ ---
403
+
404
+ ## 8. pull_yfinance
405
+ - **Description**: Fetch and process time series data from Yahoo Finance.
406
+ - **Usage**: pull_yfinance(tickers, week_start_day)
407
+ - **Example**: pull_yfinance(['^FTMC', '^IXIC'], 'mon')
408
+
409
+ ## Installation
410
+
411
+ Install the IMS package via pip:
412
+
413
+ ```bash
414
+ pip install imsciences
415
+ ```
416
+
417
+ ---
418
+
419
+ ## Useage
420
+
421
+ ```bash
422
+ from imsciences import *
423
+ ims = dataprocessing()
424
+ ims_pull = datapull()
425
+ ```
426
+
427
+ ---
428
+
429
+ ## License
430
+
431
+ This project is licensed under the MIT License.
382
432
 
383
433
  ---
@@ -1,17 +1,17 @@
1
1
  dataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
2
2
  dataprocessing/data-processing-functions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
3
3
  dataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
4
- imsciences/__init__.py,sha256=0IwH7R_2N8vimJJo2DLzIG1hq9ddn8gB6ijlLrQemZs,122
4
+ imsciences/__init__.py,sha256=7CfK2dMjPnBBw6I4st-20MdMlLjZULviFVXF2eMD9NI,80
5
5
  imsciences/datafunctions-IMS-24Ltp-3.py,sha256=3Snv-0iE_03StmyjtT-riOU9f4v8TaJWLoyZLJp6l8Y,141406
6
- imsciences/datafunctions.py,sha256=lvvodU8dZ9IN_GS7FYMuft9ZsQkD2BMIGQxLiN8GY7c,151557
6
+ imsciences/datafunctions.py,sha256=XrvJWWFh9gdKAoeIHee2nYi0Z0zPxmW3oB6ICnGTxYc,158444
7
7
  imsciences/datapull.py,sha256=TPY0LDgOkcKTBk8OekbD0Grg5x0SomAK2dZ7MuT6X1E,19000
8
8
  imsciences/unittesting.py,sha256=d9H5HN8y7oof59hqN9mGqkjulExqFd93BEW-X8w_Id8,58142
9
9
  imsciencesdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
10
10
  imsciencesdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
11
11
  imsdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
12
12
  imsdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
13
- imsciences-0.6.3.2.dist-info/METADATA,sha256=k22-OJm6rdvDU7mubqDGW1K9Z-inek4VCQ4HdAw51cA,16981
14
- imsciences-0.6.3.2.dist-info/PKG-INFO-IMS-24Ltp-3,sha256=yqZbigwHjnYoqyI81PGz_AeofRFfOrwH_Vyawyef-mg,854
15
- imsciences-0.6.3.2.dist-info/WHEEL,sha256=ixB2d4u7mugx_bCBycvM9OzZ5yD7NmPXFRtKlORZS2Y,91
16
- imsciences-0.6.3.2.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
17
- imsciences-0.6.3.2.dist-info/RECORD,,
13
+ imsciences-0.8.1.dist-info/METADATA,sha256=sJK90uzVkH6KCDVM3hmkbRyGoXNmie8JMoCVLy4J7Fg,17785
14
+ imsciences-0.8.1.dist-info/PKG-INFO-IMS-24Ltp-3,sha256=yqZbigwHjnYoqyI81PGz_AeofRFfOrwH_Vyawyef-mg,854
15
+ imsciences-0.8.1.dist-info/WHEEL,sha256=ixB2d4u7mugx_bCBycvM9OzZ5yD7NmPXFRtKlORZS2Y,91
16
+ imsciences-0.8.1.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
17
+ imsciences-0.8.1.dist-info/RECORD,,