imsciences 0.5.4.7__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/__init__.py +4 -1
- imsciences/datafunctions-IMS-24Ltp-3.py +2711 -0
- imsciences/datafunctions.py +2842 -170
- imsciences/datapull.py +374 -0
- imsciences/geo.py +195 -0
- imsciences/mmm.py +1415 -0
- imsciences/pull.py +1483 -0
- imsciences/unittesting.py +1064 -0
- imsciences/vis.py +196 -0
- imsciences-0.9.3.dist-info/LICENSE.txt +21 -0
- imsciences-0.9.3.dist-info/METADATA +330 -0
- imsciences-0.9.3.dist-info/PKG-INFO-IMS-24Ltp-3 +24 -0
- imsciences-0.9.3.dist-info/RECORD +22 -0
- {imsciences-0.5.4.7.dist-info → imsciences-0.9.3.dist-info}/WHEEL +1 -1
- imsciences-0.5.4.7.dist-info/METADATA +0 -95
- imsciences-0.5.4.7.dist-info/RECORD +0 -13
- {imsciences-0.5.4.7.dist-info → imsciences-0.9.3.dist-info}/top_level.txt +0 -0
imsciences/datapull.py
ADDED
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import calendar
|
|
3
|
+
import requests
|
|
4
|
+
import os
|
|
5
|
+
import plotly.express as px
|
|
6
|
+
import plotly.graph_objs as go
|
|
7
|
+
import numpy as np
|
|
8
|
+
import datetime
|
|
9
|
+
import re
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from imsciences import *
|
|
12
|
+
from fredapi import Fred
|
|
13
|
+
import time
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from datafunctions import dataprocessing
|
|
16
|
+
|
|
17
|
+
class datapull:
|
|
18
|
+
|
|
19
|
+
def pull_help(self):
|
|
20
|
+
print("This is the help section. The functions in the package are as follows:")
|
|
21
|
+
|
|
22
|
+
print("\n1. pull_fred_data")
|
|
23
|
+
print(" - Description: Get data from FRED by using series id tokens.")
|
|
24
|
+
print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
|
|
25
|
+
print(" - Example: pull_fred_data('sun', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])")
|
|
26
|
+
|
|
27
|
+
############################################################### MACRO ##########################################################################
|
|
28
|
+
|
|
29
|
+
def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]) -> pd.DataFrame:
|
|
30
|
+
'''
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
week_commencing : str
|
|
34
|
+
specify the day for the week commencing, the default is 'sun' (e.g., 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
|
|
35
|
+
|
|
36
|
+
series_id_list : list[str]
|
|
37
|
+
provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
|
|
38
|
+
["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]
|
|
39
|
+
|
|
40
|
+
Returns
|
|
41
|
+
----------
|
|
42
|
+
pd.DataFrame
|
|
43
|
+
Return a data frame with FRED data according to the series IDs provided
|
|
44
|
+
|
|
45
|
+
Example
|
|
46
|
+
----------
|
|
47
|
+
pull_fred_data("mon", ["GCEC1", "SP500"])
|
|
48
|
+
'''
|
|
49
|
+
# Fred API
|
|
50
|
+
fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
|
|
51
|
+
|
|
52
|
+
# Fetch the metadata for each series to get the full names
|
|
53
|
+
series_names = {series_id: fred.get_series_info(series_id).title for series_id in series_id_list}
|
|
54
|
+
|
|
55
|
+
# Download data from series id list
|
|
56
|
+
fred_series = {series_id: fred.get_series(series_id) for series_id in series_id_list}
|
|
57
|
+
|
|
58
|
+
# Data processing
|
|
59
|
+
date_range = {'OBS': pd.date_range("1950-01-01", datetime.today().strftime('%Y-%m-%d'), freq='d')}
|
|
60
|
+
fred_series_df = pd.DataFrame(date_range)
|
|
61
|
+
|
|
62
|
+
for series_id, series_data in fred_series.items():
|
|
63
|
+
series_data = series_data.reset_index()
|
|
64
|
+
series_data.columns = ['OBS', series_names[series_id]] # Use the series name as the column header
|
|
65
|
+
fred_series_df = pd.merge_asof(fred_series_df, series_data, on='OBS', direction='backward')
|
|
66
|
+
|
|
67
|
+
# Handle duplicate columns
|
|
68
|
+
for col in fred_series_df.columns:
|
|
69
|
+
if '_x' in col:
|
|
70
|
+
base_col = col.replace('_x', '')
|
|
71
|
+
fred_series_df[base_col] = fred_series_df[col].combine_first(fred_series_df[base_col + '_y'])
|
|
72
|
+
fred_series_df.drop([col, base_col + '_y'], axis=1, inplace=True)
|
|
73
|
+
|
|
74
|
+
# Ensure sum_columns are present in the DataFrame
|
|
75
|
+
sum_columns = [series_names[series_id] for series_id in series_id_list if series_names[series_id] in fred_series_df.columns]
|
|
76
|
+
|
|
77
|
+
# Aggregate results by week
|
|
78
|
+
fred_df_final = dataprocessing.aggregate_daily_to_wc_wide(self, df=fred_series_df,
|
|
79
|
+
date_column="OBS",
|
|
80
|
+
group_columns=[],
|
|
81
|
+
sum_columns=sum_columns,
|
|
82
|
+
wc=week_commencing,
|
|
83
|
+
aggregation="average")
|
|
84
|
+
|
|
85
|
+
# Remove anything after the instance of any ':' in the column names and rename, except for 'OBS'
|
|
86
|
+
fred_df_final.columns = ['OBS' if col == 'OBS' else 'macro_' + col.lower().split(':')[0].replace(' ', '_') for col in fred_df_final.columns]
|
|
87
|
+
|
|
88
|
+
return fred_df_final
|
|
89
|
+
|
|
90
|
+
def pull_boe_data(self, week_commencing="mon", max_retries=30, delay=5):
|
|
91
|
+
"""
|
|
92
|
+
Fetch and process Bank of England interest rate data.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
week_commencing (str): The starting day of the week for aggregation.
|
|
96
|
+
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
97
|
+
Default is "sun".
|
|
98
|
+
max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 30.
|
|
99
|
+
delay (int): Delay in seconds between retry attempts. Default is 5.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
|
|
103
|
+
The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
|
|
104
|
+
and 'macro_boe_intr_rate' contains the average interest rate for the week.
|
|
105
|
+
"""
|
|
106
|
+
# Week commencing dictionary
|
|
107
|
+
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
108
|
+
|
|
109
|
+
# Function to fetch the data with retries
|
|
110
|
+
def fetch_data_with_retries(url, max_retries, delay):
|
|
111
|
+
for attempt in range(max_retries):
|
|
112
|
+
try:
|
|
113
|
+
html_table = pd.read_html(url)[0]
|
|
114
|
+
return html_table
|
|
115
|
+
except Exception as e:
|
|
116
|
+
print(f"Attempt {attempt + 1} failed: {e}")
|
|
117
|
+
if attempt < max_retries - 1:
|
|
118
|
+
time.sleep(delay)
|
|
119
|
+
else:
|
|
120
|
+
raise
|
|
121
|
+
|
|
122
|
+
# Import HTML data from Bank of England rate
|
|
123
|
+
url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
|
|
124
|
+
html_table = fetch_data_with_retries(url, max_retries, delay)
|
|
125
|
+
|
|
126
|
+
df = pd.DataFrame(html_table)
|
|
127
|
+
df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
|
|
128
|
+
|
|
129
|
+
# Change date column to datetime and find the corresponding week to the date
|
|
130
|
+
df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
|
|
131
|
+
df.sort_values("OBS", axis=0, inplace=True)
|
|
132
|
+
|
|
133
|
+
# Create a daily date range and find the week commencing for that day
|
|
134
|
+
date_range = pd.date_range(df["OBS"].iloc[0], datetime.today(), freq="d")
|
|
135
|
+
df_daily = pd.DataFrame(date_range, columns=["OBS"])
|
|
136
|
+
|
|
137
|
+
# Adjust each date to the specified week commencing day
|
|
138
|
+
df_daily['Week_Commencing'] = df_daily["OBS"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
139
|
+
|
|
140
|
+
# Outer merge the daily date range on the boe dataframe and forward fill in the blanks
|
|
141
|
+
df_final = df_daily.merge(df, on='OBS', how="left")
|
|
142
|
+
df_final["macro_boe_intr_rate"].ffill(inplace=True)
|
|
143
|
+
|
|
144
|
+
# Group by the week start date and get the mean of the interest rates for each week
|
|
145
|
+
df_final = df_final.groupby('Week_Commencing')['macro_boe_intr_rate'].mean().reset_index()
|
|
146
|
+
|
|
147
|
+
df_final['Week_Commencing'] = df_final['Week_Commencing'].dt.strftime('%d/%m/%Y')
|
|
148
|
+
df_final.rename(columns={'Week_Commencing': 'OBS'}, inplace=True)
|
|
149
|
+
|
|
150
|
+
return df_final
|
|
151
|
+
|
|
152
|
+
def pull_ons_data(self, series_list, week_commencing):
|
|
153
|
+
"""
|
|
154
|
+
Fetch and process time series data from the ONS API.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
series_list (list): A list of dictionaries where each dictionary represents a time series.
|
|
158
|
+
Each dictionary should have the keys 'series_id' and 'dataset_id'.
|
|
159
|
+
week_commencing (str): The starting day of the week for aggregation.
|
|
160
|
+
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
pd.DataFrame: A DataFrame with weekly aggregated ONS data. The 'OBS' column contains the week
|
|
164
|
+
commencing dates and other columns contain the aggregated time series values.
|
|
165
|
+
"""
|
|
166
|
+
# Generate a date range from 1950-01-01 to today
|
|
167
|
+
date_range = pd.date_range(start="1950-01-01", end=datetime.today(), freq='D')
|
|
168
|
+
daily_df = pd.DataFrame(date_range, columns=['OBS'])
|
|
169
|
+
|
|
170
|
+
# Keep track of the renamed value columns
|
|
171
|
+
value_columns = []
|
|
172
|
+
|
|
173
|
+
for series in series_list:
|
|
174
|
+
series_id = series['series_id']
|
|
175
|
+
dataset_id = series['dataset_id']
|
|
176
|
+
|
|
177
|
+
# Construct the URL for data
|
|
178
|
+
data_url = f"https://api.ons.gov.uk/timeseries/{series_id}/dataset/{dataset_id}/data"
|
|
179
|
+
|
|
180
|
+
# Make the request to the ONS API for data
|
|
181
|
+
data_response = requests.get(data_url)
|
|
182
|
+
|
|
183
|
+
# Check if the request was successful
|
|
184
|
+
if data_response.status_code != 200:
|
|
185
|
+
print(f"Failed to fetch data for series {series_id}: {data_response.status_code} {data_response.text}")
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
# Parse the JSON response for data
|
|
189
|
+
data = data_response.json()
|
|
190
|
+
|
|
191
|
+
# Attempt to extract the name of the time series from the data response
|
|
192
|
+
series_name = data.get('description', {}).get('title', 'Value')
|
|
193
|
+
|
|
194
|
+
# Determine the most granular time series data available
|
|
195
|
+
if 'months' in data and data['months']:
|
|
196
|
+
time_series_data = data['months']
|
|
197
|
+
elif 'quarters' in data and data['quarters']:
|
|
198
|
+
time_series_data = data['quarters']
|
|
199
|
+
elif 'years' in data and data['years']:
|
|
200
|
+
time_series_data = data['years']
|
|
201
|
+
else:
|
|
202
|
+
print("No time series data found in the response")
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
# Create a DataFrame from the time series data
|
|
206
|
+
df = pd.DataFrame(time_series_data)
|
|
207
|
+
|
|
208
|
+
# Handle different frequencies in the data
|
|
209
|
+
if 'date' in df.columns:
|
|
210
|
+
if any(df['date'].str.contains('Q')):
|
|
211
|
+
df['date'] = pd.PeriodIndex(df['date'], freq='Q').to_timestamp()
|
|
212
|
+
else:
|
|
213
|
+
df['date'] = pd.to_datetime(df['date'])
|
|
214
|
+
|
|
215
|
+
df = df.rename(columns={'date': 'OBS', 'value': series_name})
|
|
216
|
+
|
|
217
|
+
# Rename the value column
|
|
218
|
+
new_col_name = 'macro_' + series_name.lower().replace(':', '').replace(' ', '_').replace('-', '_')
|
|
219
|
+
df = df.rename(columns={series_name: new_col_name})
|
|
220
|
+
|
|
221
|
+
# Track the renamed value column
|
|
222
|
+
value_columns.append(new_col_name)
|
|
223
|
+
|
|
224
|
+
# Merge the data based on the observation date
|
|
225
|
+
daily_df = pd.merge_asof(daily_df, df[['OBS', new_col_name]], on='OBS', direction='backward')
|
|
226
|
+
|
|
227
|
+
# Ensure columns are numeric
|
|
228
|
+
for col in value_columns:
|
|
229
|
+
if col in daily_df.columns:
|
|
230
|
+
daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
|
|
231
|
+
else:
|
|
232
|
+
print(f"Column {col} not found in daily_df")
|
|
233
|
+
|
|
234
|
+
# Aggregate results by week
|
|
235
|
+
ons_df_final = dataprocessing.aggregate_daily_to_wc_wide(self, df=daily_df,
|
|
236
|
+
date_column="OBS",
|
|
237
|
+
group_columns=[],
|
|
238
|
+
sum_columns=value_columns,
|
|
239
|
+
wc=week_commencing,
|
|
240
|
+
aggregation="average")
|
|
241
|
+
|
|
242
|
+
return ons_df_final
|
|
243
|
+
|
|
244
|
+
############################################################### Seasonality ##########################################################################
|
|
245
|
+
|
|
246
|
+
def pull_combined_dummies(self, week_commencing):
|
|
247
|
+
# Week commencing dictionary
|
|
248
|
+
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
249
|
+
|
|
250
|
+
# Create daily date range dataframe
|
|
251
|
+
date_range = pd.date_range(datetime.datetime(2015, 1, 1), datetime.date.today(), freq="d")
|
|
252
|
+
df_daily = pd.DataFrame(date_range, columns=["Date"])
|
|
253
|
+
|
|
254
|
+
# Create weekly date range dataframe
|
|
255
|
+
df_daily['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
256
|
+
df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
|
|
257
|
+
df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
|
|
258
|
+
|
|
259
|
+
df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
|
|
260
|
+
df_weekly_start.set_index("Date", inplace=True)
|
|
261
|
+
|
|
262
|
+
# Create individual weekly dummies
|
|
263
|
+
dummy_columns = {}
|
|
264
|
+
for i in range(len(df_weekly_start)):
|
|
265
|
+
col_name = f"dum_{df_weekly_start.index[i].strftime('%Y_%m_%d')}"
|
|
266
|
+
dummy_columns[col_name] = [0] * len(df_weekly_start)
|
|
267
|
+
dummy_columns[col_name][i] = 1
|
|
268
|
+
|
|
269
|
+
df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
|
|
270
|
+
df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
|
|
271
|
+
|
|
272
|
+
# Create monthly dummies
|
|
273
|
+
df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
|
|
274
|
+
df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"])
|
|
275
|
+
df_monthly_dummies['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
276
|
+
df_monthly_dummies = df_monthly_dummies.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
|
|
277
|
+
|
|
278
|
+
df_monthly_dummies.set_index("Date", inplace=True)
|
|
279
|
+
df_monthly_dummies = df_monthly_dummies / 7
|
|
280
|
+
|
|
281
|
+
# Combine weekly and monthly dataframes
|
|
282
|
+
df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
|
|
283
|
+
|
|
284
|
+
# Create weekly dummies
|
|
285
|
+
df_combined.reset_index(inplace=True)
|
|
286
|
+
df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
|
|
287
|
+
df_combined = pd.get_dummies(df_combined, prefix="wk", columns=["Week"])
|
|
288
|
+
|
|
289
|
+
# Create yearly dummies
|
|
290
|
+
df_combined["Year"] = df_combined["Date"].dt.year
|
|
291
|
+
df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"])
|
|
292
|
+
|
|
293
|
+
# Add constant
|
|
294
|
+
df_combined["Constant"] = 1
|
|
295
|
+
|
|
296
|
+
# Add trend
|
|
297
|
+
df_combined["Trend"] = df_combined.index + 1
|
|
298
|
+
|
|
299
|
+
# Set date as index
|
|
300
|
+
df_combined.set_index("Date", inplace=True)
|
|
301
|
+
|
|
302
|
+
# Create COVID lockdown dummies
|
|
303
|
+
lockdown_periods = [
|
|
304
|
+
# Lockdown 1
|
|
305
|
+
("2020-03-23", "2020-05-24"),
|
|
306
|
+
# Lockdown 2
|
|
307
|
+
("2020-11-05", "2020-12-02"),
|
|
308
|
+
# Lockdown 3
|
|
309
|
+
("2021-01-04", "2021-03-08")
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
df_covid = pd.DataFrame(date_range, columns=["Date"])
|
|
313
|
+
df_covid["national_lockdown"] = 0
|
|
314
|
+
|
|
315
|
+
for start, end in lockdown_periods:
|
|
316
|
+
df_covid.loc[(df_covid["Date"] >= start) & (df_covid["Date"] <= end), "national_lockdown"] = 1
|
|
317
|
+
|
|
318
|
+
df_covid['week_start'] = df_covid["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
319
|
+
df_covid.drop("Date", axis=1, inplace=True)
|
|
320
|
+
df_covid.rename(columns={"week_start": "OBS"}, inplace=True)
|
|
321
|
+
df_national_lockdown_total = df_covid.groupby('OBS').sum(numeric_only=True)
|
|
322
|
+
df_national_lockdown_total.rename(columns={"national_lockdown": "covid_national_lockdown_total"}, inplace=True)
|
|
323
|
+
|
|
324
|
+
df_national_lockdown_1 = df_national_lockdown_total.copy(deep=True)
|
|
325
|
+
df_national_lockdown_2 = df_national_lockdown_total.copy(deep=True)
|
|
326
|
+
df_national_lockdown_3 = df_national_lockdown_total.copy(deep=True)
|
|
327
|
+
|
|
328
|
+
df_national_lockdown_1.loc[df_national_lockdown_1.index > "2020-05-24"] = 0
|
|
329
|
+
df_national_lockdown_1.rename(columns={"covid_national_lockdown_total": "covid_national_lockdown_1"}, inplace=True)
|
|
330
|
+
|
|
331
|
+
df_national_lockdown_2.loc[df_national_lockdown_2.index < "2020-11-05"] = 0
|
|
332
|
+
df_national_lockdown_2.loc[df_national_lockdown_2.index > "2020-12-02"] = 0
|
|
333
|
+
df_national_lockdown_2.rename(columns={"covid_national_lockdown_total": "covid_national_lockdown_2"}, inplace=True)
|
|
334
|
+
|
|
335
|
+
df_national_lockdown_3.loc[df_national_lockdown_3.index < "2021-01-04"] = 0
|
|
336
|
+
df_national_lockdown_3.rename(columns={"covid_national_lockdown_total": "covid_national_lockdown_3"}, inplace=True)
|
|
337
|
+
|
|
338
|
+
df_final_covid = pd.concat([df_national_lockdown_total, df_national_lockdown_1, df_national_lockdown_2, df_national_lockdown_3], axis=1)
|
|
339
|
+
df_final_covid.reset_index(inplace=True)
|
|
340
|
+
df_final_covid.rename(columns={"index": "OBS"}, inplace=True)
|
|
341
|
+
|
|
342
|
+
# Create seasonal indicators for the last day and last Friday of the month
|
|
343
|
+
min_date = '2019-12-29'
|
|
344
|
+
max_date = datetime.date.today().strftime('%Y-%m-%d')
|
|
345
|
+
date_range_seas = pd.date_range(start=min_date, end=max_date)
|
|
346
|
+
|
|
347
|
+
df_seas = pd.DataFrame(date_range_seas, columns=['Date'])
|
|
348
|
+
df_seas['Last_Day_of_Month'] = df_seas['Date'].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
|
|
349
|
+
|
|
350
|
+
def is_last_friday(date):
|
|
351
|
+
last_day_of_month = date.to_period('M').to_timestamp('M')
|
|
352
|
+
last_day_weekday = last_day_of_month.dayofweek
|
|
353
|
+
if last_day_weekday >= 4:
|
|
354
|
+
days_to_subtract = last_day_weekday - 4
|
|
355
|
+
else:
|
|
356
|
+
days_to_subtract = last_day_weekday + 3
|
|
357
|
+
last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
|
|
358
|
+
return 1 if date == last_friday else 0
|
|
359
|
+
|
|
360
|
+
df_seas['Last_Friday_of_Month'] = df_seas['Date'].apply(is_last_friday)
|
|
361
|
+
|
|
362
|
+
df_seas['week_start'] = df_seas["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
363
|
+
df_seas = df_seas.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
|
|
364
|
+
df_seas.set_index("Date", inplace=True)
|
|
365
|
+
|
|
366
|
+
# Combine all dataframes
|
|
367
|
+
df_combined = df_combined.reset_index().rename(columns={"Date": "OBS"})
|
|
368
|
+
df_final_combined = pd.merge(df_combined, df_final_covid, how='left', left_on='OBS', right_on='OBS')
|
|
369
|
+
df_final_combined = pd.merge(df_final_combined, df_seas, how='left', left_on='OBS', right_on='Date')
|
|
370
|
+
|
|
371
|
+
# Fill any NaN values with 0
|
|
372
|
+
df_final_combined.fillna(0, inplace=True)
|
|
373
|
+
|
|
374
|
+
return df_final_combined
|
imsciences/geo.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import geopandas as gpd
|
|
3
|
+
from shapely.geometry import Point
|
|
4
|
+
from google.analytics.data_v1beta import BetaAnalyticsDataClient
|
|
5
|
+
from google.analytics.data_v1beta.types import DateRange
|
|
6
|
+
from google.analytics.data_v1beta.types import Dimension
|
|
7
|
+
from google.analytics.data_v1beta.types import Metric
|
|
8
|
+
from google.analytics.data_v1beta.types import RunReportRequest
|
|
9
|
+
from google.analytics.data_v1beta.types import OrderBy
|
|
10
|
+
from google.analytics.data_v1beta.types import Filter
|
|
11
|
+
from google.analytics.data_v1beta.types import FilterExpression
|
|
12
|
+
from google.analytics.data_v1beta.types import FilterExpressionList
|
|
13
|
+
from google.auth.exceptions import DefaultCredentialsError
|
|
14
|
+
import logging
|
|
15
|
+
from datetime import datetime, timedelta
|
|
16
|
+
import os
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
class geoprocessing:
|
|
20
|
+
|
|
21
|
+
def help(self):
|
|
22
|
+
|
|
23
|
+
print("\n1. pull_ga")
|
|
24
|
+
print(" - Description: Pull in GA4 data for geo experiments.")
|
|
25
|
+
print(" - Usage: pull_ga(credentials_file, property_id, start_date, country, metrics)")
|
|
26
|
+
print(" - Example: pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])")
|
|
27
|
+
|
|
28
|
+
print("\n2. process_itv_analysis")
|
|
29
|
+
print(" - Description: Pull in GA4 data for geo experiments.")
|
|
30
|
+
print(" - Usage: process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, group1, group2)")
|
|
31
|
+
print(" - Example:process_itv_analysis(df,'itv regional mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'])")
|
|
32
|
+
|
|
33
|
+
def pull_ga(self, credentials_file, property_id, start_date, country, metrics):
|
|
34
|
+
"""
|
|
35
|
+
Pulls Google Analytics data using the BetaAnalyticsDataClient.
|
|
36
|
+
|
|
37
|
+
Parameters:
|
|
38
|
+
credentials_file (str): Path to the JSON credentials file.
|
|
39
|
+
property_id (str): Google Analytics property ID.
|
|
40
|
+
start_date (str): Start date in 'YYYY-MM-DD' format.
|
|
41
|
+
country (str): Country to filter the data by.
|
|
42
|
+
metrics (list): List of metrics to retrieve (e.g., ["totalUsers", "sessions"]).
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
pd.DataFrame: A pandas DataFrame containing the fetched data.
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
end_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
|
|
49
|
+
|
|
50
|
+
if not os.path.exists(credentials_file):
|
|
51
|
+
raise FileNotFoundError(f"Credentials file '{credentials_file}' not found.")
|
|
52
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_file
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
client = BetaAnalyticsDataClient()
|
|
56
|
+
except DefaultCredentialsError as e:
|
|
57
|
+
raise DefaultCredentialsError(
|
|
58
|
+
f"Failed to initialize Google Analytics client: {e}"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def format_report(request):
|
|
62
|
+
response = client.run_report(request)
|
|
63
|
+
# Row index
|
|
64
|
+
row_index_names = [header.name for header in response.dimension_headers]
|
|
65
|
+
row_header = []
|
|
66
|
+
for i in range(len(row_index_names)):
|
|
67
|
+
row_header.append([row.dimension_values[i].value for row in response.rows])
|
|
68
|
+
|
|
69
|
+
row_index_named = pd.MultiIndex.from_arrays(np.array(row_header), names=np.array(row_index_names))
|
|
70
|
+
# Row flat data
|
|
71
|
+
metric_names = [header.name for header in response.metric_headers]
|
|
72
|
+
data_values = []
|
|
73
|
+
for i in range(len(metric_names)):
|
|
74
|
+
data_values.append([row.metric_values[i].value for row in response.rows])
|
|
75
|
+
|
|
76
|
+
output = pd.DataFrame(data=np.transpose(np.array(data_values, dtype='f')),
|
|
77
|
+
index=row_index_named, columns=metric_names)
|
|
78
|
+
return output
|
|
79
|
+
|
|
80
|
+
all_dfs = []
|
|
81
|
+
offset_value = 0
|
|
82
|
+
batch_size = 100000
|
|
83
|
+
|
|
84
|
+
while True:
|
|
85
|
+
metric_objects = [Metric(name=metric) for metric in metrics]
|
|
86
|
+
|
|
87
|
+
request = RunReportRequest(
|
|
88
|
+
property='properties/' + property_id,
|
|
89
|
+
dimensions=[Dimension(name="date"), Dimension(name="city")],
|
|
90
|
+
metrics=metric_objects,
|
|
91
|
+
order_bys=[OrderBy(dimension={'dimension_name': 'date'}),
|
|
92
|
+
OrderBy(dimension={'dimension_name': 'city'})],
|
|
93
|
+
date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
|
|
94
|
+
limit=batch_size,
|
|
95
|
+
offset=offset_value,
|
|
96
|
+
dimension_filter=FilterExpression(
|
|
97
|
+
and_group=FilterExpressionList(
|
|
98
|
+
expressions=[
|
|
99
|
+
FilterExpression(
|
|
100
|
+
filter=Filter(
|
|
101
|
+
field_name="country",
|
|
102
|
+
string_filter=Filter.StringFilter(value=country),
|
|
103
|
+
)
|
|
104
|
+
),
|
|
105
|
+
]
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
df = format_report(request)
|
|
111
|
+
if df.empty:
|
|
112
|
+
break
|
|
113
|
+
|
|
114
|
+
df = df.reset_index()
|
|
115
|
+
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
|
|
116
|
+
all_dfs.append(df)
|
|
117
|
+
offset_value += batch_size
|
|
118
|
+
|
|
119
|
+
if not all_dfs:
|
|
120
|
+
return pd.DataFrame()
|
|
121
|
+
|
|
122
|
+
final_df = pd.concat(all_dfs, ignore_index=True)
|
|
123
|
+
return final_df
|
|
124
|
+
|
|
125
|
+
except FileNotFoundError as e:
|
|
126
|
+
logging.error(f"FileNotFoundError: {e}")
|
|
127
|
+
raise
|
|
128
|
+
except DefaultCredentialsError as e:
|
|
129
|
+
logging.error(f"DefaultCredentialsError: {e}")
|
|
130
|
+
raise
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logging.error(f"An unexpected error occurred: {e}")
|
|
133
|
+
raise
|
|
134
|
+
|
|
135
|
+
def process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, group1, group2):
|
|
136
|
+
"""
|
|
137
|
+
Process ITV analysis by mapping geos, grouping data, and merging with media spend.
|
|
138
|
+
|
|
139
|
+
Parameters:
|
|
140
|
+
raw_df (pd.DataFrame): Raw input data containing 'geo', 'newUsers', 'totalRevenue', and 'date'.
|
|
141
|
+
itv_path (str): Path to the ITV regional mapping CSV file.
|
|
142
|
+
cities_path (str): Path to the Geo Mappings Excel file.
|
|
143
|
+
media_spend_path (str): Path to the media spend Excel file.
|
|
144
|
+
output_path (str): Path to save the final output CSV file.
|
|
145
|
+
group1 (list): List of geo regions for group 1.
|
|
146
|
+
group2 (list): List of geo regions for group 2.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
None
|
|
150
|
+
"""
|
|
151
|
+
# Load and preprocess data
|
|
152
|
+
itv = pd.read_csv(itv_path).dropna(subset=['Latitude', 'Longitude'])
|
|
153
|
+
cities = pd.read_excel(cities_path).dropna(subset=['Latitude', 'Longitude'])
|
|
154
|
+
|
|
155
|
+
itv['geometry'] = itv.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
|
|
156
|
+
cities['geometry'] = cities.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
|
|
157
|
+
|
|
158
|
+
itv_gdf = gpd.GeoDataFrame(itv, geometry='geometry')
|
|
159
|
+
cities_gdf = gpd.GeoDataFrame(cities, geometry='geometry')
|
|
160
|
+
|
|
161
|
+
# Perform spatial join to match geos
|
|
162
|
+
joined_gdf = gpd.sjoin_nearest(itv_gdf, cities_gdf, how='inner', distance_col='distance')
|
|
163
|
+
matched_result = joined_gdf[['ITV Region', 'geo']].drop_duplicates(subset=['geo'])
|
|
164
|
+
|
|
165
|
+
# Handle unmatched geos
|
|
166
|
+
unmatched_geos = set(cities_gdf['geo']) - set(matched_result['geo'])
|
|
167
|
+
unmatched_cities_gdf = cities_gdf[cities_gdf['geo'].isin(unmatched_geos)]
|
|
168
|
+
nearest_unmatched_gdf = gpd.sjoin_nearest(unmatched_cities_gdf, itv_gdf, how='inner', distance_col='distance')
|
|
169
|
+
|
|
170
|
+
unmatched_geo_mapping = nearest_unmatched_gdf[['geo', 'ITV Region', 'Latitude_right', 'Longitude_right']]
|
|
171
|
+
unmatched_geo_mapping.columns = ['geo', 'ITV Region', 'Nearest_Latitude', 'Nearest_Longitude']
|
|
172
|
+
|
|
173
|
+
matched_result = pd.concat([matched_result, unmatched_geo_mapping[['geo', 'ITV Region']]])
|
|
174
|
+
|
|
175
|
+
# Group and filter data
|
|
176
|
+
merged_df = pd.merge(raw_df, matched_result, on='geo', how='left')
|
|
177
|
+
merged_df = merged_df[merged_df["geo"] != "(not set)"].drop(columns=['geo'])
|
|
178
|
+
merged_df = merged_df.rename(columns={'ITV Region': 'geo', 'newUsers': 'response'})
|
|
179
|
+
|
|
180
|
+
grouped_df = merged_df.groupby(['date', 'geo'], as_index=False).agg({'response': 'sum', 'totalRevenue': 'sum'})
|
|
181
|
+
filtered_df = grouped_df[grouped_df['geo'].isin(group1 + group2)].copy()
|
|
182
|
+
|
|
183
|
+
assignment_map = {city: 1 for city in group1}
|
|
184
|
+
assignment_map.update({city: 2 for city in group2})
|
|
185
|
+
filtered_df['assignment'] = filtered_df['geo'].map(assignment_map)
|
|
186
|
+
|
|
187
|
+
# Merge with media spend data
|
|
188
|
+
media_spend_df = pd.read_excel(media_spend_path).rename(columns={'Cost': 'cost'})
|
|
189
|
+
analysis_df = pd.merge(filtered_df, media_spend_df, on=['date', 'geo'], how='left')
|
|
190
|
+
analysis_df['cost'] = analysis_df['cost'].fillna(0)
|
|
191
|
+
|
|
192
|
+
# Save the final output
|
|
193
|
+
analysis_df.to_csv(output_path, index=False)
|
|
194
|
+
|
|
195
|
+
return analysis_df
|