imsciences 0.5.4.7__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imsciences/datapull.py ADDED
@@ -0,0 +1,374 @@
1
+ import pandas as pd
2
+ import calendar
3
+ import requests
4
+ import os
5
+ import plotly.express as px
6
+ import plotly.graph_objs as go
7
+ import numpy as np
8
+ import datetime
9
+ import re
10
+ import pandas as pd
11
+ from imsciences import *
12
+ from fredapi import Fred
13
+ import time
14
+ from datetime import datetime
15
+ from datafunctions import dataprocessing
16
+
17
+ class datapull:
18
+
19
+ def pull_help(self):
20
+ print("This is the help section. The functions in the package are as follows:")
21
+
22
+ print("\n1. pull_fred_data")
23
+ print(" - Description: Get data from FRED by using series id tokens.")
24
+ print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
25
+ print(" - Example: pull_fred_data('sun', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])")
26
+
27
+ ############################################################### MACRO ##########################################################################
28
+
29
+ def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]) -> pd.DataFrame:
30
+ '''
31
+ Parameters
32
+ ----------
33
+ week_commencing : str
34
+ specify the day for the week commencing, the default is 'sun' (e.g., 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
35
+
36
+ series_id_list : list[str]
37
+ provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
38
+ ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]
39
+
40
+ Returns
41
+ ----------
42
+ pd.DataFrame
43
+ Return a data frame with FRED data according to the series IDs provided
44
+
45
+ Example
46
+ ----------
47
+ pull_fred_data("mon", ["GCEC1", "SP500"])
48
+ '''
49
+ # Fred API
50
+ fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
51
+
52
+ # Fetch the metadata for each series to get the full names
53
+ series_names = {series_id: fred.get_series_info(series_id).title for series_id in series_id_list}
54
+
55
+ # Download data from series id list
56
+ fred_series = {series_id: fred.get_series(series_id) for series_id in series_id_list}
57
+
58
+ # Data processing
59
+ date_range = {'OBS': pd.date_range("1950-01-01", datetime.today().strftime('%Y-%m-%d'), freq='d')}
60
+ fred_series_df = pd.DataFrame(date_range)
61
+
62
+ for series_id, series_data in fred_series.items():
63
+ series_data = series_data.reset_index()
64
+ series_data.columns = ['OBS', series_names[series_id]] # Use the series name as the column header
65
+ fred_series_df = pd.merge_asof(fred_series_df, series_data, on='OBS', direction='backward')
66
+
67
+ # Handle duplicate columns
68
+ for col in fred_series_df.columns:
69
+ if '_x' in col:
70
+ base_col = col.replace('_x', '')
71
+ fred_series_df[base_col] = fred_series_df[col].combine_first(fred_series_df[base_col + '_y'])
72
+ fred_series_df.drop([col, base_col + '_y'], axis=1, inplace=True)
73
+
74
+ # Ensure sum_columns are present in the DataFrame
75
+ sum_columns = [series_names[series_id] for series_id in series_id_list if series_names[series_id] in fred_series_df.columns]
76
+
77
+ # Aggregate results by week
78
+ fred_df_final = dataprocessing.aggregate_daily_to_wc_wide(self, df=fred_series_df,
79
+ date_column="OBS",
80
+ group_columns=[],
81
+ sum_columns=sum_columns,
82
+ wc=week_commencing,
83
+ aggregation="average")
84
+
85
+ # Remove anything after the instance of any ':' in the column names and rename, except for 'OBS'
86
+ fred_df_final.columns = ['OBS' if col == 'OBS' else 'macro_' + col.lower().split(':')[0].replace(' ', '_') for col in fred_df_final.columns]
87
+
88
+ return fred_df_final
89
+
90
+ def pull_boe_data(self, week_commencing="mon", max_retries=30, delay=5):
91
+ """
92
+ Fetch and process Bank of England interest rate data.
93
+
94
+ Args:
95
+ week_commencing (str): The starting day of the week for aggregation.
96
+ Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
97
+ Default is "sun".
98
+ max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 30.
99
+ delay (int): Delay in seconds between retry attempts. Default is 5.
100
+
101
+ Returns:
102
+ pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
103
+ The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
104
+ and 'macro_boe_intr_rate' contains the average interest rate for the week.
105
+ """
106
+ # Week commencing dictionary
107
+ day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
108
+
109
+ # Function to fetch the data with retries
110
+ def fetch_data_with_retries(url, max_retries, delay):
111
+ for attempt in range(max_retries):
112
+ try:
113
+ html_table = pd.read_html(url)[0]
114
+ return html_table
115
+ except Exception as e:
116
+ print(f"Attempt {attempt + 1} failed: {e}")
117
+ if attempt < max_retries - 1:
118
+ time.sleep(delay)
119
+ else:
120
+ raise
121
+
122
+ # Import HTML data from Bank of England rate
123
+ url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
124
+ html_table = fetch_data_with_retries(url, max_retries, delay)
125
+
126
+ df = pd.DataFrame(html_table)
127
+ df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
128
+
129
+ # Change date column to datetime and find the corresponding week to the date
130
+ df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
131
+ df.sort_values("OBS", axis=0, inplace=True)
132
+
133
+ # Create a daily date range and find the week commencing for that day
134
+ date_range = pd.date_range(df["OBS"].iloc[0], datetime.today(), freq="d")
135
+ df_daily = pd.DataFrame(date_range, columns=["OBS"])
136
+
137
+ # Adjust each date to the specified week commencing day
138
+ df_daily['Week_Commencing'] = df_daily["OBS"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
139
+
140
+ # Outer merge the daily date range on the boe dataframe and forward fill in the blanks
141
+ df_final = df_daily.merge(df, on='OBS', how="left")
142
+ df_final["macro_boe_intr_rate"].ffill(inplace=True)
143
+
144
+ # Group by the week start date and get the mean of the interest rates for each week
145
+ df_final = df_final.groupby('Week_Commencing')['macro_boe_intr_rate'].mean().reset_index()
146
+
147
+ df_final['Week_Commencing'] = df_final['Week_Commencing'].dt.strftime('%d/%m/%Y')
148
+ df_final.rename(columns={'Week_Commencing': 'OBS'}, inplace=True)
149
+
150
+ return df_final
151
+
152
+ def pull_ons_data(self, series_list, week_commencing):
153
+ """
154
+ Fetch and process time series data from the ONS API.
155
+
156
+ Args:
157
+ series_list (list): A list of dictionaries where each dictionary represents a time series.
158
+ Each dictionary should have the keys 'series_id' and 'dataset_id'.
159
+ week_commencing (str): The starting day of the week for aggregation.
160
+ Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
161
+
162
+ Returns:
163
+ pd.DataFrame: A DataFrame with weekly aggregated ONS data. The 'OBS' column contains the week
164
+ commencing dates and other columns contain the aggregated time series values.
165
+ """
166
+ # Generate a date range from 1950-01-01 to today
167
+ date_range = pd.date_range(start="1950-01-01", end=datetime.today(), freq='D')
168
+ daily_df = pd.DataFrame(date_range, columns=['OBS'])
169
+
170
+ # Keep track of the renamed value columns
171
+ value_columns = []
172
+
173
+ for series in series_list:
174
+ series_id = series['series_id']
175
+ dataset_id = series['dataset_id']
176
+
177
+ # Construct the URL for data
178
+ data_url = f"https://api.ons.gov.uk/timeseries/{series_id}/dataset/{dataset_id}/data"
179
+
180
+ # Make the request to the ONS API for data
181
+ data_response = requests.get(data_url)
182
+
183
+ # Check if the request was successful
184
+ if data_response.status_code != 200:
185
+ print(f"Failed to fetch data for series {series_id}: {data_response.status_code} {data_response.text}")
186
+ continue
187
+
188
+ # Parse the JSON response for data
189
+ data = data_response.json()
190
+
191
+ # Attempt to extract the name of the time series from the data response
192
+ series_name = data.get('description', {}).get('title', 'Value')
193
+
194
+ # Determine the most granular time series data available
195
+ if 'months' in data and data['months']:
196
+ time_series_data = data['months']
197
+ elif 'quarters' in data and data['quarters']:
198
+ time_series_data = data['quarters']
199
+ elif 'years' in data and data['years']:
200
+ time_series_data = data['years']
201
+ else:
202
+ print("No time series data found in the response")
203
+ continue
204
+
205
+ # Create a DataFrame from the time series data
206
+ df = pd.DataFrame(time_series_data)
207
+
208
+ # Handle different frequencies in the data
209
+ if 'date' in df.columns:
210
+ if any(df['date'].str.contains('Q')):
211
+ df['date'] = pd.PeriodIndex(df['date'], freq='Q').to_timestamp()
212
+ else:
213
+ df['date'] = pd.to_datetime(df['date'])
214
+
215
+ df = df.rename(columns={'date': 'OBS', 'value': series_name})
216
+
217
+ # Rename the value column
218
+ new_col_name = 'macro_' + series_name.lower().replace(':', '').replace(' ', '_').replace('-', '_')
219
+ df = df.rename(columns={series_name: new_col_name})
220
+
221
+ # Track the renamed value column
222
+ value_columns.append(new_col_name)
223
+
224
+ # Merge the data based on the observation date
225
+ daily_df = pd.merge_asof(daily_df, df[['OBS', new_col_name]], on='OBS', direction='backward')
226
+
227
+ # Ensure columns are numeric
228
+ for col in value_columns:
229
+ if col in daily_df.columns:
230
+ daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
231
+ else:
232
+ print(f"Column {col} not found in daily_df")
233
+
234
+ # Aggregate results by week
235
+ ons_df_final = dataprocessing.aggregate_daily_to_wc_wide(self, df=daily_df,
236
+ date_column="OBS",
237
+ group_columns=[],
238
+ sum_columns=value_columns,
239
+ wc=week_commencing,
240
+ aggregation="average")
241
+
242
+ return ons_df_final
243
+
244
+ ############################################################### Seasonality ##########################################################################
245
+
246
+ def pull_combined_dummies(self, week_commencing):
247
+ # Week commencing dictionary
248
+ day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
249
+
250
+ # Create daily date range dataframe
251
+ date_range = pd.date_range(datetime.datetime(2015, 1, 1), datetime.date.today(), freq="d")
252
+ df_daily = pd.DataFrame(date_range, columns=["Date"])
253
+
254
+ # Create weekly date range dataframe
255
+ df_daily['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
256
+ df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
257
+ df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
258
+
259
+ df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
260
+ df_weekly_start.set_index("Date", inplace=True)
261
+
262
+ # Create individual weekly dummies
263
+ dummy_columns = {}
264
+ for i in range(len(df_weekly_start)):
265
+ col_name = f"dum_{df_weekly_start.index[i].strftime('%Y_%m_%d')}"
266
+ dummy_columns[col_name] = [0] * len(df_weekly_start)
267
+ dummy_columns[col_name][i] = 1
268
+
269
+ df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
270
+ df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
271
+
272
+ # Create monthly dummies
273
+ df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
274
+ df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"])
275
+ df_monthly_dummies['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
276
+ df_monthly_dummies = df_monthly_dummies.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
277
+
278
+ df_monthly_dummies.set_index("Date", inplace=True)
279
+ df_monthly_dummies = df_monthly_dummies / 7
280
+
281
+ # Combine weekly and monthly dataframes
282
+ df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
283
+
284
+ # Create weekly dummies
285
+ df_combined.reset_index(inplace=True)
286
+ df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
287
+ df_combined = pd.get_dummies(df_combined, prefix="wk", columns=["Week"])
288
+
289
+ # Create yearly dummies
290
+ df_combined["Year"] = df_combined["Date"].dt.year
291
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"])
292
+
293
+ # Add constant
294
+ df_combined["Constant"] = 1
295
+
296
+ # Add trend
297
+ df_combined["Trend"] = df_combined.index + 1
298
+
299
+ # Set date as index
300
+ df_combined.set_index("Date", inplace=True)
301
+
302
+ # Create COVID lockdown dummies
303
+ lockdown_periods = [
304
+ # Lockdown 1
305
+ ("2020-03-23", "2020-05-24"),
306
+ # Lockdown 2
307
+ ("2020-11-05", "2020-12-02"),
308
+ # Lockdown 3
309
+ ("2021-01-04", "2021-03-08")
310
+ ]
311
+
312
+ df_covid = pd.DataFrame(date_range, columns=["Date"])
313
+ df_covid["national_lockdown"] = 0
314
+
315
+ for start, end in lockdown_periods:
316
+ df_covid.loc[(df_covid["Date"] >= start) & (df_covid["Date"] <= end), "national_lockdown"] = 1
317
+
318
+ df_covid['week_start'] = df_covid["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
319
+ df_covid.drop("Date", axis=1, inplace=True)
320
+ df_covid.rename(columns={"week_start": "OBS"}, inplace=True)
321
+ df_national_lockdown_total = df_covid.groupby('OBS').sum(numeric_only=True)
322
+ df_national_lockdown_total.rename(columns={"national_lockdown": "covid_national_lockdown_total"}, inplace=True)
323
+
324
+ df_national_lockdown_1 = df_national_lockdown_total.copy(deep=True)
325
+ df_national_lockdown_2 = df_national_lockdown_total.copy(deep=True)
326
+ df_national_lockdown_3 = df_national_lockdown_total.copy(deep=True)
327
+
328
+ df_national_lockdown_1.loc[df_national_lockdown_1.index > "2020-05-24"] = 0
329
+ df_national_lockdown_1.rename(columns={"covid_national_lockdown_total": "covid_national_lockdown_1"}, inplace=True)
330
+
331
+ df_national_lockdown_2.loc[df_national_lockdown_2.index < "2020-11-05"] = 0
332
+ df_national_lockdown_2.loc[df_national_lockdown_2.index > "2020-12-02"] = 0
333
+ df_national_lockdown_2.rename(columns={"covid_national_lockdown_total": "covid_national_lockdown_2"}, inplace=True)
334
+
335
+ df_national_lockdown_3.loc[df_national_lockdown_3.index < "2021-01-04"] = 0
336
+ df_national_lockdown_3.rename(columns={"covid_national_lockdown_total": "covid_national_lockdown_3"}, inplace=True)
337
+
338
+ df_final_covid = pd.concat([df_national_lockdown_total, df_national_lockdown_1, df_national_lockdown_2, df_national_lockdown_3], axis=1)
339
+ df_final_covid.reset_index(inplace=True)
340
+ df_final_covid.rename(columns={"index": "OBS"}, inplace=True)
341
+
342
+ # Create seasonal indicators for the last day and last Friday of the month
343
+ min_date = '2019-12-29'
344
+ max_date = datetime.date.today().strftime('%Y-%m-%d')
345
+ date_range_seas = pd.date_range(start=min_date, end=max_date)
346
+
347
+ df_seas = pd.DataFrame(date_range_seas, columns=['Date'])
348
+ df_seas['Last_Day_of_Month'] = df_seas['Date'].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
349
+
350
+ def is_last_friday(date):
351
+ last_day_of_month = date.to_period('M').to_timestamp('M')
352
+ last_day_weekday = last_day_of_month.dayofweek
353
+ if last_day_weekday >= 4:
354
+ days_to_subtract = last_day_weekday - 4
355
+ else:
356
+ days_to_subtract = last_day_weekday + 3
357
+ last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
358
+ return 1 if date == last_friday else 0
359
+
360
+ df_seas['Last_Friday_of_Month'] = df_seas['Date'].apply(is_last_friday)
361
+
362
+ df_seas['week_start'] = df_seas["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
363
+ df_seas = df_seas.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
364
+ df_seas.set_index("Date", inplace=True)
365
+
366
+ # Combine all dataframes
367
+ df_combined = df_combined.reset_index().rename(columns={"Date": "OBS"})
368
+ df_final_combined = pd.merge(df_combined, df_final_covid, how='left', left_on='OBS', right_on='OBS')
369
+ df_final_combined = pd.merge(df_final_combined, df_seas, how='left', left_on='OBS', right_on='Date')
370
+
371
+ # Fill any NaN values with 0
372
+ df_final_combined.fillna(0, inplace=True)
373
+
374
+ return df_final_combined
imsciences/geo.py ADDED
@@ -0,0 +1,195 @@
1
+ import pandas as pd
2
+ import geopandas as gpd
3
+ from shapely.geometry import Point
4
+ from google.analytics.data_v1beta import BetaAnalyticsDataClient
5
+ from google.analytics.data_v1beta.types import DateRange
6
+ from google.analytics.data_v1beta.types import Dimension
7
+ from google.analytics.data_v1beta.types import Metric
8
+ from google.analytics.data_v1beta.types import RunReportRequest
9
+ from google.analytics.data_v1beta.types import OrderBy
10
+ from google.analytics.data_v1beta.types import Filter
11
+ from google.analytics.data_v1beta.types import FilterExpression
12
+ from google.analytics.data_v1beta.types import FilterExpressionList
13
+ from google.auth.exceptions import DefaultCredentialsError
14
+ import logging
15
+ from datetime import datetime, timedelta
16
+ import os
17
+ import numpy as np
18
+
19
+ class geoprocessing:
20
+
21
+ def help(self):
22
+
23
+ print("\n1. pull_ga")
24
+ print(" - Description: Pull in GA4 data for geo experiments.")
25
+ print(" - Usage: pull_ga(credentials_file, property_id, start_date, country, metrics)")
26
+ print(" - Example: pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])")
27
+
28
+ print("\n2. process_itv_analysis")
29
+ print(" - Description: Pull in GA4 data for geo experiments.")
30
+ print(" - Usage: process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, group1, group2)")
31
+ print(" - Example:process_itv_analysis(df,'itv regional mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'])")
32
+
33
+ def pull_ga(self, credentials_file, property_id, start_date, country, metrics):
34
+ """
35
+ Pulls Google Analytics data using the BetaAnalyticsDataClient.
36
+
37
+ Parameters:
38
+ credentials_file (str): Path to the JSON credentials file.
39
+ property_id (str): Google Analytics property ID.
40
+ start_date (str): Start date in 'YYYY-MM-DD' format.
41
+ country (str): Country to filter the data by.
42
+ metrics (list): List of metrics to retrieve (e.g., ["totalUsers", "sessions"]).
43
+
44
+ Returns:
45
+ pd.DataFrame: A pandas DataFrame containing the fetched data.
46
+ """
47
+ try:
48
+ end_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
49
+
50
+ if not os.path.exists(credentials_file):
51
+ raise FileNotFoundError(f"Credentials file '{credentials_file}' not found.")
52
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_file
53
+
54
+ try:
55
+ client = BetaAnalyticsDataClient()
56
+ except DefaultCredentialsError as e:
57
+ raise DefaultCredentialsError(
58
+ f"Failed to initialize Google Analytics client: {e}"
59
+ )
60
+
61
+ def format_report(request):
62
+ response = client.run_report(request)
63
+ # Row index
64
+ row_index_names = [header.name for header in response.dimension_headers]
65
+ row_header = []
66
+ for i in range(len(row_index_names)):
67
+ row_header.append([row.dimension_values[i].value for row in response.rows])
68
+
69
+ row_index_named = pd.MultiIndex.from_arrays(np.array(row_header), names=np.array(row_index_names))
70
+ # Row flat data
71
+ metric_names = [header.name for header in response.metric_headers]
72
+ data_values = []
73
+ for i in range(len(metric_names)):
74
+ data_values.append([row.metric_values[i].value for row in response.rows])
75
+
76
+ output = pd.DataFrame(data=np.transpose(np.array(data_values, dtype='f')),
77
+ index=row_index_named, columns=metric_names)
78
+ return output
79
+
80
+ all_dfs = []
81
+ offset_value = 0
82
+ batch_size = 100000
83
+
84
+ while True:
85
+ metric_objects = [Metric(name=metric) for metric in metrics]
86
+
87
+ request = RunReportRequest(
88
+ property='properties/' + property_id,
89
+ dimensions=[Dimension(name="date"), Dimension(name="city")],
90
+ metrics=metric_objects,
91
+ order_bys=[OrderBy(dimension={'dimension_name': 'date'}),
92
+ OrderBy(dimension={'dimension_name': 'city'})],
93
+ date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
94
+ limit=batch_size,
95
+ offset=offset_value,
96
+ dimension_filter=FilterExpression(
97
+ and_group=FilterExpressionList(
98
+ expressions=[
99
+ FilterExpression(
100
+ filter=Filter(
101
+ field_name="country",
102
+ string_filter=Filter.StringFilter(value=country),
103
+ )
104
+ ),
105
+ ]
106
+ )
107
+ )
108
+ )
109
+
110
+ df = format_report(request)
111
+ if df.empty:
112
+ break
113
+
114
+ df = df.reset_index()
115
+ df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
116
+ all_dfs.append(df)
117
+ offset_value += batch_size
118
+
119
+ if not all_dfs:
120
+ return pd.DataFrame()
121
+
122
+ final_df = pd.concat(all_dfs, ignore_index=True)
123
+ return final_df
124
+
125
+ except FileNotFoundError as e:
126
+ logging.error(f"FileNotFoundError: {e}")
127
+ raise
128
+ except DefaultCredentialsError as e:
129
+ logging.error(f"DefaultCredentialsError: {e}")
130
+ raise
131
+ except Exception as e:
132
+ logging.error(f"An unexpected error occurred: {e}")
133
+ raise
134
+
135
+ def process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, group1, group2):
136
+ """
137
+ Process ITV analysis by mapping geos, grouping data, and merging with media spend.
138
+
139
+ Parameters:
140
+ raw_df (pd.DataFrame): Raw input data containing 'geo', 'newUsers', 'totalRevenue', and 'date'.
141
+ itv_path (str): Path to the ITV regional mapping CSV file.
142
+ cities_path (str): Path to the Geo Mappings Excel file.
143
+ media_spend_path (str): Path to the media spend Excel file.
144
+ output_path (str): Path to save the final output CSV file.
145
+ group1 (list): List of geo regions for group 1.
146
+ group2 (list): List of geo regions for group 2.
147
+
148
+ Returns:
149
+ None
150
+ """
151
+ # Load and preprocess data
152
+ itv = pd.read_csv(itv_path).dropna(subset=['Latitude', 'Longitude'])
153
+ cities = pd.read_excel(cities_path).dropna(subset=['Latitude', 'Longitude'])
154
+
155
+ itv['geometry'] = itv.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
156
+ cities['geometry'] = cities.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
157
+
158
+ itv_gdf = gpd.GeoDataFrame(itv, geometry='geometry')
159
+ cities_gdf = gpd.GeoDataFrame(cities, geometry='geometry')
160
+
161
+ # Perform spatial join to match geos
162
+ joined_gdf = gpd.sjoin_nearest(itv_gdf, cities_gdf, how='inner', distance_col='distance')
163
+ matched_result = joined_gdf[['ITV Region', 'geo']].drop_duplicates(subset=['geo'])
164
+
165
+ # Handle unmatched geos
166
+ unmatched_geos = set(cities_gdf['geo']) - set(matched_result['geo'])
167
+ unmatched_cities_gdf = cities_gdf[cities_gdf['geo'].isin(unmatched_geos)]
168
+ nearest_unmatched_gdf = gpd.sjoin_nearest(unmatched_cities_gdf, itv_gdf, how='inner', distance_col='distance')
169
+
170
+ unmatched_geo_mapping = nearest_unmatched_gdf[['geo', 'ITV Region', 'Latitude_right', 'Longitude_right']]
171
+ unmatched_geo_mapping.columns = ['geo', 'ITV Region', 'Nearest_Latitude', 'Nearest_Longitude']
172
+
173
+ matched_result = pd.concat([matched_result, unmatched_geo_mapping[['geo', 'ITV Region']]])
174
+
175
+ # Group and filter data
176
+ merged_df = pd.merge(raw_df, matched_result, on='geo', how='left')
177
+ merged_df = merged_df[merged_df["geo"] != "(not set)"].drop(columns=['geo'])
178
+ merged_df = merged_df.rename(columns={'ITV Region': 'geo', 'newUsers': 'response'})
179
+
180
+ grouped_df = merged_df.groupby(['date', 'geo'], as_index=False).agg({'response': 'sum', 'totalRevenue': 'sum'})
181
+ filtered_df = grouped_df[grouped_df['geo'].isin(group1 + group2)].copy()
182
+
183
+ assignment_map = {city: 1 for city in group1}
184
+ assignment_map.update({city: 2 for city in group2})
185
+ filtered_df['assignment'] = filtered_df['geo'].map(assignment_map)
186
+
187
+ # Merge with media spend data
188
+ media_spend_df = pd.read_excel(media_spend_path).rename(columns={'Cost': 'cost'})
189
+ analysis_df = pd.merge(filtered_df, media_spend_df, on=['date', 'geo'], how='left')
190
+ analysis_df['cost'] = analysis_df['cost'].fillna(0)
191
+
192
+ # Save the final output
193
+ analysis_df.to_csv(output_path, index=False)
194
+
195
+ return analysis_df