imsciences 0.9.6.9__tar.gz → 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {imsciences-0.9.6.9 → imsciences-1.0.1}/PKG-INFO +1 -1
- {imsciences-0.9.6.9 → imsciences-1.0.1}/imsciences/__init__.py +2 -2
- {imsciences-0.9.6.9 → imsciences-1.0.1}/imsciences/geo.py +173 -115
- {imsciences-0.9.6.9 → imsciences-1.0.1}/imsciences/mmm.py +930 -409
- imsciences-1.0.1/imsciences/pull.py +3091 -0
- imsciences-1.0.1/imsciences/vis.py +739 -0
- {imsciences-0.9.6.9 → imsciences-1.0.1}/imsciences.egg-info/PKG-INFO +1 -1
- {imsciences-0.9.6.9 → imsciences-1.0.1}/imsciences.egg-info/SOURCES.txt +1 -1
- imsciences-1.0.1/pyproject.toml +77 -0
- imsciences-1.0.1/setup.py +121 -0
- imsciences-0.9.6.9/imsciences/pull.py +0 -2293
- imsciences-0.9.6.9/imsciences/unittesting.py +0 -1063
- imsciences-0.9.6.9/imsciences/vis.py +0 -196
- imsciences-0.9.6.9/setup.py +0 -38
- {imsciences-0.9.6.9 → imsciences-1.0.1}/LICENSE.txt +0 -0
- {imsciences-0.9.6.9 → imsciences-1.0.1}/README.md +0 -0
- {imsciences-0.9.6.9 → imsciences-1.0.1}/imsciences.egg-info/PKG-INFO-TomG-HP-290722 +0 -0
- {imsciences-0.9.6.9 → imsciences-1.0.1}/imsciences.egg-info/dependency_links.txt +0 -0
- {imsciences-0.9.6.9 → imsciences-1.0.1}/imsciences.egg-info/requires.txt +0 -0
- {imsciences-0.9.6.9 → imsciences-1.0.1}/imsciences.egg-info/top_level.txt +0 -0
- {imsciences-0.9.6.9 → imsciences-1.0.1}/setup.cfg +0 -0
|
@@ -1,65 +1,88 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
from google.analytics.data_v1beta import BetaAnalyticsDataClient
|
|
3
|
-
from google.analytics.data_v1beta.types import DateRange
|
|
4
|
-
from google.analytics.data_v1beta.types import Dimension
|
|
5
|
-
from google.analytics.data_v1beta.types import Metric
|
|
6
|
-
from google.analytics.data_v1beta.types import RunReportRequest
|
|
7
|
-
from google.analytics.data_v1beta.types import OrderBy
|
|
8
|
-
from google.analytics.data_v1beta.types import Filter
|
|
9
|
-
from google.analytics.data_v1beta.types import FilterExpression
|
|
10
|
-
from google.analytics.data_v1beta.types import FilterExpressionList
|
|
11
|
-
from google.auth.exceptions import DefaultCredentialsError
|
|
12
1
|
import logging
|
|
13
|
-
from datetime import datetime, timedelta
|
|
14
2
|
import os
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
|
|
15
5
|
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from google.analytics.data_v1beta import BetaAnalyticsDataClient
|
|
8
|
+
from google.analytics.data_v1beta.types import (
|
|
9
|
+
DateRange,
|
|
10
|
+
Dimension,
|
|
11
|
+
Filter,
|
|
12
|
+
FilterExpression,
|
|
13
|
+
FilterExpressionList,
|
|
14
|
+
Metric,
|
|
15
|
+
OrderBy,
|
|
16
|
+
RunReportRequest,
|
|
17
|
+
)
|
|
18
|
+
from google.auth.exceptions import DefaultCredentialsError
|
|
16
19
|
from scipy.spatial import cKDTree
|
|
17
20
|
|
|
21
|
+
|
|
18
22
|
class geoprocessing:
|
|
19
|
-
|
|
20
23
|
def help(self):
|
|
21
|
-
|
|
22
24
|
print("\n1. pull_ga")
|
|
23
25
|
print(" - Description: Pull in GA4 data for geo experiments.")
|
|
24
|
-
print(
|
|
25
|
-
|
|
26
|
+
print(
|
|
27
|
+
" - Usage: pull_ga(credentials_file, property_id, start_date, country, metrics)",
|
|
28
|
+
)
|
|
29
|
+
print(
|
|
30
|
+
" - Example: pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])",
|
|
31
|
+
)
|
|
26
32
|
|
|
27
33
|
print("\n2. process_itv_analysis")
|
|
28
|
-
print(
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
34
|
+
print(
|
|
35
|
+
" - Description: Processes region-level data for geo experiments by mapping ITV regions, grouping selected metrics, merging with media spend data, and saving the result.",
|
|
36
|
+
)
|
|
37
|
+
print(
|
|
38
|
+
" - Usage: process_itv_analysis(raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list",
|
|
39
|
+
)
|
|
40
|
+
print(
|
|
41
|
+
" - Example: process_itv_analysis(df, 'itv_regional_mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'], ['newUsers', 'transactions'], ['sum', 'sum']",
|
|
42
|
+
)
|
|
43
|
+
|
|
32
44
|
print("\n3. process_city_analysis")
|
|
33
|
-
print(
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
45
|
+
print(
|
|
46
|
+
" - Description: Processes city-level data for geo experiments by grouping selected metrics, merging with media spend data, and saving the result.",
|
|
47
|
+
)
|
|
48
|
+
print(
|
|
49
|
+
" - Usage: process_city_analysis(raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)",
|
|
50
|
+
)
|
|
51
|
+
print(
|
|
52
|
+
" - Example: process_city_analysis(df, spend, 'output.csv', ['Barnsley'], ['Aberdeen'], ['newUsers', 'transactions'], ['sum', 'mean'])",
|
|
53
|
+
)
|
|
54
|
+
|
|
37
55
|
def pull_ga(self, credentials_file, property_id, start_date, country, metrics):
|
|
38
56
|
"""
|
|
39
57
|
Pulls Google Analytics data using the BetaAnalyticsDataClient.
|
|
40
58
|
|
|
41
|
-
Parameters
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
42
61
|
credentials_file (str): Path to the JSON credentials file.
|
|
43
62
|
property_id (str): Google Analytics property ID.
|
|
44
63
|
start_date (str): Start date in 'YYYY-MM-DD' format.
|
|
45
64
|
country (str): Country to filter the data by.
|
|
46
65
|
metrics (list): List of metrics to retrieve (e.g., ["totalUsers", "sessions"]).
|
|
47
66
|
|
|
48
|
-
Returns
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
49
69
|
pd.DataFrame: A pandas DataFrame containing the fetched data.
|
|
70
|
+
|
|
50
71
|
"""
|
|
51
72
|
try:
|
|
52
|
-
end_date = (datetime.now() - timedelta(days=1)).strftime(
|
|
73
|
+
end_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
|
|
53
74
|
|
|
54
75
|
if not os.path.exists(credentials_file):
|
|
55
|
-
raise FileNotFoundError(
|
|
76
|
+
raise FileNotFoundError(
|
|
77
|
+
f"Credentials file '{credentials_file}' not found.",
|
|
78
|
+
)
|
|
56
79
|
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_file
|
|
57
80
|
|
|
58
81
|
try:
|
|
59
82
|
client = BetaAnalyticsDataClient()
|
|
60
83
|
except DefaultCredentialsError as e:
|
|
61
84
|
raise DefaultCredentialsError(
|
|
62
|
-
f"Failed to initialize Google Analytics client: {e}"
|
|
85
|
+
f"Failed to initialize Google Analytics client: {e}",
|
|
63
86
|
)
|
|
64
87
|
|
|
65
88
|
def format_report(request):
|
|
@@ -68,32 +91,44 @@ class geoprocessing:
|
|
|
68
91
|
row_index_names = [header.name for header in response.dimension_headers]
|
|
69
92
|
row_header = []
|
|
70
93
|
for i in range(len(row_index_names)):
|
|
71
|
-
row_header.append(
|
|
94
|
+
row_header.append(
|
|
95
|
+
[row.dimension_values[i].value for row in response.rows],
|
|
96
|
+
)
|
|
72
97
|
|
|
73
|
-
row_index_named = pd.MultiIndex.from_arrays(
|
|
98
|
+
row_index_named = pd.MultiIndex.from_arrays(
|
|
99
|
+
np.array(row_header),
|
|
100
|
+
names=np.array(row_index_names),
|
|
101
|
+
)
|
|
74
102
|
# Row flat data
|
|
75
103
|
metric_names = [header.name for header in response.metric_headers]
|
|
76
104
|
data_values = []
|
|
77
105
|
for i in range(len(metric_names)):
|
|
78
|
-
data_values.append(
|
|
106
|
+
data_values.append(
|
|
107
|
+
[row.metric_values[i].value for row in response.rows],
|
|
108
|
+
)
|
|
79
109
|
|
|
80
|
-
output = pd.DataFrame(
|
|
81
|
-
|
|
110
|
+
output = pd.DataFrame(
|
|
111
|
+
data=np.transpose(np.array(data_values, dtype="f")),
|
|
112
|
+
index=row_index_named,
|
|
113
|
+
columns=metric_names,
|
|
114
|
+
)
|
|
82
115
|
return output
|
|
83
116
|
|
|
84
117
|
all_dfs = []
|
|
85
118
|
offset_value = 0
|
|
86
|
-
batch_size = 100000
|
|
119
|
+
batch_size = 100000
|
|
87
120
|
|
|
88
121
|
while True:
|
|
89
122
|
metric_objects = [Metric(name=metric) for metric in metrics]
|
|
90
123
|
|
|
91
124
|
request = RunReportRequest(
|
|
92
|
-
property=
|
|
125
|
+
property="properties/" + property_id,
|
|
93
126
|
dimensions=[Dimension(name="date"), Dimension(name="city")],
|
|
94
127
|
metrics=metric_objects,
|
|
95
|
-
order_bys=[
|
|
96
|
-
|
|
128
|
+
order_bys=[
|
|
129
|
+
OrderBy(dimension={"dimension_name": "date"}),
|
|
130
|
+
OrderBy(dimension={"dimension_name": "city"}),
|
|
131
|
+
],
|
|
97
132
|
date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
|
|
98
133
|
limit=batch_size,
|
|
99
134
|
offset=offset_value,
|
|
@@ -103,37 +138,39 @@ class geoprocessing:
|
|
|
103
138
|
FilterExpression(
|
|
104
139
|
filter=Filter(
|
|
105
140
|
field_name="country",
|
|
106
|
-
string_filter=Filter.StringFilter(
|
|
107
|
-
|
|
141
|
+
string_filter=Filter.StringFilter(
|
|
142
|
+
value=country,
|
|
143
|
+
),
|
|
144
|
+
),
|
|
108
145
|
),
|
|
109
|
-
]
|
|
110
|
-
)
|
|
111
|
-
)
|
|
146
|
+
],
|
|
147
|
+
),
|
|
148
|
+
),
|
|
112
149
|
)
|
|
113
150
|
|
|
114
151
|
df = format_report(request)
|
|
115
152
|
if df.empty:
|
|
116
|
-
break
|
|
153
|
+
break
|
|
117
154
|
|
|
118
155
|
df = df.reset_index()
|
|
119
|
-
df[
|
|
156
|
+
df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
|
|
120
157
|
all_dfs.append(df)
|
|
121
158
|
offset_value += batch_size
|
|
122
159
|
|
|
123
160
|
if not all_dfs:
|
|
124
|
-
return pd.DataFrame()
|
|
161
|
+
return pd.DataFrame()
|
|
125
162
|
|
|
126
163
|
final_df = pd.concat(all_dfs, ignore_index=True)
|
|
127
164
|
return final_df
|
|
128
165
|
|
|
129
166
|
except FileNotFoundError as e:
|
|
130
|
-
logging.
|
|
167
|
+
logging.exception(f"FileNotFoundError: {e}")
|
|
131
168
|
raise
|
|
132
169
|
except DefaultCredentialsError as e:
|
|
133
|
-
logging.
|
|
170
|
+
logging.exception(f"DefaultCredentialsError: {e}")
|
|
134
171
|
raise
|
|
135
172
|
except Exception as e:
|
|
136
|
-
logging.
|
|
173
|
+
logging.exception(f"An unexpected error occurred: {e}")
|
|
137
174
|
raise
|
|
138
175
|
|
|
139
176
|
def process_itv_analysis(self, raw_df, city_lut, itv_lut, test_list, control_list):
|
|
@@ -143,7 +180,8 @@ class geoprocessing:
|
|
|
143
180
|
automatically aggregating metric columns (all columns from raw_df except 'date' and 'geo'),
|
|
144
181
|
and assigning a numerical group based on provided test and control lists.
|
|
145
182
|
|
|
146
|
-
Parameters
|
|
183
|
+
Parameters
|
|
184
|
+
----------
|
|
147
185
|
raw_df (pd.DataFrame): Raw data containing at least the columns 'date' and 'geo'
|
|
148
186
|
plus metric columns that should be summed.
|
|
149
187
|
city_lut (pd.DataFrame): City lookup table DataFrame with a column 'geo' and coordinate data.
|
|
@@ -152,77 +190,94 @@ class geoprocessing:
|
|
|
152
190
|
test_list (list): List of region names (strings) to be assigned the value 1.
|
|
153
191
|
control_list (list): List of region names (strings) to be assigned the value 2.
|
|
154
192
|
|
|
155
|
-
Returns
|
|
193
|
+
Returns
|
|
194
|
+
-------
|
|
156
195
|
pd.DataFrame: Aggregated DataFrame grouped by 'date' and 'geo' (where 'geo' comes from ITV Region),
|
|
157
196
|
with the metric columns summed and an additional 'assignment' column.
|
|
197
|
+
|
|
158
198
|
"""
|
|
159
|
-
|
|
160
199
|
# Ensure the ITV lookup table has valid coordinate data
|
|
161
|
-
itv_lut = itv_lut.dropna(subset=[
|
|
162
|
-
|
|
200
|
+
itv_lut = itv_lut.dropna(subset=["Latitude", "Longitude"])
|
|
201
|
+
|
|
163
202
|
# Merge raw_df with the city lookup table on 'geo'
|
|
164
|
-
merged_df = pd.merge(raw_df, city_lut, on=
|
|
165
|
-
|
|
203
|
+
merged_df = pd.merge(raw_df, city_lut, on="geo", how="left")
|
|
204
|
+
|
|
166
205
|
# Build a KD-tree from the ITV lookup table's coordinates for an efficient nearest-neighbor search
|
|
167
|
-
tree = cKDTree(itv_lut[[
|
|
168
|
-
|
|
206
|
+
tree = cKDTree(itv_lut[["Latitude", "Longitude"]].values)
|
|
207
|
+
|
|
169
208
|
# For each record in merged_df, find the nearest ITV region based on coordinates
|
|
170
|
-
distances, indices = tree.query(
|
|
171
|
-
|
|
209
|
+
distances, indices = tree.query(
|
|
210
|
+
merged_df[["Latitude", "Longitude"]].values,
|
|
211
|
+
k=1,
|
|
212
|
+
)
|
|
213
|
+
|
|
172
214
|
# Map the nearest ITV Region back to merged_df
|
|
173
|
-
merged_df[
|
|
174
|
-
|
|
215
|
+
merged_df["ITV Region"] = itv_lut.iloc[indices]["ITV Region"].values
|
|
216
|
+
|
|
175
217
|
# Automatically determine the metric columns from raw_df (all columns except 'date' and 'geo')
|
|
176
|
-
metric_cols = [col for col in raw_df.columns if col not in [
|
|
177
|
-
|
|
218
|
+
metric_cols = [col for col in raw_df.columns if col not in ["date", "geo"]]
|
|
219
|
+
|
|
178
220
|
# Aggregate (sum) the metric columns, grouping by 'date' and the nearest ITV Region
|
|
179
|
-
aggregated_df = merged_df.groupby([
|
|
180
|
-
|
|
221
|
+
aggregated_df = merged_df.groupby(["date", "ITV Region"], as_index=False)[
|
|
222
|
+
metric_cols
|
|
223
|
+
].sum()
|
|
224
|
+
|
|
181
225
|
# Rename 'ITV Region' to 'geo' to be consistent with your downstream usage
|
|
182
|
-
aggregated_df.rename(columns={
|
|
183
|
-
|
|
226
|
+
aggregated_df.rename(columns={"ITV Region": "geo"}, inplace=True)
|
|
227
|
+
|
|
184
228
|
# Define a function to assign group values based on the region name
|
|
185
229
|
def assign_value(region):
|
|
186
230
|
if region in test_list:
|
|
187
231
|
return 1
|
|
188
|
-
|
|
232
|
+
if region in control_list:
|
|
189
233
|
return 2
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
234
|
+
return np.nan # Or another default value if desired
|
|
235
|
+
|
|
193
236
|
# Apply the assignment function and remove rows without a valid assignment
|
|
194
|
-
aggregated_df[
|
|
195
|
-
aggregated_df.dropna(subset=[
|
|
196
|
-
aggregated_df[
|
|
197
|
-
|
|
237
|
+
aggregated_df["assignment"] = aggregated_df["geo"].apply(assign_value)
|
|
238
|
+
aggregated_df.dropna(subset=["assignment"], inplace=True)
|
|
239
|
+
aggregated_df["assignment"] = aggregated_df["assignment"].astype(int)
|
|
240
|
+
|
|
198
241
|
return aggregated_df
|
|
199
|
-
|
|
200
|
-
def process_city_analysis(
|
|
242
|
+
|
|
243
|
+
def process_city_analysis(
|
|
244
|
+
self,
|
|
245
|
+
raw_data,
|
|
246
|
+
spend_data,
|
|
247
|
+
output_path,
|
|
248
|
+
test_group,
|
|
249
|
+
control_group,
|
|
250
|
+
columns_to_aggregate,
|
|
251
|
+
aggregator_list,
|
|
252
|
+
):
|
|
201
253
|
"""
|
|
202
|
-
Process city-level analysis by grouping data, applying custom aggregations,
|
|
254
|
+
Process city-level analysis by grouping data, applying custom aggregations,
|
|
203
255
|
and merging with spend data.
|
|
204
256
|
|
|
205
|
-
Parameters
|
|
257
|
+
Parameters
|
|
258
|
+
----------
|
|
206
259
|
raw_data (str or pd.DataFrame):
|
|
207
260
|
- Raw input data as a file path (CSV/XLSX) or a DataFrame.
|
|
208
261
|
- Must contain 'date' and 'city' columns, plus any columns to be aggregated.
|
|
209
262
|
spend_data (str or pd.DataFrame):
|
|
210
263
|
- Spend data as a file path (CSV/XLSX) or a DataFrame.
|
|
211
264
|
- Must contain 'date', 'geo', and 'cost' columns.
|
|
212
|
-
output_path (str):
|
|
265
|
+
output_path (str):
|
|
213
266
|
- Path to save the final output file (CSV or XLSX).
|
|
214
|
-
group1 (list):
|
|
267
|
+
group1 (list):
|
|
215
268
|
- List of city regions to be considered "Test Group" or "Group 1".
|
|
216
|
-
group2 (list):
|
|
269
|
+
group2 (list):
|
|
217
270
|
- List of city regions to be considered "Control Group" or "Group 2".
|
|
218
|
-
columns_to_aggregate (list):
|
|
271
|
+
columns_to_aggregate (list):
|
|
219
272
|
- List of columns to apply aggregation to, e.g. ['newUsers', 'transactions'].
|
|
220
|
-
aggregator_list (list):
|
|
273
|
+
aggregator_list (list):
|
|
221
274
|
- List of corresponding aggregation functions, e.g. ['sum', 'mean'].
|
|
222
275
|
- Must be the same length as columns_to_aggregate.
|
|
223
276
|
|
|
224
|
-
Returns
|
|
277
|
+
Returns
|
|
278
|
+
-------
|
|
225
279
|
pd.DataFrame: The final merged, aggregated DataFrame.
|
|
280
|
+
|
|
226
281
|
"""
|
|
227
282
|
|
|
228
283
|
def read_file(data):
|
|
@@ -230,82 +285,85 @@ class geoprocessing:
|
|
|
230
285
|
if isinstance(data, pd.DataFrame):
|
|
231
286
|
return data
|
|
232
287
|
ext = os.path.splitext(data)[1].lower()
|
|
233
|
-
if ext ==
|
|
288
|
+
if ext == ".csv":
|
|
234
289
|
return pd.read_csv(data)
|
|
235
|
-
|
|
290
|
+
if ext in [".xlsx", ".xls"]:
|
|
236
291
|
return pd.read_excel(data)
|
|
237
|
-
|
|
238
|
-
|
|
292
|
+
raise ValueError(
|
|
293
|
+
"Unsupported file type. Please use a CSV or XLSX file.",
|
|
294
|
+
)
|
|
239
295
|
|
|
240
296
|
def write_file(df, file_path):
|
|
241
297
|
"""Helper function to write DataFrame to CSV or XLSX files."""
|
|
242
298
|
ext = os.path.splitext(file_path)[1].lower()
|
|
243
|
-
if ext ==
|
|
299
|
+
if ext == ".csv":
|
|
244
300
|
df.to_csv(file_path, index=False)
|
|
245
|
-
elif ext in [
|
|
246
|
-
df.to_excel(file_path, index=False, engine=
|
|
301
|
+
elif ext in [".xlsx", ".xls"]:
|
|
302
|
+
df.to_excel(file_path, index=False, engine="openpyxl")
|
|
247
303
|
else:
|
|
248
|
-
raise ValueError(
|
|
304
|
+
raise ValueError(
|
|
305
|
+
"Unsupported file type. Please use a CSV or XLSX file.",
|
|
306
|
+
)
|
|
249
307
|
|
|
250
308
|
# -----------------------
|
|
251
309
|
# 1. Read and validate data
|
|
252
310
|
# -----------------------
|
|
253
311
|
raw_df = read_file(raw_data)
|
|
254
|
-
raw_df = raw_df.rename(columns={
|
|
255
|
-
spend_df = read_file(spend_data).rename(columns={
|
|
312
|
+
raw_df = raw_df.rename(columns={"city": "geo"})
|
|
313
|
+
spend_df = read_file(spend_data).rename(columns={"Cost": "cost"})
|
|
256
314
|
|
|
257
315
|
# Columns we minimally need in raw_df
|
|
258
|
-
required_columns = {
|
|
316
|
+
required_columns = {"date", "geo"}
|
|
259
317
|
# Ensure the columns to aggregate are there
|
|
260
318
|
required_columns = required_columns.union(set(columns_to_aggregate))
|
|
261
319
|
missing_in_raw = required_columns - set(raw_df.columns)
|
|
262
320
|
if missing_in_raw:
|
|
263
321
|
raise ValueError(
|
|
264
|
-
f"The raw data is missing the following required columns: {missing_in_raw}"
|
|
322
|
+
f"The raw data is missing the following required columns: {missing_in_raw}",
|
|
265
323
|
)
|
|
266
324
|
|
|
267
325
|
# Validate spend data
|
|
268
|
-
spend_required_columns = {
|
|
326
|
+
spend_required_columns = {"date", "geo", "cost"}
|
|
269
327
|
missing_in_spend = spend_required_columns - set(spend_df.columns)
|
|
270
328
|
if missing_in_spend:
|
|
271
329
|
raise ValueError(
|
|
272
|
-
f"The spend data is missing the following required columns: {missing_in_spend}"
|
|
330
|
+
f"The spend data is missing the following required columns: {missing_in_spend}",
|
|
273
331
|
)
|
|
274
332
|
|
|
275
333
|
# -----------------------
|
|
276
334
|
# 2. Clean and prepare spend data
|
|
277
335
|
# -----------------------
|
|
278
336
|
# Convert cost column to numeric after stripping currency symbols and commas
|
|
279
|
-
spend_df[
|
|
280
|
-
spend_df[
|
|
281
|
-
.replace('[^\\d.]', '', regex=True)
|
|
282
|
-
.astype(float)
|
|
337
|
+
spend_df["cost"] = (
|
|
338
|
+
spend_df["cost"].replace("[^\\d.]", "", regex=True).astype(float)
|
|
283
339
|
)
|
|
284
340
|
|
|
285
341
|
# -----------------------
|
|
286
342
|
# 3. Prepare raw data
|
|
287
343
|
# -----------------------
|
|
288
344
|
# Filter only the relevant geos
|
|
289
|
-
filtered_df = raw_df[raw_df[
|
|
345
|
+
filtered_df = raw_df[raw_df["geo"].isin(test_group + control_group)].copy()
|
|
290
346
|
# -----------------------
|
|
291
347
|
# 4. Group and aggregate
|
|
292
348
|
# -----------------------
|
|
293
349
|
# Create a dictionary of {col: agg_function}
|
|
294
350
|
if len(columns_to_aggregate) != len(aggregator_list):
|
|
295
351
|
raise ValueError(
|
|
296
|
-
"columns_to_aggregate and aggregator_list must have the same length."
|
|
352
|
+
"columns_to_aggregate and aggregator_list must have the same length.",
|
|
297
353
|
)
|
|
298
354
|
aggregation_dict = dict(zip(columns_to_aggregate, aggregator_list))
|
|
299
355
|
|
|
300
356
|
# Perform groupby using the aggregator dictionary
|
|
301
|
-
grouped_df = filtered_df.groupby([
|
|
357
|
+
grouped_df = filtered_df.groupby(["date", "geo"], as_index=False).agg(
|
|
358
|
+
aggregation_dict,
|
|
359
|
+
)
|
|
302
360
|
|
|
303
361
|
# -----------------------
|
|
304
362
|
# 5. Map groups (Test vs. Control)
|
|
305
|
-
# -----------------------
|
|
306
|
-
assignment_map =
|
|
307
|
-
assignment_map.update(
|
|
308
|
-
grouped_df[
|
|
363
|
+
# -----------------------
|
|
364
|
+
assignment_map = dict.fromkeys(test_group, 1)
|
|
365
|
+
assignment_map.update(dict.fromkeys(control_group, 2))
|
|
366
|
+
grouped_df["assignment"] = grouped_df["geo"].map(assignment_map)
|
|
309
367
|
|
|
310
368
|
# -----------------------
|
|
311
369
|
# 6. Merge with spend data
|
|
@@ -313,16 +371,16 @@ class geoprocessing:
|
|
|
313
371
|
merged_df = pd.merge(
|
|
314
372
|
grouped_df,
|
|
315
373
|
spend_df, # has date, geo, cost
|
|
316
|
-
on=[
|
|
317
|
-
how=
|
|
374
|
+
on=["date", "geo"],
|
|
375
|
+
how="left",
|
|
318
376
|
)
|
|
319
377
|
|
|
320
378
|
# Fill missing cost with 0
|
|
321
|
-
merged_df[
|
|
379
|
+
merged_df["cost"] = merged_df["cost"].fillna(0)
|
|
322
380
|
|
|
323
381
|
# -----------------------
|
|
324
382
|
# 7. Write out results
|
|
325
383
|
# -----------------------
|
|
326
384
|
write_file(merged_df, output_path)
|
|
327
385
|
|
|
328
|
-
return merged_df
|
|
386
|
+
return merged_df
|