imsciences 0.9.5.4__py3-none-any.whl → 0.9.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of imsciences might be problematic. Click here for more details.

imsciences/geo.py CHANGED
@@ -26,14 +26,14 @@ class geoprocessing:
26
26
  print(" - Example: pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])")
27
27
 
28
28
  print("\n2. process_itv_analysis")
29
- print(" - Description: Pull in GA4 data for geo experiments.")
30
- print(" - Usage: process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, group1, group2)")
31
- print(" - Example:process_itv_analysis(df,'itv regional mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'])")
29
+ print(" - Description: Processes region-level data for geo experiments by mapping ITV regions, grouping selected metrics, merging with media spend data, and saving the result.")
30
+ print(" - Usage: process_itv_analysis(raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list")
31
+ print(" - Example: process_itv_analysis(df, 'itv_regional_mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'], ['newUsers', 'transactions'], ['sum', 'sum']")
32
32
 
33
33
  print("\n3. process_city_analysis")
34
- print(" - Description: Processes city-level data for geo experiments by grouping user metrics, merging with media spend data, and saving the result.")
35
- print(" - Usage: process_city_analysis(raw_df, spend_df, output_path, group1, group2, response_column)")
36
- print(" - Example:process_city_analysis(df, spend, output, ['Barnsley'], ['Aberdeen'], 'newUsers')")
34
+ print(" - Description: Processes city-level data for geo experiments by grouping selected metrics, merging with media spend data, and saving the result.")
35
+ print(" - Usage: process_city_analysis(raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)")
36
+ print(" - Example: process_city_analysis(df, spend, 'output.csv', ['Barnsley'], ['Aberdeen'], ['newUsers', 'transactions'], ['sum', 'mean'])")
37
37
 
38
38
  def pull_ga(self, credentials_file, property_id, start_date, country, metrics):
39
39
  """
@@ -137,23 +137,28 @@ class geoprocessing:
137
137
  logging.error(f"An unexpected error occurred: {e}")
138
138
  raise
139
139
 
140
- def process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, group1, group2):
140
+ def process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
141
141
  """
142
142
  Process ITV analysis by mapping geos, grouping data, and merging with media spend.
143
143
 
144
144
  Parameters:
145
- raw_df (pd.DataFrame): Raw input data containing 'geo', 'newUsers', 'totalRevenue', and 'date'.
145
+ raw_df (pd.DataFrame): Raw input data containing columns such as 'geo', plus any metrics to be aggregated.
146
146
  itv_path (str): Path to the ITV regional mapping CSV file.
147
147
  cities_path (str): Path to the Geo Mappings Excel file.
148
148
  media_spend_path (str): Path to the media spend Excel file.
149
149
  output_path (str): Path to save the final output CSV file.
150
150
  group1 (list): List of geo regions for group 1.
151
151
  group2 (list): List of geo regions for group 2.
152
+ columns_to_aggregate (list): List of columns in `raw_df` that need aggregation.
153
+ aggregator_list (list): List of aggregation operations (e.g. ["sum", "mean", ...]) for corresponding columns.
152
154
 
153
155
  Returns:
154
- None
156
+ pd.DataFrame: The final merged and aggregated DataFrame.
155
157
  """
156
- # Load and preprocess data
158
+
159
+ # -----------------------
160
+ # 1. Load and preprocess data
161
+ # -----------------------
157
162
  itv = pd.read_csv(itv_path).dropna(subset=['Latitude', 'Longitude'])
158
163
  cities = pd.read_excel(cities_path).dropna(subset=['Latitude', 'Longitude'])
159
164
 
@@ -163,59 +168,114 @@ class geoprocessing:
163
168
  itv_gdf = gpd.GeoDataFrame(itv, geometry='geometry')
164
169
  cities_gdf = gpd.GeoDataFrame(cities, geometry='geometry')
165
170
 
166
- # Perform spatial join to match geos
167
- joined_gdf = gpd.sjoin_nearest(itv_gdf, cities_gdf, how='inner', distance_col='distance')
171
+ # -----------------------
172
+ # 2. Perform spatial join to match geos
173
+ # -----------------------
174
+ joined_gdf = gpd.sjoin_nearest(
175
+ itv_gdf,
176
+ cities_gdf,
177
+ how='inner',
178
+ distance_col='distance'
179
+ )
168
180
  matched_result = joined_gdf[['ITV Region', 'geo']].drop_duplicates(subset=['geo'])
169
181
 
170
182
  # Handle unmatched geos
171
183
  unmatched_geos = set(cities_gdf['geo']) - set(matched_result['geo'])
172
184
  unmatched_cities_gdf = cities_gdf[cities_gdf['geo'].isin(unmatched_geos)]
173
- nearest_unmatched_gdf = gpd.sjoin_nearest(unmatched_cities_gdf, itv_gdf, how='inner', distance_col='distance')
185
+
186
+ nearest_unmatched_gdf = gpd.sjoin_nearest(
187
+ unmatched_cities_gdf,
188
+ itv_gdf,
189
+ how='inner',
190
+ distance_col='distance'
191
+ )
174
192
 
175
193
  unmatched_geo_mapping = nearest_unmatched_gdf[['geo', 'ITV Region', 'Latitude_right', 'Longitude_right']]
176
194
  unmatched_geo_mapping.columns = ['geo', 'ITV Region', 'Nearest_Latitude', 'Nearest_Longitude']
177
195
 
178
196
  matched_result = pd.concat([matched_result, unmatched_geo_mapping[['geo', 'ITV Region']]])
179
197
 
180
- # Group and filter data
198
+ # -----------------------
199
+ # 3. Merge with raw data
200
+ # -----------------------
181
201
  merged_df = pd.merge(raw_df, matched_result, on='geo', how='left')
182
- merged_df = merged_df[merged_df["geo"] != "(not set)"].drop(columns=['geo'])
183
- merged_df = merged_df.rename(columns={'ITV Region': 'geo', 'newUsers': 'response'})
184
202
 
185
- grouped_df = merged_df.groupby(['date', 'geo'], as_index=False).agg({'response': 'sum', 'totalRevenue': 'sum'})
186
- filtered_df = grouped_df[grouped_df['geo'].isin(group1 + group2)].copy()
203
+ # Remove rows where geo is "(not set)"
204
+ merged_df = merged_df[merged_df["geo"] != "(not set)"]
205
+
206
+ # Replace 'geo' column with 'ITV Region'
207
+ # - We'll keep the "ITV Region" naming for clarity, but you can rename if you like.
208
+ merged_df = merged_df.drop(columns=['geo'])
209
+ merged_df = merged_df.rename(columns={'ITV Region': 'geo'})
210
+
211
+ # -----------------------
212
+ # 4. Group and aggregate
213
+ # -----------------------
214
+ # Build the dictionary for aggregation: {col1: agg1, col2: agg2, ...}
215
+ aggregation_dict = dict(zip(columns_to_aggregate, aggregator_list))
187
216
 
188
- assignment_map = {city: 1 for city in group1}
189
- assignment_map.update({city: 2 for city in group2})
217
+ # Perform the groupby operation
218
+ grouped_df = merged_df.groupby(['date', 'geo'], as_index=False).agg(aggregation_dict)
219
+
220
+ # -----------------------
221
+ # 5. Filter for test & control groups
222
+ # -----------------------
223
+ filtered_df = grouped_df[grouped_df['geo'].isin(test_group + control_group)].copy()
224
+
225
+ assignment_map = {city: 1 for city in test_group}
226
+ assignment_map.update({city: 2 for city in control_group})
190
227
  filtered_df['assignment'] = filtered_df['geo'].map(assignment_map)
191
228
 
192
- # Merge with media spend data
229
+ # -----------------------
230
+ # 6. Merge with media spend
231
+ # -----------------------
193
232
  media_spend_df = pd.read_excel(media_spend_path).rename(columns={'Cost': 'cost'})
194
- analysis_df = pd.merge(filtered_df, media_spend_df, on=['date', 'geo'], how='left')
233
+
234
+ # Merge on date and geo
235
+ analysis_df = pd.merge(
236
+ filtered_df,
237
+ media_spend_df,
238
+ on=['date', 'geo'],
239
+ how='left'
240
+ )
241
+
242
+ # Fill missing cost with 0
195
243
  analysis_df['cost'] = analysis_df['cost'].fillna(0)
196
244
 
197
- # Save the final output
245
+ # -----------------------
246
+ # 7. Save to CSV
247
+ # -----------------------
198
248
  analysis_df.to_csv(output_path, index=False)
199
-
200
- return analysis_df
249
+
250
+ return analysis_df
201
251
 
202
- def process_city_analysis(self, raw_data, spend_data, output_path, group1, group2, response_column):
252
+ def process_city_analysis(self, raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
203
253
  """
204
- Process city analysis by grouping data, analyzing user metrics, and merging with spend data.
254
+ Process city-level analysis by grouping data, applying custom aggregations,
255
+ and merging with spend data.
205
256
 
206
257
  Parameters:
207
- raw_data (str or pd.DataFrame): Raw input data as a file path (CSV/XLSX) or DataFrame.
208
- spend_data (str or pd.DataFrame): Spend data as a file path (CSV/XLSX) or DataFrame.
209
- output_path (str): Path to save the final output file (CSV or XLSX).
210
- group1 (list): List of city regions for group 1.
211
- group2 (list): List of city regions for group 2.
212
- response_column (str): Column name to be used as the response metric.
258
+ raw_data (str or pd.DataFrame):
259
+ - Raw input data as a file path (CSV/XLSX) or a DataFrame.
260
+ - Must contain 'date' and 'city' columns, plus any columns to be aggregated.
261
+ spend_data (str or pd.DataFrame):
262
+ - Spend data as a file path (CSV/XLSX) or a DataFrame.
263
+ - Must contain 'date', 'geo', and 'cost' columns.
264
+ output_path (str):
265
+ - Path to save the final output file (CSV or XLSX).
266
+ group1 (list):
267
+ - List of city regions to be considered "Test Group" or "Group 1".
268
+ group2 (list):
269
+ - List of city regions to be considered "Control Group" or "Group 2".
270
+ columns_to_aggregate (list):
271
+ - List of columns to apply aggregation to, e.g. ['newUsers', 'transactions'].
272
+ aggregator_list (list):
273
+ - List of corresponding aggregation functions, e.g. ['sum', 'mean'].
274
+ - Must be the same length as columns_to_aggregate.
213
275
 
214
276
  Returns:
215
- pd.DataFrame: Processed DataFrame.
277
+ pd.DataFrame: The final merged, aggregated DataFrame.
216
278
  """
217
- import pandas as pd
218
- import os
219
279
 
220
280
  def read_file(data):
221
281
  """Helper function to handle file paths or return DataFrame directly."""
@@ -239,39 +299,85 @@ class geoprocessing:
239
299
  else:
240
300
  raise ValueError("Unsupported file type. Please use a CSV or XLSX file.")
241
301
 
242
- # Read data
302
+ # -----------------------
303
+ # 1. Read and validate data
304
+ # -----------------------
243
305
  raw_df = read_file(raw_data)
244
306
  spend_df = read_file(spend_data)
245
307
 
246
- # Ensure necessary columns are present
247
- required_columns = {'date', 'city', response_column}
248
- if not required_columns.issubset(raw_df.columns):
249
- raise ValueError(f"Input DataFrame must contain the following columns: {required_columns}")
250
-
308
+ # Columns we minimally need in raw_df
309
+ required_columns = {'date', 'city'}
310
+ # Ensure the columns to aggregate are there
311
+ required_columns = required_columns.union(set(columns_to_aggregate))
312
+ missing_in_raw = required_columns - set(raw_df.columns)
313
+ if missing_in_raw:
314
+ raise ValueError(
315
+ f"The raw data is missing the following required columns: {missing_in_raw}"
316
+ )
317
+
318
+ # Validate spend data
251
319
  spend_required_columns = {'date', 'geo', 'cost'}
252
- if not spend_required_columns.issubset(spend_df.columns):
253
- raise ValueError(f"Spend DataFrame must contain the following columns: {spend_required_columns}")
254
-
320
+ missing_in_spend = spend_required_columns - set(spend_df.columns)
321
+ if missing_in_spend:
322
+ raise ValueError(
323
+ f"The spend data is missing the following required columns: {missing_in_spend}"
324
+ )
325
+
326
+ # -----------------------
327
+ # 2. Clean and prepare spend data
328
+ # -----------------------
255
329
  # Convert cost column to numeric after stripping currency symbols and commas
256
- spend_df['cost'] = spend_df['cost'].replace('[^\d.]', '', regex=True).astype(float)
257
-
258
- # Rename and process input DataFrame
259
- raw_df = raw_df.rename(columns={'city': 'geo', response_column: 'response'})
260
-
261
- # Filter and group data
262
- filtered_df = raw_df[raw_df['geo'].isin(group1 + group2)].copy()
263
-
264
- grouped_df = filtered_df.groupby(['date', 'geo'], as_index=False).agg({'response': 'sum'})
265
-
266
- assignment_map = {city: 1 for city in group1}
267
- assignment_map.update({city: 2 for city in group2})
330
+ spend_df['cost'] = (
331
+ spend_df['cost']
332
+ .replace('[^\\d.]', '', regex=True)
333
+ .astype(float)
334
+ )
335
+
336
+ # -----------------------
337
+ # 3. Prepare raw data
338
+ # -----------------------
339
+ # Rename 'city' to 'geo' for consistency
340
+ raw_df = raw_df.rename(columns={'city': 'geo'})
341
+
342
+ # Filter only the relevant geos
343
+ filtered_df = raw_df[raw_df['geo'].isin(test_group + control_group)].copy()
344
+
345
+ # -----------------------
346
+ # 4. Group and aggregate
347
+ # -----------------------
348
+ # Create a dictionary of {col: agg_function}
349
+ if len(columns_to_aggregate) != len(aggregator_list):
350
+ raise ValueError(
351
+ "columns_to_aggregate and aggregator_list must have the same length."
352
+ )
353
+ aggregation_dict = dict(zip(columns_to_aggregate, aggregator_list))
354
+
355
+ # Perform groupby using the aggregator dictionary
356
+ grouped_df = filtered_df.groupby(['date', 'geo'], as_index=False).agg(aggregation_dict)
357
+
358
+ # -----------------------
359
+ # 5. Map groups (Test vs. Control)
360
+ # -----------------------
361
+ assignment_map = {city: "Test Group" for city in test_group}
362
+ assignment_map.update({city: "Control Group" for city in control_group})
268
363
  grouped_df['assignment'] = grouped_df['geo'].map(assignment_map)
269
364
 
270
- # Merge with spend data
271
- merged_df = pd.merge(grouped_df, spend_df, on=['date', 'geo'], how='left')
365
+ # -----------------------
366
+ # 6. Merge with spend data
367
+ # -----------------------
368
+ merged_df = pd.merge(
369
+ grouped_df,
370
+ spend_df, # has date, geo, cost
371
+ on=['date', 'geo'],
372
+ how='left'
373
+ )
374
+
375
+ # Fill missing cost with 0
272
376
  merged_df['cost'] = merged_df['cost'].fillna(0)
273
377
 
274
- # Save the final output
378
+ # -----------------------
379
+ # 7. Write out results
380
+ # -----------------------
275
381
  write_file(merged_df, output_path)
276
382
 
277
- return merged_df
383
+ return merged_df