imsciences 0.9.5.6__tar.gz → 0.9.5.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of imsciences might be problematic. Click here for more details.
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/PKG-INFO +1 -1
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences/geo.py +52 -107
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences.egg-info/PKG-INFO +1 -1
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/setup.py +1 -1
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/LICENSE.txt +0 -0
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/README.md +0 -0
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences/__init__.py +0 -0
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences/mmm.py +0 -0
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences/pull.py +0 -0
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences/unittesting.py +0 -0
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences/vis.py +0 -0
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences.egg-info/PKG-INFO-IMS-24Ltp-3 +0 -0
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences.egg-info/SOURCES.txt +0 -0
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences.egg-info/dependency_links.txt +0 -0
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences.egg-info/requires.txt +0 -0
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences.egg-info/top_level.txt +0 -0
- {imsciences-0.9.5.6 → imsciences-0.9.5.8}/setup.cfg +0 -0
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
import geopandas as gpd
|
|
3
|
-
from shapely.geometry import Point
|
|
4
2
|
from google.analytics.data_v1beta import BetaAnalyticsDataClient
|
|
5
3
|
from google.analytics.data_v1beta.types import DateRange
|
|
6
4
|
from google.analytics.data_v1beta.types import Dimension
|
|
@@ -15,6 +13,7 @@ import logging
|
|
|
15
13
|
from datetime import datetime, timedelta
|
|
16
14
|
import os
|
|
17
15
|
import numpy as np
|
|
16
|
+
from scipy.spatial import cKDTree
|
|
18
17
|
|
|
19
18
|
class geoprocessing:
|
|
20
19
|
|
|
@@ -137,117 +136,66 @@ class geoprocessing:
|
|
|
137
136
|
logging.error(f"An unexpected error occurred: {e}")
|
|
138
137
|
raise
|
|
139
138
|
|
|
140
|
-
def process_itv_analysis(self, raw_df,
|
|
139
|
+
def process_itv_analysis(self, raw_df, city_lut, itv_lut, test_list, control_list):
|
|
141
140
|
"""
|
|
142
|
-
Process
|
|
143
|
-
|
|
141
|
+
Process the raw data by merging it with a city lookup table,
|
|
142
|
+
performing a spatial join to find the nearest ITV region,
|
|
143
|
+
automatically aggregating metric columns (all columns from raw_df except 'date' and 'geo'),
|
|
144
|
+
and assigning a numerical group based on provided test and control lists.
|
|
145
|
+
|
|
144
146
|
Parameters:
|
|
145
|
-
raw_df (pd.DataFrame): Raw
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
aggregator_list (list): List of aggregation operations (e.g. ["sum", "mean", ...]) for corresponding columns.
|
|
154
|
-
|
|
147
|
+
raw_df (pd.DataFrame): Raw data containing at least the columns 'date' and 'geo'
|
|
148
|
+
plus metric columns that should be summed.
|
|
149
|
+
city_lut (pd.DataFrame): City lookup table DataFrame with a column 'geo' and coordinate data.
|
|
150
|
+
itv_lut (pd.DataFrame): ITV lookup table DataFrame with columns 'Latitude', 'Longitude', and 'ITV Region'
|
|
151
|
+
for spatial matching.
|
|
152
|
+
test_list (list): List of region names (strings) to be assigned the value 1.
|
|
153
|
+
control_list (list): List of region names (strings) to be assigned the value 2.
|
|
154
|
+
|
|
155
155
|
Returns:
|
|
156
|
-
pd.DataFrame:
|
|
156
|
+
pd.DataFrame: Aggregated DataFrame grouped by 'date' and 'geo' (where 'geo' comes from ITV Region),
|
|
157
|
+
with the metric columns summed and an additional 'assignment' column.
|
|
157
158
|
"""
|
|
158
|
-
|
|
159
|
-
# -----------------------
|
|
160
|
-
# 1. Load and preprocess data
|
|
161
|
-
# -----------------------
|
|
162
|
-
itv = pd.read_csv(itv_path).dropna(subset=['Latitude', 'Longitude'])
|
|
163
|
-
cities = pd.read_excel(cities_path).dropna(subset=['Latitude', 'Longitude'])
|
|
164
159
|
|
|
165
|
-
|
|
166
|
-
|
|
160
|
+
# Ensure the ITV lookup table has valid coordinate data
|
|
161
|
+
itv_lut = itv_lut.dropna(subset=['Latitude', 'Longitude'])
|
|
167
162
|
|
|
168
|
-
|
|
169
|
-
|
|
163
|
+
# Merge raw_df with the city lookup table on 'geo'
|
|
164
|
+
merged_df = pd.merge(raw_df, city_lut, on='geo', how='left')
|
|
170
165
|
|
|
171
|
-
#
|
|
172
|
-
|
|
173
|
-
# -----------------------
|
|
174
|
-
joined_gdf = gpd.sjoin_nearest(
|
|
175
|
-
itv_gdf,
|
|
176
|
-
cities_gdf,
|
|
177
|
-
how='inner',
|
|
178
|
-
distance_col='distance'
|
|
179
|
-
)
|
|
180
|
-
matched_result = joined_gdf[['ITV Region', 'geo']].drop_duplicates(subset=['geo'])
|
|
166
|
+
# Build a KD-tree from the ITV lookup table's coordinates for an efficient nearest-neighbor search
|
|
167
|
+
tree = cKDTree(itv_lut[['Latitude', 'Longitude']].values)
|
|
181
168
|
|
|
182
|
-
#
|
|
183
|
-
|
|
184
|
-
unmatched_cities_gdf = cities_gdf[cities_gdf['geo'].isin(unmatched_geos)]
|
|
169
|
+
# For each record in merged_df, find the nearest ITV region based on coordinates
|
|
170
|
+
distances, indices = tree.query(merged_df[['Latitude', 'Longitude']].values, k=1)
|
|
185
171
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
itv_gdf,
|
|
189
|
-
how='inner',
|
|
190
|
-
distance_col='distance'
|
|
191
|
-
)
|
|
172
|
+
# Map the nearest ITV Region back to merged_df
|
|
173
|
+
merged_df['ITV Region'] = itv_lut.iloc[indices]['ITV Region'].values
|
|
192
174
|
|
|
193
|
-
|
|
194
|
-
|
|
175
|
+
# Automatically determine the metric columns from raw_df (all columns except 'date' and 'geo')
|
|
176
|
+
metric_cols = [col for col in raw_df.columns if col not in ['date', 'geo']]
|
|
195
177
|
|
|
196
|
-
|
|
178
|
+
# Aggregate (sum) the metric columns, grouping by 'date' and the nearest ITV Region
|
|
179
|
+
aggregated_df = merged_df.groupby(['date', 'ITV Region'], as_index=False)[metric_cols].sum()
|
|
197
180
|
|
|
198
|
-
#
|
|
199
|
-
|
|
200
|
-
# -----------------------
|
|
201
|
-
merged_df = pd.merge(raw_df, matched_result, on='geo', how='left')
|
|
202
|
-
|
|
203
|
-
# Remove rows where geo is "(not set)"
|
|
204
|
-
merged_df = merged_df[merged_df["geo"] != "(not set)"]
|
|
205
|
-
|
|
206
|
-
# Replace 'geo' column with 'ITV Region'
|
|
207
|
-
# - We'll keep the "ITV Region" naming for clarity, but you can rename if you like.
|
|
208
|
-
merged_df = merged_df.drop(columns=['geo'])
|
|
209
|
-
merged_df = merged_df.rename(columns={'ITV Region': 'geo'})
|
|
181
|
+
# Rename 'ITV Region' to 'geo' to be consistent with your downstream usage
|
|
182
|
+
aggregated_df.rename(columns={'ITV Region': 'geo'}, inplace=True)
|
|
210
183
|
|
|
211
|
-
#
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
# -----------------------
|
|
221
|
-
# 5. Filter for test & control groups
|
|
222
|
-
# -----------------------
|
|
223
|
-
filtered_df = grouped_df[grouped_df['geo'].isin(test_group + control_group)].copy()
|
|
224
|
-
|
|
225
|
-
assignment_map = {city: 1 for city in test_group}
|
|
226
|
-
assignment_map.update({city: 2 for city in control_group})
|
|
227
|
-
filtered_df['assignment'] = filtered_df['geo'].map(assignment_map)
|
|
228
|
-
|
|
229
|
-
# -----------------------
|
|
230
|
-
# 6. Merge with media spend
|
|
231
|
-
# -----------------------
|
|
232
|
-
media_spend_df = pd.read_excel(media_spend_path).rename(columns={'Cost': 'cost'})
|
|
233
|
-
|
|
234
|
-
# Merge on date and geo
|
|
235
|
-
analysis_df = pd.merge(
|
|
236
|
-
filtered_df,
|
|
237
|
-
media_spend_df,
|
|
238
|
-
on=['date', 'geo'],
|
|
239
|
-
how='left'
|
|
240
|
-
)
|
|
241
|
-
|
|
242
|
-
# Fill missing cost with 0
|
|
243
|
-
analysis_df['cost'] = analysis_df['cost'].fillna(0)
|
|
184
|
+
# Define a function to assign group values based on the region name
|
|
185
|
+
def assign_value(region):
|
|
186
|
+
if region in test_list:
|
|
187
|
+
return 1
|
|
188
|
+
elif region in control_list:
|
|
189
|
+
return 2
|
|
190
|
+
else:
|
|
191
|
+
return np.nan # Or another default value if desired
|
|
244
192
|
|
|
245
|
-
#
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
193
|
+
# Apply the assignment function and remove rows without a valid assignment
|
|
194
|
+
aggregated_df['assignment'] = aggregated_df['geo'].apply(assign_value)
|
|
195
|
+
aggregated_df.dropna(subset=['assignment'], inplace=True)
|
|
196
|
+
aggregated_df['assignment'] = aggregated_df['assignment'].astype(int)
|
|
249
197
|
|
|
250
|
-
return
|
|
198
|
+
return aggregated_df
|
|
251
199
|
|
|
252
200
|
def process_city_analysis(self, raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
|
|
253
201
|
"""
|
|
@@ -303,10 +251,11 @@ class geoprocessing:
|
|
|
303
251
|
# 1. Read and validate data
|
|
304
252
|
# -----------------------
|
|
305
253
|
raw_df = read_file(raw_data)
|
|
306
|
-
|
|
254
|
+
raw_df = raw_df.rename(columns={'city': 'geo'})
|
|
255
|
+
spend_df = read_file(spend_data).rename(columns={'Cost': 'cost'})
|
|
307
256
|
|
|
308
257
|
# Columns we minimally need in raw_df
|
|
309
|
-
required_columns = {'date', '
|
|
258
|
+
required_columns = {'date', 'geo'}
|
|
310
259
|
# Ensure the columns to aggregate are there
|
|
311
260
|
required_columns = required_columns.union(set(columns_to_aggregate))
|
|
312
261
|
missing_in_raw = required_columns - set(raw_df.columns)
|
|
@@ -336,12 +285,8 @@ class geoprocessing:
|
|
|
336
285
|
# -----------------------
|
|
337
286
|
# 3. Prepare raw data
|
|
338
287
|
# -----------------------
|
|
339
|
-
# Rename 'city' to 'geo' for consistency
|
|
340
|
-
raw_df = raw_df.rename(columns={'city': 'geo'})
|
|
341
|
-
|
|
342
288
|
# Filter only the relevant geos
|
|
343
289
|
filtered_df = raw_df[raw_df['geo'].isin(test_group + control_group)].copy()
|
|
344
|
-
|
|
345
290
|
# -----------------------
|
|
346
291
|
# 4. Group and aggregate
|
|
347
292
|
# -----------------------
|
|
@@ -357,9 +302,9 @@ class geoprocessing:
|
|
|
357
302
|
|
|
358
303
|
# -----------------------
|
|
359
304
|
# 5. Map groups (Test vs. Control)
|
|
360
|
-
# -----------------------
|
|
361
|
-
assignment_map = {city:
|
|
362
|
-
assignment_map.update({city:
|
|
305
|
+
# -----------------------
|
|
306
|
+
assignment_map = {city: 1 for city in test_group}
|
|
307
|
+
assignment_map.update({city: 2 for city in control_group})
|
|
363
308
|
grouped_df['assignment'] = grouped_df['geo'].map(assignment_map)
|
|
364
309
|
|
|
365
310
|
# -----------------------
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|