imsciences 0.9.5.7__tar.gz → 0.9.5.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of imsciences might be problematic. Click here for more details.
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/PKG-INFO +1 -1
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/imsciences/geo.py +46 -98
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/imsciences.egg-info/PKG-INFO +1 -1
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/setup.py +1 -1
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/LICENSE.txt +0 -0
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/README.md +0 -0
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/imsciences/__init__.py +0 -0
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/imsciences/mmm.py +0 -0
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/imsciences/pull.py +0 -0
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/imsciences/unittesting.py +0 -0
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/imsciences/vis.py +0 -0
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/imsciences.egg-info/PKG-INFO-IMS-24Ltp-3 +0 -0
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/imsciences.egg-info/SOURCES.txt +0 -0
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/imsciences.egg-info/dependency_links.txt +0 -0
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/imsciences.egg-info/requires.txt +0 -0
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/imsciences.egg-info/top_level.txt +0 -0
- {imsciences-0.9.5.7 → imsciences-0.9.5.8}/setup.cfg +0 -0
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
import geopandas as gpd
|
|
3
|
-
from shapely.geometry import Point
|
|
4
2
|
from google.analytics.data_v1beta import BetaAnalyticsDataClient
|
|
5
3
|
from google.analytics.data_v1beta.types import DateRange
|
|
6
4
|
from google.analytics.data_v1beta.types import Dimension
|
|
@@ -15,6 +13,7 @@ import logging
|
|
|
15
13
|
from datetime import datetime, timedelta
|
|
16
14
|
import os
|
|
17
15
|
import numpy as np
|
|
16
|
+
from scipy.spatial import cKDTree
|
|
18
17
|
|
|
19
18
|
class geoprocessing:
|
|
20
19
|
|
|
@@ -137,117 +136,66 @@ class geoprocessing:
|
|
|
137
136
|
logging.error(f"An unexpected error occurred: {e}")
|
|
138
137
|
raise
|
|
139
138
|
|
|
140
|
-
def process_itv_analysis(self, raw_df,
|
|
139
|
+
def process_itv_analysis(self, raw_df, city_lut, itv_lut, test_list, control_list):
|
|
141
140
|
"""
|
|
142
|
-
Process
|
|
143
|
-
|
|
141
|
+
Process the raw data by merging it with a city lookup table,
|
|
142
|
+
performing a spatial join to find the nearest ITV region,
|
|
143
|
+
automatically aggregating metric columns (all columns from raw_df except 'date' and 'geo'),
|
|
144
|
+
and assigning a numerical group based on provided test and control lists.
|
|
145
|
+
|
|
144
146
|
Parameters:
|
|
145
|
-
raw_df (pd.DataFrame): Raw
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
aggregator_list (list): List of aggregation operations (e.g. ["sum", "mean", ...]) for corresponding columns.
|
|
154
|
-
|
|
147
|
+
raw_df (pd.DataFrame): Raw data containing at least the columns 'date' and 'geo'
|
|
148
|
+
plus metric columns that should be summed.
|
|
149
|
+
city_lut (pd.DataFrame): City lookup table DataFrame with a column 'geo' and coordinate data.
|
|
150
|
+
itv_lut (pd.DataFrame): ITV lookup table DataFrame with columns 'Latitude', 'Longitude', and 'ITV Region'
|
|
151
|
+
for spatial matching.
|
|
152
|
+
test_list (list): List of region names (strings) to be assigned the value 1.
|
|
153
|
+
control_list (list): List of region names (strings) to be assigned the value 2.
|
|
154
|
+
|
|
155
155
|
Returns:
|
|
156
|
-
pd.DataFrame:
|
|
156
|
+
pd.DataFrame: Aggregated DataFrame grouped by 'date' and 'geo' (where 'geo' comes from ITV Region),
|
|
157
|
+
with the metric columns summed and an additional 'assignment' column.
|
|
157
158
|
"""
|
|
158
|
-
|
|
159
|
-
# -----------------------
|
|
160
|
-
# 1. Load and preprocess data
|
|
161
|
-
# -----------------------
|
|
162
|
-
itv = pd.read_csv(itv_path).dropna(subset=['Latitude', 'Longitude'])
|
|
163
|
-
cities = pd.read_excel(cities_path).dropna(subset=['Latitude', 'Longitude'])
|
|
164
|
-
|
|
165
|
-
itv['geometry'] = itv.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
|
|
166
|
-
cities['geometry'] = cities.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
|
|
167
|
-
|
|
168
|
-
itv_gdf = gpd.GeoDataFrame(itv, geometry='geometry')
|
|
169
|
-
cities_gdf = gpd.GeoDataFrame(cities, geometry='geometry')
|
|
170
|
-
|
|
171
|
-
# -----------------------
|
|
172
|
-
# 2. Perform spatial join to match geos
|
|
173
|
-
# -----------------------
|
|
174
|
-
joined_gdf = gpd.sjoin_nearest(
|
|
175
|
-
itv_gdf,
|
|
176
|
-
cities_gdf,
|
|
177
|
-
how='inner',
|
|
178
|
-
distance_col='distance'
|
|
179
|
-
)
|
|
180
|
-
matched_result = joined_gdf[['ITV Region', 'geo']].drop_duplicates(subset=['geo'])
|
|
181
|
-
|
|
182
|
-
# Handle unmatched geos
|
|
183
|
-
unmatched_geos = set(cities_gdf['geo']) - set(matched_result['geo'])
|
|
184
|
-
unmatched_cities_gdf = cities_gdf[cities_gdf['geo'].isin(unmatched_geos)]
|
|
185
|
-
|
|
186
|
-
nearest_unmatched_gdf = gpd.sjoin_nearest(
|
|
187
|
-
unmatched_cities_gdf,
|
|
188
|
-
itv_gdf,
|
|
189
|
-
how='inner',
|
|
190
|
-
distance_col='distance'
|
|
191
|
-
)
|
|
192
|
-
|
|
193
|
-
unmatched_geo_mapping = nearest_unmatched_gdf[['geo', 'ITV Region', 'Latitude_right', 'Longitude_right']]
|
|
194
|
-
unmatched_geo_mapping.columns = ['geo', 'ITV Region', 'Nearest_Latitude', 'Nearest_Longitude']
|
|
195
|
-
|
|
196
|
-
matched_result = pd.concat([matched_result, unmatched_geo_mapping[['geo', 'ITV Region']]])
|
|
197
|
-
|
|
198
|
-
# -----------------------
|
|
199
|
-
# 3. Merge with raw data
|
|
200
|
-
# -----------------------
|
|
201
|
-
merged_df = pd.merge(raw_df, matched_result, on='geo', how='left')
|
|
202
159
|
|
|
203
|
-
#
|
|
204
|
-
|
|
160
|
+
# Ensure the ITV lookup table has valid coordinate data
|
|
161
|
+
itv_lut = itv_lut.dropna(subset=['Latitude', 'Longitude'])
|
|
205
162
|
|
|
206
|
-
#
|
|
207
|
-
|
|
208
|
-
merged_df = merged_df.drop(columns=['geo'])
|
|
209
|
-
merged_df = merged_df.rename(columns={'ITV Region': 'geo'})
|
|
163
|
+
# Merge raw_df with the city lookup table on 'geo'
|
|
164
|
+
merged_df = pd.merge(raw_df, city_lut, on='geo', how='left')
|
|
210
165
|
|
|
211
|
-
#
|
|
212
|
-
|
|
213
|
-
# -----------------------
|
|
214
|
-
# Build the dictionary for aggregation: {col1: agg1, col2: agg2, ...}
|
|
215
|
-
aggregation_dict = dict(zip(columns_to_aggregate, aggregator_list))
|
|
166
|
+
# Build a KD-tree from the ITV lookup table's coordinates for an efficient nearest-neighbor search
|
|
167
|
+
tree = cKDTree(itv_lut[['Latitude', 'Longitude']].values)
|
|
216
168
|
|
|
217
|
-
#
|
|
218
|
-
|
|
169
|
+
# For each record in merged_df, find the nearest ITV region based on coordinates
|
|
170
|
+
distances, indices = tree.query(merged_df[['Latitude', 'Longitude']].values, k=1)
|
|
219
171
|
|
|
220
|
-
#
|
|
221
|
-
|
|
222
|
-
# -----------------------
|
|
223
|
-
filtered_df = grouped_df[grouped_df['geo'].isin(test_group + control_group)].copy()
|
|
172
|
+
# Map the nearest ITV Region back to merged_df
|
|
173
|
+
merged_df['ITV Region'] = itv_lut.iloc[indices]['ITV Region'].values
|
|
224
174
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
filtered_df['assignment'] = filtered_df['geo'].map(assignment_map)
|
|
175
|
+
# Automatically determine the metric columns from raw_df (all columns except 'date' and 'geo')
|
|
176
|
+
metric_cols = [col for col in raw_df.columns if col not in ['date', 'geo']]
|
|
228
177
|
|
|
229
|
-
#
|
|
230
|
-
|
|
231
|
-
# -----------------------
|
|
232
|
-
media_spend_df = pd.read_excel(media_spend_path).rename(columns={'Cost': 'cost'})
|
|
178
|
+
# Aggregate (sum) the metric columns, grouping by 'date' and the nearest ITV Region
|
|
179
|
+
aggregated_df = merged_df.groupby(['date', 'ITV Region'], as_index=False)[metric_cols].sum()
|
|
233
180
|
|
|
234
|
-
#
|
|
235
|
-
|
|
236
|
-
filtered_df,
|
|
237
|
-
media_spend_df,
|
|
238
|
-
on=['date', 'geo'],
|
|
239
|
-
how='left'
|
|
240
|
-
)
|
|
181
|
+
# Rename 'ITV Region' to 'geo' to be consistent with your downstream usage
|
|
182
|
+
aggregated_df.rename(columns={'ITV Region': 'geo'}, inplace=True)
|
|
241
183
|
|
|
242
|
-
#
|
|
243
|
-
|
|
184
|
+
# Define a function to assign group values based on the region name
|
|
185
|
+
def assign_value(region):
|
|
186
|
+
if region in test_list:
|
|
187
|
+
return 1
|
|
188
|
+
elif region in control_list:
|
|
189
|
+
return 2
|
|
190
|
+
else:
|
|
191
|
+
return np.nan # Or another default value if desired
|
|
244
192
|
|
|
245
|
-
#
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
193
|
+
# Apply the assignment function and remove rows without a valid assignment
|
|
194
|
+
aggregated_df['assignment'] = aggregated_df['geo'].apply(assign_value)
|
|
195
|
+
aggregated_df.dropna(subset=['assignment'], inplace=True)
|
|
196
|
+
aggregated_df['assignment'] = aggregated_df['assignment'].astype(int)
|
|
249
197
|
|
|
250
|
-
return
|
|
198
|
+
return aggregated_df
|
|
251
199
|
|
|
252
200
|
def process_city_analysis(self, raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
|
|
253
201
|
"""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|