imsciences 0.9.5.6__tar.gz → 0.9.5.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of imsciences might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.9.5.6
3
+ Version: 0.9.5.8
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -1,6 +1,4 @@
1
1
  import pandas as pd
2
- import geopandas as gpd
3
- from shapely.geometry import Point
4
2
  from google.analytics.data_v1beta import BetaAnalyticsDataClient
5
3
  from google.analytics.data_v1beta.types import DateRange
6
4
  from google.analytics.data_v1beta.types import Dimension
@@ -15,6 +13,7 @@ import logging
15
13
  from datetime import datetime, timedelta
16
14
  import os
17
15
  import numpy as np
16
+ from scipy.spatial import cKDTree
18
17
 
19
18
  class geoprocessing:
20
19
 
@@ -137,117 +136,66 @@ class geoprocessing:
137
136
  logging.error(f"An unexpected error occurred: {e}")
138
137
  raise
139
138
 
140
- def process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
139
+ def process_itv_analysis(self, raw_df, city_lut, itv_lut, test_list, control_list):
141
140
  """
142
- Process ITV analysis by mapping geos, grouping data, and merging with media spend.
143
-
141
+ Process the raw data by merging it with a city lookup table,
142
+ performing a spatial join to find the nearest ITV region,
143
+ automatically aggregating metric columns (all columns from raw_df except 'date' and 'geo'),
144
+ and assigning a numerical group based on provided test and control lists.
145
+
144
146
  Parameters:
145
- raw_df (pd.DataFrame): Raw input data containing columns such as 'geo', plus any metrics to be aggregated.
146
- itv_path (str): Path to the ITV regional mapping CSV file.
147
- cities_path (str): Path to the Geo Mappings Excel file.
148
- media_spend_path (str): Path to the media spend Excel file.
149
- output_path (str): Path to save the final output CSV file.
150
- group1 (list): List of geo regions for group 1.
151
- group2 (list): List of geo regions for group 2.
152
- columns_to_aggregate (list): List of columns in `raw_df` that need aggregation.
153
- aggregator_list (list): List of aggregation operations (e.g. ["sum", "mean", ...]) for corresponding columns.
154
-
147
+ raw_df (pd.DataFrame): Raw data containing at least the columns 'date' and 'geo'
148
+ plus metric columns that should be summed.
149
+ city_lut (pd.DataFrame): City lookup table DataFrame with a column 'geo' and coordinate data.
150
+ itv_lut (pd.DataFrame): ITV lookup table DataFrame with columns 'Latitude', 'Longitude', and 'ITV Region'
151
+ for spatial matching.
152
+ test_list (list): List of region names (strings) to be assigned the value 1.
153
+ control_list (list): List of region names (strings) to be assigned the value 2.
154
+
155
155
  Returns:
156
- pd.DataFrame: The final merged and aggregated DataFrame.
156
+ pd.DataFrame: Aggregated DataFrame grouped by 'date' and 'geo' (where 'geo' comes from ITV Region),
157
+ with the metric columns summed and an additional 'assignment' column.
157
158
  """
158
-
159
- # -----------------------
160
- # 1. Load and preprocess data
161
- # -----------------------
162
- itv = pd.read_csv(itv_path).dropna(subset=['Latitude', 'Longitude'])
163
- cities = pd.read_excel(cities_path).dropna(subset=['Latitude', 'Longitude'])
164
159
 
165
- itv['geometry'] = itv.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
166
- cities['geometry'] = cities.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
160
+ # Ensure the ITV lookup table has valid coordinate data
161
+ itv_lut = itv_lut.dropna(subset=['Latitude', 'Longitude'])
167
162
 
168
- itv_gdf = gpd.GeoDataFrame(itv, geometry='geometry')
169
- cities_gdf = gpd.GeoDataFrame(cities, geometry='geometry')
163
+ # Merge raw_df with the city lookup table on 'geo'
164
+ merged_df = pd.merge(raw_df, city_lut, on='geo', how='left')
170
165
 
171
- # -----------------------
172
- # 2. Perform spatial join to match geos
173
- # -----------------------
174
- joined_gdf = gpd.sjoin_nearest(
175
- itv_gdf,
176
- cities_gdf,
177
- how='inner',
178
- distance_col='distance'
179
- )
180
- matched_result = joined_gdf[['ITV Region', 'geo']].drop_duplicates(subset=['geo'])
166
+ # Build a KD-tree from the ITV lookup table's coordinates for an efficient nearest-neighbor search
167
+ tree = cKDTree(itv_lut[['Latitude', 'Longitude']].values)
181
168
 
182
- # Handle unmatched geos
183
- unmatched_geos = set(cities_gdf['geo']) - set(matched_result['geo'])
184
- unmatched_cities_gdf = cities_gdf[cities_gdf['geo'].isin(unmatched_geos)]
169
+ # For each record in merged_df, find the nearest ITV region based on coordinates
170
+ distances, indices = tree.query(merged_df[['Latitude', 'Longitude']].values, k=1)
185
171
 
186
- nearest_unmatched_gdf = gpd.sjoin_nearest(
187
- unmatched_cities_gdf,
188
- itv_gdf,
189
- how='inner',
190
- distance_col='distance'
191
- )
172
+ # Map the nearest ITV Region back to merged_df
173
+ merged_df['ITV Region'] = itv_lut.iloc[indices]['ITV Region'].values
192
174
 
193
- unmatched_geo_mapping = nearest_unmatched_gdf[['geo', 'ITV Region', 'Latitude_right', 'Longitude_right']]
194
- unmatched_geo_mapping.columns = ['geo', 'ITV Region', 'Nearest_Latitude', 'Nearest_Longitude']
175
+ # Automatically determine the metric columns from raw_df (all columns except 'date' and 'geo')
176
+ metric_cols = [col for col in raw_df.columns if col not in ['date', 'geo']]
195
177
 
196
- matched_result = pd.concat([matched_result, unmatched_geo_mapping[['geo', 'ITV Region']]])
178
+ # Aggregate (sum) the metric columns, grouping by 'date' and the nearest ITV Region
179
+ aggregated_df = merged_df.groupby(['date', 'ITV Region'], as_index=False)[metric_cols].sum()
197
180
 
198
- # -----------------------
199
- # 3. Merge with raw data
200
- # -----------------------
201
- merged_df = pd.merge(raw_df, matched_result, on='geo', how='left')
202
-
203
- # Remove rows where geo is "(not set)"
204
- merged_df = merged_df[merged_df["geo"] != "(not set)"]
205
-
206
- # Replace 'geo' column with 'ITV Region'
207
- # - We'll keep the "ITV Region" naming for clarity, but you can rename if you like.
208
- merged_df = merged_df.drop(columns=['geo'])
209
- merged_df = merged_df.rename(columns={'ITV Region': 'geo'})
181
+ # Rename 'ITV Region' to 'geo' to be consistent with your downstream usage
182
+ aggregated_df.rename(columns={'ITV Region': 'geo'}, inplace=True)
210
183
 
211
- # -----------------------
212
- # 4. Group and aggregate
213
- # -----------------------
214
- # Build the dictionary for aggregation: {col1: agg1, col2: agg2, ...}
215
- aggregation_dict = dict(zip(columns_to_aggregate, aggregator_list))
216
-
217
- # Perform the groupby operation
218
- grouped_df = merged_df.groupby(['date', 'geo'], as_index=False).agg(aggregation_dict)
219
-
220
- # -----------------------
221
- # 5. Filter for test & control groups
222
- # -----------------------
223
- filtered_df = grouped_df[grouped_df['geo'].isin(test_group + control_group)].copy()
224
-
225
- assignment_map = {city: 1 for city in test_group}
226
- assignment_map.update({city: 2 for city in control_group})
227
- filtered_df['assignment'] = filtered_df['geo'].map(assignment_map)
228
-
229
- # -----------------------
230
- # 6. Merge with media spend
231
- # -----------------------
232
- media_spend_df = pd.read_excel(media_spend_path).rename(columns={'Cost': 'cost'})
233
-
234
- # Merge on date and geo
235
- analysis_df = pd.merge(
236
- filtered_df,
237
- media_spend_df,
238
- on=['date', 'geo'],
239
- how='left'
240
- )
241
-
242
- # Fill missing cost with 0
243
- analysis_df['cost'] = analysis_df['cost'].fillna(0)
184
+ # Define a function to assign group values based on the region name
185
+ def assign_value(region):
186
+ if region in test_list:
187
+ return 1
188
+ elif region in control_list:
189
+ return 2
190
+ else:
191
+ return np.nan # Or another default value if desired
244
192
 
245
- # -----------------------
246
- # 7. Save to CSV
247
- # -----------------------
248
- analysis_df.to_csv(output_path, index=False)
193
+ # Apply the assignment function and remove rows without a valid assignment
194
+ aggregated_df['assignment'] = aggregated_df['geo'].apply(assign_value)
195
+ aggregated_df.dropna(subset=['assignment'], inplace=True)
196
+ aggregated_df['assignment'] = aggregated_df['assignment'].astype(int)
249
197
 
250
- return analysis_df
198
+ return aggregated_df
251
199
 
252
200
  def process_city_analysis(self, raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
253
201
  """
@@ -303,10 +251,11 @@ class geoprocessing:
303
251
  # 1. Read and validate data
304
252
  # -----------------------
305
253
  raw_df = read_file(raw_data)
306
- spend_df = read_file(spend_data)
254
+ raw_df = raw_df.rename(columns={'city': 'geo'})
255
+ spend_df = read_file(spend_data).rename(columns={'Cost': 'cost'})
307
256
 
308
257
  # Columns we minimally need in raw_df
309
- required_columns = {'date', 'city'}
258
+ required_columns = {'date', 'geo'}
310
259
  # Ensure the columns to aggregate are there
311
260
  required_columns = required_columns.union(set(columns_to_aggregate))
312
261
  missing_in_raw = required_columns - set(raw_df.columns)
@@ -336,12 +285,8 @@ class geoprocessing:
336
285
  # -----------------------
337
286
  # 3. Prepare raw data
338
287
  # -----------------------
339
- # Rename 'city' to 'geo' for consistency
340
- raw_df = raw_df.rename(columns={'city': 'geo'})
341
-
342
288
  # Filter only the relevant geos
343
289
  filtered_df = raw_df[raw_df['geo'].isin(test_group + control_group)].copy()
344
-
345
290
  # -----------------------
346
291
  # 4. Group and aggregate
347
292
  # -----------------------
@@ -357,9 +302,9 @@ class geoprocessing:
357
302
 
358
303
  # -----------------------
359
304
  # 5. Map groups (Test vs. Control)
360
- # -----------------------
361
- assignment_map = {city: "Test Group" for city in test_group}
362
- assignment_map.update({city: "Control Group" for city in control_group})
305
+ # -----------------------
306
+ assignment_map = {city: 1 for city in test_group}
307
+ assignment_map.update({city: 2 for city in control_group})
363
308
  grouped_df['assignment'] = grouped_df['geo'].map(assignment_map)
364
309
 
365
310
  # -----------------------
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.9.5.6
3
+ Version: 0.9.5.8
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -8,7 +8,7 @@ def read_md(file_name):
8
8
  return f.read()
9
9
  return ''
10
10
 
11
- VERSION = '0.9.5.6'
11
+ VERSION = '0.9.5.8'
12
12
  DESCRIPTION = 'IMS Data Processing Package'
13
13
  LONG_DESCRIPTION = read_md('README.md')
14
14
 
File without changes
File without changes
File without changes