imsciences 0.9.5.7__tar.gz → 0.9.5.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of imsciences might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.9.5.7
3
+ Version: 0.9.5.8
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -1,6 +1,4 @@
1
1
  import pandas as pd
2
- import geopandas as gpd
3
- from shapely.geometry import Point
4
2
  from google.analytics.data_v1beta import BetaAnalyticsDataClient
5
3
  from google.analytics.data_v1beta.types import DateRange
6
4
  from google.analytics.data_v1beta.types import Dimension
@@ -15,6 +13,7 @@ import logging
15
13
  from datetime import datetime, timedelta
16
14
  import os
17
15
  import numpy as np
16
+ from scipy.spatial import cKDTree
18
17
 
19
18
  class geoprocessing:
20
19
 
@@ -137,117 +136,66 @@ class geoprocessing:
137
136
  logging.error(f"An unexpected error occurred: {e}")
138
137
  raise
139
138
 
140
- def process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
139
+ def process_itv_analysis(self, raw_df, city_lut, itv_lut, test_list, control_list):
141
140
  """
142
- Process ITV analysis by mapping geos, grouping data, and merging with media spend.
143
-
141
+ Process the raw data by merging it with a city lookup table,
142
+ performing a spatial join to find the nearest ITV region,
143
+ automatically aggregating metric columns (all columns from raw_df except 'date' and 'geo'),
144
+ and assigning a numerical group based on provided test and control lists.
145
+
144
146
  Parameters:
145
- raw_df (pd.DataFrame): Raw input data containing columns such as 'geo', plus any metrics to be aggregated.
146
- itv_path (str): Path to the ITV regional mapping CSV file.
147
- cities_path (str): Path to the Geo Mappings Excel file.
148
- media_spend_path (str): Path to the media spend Excel file.
149
- output_path (str): Path to save the final output CSV file.
150
- group1 (list): List of geo regions for group 1.
151
- group2 (list): List of geo regions for group 2.
152
- columns_to_aggregate (list): List of columns in `raw_df` that need aggregation.
153
- aggregator_list (list): List of aggregation operations (e.g. ["sum", "mean", ...]) for corresponding columns.
154
-
147
+ raw_df (pd.DataFrame): Raw data containing at least the columns 'date' and 'geo'
148
+ plus metric columns that should be summed.
149
+ city_lut (pd.DataFrame): City lookup table DataFrame with a column 'geo' and coordinate data.
150
+ itv_lut (pd.DataFrame): ITV lookup table DataFrame with columns 'Latitude', 'Longitude', and 'ITV Region'
151
+ for spatial matching.
152
+ test_list (list): List of region names (strings) to be assigned the value 1.
153
+ control_list (list): List of region names (strings) to be assigned the value 2.
154
+
155
155
  Returns:
156
- pd.DataFrame: The final merged and aggregated DataFrame.
156
+ pd.DataFrame: Aggregated DataFrame grouped by 'date' and 'geo' (where 'geo' comes from ITV Region),
157
+ with the metric columns summed and an additional 'assignment' column.
157
158
  """
158
-
159
- # -----------------------
160
- # 1. Load and preprocess data
161
- # -----------------------
162
- itv = pd.read_csv(itv_path).dropna(subset=['Latitude', 'Longitude'])
163
- cities = pd.read_excel(cities_path).dropna(subset=['Latitude', 'Longitude'])
164
-
165
- itv['geometry'] = itv.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
166
- cities['geometry'] = cities.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
167
-
168
- itv_gdf = gpd.GeoDataFrame(itv, geometry='geometry')
169
- cities_gdf = gpd.GeoDataFrame(cities, geometry='geometry')
170
-
171
- # -----------------------
172
- # 2. Perform spatial join to match geos
173
- # -----------------------
174
- joined_gdf = gpd.sjoin_nearest(
175
- itv_gdf,
176
- cities_gdf,
177
- how='inner',
178
- distance_col='distance'
179
- )
180
- matched_result = joined_gdf[['ITV Region', 'geo']].drop_duplicates(subset=['geo'])
181
-
182
- # Handle unmatched geos
183
- unmatched_geos = set(cities_gdf['geo']) - set(matched_result['geo'])
184
- unmatched_cities_gdf = cities_gdf[cities_gdf['geo'].isin(unmatched_geos)]
185
-
186
- nearest_unmatched_gdf = gpd.sjoin_nearest(
187
- unmatched_cities_gdf,
188
- itv_gdf,
189
- how='inner',
190
- distance_col='distance'
191
- )
192
-
193
- unmatched_geo_mapping = nearest_unmatched_gdf[['geo', 'ITV Region', 'Latitude_right', 'Longitude_right']]
194
- unmatched_geo_mapping.columns = ['geo', 'ITV Region', 'Nearest_Latitude', 'Nearest_Longitude']
195
-
196
- matched_result = pd.concat([matched_result, unmatched_geo_mapping[['geo', 'ITV Region']]])
197
-
198
- # -----------------------
199
- # 3. Merge with raw data
200
- # -----------------------
201
- merged_df = pd.merge(raw_df, matched_result, on='geo', how='left')
202
159
 
203
- # Remove rows where geo is "(not set)"
204
- merged_df = merged_df[merged_df["geo"] != "(not set)"]
160
+ # Ensure the ITV lookup table has valid coordinate data
161
+ itv_lut = itv_lut.dropna(subset=['Latitude', 'Longitude'])
205
162
 
206
- # Replace 'geo' column with 'ITV Region'
207
- # - We'll keep the "ITV Region" naming for clarity, but you can rename if you like.
208
- merged_df = merged_df.drop(columns=['geo'])
209
- merged_df = merged_df.rename(columns={'ITV Region': 'geo'})
163
+ # Merge raw_df with the city lookup table on 'geo'
164
+ merged_df = pd.merge(raw_df, city_lut, on='geo', how='left')
210
165
 
211
- # -----------------------
212
- # 4. Group and aggregate
213
- # -----------------------
214
- # Build the dictionary for aggregation: {col1: agg1, col2: agg2, ...}
215
- aggregation_dict = dict(zip(columns_to_aggregate, aggregator_list))
166
+ # Build a KD-tree from the ITV lookup table's coordinates for an efficient nearest-neighbor search
167
+ tree = cKDTree(itv_lut[['Latitude', 'Longitude']].values)
216
168
 
217
- # Perform the groupby operation
218
- grouped_df = merged_df.groupby(['date', 'geo'], as_index=False).agg(aggregation_dict)
169
+ # For each record in merged_df, find the nearest ITV region based on coordinates
170
+ distances, indices = tree.query(merged_df[['Latitude', 'Longitude']].values, k=1)
219
171
 
220
- # -----------------------
221
- # 5. Filter for test & control groups
222
- # -----------------------
223
- filtered_df = grouped_df[grouped_df['geo'].isin(test_group + control_group)].copy()
172
+ # Map the nearest ITV Region back to merged_df
173
+ merged_df['ITV Region'] = itv_lut.iloc[indices]['ITV Region'].values
224
174
 
225
- assignment_map = {city: 1 for city in test_group}
226
- assignment_map.update({city: 2 for city in control_group})
227
- filtered_df['assignment'] = filtered_df['geo'].map(assignment_map)
175
+ # Automatically determine the metric columns from raw_df (all columns except 'date' and 'geo')
176
+ metric_cols = [col for col in raw_df.columns if col not in ['date', 'geo']]
228
177
 
229
- # -----------------------
230
- # 6. Merge with media spend
231
- # -----------------------
232
- media_spend_df = pd.read_excel(media_spend_path).rename(columns={'Cost': 'cost'})
178
+ # Aggregate (sum) the metric columns, grouping by 'date' and the nearest ITV Region
179
+ aggregated_df = merged_df.groupby(['date', 'ITV Region'], as_index=False)[metric_cols].sum()
233
180
 
234
- # Merge on date and geo
235
- analysis_df = pd.merge(
236
- filtered_df,
237
- media_spend_df,
238
- on=['date', 'geo'],
239
- how='left'
240
- )
181
+ # Rename 'ITV Region' to 'geo' to be consistent with your downstream usage
182
+ aggregated_df.rename(columns={'ITV Region': 'geo'}, inplace=True)
241
183
 
242
- # Fill missing cost with 0
243
- analysis_df['cost'] = analysis_df['cost'].fillna(0)
184
+ # Define a function to assign group values based on the region name
185
+ def assign_value(region):
186
+ if region in test_list:
187
+ return 1
188
+ elif region in control_list:
189
+ return 2
190
+ else:
191
+ return np.nan # Or another default value if desired
244
192
 
245
- # -----------------------
246
- # 7. Save to CSV
247
- # -----------------------
248
- analysis_df.to_csv(output_path, index=False)
193
+ # Apply the assignment function and remove rows without a valid assignment
194
+ aggregated_df['assignment'] = aggregated_df['geo'].apply(assign_value)
195
+ aggregated_df.dropna(subset=['assignment'], inplace=True)
196
+ aggregated_df['assignment'] = aggregated_df['assignment'].astype(int)
249
197
 
250
- return analysis_df
198
+ return aggregated_df
251
199
 
252
200
  def process_city_analysis(self, raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
253
201
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.9.5.7
3
+ Version: 0.9.5.8
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -8,7 +8,7 @@ def read_md(file_name):
8
8
  return f.read()
9
9
  return ''
10
10
 
11
- VERSION = '0.9.5.7'
11
+ VERSION = '0.9.5.8'
12
12
  DESCRIPTION = 'IMS Data Processing Package'
13
13
  LONG_DESCRIPTION = read_md('README.md')
14
14
 
File without changes
File without changes
File without changes