PyPI - imsciences - Versions diffs - 0.9.5.7__tar.gz → 0.9.5.8__tar.gz - Mend

@@ -1,6 +1,4 @@
 import pandas as pd
-import geopandas as gpd
-from shapely.geometry import Point
 from google.analytics.data_v1beta import BetaAnalyticsDataClient
 from google.analytics.data_v1beta.types import DateRange
 from google.analytics.data_v1beta.types import Dimension
@@ -15,6 +13,7 @@ import logging
 from datetime import datetime, timedelta
 import os
 import numpy as np
+from scipy.spatial import cKDTree
 class geoprocessing:
@@ -137,117 +136,66 @@ class geoprocessing:
             logging.error(f"An unexpected error occurred: {e}")
             raise
-    def process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
+    def process_itv_analysis(self, raw_df, city_lut, itv_lut, test_list, control_list):
         """
-        Process ITV analysis by mapping geos, grouping data, and merging with media spend.
+        Process the raw data by merging it with a city lookup table,
+        performing a spatial join to find the nearest ITV region,
+        automatically aggregating metric columns (all columns from raw_df except 'date' and 'geo'),
+        and assigning a numerical group based on provided test and control lists.
         Parameters:
-            raw_df (pd.DataFrame): Raw input data containing columns such as 'geo', plus any metrics to be aggregated.
-            itv_path (str): Path to the ITV regional mapping CSV file.
-            cities_path (str): Path to the Geo Mappings Excel file.
-            media_spend_path (str): Path to the media spend Excel file.
-            output_path (str): Path to save the final output CSV file.
-            group1 (list): List of geo regions for group 1.
-            group2 (list): List of geo regions for group 2.
-            columns_to_aggregate (list): List of columns in `raw_df` that need aggregation.
-            aggregator_list (list): List of aggregation operations (e.g. ["sum", "mean", ...]) for corresponding columns.
+            raw_df (pd.DataFrame): Raw data containing at least the columns 'date' and 'geo'
+                                plus metric columns that should be summed.
+            city_lut (pd.DataFrame): City lookup table DataFrame with a column 'geo' and coordinate data.
+            itv_lut (pd.DataFrame): ITV lookup table DataFrame with columns 'Latitude', 'Longitude', and 'ITV Region'
+                                    for spatial matching.
+            test_list (list): List of region names (strings) to be assigned the value 1.
+            control_list (list): List of region names (strings) to be assigned the value 2.
         Returns:
-            pd.DataFrame: The final merged and aggregated DataFrame.
+            pd.DataFrame: Aggregated DataFrame grouped by 'date' and 'geo' (where 'geo' comes from ITV Region),
+                        with the metric columns summed and an additional 'assignment' column.
         """
-        # -----------------------
-        # 1. Load and preprocess data
-        # -----------------------
-        itv = pd.read_csv(itv_path).dropna(subset=['Latitude', 'Longitude'])
-        cities = pd.read_excel(cities_path).dropna(subset=['Latitude', 'Longitude'])
-        itv['geometry'] = itv.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
-        cities['geometry'] = cities.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
-        itv_gdf = gpd.GeoDataFrame(itv, geometry='geometry')
-        cities_gdf = gpd.GeoDataFrame(cities, geometry='geometry')
-        # -----------------------
-        # 2. Perform spatial join to match geos
-        # -----------------------
-        joined_gdf = gpd.sjoin_nearest(
-            itv_gdf,
-            cities_gdf,
-            how='inner',
-            distance_col='distance'
-        )
-        matched_result = joined_gdf[['ITV Region', 'geo']].drop_duplicates(subset=['geo'])
-        # Handle unmatched geos
-        unmatched_geos = set(cities_gdf['geo']) - set(matched_result['geo'])
-        unmatched_cities_gdf = cities_gdf[cities_gdf['geo'].isin(unmatched_geos)]
-        nearest_unmatched_gdf = gpd.sjoin_nearest(
-            unmatched_cities_gdf,
-            itv_gdf,
-            how='inner',
-            distance_col='distance'
-        )
-        unmatched_geo_mapping = nearest_unmatched_gdf[['geo', 'ITV Region', 'Latitude_right', 'Longitude_right']]
-        unmatched_geo_mapping.columns = ['geo', 'ITV Region', 'Nearest_Latitude', 'Nearest_Longitude']
-        matched_result = pd.concat([matched_result, unmatched_geo_mapping[['geo', 'ITV Region']]])
-        # -----------------------
-        # 3. Merge with raw data
-        # -----------------------
-        merged_df = pd.merge(raw_df, matched_result, on='geo', how='left')
-        # Remove rows where geo is "(not set)"
-        merged_df = merged_df[merged_df["geo"] != "(not set)"]
+        # Ensure the ITV lookup table has valid coordinate data
+        itv_lut = itv_lut.dropna(subset=['Latitude', 'Longitude'])
-        # Replace 'geo' column with 'ITV Region'
-        # - We'll keep the "ITV Region" naming for clarity, but you can rename if you like.
-        merged_df = merged_df.drop(columns=['geo'])
-        merged_df = merged_df.rename(columns={'ITV Region': 'geo'})
+        # Merge raw_df with the city lookup table on 'geo'
+        merged_df = pd.merge(raw_df, city_lut, on='geo', how='left')
-        # -----------------------
-        # 4. Group and aggregate
-        # -----------------------
-        # Build the dictionary for aggregation: {col1: agg1, col2: agg2, ...}
-        aggregation_dict = dict(zip(columns_to_aggregate, aggregator_list))
+        # Build a KD-tree from the ITV lookup table's coordinates for an efficient nearest-neighbor search
+        tree = cKDTree(itv_lut[['Latitude', 'Longitude']].values)
-        # Perform the groupby operation
-        grouped_df = merged_df.groupby(['date', 'geo'], as_index=False).agg(aggregation_dict)
+        # For each record in merged_df, find the nearest ITV region based on coordinates
+        distances, indices = tree.query(merged_df[['Latitude', 'Longitude']].values, k=1)
-        # -----------------------
-        # 5. Filter for test & control groups
-        # -----------------------
-        filtered_df = grouped_df[grouped_df['geo'].isin(test_group + control_group)].copy()
+        # Map the nearest ITV Region back to merged_df
+        merged_df['ITV Region'] = itv_lut.iloc[indices]['ITV Region'].values
-        assignment_map = {city: 1 for city in test_group}
-        assignment_map.update({city: 2 for city in control_group})
-        filtered_df['assignment'] = filtered_df['geo'].map(assignment_map)
+        # Automatically determine the metric columns from raw_df (all columns except 'date' and 'geo')
+        metric_cols = [col for col in raw_df.columns if col not in ['date', 'geo']]
-        # -----------------------
-        # 6. Merge with media spend
-        # -----------------------
-        media_spend_df = pd.read_excel(media_spend_path).rename(columns={'Cost': 'cost'})
+        # Aggregate (sum) the metric columns, grouping by 'date' and the nearest ITV Region
+        aggregated_df = merged_df.groupby(['date', 'ITV Region'], as_index=False)[metric_cols].sum()
-        # Merge on date and geo
-        analysis_df = pd.merge(
-            filtered_df,
-            media_spend_df,
-            on=['date', 'geo'],
-            how='left'
-        )
+        # Rename 'ITV Region' to 'geo' to be consistent with your downstream usage
+        aggregated_df.rename(columns={'ITV Region': 'geo'}, inplace=True)
-        # Fill missing cost with 0
-        analysis_df['cost'] = analysis_df['cost'].fillna(0)
+        # Define a function to assign group values based on the region name
+        def assign_value(region):
+            if region in test_list:
+                return 1
+            elif region in control_list:
+                return 2
+            else:
+                return np.nan  # Or another default value if desired
-        # -----------------------
-        # 7. Save to CSV
-        # -----------------------
-        analysis_df.to_csv(output_path, index=False)
+        # Apply the assignment function and remove rows without a valid assignment
+        aggregated_df['assignment'] = aggregated_df['geo'].apply(assign_value)
+        aggregated_df.dropna(subset=['assignment'], inplace=True)
+        aggregated_df['assignment'] = aggregated_df['assignment'].astype(int)
-        return analysis_df
+        return aggregated_df
     def process_city_analysis(self, raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
         """

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: imsciences
-Version: 0.9.5.7
+Version: 0.9.5.8
 Summary: IMS Data Processing Package
 Author: IMS
 Author-email: cam@im-sciences.com

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: imsciences
-Version: 0.9.5.7
+Version: 0.9.5.8
 Summary: IMS Data Processing Package
 Author: IMS
 Author-email: cam@im-sciences.com

@@ -8,7 +8,7 @@ def read_md(file_name):
             return f.read()
     return ''
-VERSION = '0.9.5.7'
+VERSION = '0.9.5.8'
 DESCRIPTION = 'IMS Data Processing Package'
 LONG_DESCRIPTION = read_md('README.md')

imsciences 0.9.5.7tar.gz → 0.9.5.8tar.gz

Potentially problematic release.

imsciences 0.9.5.7__tar.gz → 0.9.5.8__tar.gz

Potentially problematic release.

imsciences 0.9.5.7tar.gz → 0.9.5.8tar.gz