PyPI - imsciences - Versions diffs - 0.9.5.6__tar.gz → 0.9.5.8__tar.gz - Mend

imsciences 0.9.5.6tar.gz → 0.9.5.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of imsciences might be problematic. Click here for more details.

Files changed (17) hide show

{imsciences-0.9.5.6 → imsciences-0.9.5.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: imsciences
-Version: 0.9.5.6
+Version: 0.9.5.8
 Summary: IMS Data Processing Package
 Author: IMS
 Author-email: cam@im-sciences.com

{imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences/geo.py RENAMED Viewed

@@ -1,6 +1,4 @@
 import pandas as pd
-import geopandas as gpd
-from shapely.geometry import Point
 from google.analytics.data_v1beta import BetaAnalyticsDataClient
 from google.analytics.data_v1beta.types import DateRange
 from google.analytics.data_v1beta.types import Dimension
@@ -15,6 +13,7 @@ import logging
 from datetime import datetime, timedelta
 import os
 import numpy as np
+from scipy.spatial import cKDTree
 class geoprocessing:
@@ -137,117 +136,66 @@ class geoprocessing:
             logging.error(f"An unexpected error occurred: {e}")
             raise
-    def process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
+    def process_itv_analysis(self, raw_df, city_lut, itv_lut, test_list, control_list):
         """
-        Process ITV analysis by mapping geos, grouping data, and merging with media spend.
+        Process the raw data by merging it with a city lookup table,
+        performing a spatial join to find the nearest ITV region,
+        automatically aggregating metric columns (all columns from raw_df except 'date' and 'geo'),
+        and assigning a numerical group based on provided test and control lists.
         Parameters:
-            raw_df (pd.DataFrame): Raw input data containing columns such as 'geo', plus any metrics to be aggregated.
-            itv_path (str): Path to the ITV regional mapping CSV file.
-            cities_path (str): Path to the Geo Mappings Excel file.
-            media_spend_path (str): Path to the media spend Excel file.
-            output_path (str): Path to save the final output CSV file.
-            group1 (list): List of geo regions for group 1.
-            group2 (list): List of geo regions for group 2.
-            columns_to_aggregate (list): List of columns in `raw_df` that need aggregation.
-            aggregator_list (list): List of aggregation operations (e.g. ["sum", "mean", ...]) for corresponding columns.
+            raw_df (pd.DataFrame): Raw data containing at least the columns 'date' and 'geo'
+                                plus metric columns that should be summed.
+            city_lut (pd.DataFrame): City lookup table DataFrame with a column 'geo' and coordinate data.
+            itv_lut (pd.DataFrame): ITV lookup table DataFrame with columns 'Latitude', 'Longitude', and 'ITV Region'
+                                    for spatial matching.
+            test_list (list): List of region names (strings) to be assigned the value 1.
+            control_list (list): List of region names (strings) to be assigned the value 2.
         Returns:
-            pd.DataFrame: The final merged and aggregated DataFrame.
+            pd.DataFrame: Aggregated DataFrame grouped by 'date' and 'geo' (where 'geo' comes from ITV Region),
+                        with the metric columns summed and an additional 'assignment' column.
         """
-        # -----------------------
-        # 1. Load and preprocess data
-        # -----------------------
-        itv = pd.read_csv(itv_path).dropna(subset=['Latitude', 'Longitude'])
-        cities = pd.read_excel(cities_path).dropna(subset=['Latitude', 'Longitude'])
-        itv['geometry'] = itv.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
-        cities['geometry'] = cities.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
+        # Ensure the ITV lookup table has valid coordinate data
+        itv_lut = itv_lut.dropna(subset=['Latitude', 'Longitude'])
-        itv_gdf = gpd.GeoDataFrame(itv, geometry='geometry')
-        cities_gdf = gpd.GeoDataFrame(cities, geometry='geometry')
+        # Merge raw_df with the city lookup table on 'geo'
+        merged_df = pd.merge(raw_df, city_lut, on='geo', how='left')
-        # -----------------------
-        # 2. Perform spatial join to match geos
-        # -----------------------
-        joined_gdf = gpd.sjoin_nearest(
-            itv_gdf,
-            cities_gdf,
-            how='inner',
-            distance_col='distance'
-        )
-        matched_result = joined_gdf[['ITV Region', 'geo']].drop_duplicates(subset=['geo'])
+        # Build a KD-tree from the ITV lookup table's coordinates for an efficient nearest-neighbor search
+        tree = cKDTree(itv_lut[['Latitude', 'Longitude']].values)
-        # Handle unmatched geos
-        unmatched_geos = set(cities_gdf['geo']) - set(matched_result['geo'])
-        unmatched_cities_gdf = cities_gdf[cities_gdf['geo'].isin(unmatched_geos)]
+        # For each record in merged_df, find the nearest ITV region based on coordinates
+        distances, indices = tree.query(merged_df[['Latitude', 'Longitude']].values, k=1)
-        nearest_unmatched_gdf = gpd.sjoin_nearest(
-            unmatched_cities_gdf,
-            itv_gdf,
-            how='inner',
-            distance_col='distance'
-        )
+        # Map the nearest ITV Region back to merged_df
+        merged_df['ITV Region'] = itv_lut.iloc[indices]['ITV Region'].values
-        unmatched_geo_mapping = nearest_unmatched_gdf[['geo', 'ITV Region', 'Latitude_right', 'Longitude_right']]
-        unmatched_geo_mapping.columns = ['geo', 'ITV Region', 'Nearest_Latitude', 'Nearest_Longitude']
+        # Automatically determine the metric columns from raw_df (all columns except 'date' and 'geo')
+        metric_cols = [col for col in raw_df.columns if col not in ['date', 'geo']]
-        matched_result = pd.concat([matched_result, unmatched_geo_mapping[['geo', 'ITV Region']]])
+        # Aggregate (sum) the metric columns, grouping by 'date' and the nearest ITV Region
+        aggregated_df = merged_df.groupby(['date', 'ITV Region'], as_index=False)[metric_cols].sum()
-        # -----------------------
-        # 3. Merge with raw data
-        # -----------------------
-        merged_df = pd.merge(raw_df, matched_result, on='geo', how='left')
-        # Remove rows where geo is "(not set)"
-        merged_df = merged_df[merged_df["geo"] != "(not set)"]
-        # Replace 'geo' column with 'ITV Region'
-        # - We'll keep the "ITV Region" naming for clarity, but you can rename if you like.
-        merged_df = merged_df.drop(columns=['geo'])
-        merged_df = merged_df.rename(columns={'ITV Region': 'geo'})
+        # Rename 'ITV Region' to 'geo' to be consistent with your downstream usage
+        aggregated_df.rename(columns={'ITV Region': 'geo'}, inplace=True)
-        # -----------------------
-        # 4. Group and aggregate
-        # -----------------------
-        # Build the dictionary for aggregation: {col1: agg1, col2: agg2, ...}
-        aggregation_dict = dict(zip(columns_to_aggregate, aggregator_list))
-        # Perform the groupby operation
-        grouped_df = merged_df.groupby(['date', 'geo'], as_index=False).agg(aggregation_dict)
-        # -----------------------
-        # 5. Filter for test & control groups
-        # -----------------------
-        filtered_df = grouped_df[grouped_df['geo'].isin(test_group + control_group)].copy()
-        assignment_map = {city: 1 for city in test_group}
-        assignment_map.update({city: 2 for city in control_group})
-        filtered_df['assignment'] = filtered_df['geo'].map(assignment_map)
-        # -----------------------
-        # 6. Merge with media spend
-        # -----------------------
-        media_spend_df = pd.read_excel(media_spend_path).rename(columns={'Cost': 'cost'})
-        # Merge on date and geo
-        analysis_df = pd.merge(
-            filtered_df,
-            media_spend_df,
-            on=['date', 'geo'],
-            how='left'
-        )
-        # Fill missing cost with 0
-        analysis_df['cost'] = analysis_df['cost'].fillna(0)
+        # Define a function to assign group values based on the region name
+        def assign_value(region):
+            if region in test_list:
+                return 1
+            elif region in control_list:
+                return 2
+            else:
+                return np.nan  # Or another default value if desired
-        # -----------------------
-        # 7. Save to CSV
-        # -----------------------
-        analysis_df.to_csv(output_path, index=False)
+        # Apply the assignment function and remove rows without a valid assignment
+        aggregated_df['assignment'] = aggregated_df['geo'].apply(assign_value)
+        aggregated_df.dropna(subset=['assignment'], inplace=True)
+        aggregated_df['assignment'] = aggregated_df['assignment'].astype(int)
-        return analysis_df
+        return aggregated_df
     def process_city_analysis(self, raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
         """
@@ -303,10 +251,11 @@ class geoprocessing:
         # 1. Read and validate data
         # -----------------------
         raw_df = read_file(raw_data)
-        spend_df = read_file(spend_data)
+        raw_df = raw_df.rename(columns={'city': 'geo'})
+        spend_df = read_file(spend_data).rename(columns={'Cost': 'cost'})
         # Columns we minimally need in raw_df
-        required_columns = {'date', 'city'}
+        required_columns = {'date', 'geo'}
         # Ensure the columns to aggregate are there
         required_columns = required_columns.union(set(columns_to_aggregate))
         missing_in_raw = required_columns - set(raw_df.columns)
@@ -336,12 +285,8 @@ class geoprocessing:
         # -----------------------
         # 3. Prepare raw data
         # -----------------------
-        # Rename 'city' to 'geo' for consistency
-        raw_df = raw_df.rename(columns={'city': 'geo'})
         # Filter only the relevant geos
         filtered_df = raw_df[raw_df['geo'].isin(test_group + control_group)].copy()
         # -----------------------
         # 4. Group and aggregate
         # -----------------------
@@ -357,9 +302,9 @@ class geoprocessing:
         # -----------------------
         # 5. Map groups (Test vs. Control)
-        # -----------------------
-        assignment_map = {city: "Test Group" for city in test_group}
-        assignment_map.update({city: "Control Group" for city in control_group})
+        # -----------------------
+        assignment_map = {city: 1 for city in test_group}
+        assignment_map.update({city: 2 for city in control_group})
         grouped_df['assignment'] = grouped_df['geo'].map(assignment_map)
         # -----------------------

{imsciences-0.9.5.6 → imsciences-0.9.5.8}/imsciences.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: imsciences
-Version: 0.9.5.6
+Version: 0.9.5.8
 Summary: IMS Data Processing Package
 Author: IMS
 Author-email: cam@im-sciences.com

{imsciences-0.9.5.6 → imsciences-0.9.5.8}/setup.py RENAMED Viewed

@@ -8,7 +8,7 @@ def read_md(file_name):
             return f.read()
     return ''
-VERSION = '0.9.5.6'
+VERSION = '0.9.5.8'
 DESCRIPTION = 'IMS Data Processing Package'
 LONG_DESCRIPTION = read_md('README.md')