PyPI - imsciences - Versions diffs - 0.9.5.4__py3-none-any.whl → 0.9.5.6__py3-none-any.whl - Mend - Supply Chain Defender

imsciences 0.9.5.4py3-none-any.whl → 0.9.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of imsciences might be problematic. Click here for more details.

Files changed (9) hide show

imsciences/geo.py CHANGED Viewed

@@ -26,14 +26,14 @@ class geoprocessing:
         print("   - Example: pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])")
         print("\n2. process_itv_analysis")
-        print("   - Description: Pull in GA4 data for geo experiments.")
-        print("   - Usage: process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, group1, group2)")
-        print("   - Example:process_itv_analysis(df,'itv regional mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'])")
+        print("   - Description: Processes region-level data for geo experiments by mapping ITV regions, grouping selected metrics, merging with media spend data, and saving the result.")
+        print("   - Usage: process_itv_analysis(raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list")
+        print("   - Example: process_itv_analysis(df, 'itv_regional_mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'], ['newUsers', 'transactions'], ['sum', 'sum']")
         print("\n3. process_city_analysis")
-        print("   - Description: Processes city-level data for geo experiments by grouping user metrics, merging with media spend data, and saving the result.")
-        print("   - Usage: process_city_analysis(raw_df, spend_df, output_path, group1, group2, response_column)")
-        print("   - Example:process_city_analysis(df, spend, output, ['Barnsley'], ['Aberdeen'], 'newUsers')")
+        print("   - Description: Processes city-level data for geo experiments by grouping selected metrics, merging with media spend data, and saving the result.")
+        print("   - Usage: process_city_analysis(raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)")
+        print("   - Example: process_city_analysis(df, spend, 'output.csv', ['Barnsley'], ['Aberdeen'], ['newUsers', 'transactions'], ['sum', 'mean'])")
     def pull_ga(self, credentials_file, property_id, start_date, country, metrics):
         """
@@ -137,23 +137,28 @@ class geoprocessing:
             logging.error(f"An unexpected error occurred: {e}")
             raise
-    def process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, group1, group2):
+    def process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
         """
         Process ITV analysis by mapping geos, grouping data, and merging with media spend.
         Parameters:
-            raw_df (pd.DataFrame): Raw input data containing 'geo', 'newUsers', 'totalRevenue', and 'date'.
+            raw_df (pd.DataFrame): Raw input data containing columns such as 'geo', plus any metrics to be aggregated.
             itv_path (str): Path to the ITV regional mapping CSV file.
             cities_path (str): Path to the Geo Mappings Excel file.
             media_spend_path (str): Path to the media spend Excel file.
             output_path (str): Path to save the final output CSV file.
             group1 (list): List of geo regions for group 1.
             group2 (list): List of geo regions for group 2.
+            columns_to_aggregate (list): List of columns in `raw_df` that need aggregation.
+            aggregator_list (list): List of aggregation operations (e.g. ["sum", "mean", ...]) for corresponding columns.
         Returns:
-            None
+            pd.DataFrame: The final merged and aggregated DataFrame.
         """
-        # Load and preprocess data
+        # -----------------------
+        # 1. Load and preprocess data
+        # -----------------------
         itv = pd.read_csv(itv_path).dropna(subset=['Latitude', 'Longitude'])
         cities = pd.read_excel(cities_path).dropna(subset=['Latitude', 'Longitude'])
@@ -163,59 +168,114 @@ class geoprocessing:
         itv_gdf = gpd.GeoDataFrame(itv, geometry='geometry')
         cities_gdf = gpd.GeoDataFrame(cities, geometry='geometry')
-        # Perform spatial join to match geos
-        joined_gdf = gpd.sjoin_nearest(itv_gdf, cities_gdf, how='inner', distance_col='distance')
+        # -----------------------
+        # 2. Perform spatial join to match geos
+        # -----------------------
+        joined_gdf = gpd.sjoin_nearest(
+            itv_gdf,
+            cities_gdf,
+            how='inner',
+            distance_col='distance'
+        )
         matched_result = joined_gdf[['ITV Region', 'geo']].drop_duplicates(subset=['geo'])
         # Handle unmatched geos
         unmatched_geos = set(cities_gdf['geo']) - set(matched_result['geo'])
         unmatched_cities_gdf = cities_gdf[cities_gdf['geo'].isin(unmatched_geos)]
-        nearest_unmatched_gdf = gpd.sjoin_nearest(unmatched_cities_gdf, itv_gdf, how='inner', distance_col='distance')
+        nearest_unmatched_gdf = gpd.sjoin_nearest(
+            unmatched_cities_gdf,
+            itv_gdf,
+            how='inner',
+            distance_col='distance'
+        )
         unmatched_geo_mapping = nearest_unmatched_gdf[['geo', 'ITV Region', 'Latitude_right', 'Longitude_right']]
         unmatched_geo_mapping.columns = ['geo', 'ITV Region', 'Nearest_Latitude', 'Nearest_Longitude']
         matched_result = pd.concat([matched_result, unmatched_geo_mapping[['geo', 'ITV Region']]])
-        # Group and filter data
+        # -----------------------
+        # 3. Merge with raw data
+        # -----------------------
         merged_df = pd.merge(raw_df, matched_result, on='geo', how='left')
-        merged_df = merged_df[merged_df["geo"] != "(not set)"].drop(columns=['geo'])
-        merged_df = merged_df.rename(columns={'ITV Region': 'geo', 'newUsers': 'response'})
-        grouped_df = merged_df.groupby(['date', 'geo'], as_index=False).agg({'response': 'sum', 'totalRevenue': 'sum'})
-        filtered_df = grouped_df[grouped_df['geo'].isin(group1 + group2)].copy()
+        # Remove rows where geo is "(not set)"
+        merged_df = merged_df[merged_df["geo"] != "(not set)"]
+        # Replace 'geo' column with 'ITV Region'
+        # - We'll keep the "ITV Region" naming for clarity, but you can rename if you like.
+        merged_df = merged_df.drop(columns=['geo'])
+        merged_df = merged_df.rename(columns={'ITV Region': 'geo'})
+        # -----------------------
+        # 4. Group and aggregate
+        # -----------------------
+        # Build the dictionary for aggregation: {col1: agg1, col2: agg2, ...}
+        aggregation_dict = dict(zip(columns_to_aggregate, aggregator_list))
-        assignment_map = {city: 1 for city in group1}
-        assignment_map.update({city: 2 for city in group2})
+        # Perform the groupby operation
+        grouped_df = merged_df.groupby(['date', 'geo'], as_index=False).agg(aggregation_dict)
+        # -----------------------
+        # 5. Filter for test & control groups
+        # -----------------------
+        filtered_df = grouped_df[grouped_df['geo'].isin(test_group + control_group)].copy()
+        assignment_map = {city: 1 for city in test_group}
+        assignment_map.update({city: 2 for city in control_group})
         filtered_df['assignment'] = filtered_df['geo'].map(assignment_map)
-        # Merge with media spend data
+        # -----------------------
+        # 6. Merge with media spend
+        # -----------------------
         media_spend_df = pd.read_excel(media_spend_path).rename(columns={'Cost': 'cost'})
-        analysis_df = pd.merge(filtered_df, media_spend_df, on=['date', 'geo'], how='left')
+        # Merge on date and geo
+        analysis_df = pd.merge(
+            filtered_df,
+            media_spend_df,
+            on=['date', 'geo'],
+            how='left'
+        )
+        # Fill missing cost with 0
         analysis_df['cost'] = analysis_df['cost'].fillna(0)
-        # Save the final output
+        # -----------------------
+        # 7. Save to CSV
+        # -----------------------
         analysis_df.to_csv(output_path, index=False)
-        return analysis_df
+        return analysis_df
-    def process_city_analysis(self, raw_data, spend_data, output_path, group1, group2, response_column):
+    def process_city_analysis(self, raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
         """
-        Process city analysis by grouping data, analyzing user metrics, and merging with spend data.
+        Process city-level analysis by grouping data, applying custom aggregations,
+        and merging with spend data.
         Parameters:
-            raw_data (str or pd.DataFrame): Raw input data as a file path (CSV/XLSX) or DataFrame.
-            spend_data (str or pd.DataFrame): Spend data as a file path (CSV/XLSX) or DataFrame.
-            output_path (str): Path to save the final output file (CSV or XLSX).
-            group1 (list): List of city regions for group 1.
-            group2 (list): List of city regions for group 2.
-            response_column (str): Column name to be used as the response metric.
+            raw_data (str or pd.DataFrame):
+                - Raw input data as a file path (CSV/XLSX) or a DataFrame.
+                - Must contain 'date' and 'city' columns, plus any columns to be aggregated.
+            spend_data (str or pd.DataFrame):
+                - Spend data as a file path (CSV/XLSX) or a DataFrame.
+                - Must contain 'date', 'geo', and 'cost' columns.
+            output_path (str):
+                - Path to save the final output file (CSV or XLSX).
+            group1 (list):
+                - List of city regions to be considered "Test Group" or "Group 1".
+            group2 (list):
+                - List of city regions to be considered "Control Group" or "Group 2".
+            columns_to_aggregate (list):
+                - List of columns to apply aggregation to, e.g. ['newUsers', 'transactions'].
+            aggregator_list (list):
+                - List of corresponding aggregation functions, e.g. ['sum', 'mean'].
+                - Must be the same length as columns_to_aggregate.
         Returns:
-            pd.DataFrame: Processed DataFrame.
+            pd.DataFrame: The final merged, aggregated DataFrame.
         """
-        import pandas as pd
-        import os
         def read_file(data):
             """Helper function to handle file paths or return DataFrame directly."""
@@ -239,39 +299,85 @@ class geoprocessing:
             else:
                 raise ValueError("Unsupported file type. Please use a CSV or XLSX file.")
-        # Read data
+        # -----------------------
+        # 1. Read and validate data
+        # -----------------------
         raw_df = read_file(raw_data)
         spend_df = read_file(spend_data)
-        # Ensure necessary columns are present
-        required_columns = {'date', 'city', response_column}
-        if not required_columns.issubset(raw_df.columns):
-            raise ValueError(f"Input DataFrame must contain the following columns: {required_columns}")
+        # Columns we minimally need in raw_df
+        required_columns = {'date', 'city'}
+        # Ensure the columns to aggregate are there
+        required_columns = required_columns.union(set(columns_to_aggregate))
+        missing_in_raw = required_columns - set(raw_df.columns)
+        if missing_in_raw:
+            raise ValueError(
+                f"The raw data is missing the following required columns: {missing_in_raw}"
+            )
+        # Validate spend data
         spend_required_columns = {'date', 'geo', 'cost'}
-        if not spend_required_columns.issubset(spend_df.columns):
-            raise ValueError(f"Spend DataFrame must contain the following columns: {spend_required_columns}")
+        missing_in_spend = spend_required_columns - set(spend_df.columns)
+        if missing_in_spend:
+            raise ValueError(
+                f"The spend data is missing the following required columns: {missing_in_spend}"
+            )
+        # -----------------------
+        # 2. Clean and prepare spend data
+        # -----------------------
         # Convert cost column to numeric after stripping currency symbols and commas
-        spend_df['cost'] = spend_df['cost'].replace('[^\d.]', '', regex=True).astype(float)
-        # Rename and process input DataFrame
-        raw_df = raw_df.rename(columns={'city': 'geo', response_column: 'response'})
-        # Filter and group data
-        filtered_df = raw_df[raw_df['geo'].isin(group1 + group2)].copy()
-        grouped_df = filtered_df.groupby(['date', 'geo'], as_index=False).agg({'response': 'sum'})
-        assignment_map = {city: 1 for city in group1}
-        assignment_map.update({city: 2 for city in group2})
+        spend_df['cost'] = (
+            spend_df['cost']
+            .replace('[^\\d.]', '', regex=True)
+            .astype(float)
+        )
+        # -----------------------
+        # 3. Prepare raw data
+        # -----------------------
+        # Rename 'city' to 'geo' for consistency
+        raw_df = raw_df.rename(columns={'city': 'geo'})
+        # Filter only the relevant geos
+        filtered_df = raw_df[raw_df['geo'].isin(test_group + control_group)].copy()
+        # -----------------------
+        # 4. Group and aggregate
+        # -----------------------
+        # Create a dictionary of {col: agg_function}
+        if len(columns_to_aggregate) != len(aggregator_list):
+            raise ValueError(
+                "columns_to_aggregate and aggregator_list must have the same length."
+            )
+        aggregation_dict = dict(zip(columns_to_aggregate, aggregator_list))
+        # Perform groupby using the aggregator dictionary
+        grouped_df = filtered_df.groupby(['date', 'geo'], as_index=False).agg(aggregation_dict)
+        # -----------------------
+        # 5. Map groups (Test vs. Control)
+        # -----------------------
+        assignment_map = {city: "Test Group" for city in test_group}
+        assignment_map.update({city: "Control Group" for city in control_group})
         grouped_df['assignment'] = grouped_df['geo'].map(assignment_map)
-        # Merge with spend data
-        merged_df = pd.merge(grouped_df, spend_df, on=['date', 'geo'], how='left')
+        # -----------------------
+        # 6. Merge with spend data
+        # -----------------------
+        merged_df = pd.merge(
+            grouped_df,
+            spend_df,  # has date, geo, cost
+            on=['date', 'geo'],
+            how='left'
+        )
+        # Fill missing cost with 0
         merged_df['cost'] = merged_df['cost'].fillna(0)
-        # Save the final output
+        # -----------------------
+        # 7. Write out results
+        # -----------------------
         write_file(merged_df, output_path)
-        return merged_df
+        return merged_df