PyPI - spacr - Versions diffs - 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl - Mend

spacr 0.3.62py3-none-any.whl → 0.3.64py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

spacr/io.py +3 -1
spacr/ml.py +205 -0
spacr/plot.py +48 -0
spacr/settings.py +64 -0
spacr/submodules.py +298 -1
{spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/METADATA +1 -1
{spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/RECORD +11 -11
{spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/LICENSE +0 -0
{spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/WHEEL +0 -0
{spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/entry_points.txt +0 -0
{spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/top_level.txt +0 -0

spacr/io.py CHANGED Viewed

@@ -2551,6 +2551,7 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
         png_list_g_df_non_numeric.drop(columns=['plate','row_name','column_name','field','file_name','cell_id', 'prcf'], inplace=True)
         if verbose:
             print(f'png_list: {len(png_list)}, png_list grouped: {len(png_list_g_df_numeric)}')
+            print(f"Added png_list columns: {png_list_g_df_numeric.columns}, {png_list_g_df_non_numeric.columns}")
         merged_df = merged_df.merge(png_list_g_df_numeric, left_index=True, right_index=True)
         merged_df = merged_df.merge(png_list_g_df_non_numeric, left_index=True, right_index=True)
@@ -2562,7 +2563,8 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
     metadata.set_index('prcfo', inplace=True)
     # Merge metadata with final merged DataFrame
-    merged_df = metadata.merge(merged_df, left_index=True, right_index=True).dropna(axis=1)
+    #merged_df = metadata.merge(merged_df, left_index=True, right_index=True).dropna(axis=1)
+    merged_df = metadata.merge(merged_df, left_index=True, right_index=True)
     merged_df.drop(columns=['label_list_morphology', 'label_list_intensity'], errors='ignore', inplace=True)
     if verbose:

spacr/ml.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pandas as pd
 import numpy as np
 from scipy import stats
 from scipy.stats import shapiro
+from math import pi
 from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV
 from sklearn.metrics import mean_squared_error
@@ -1515,3 +1516,207 @@ def _calculate_similarity(df, features, col_to_compare, val1, val2):
     return df
+def interperate_vision_model(settings={}):
+    from .io import _read_and_merge_data, _results_to_csv
+    from .settings import set_interperate_vision_model_defaults
+    from .utils import save_settings
+    settings = set_interperate_vision_model_defaults(settings)
+    save_settings(settings, name='interperate_vision_model', show=True)
+    # Function to create radar plot for individual and combined values
+    def create_extended_radar_plot(values, labels, title):
+        values = list(values) + [values[0]]  # Close the loop for radar chart
+        angles = [n / float(len(labels)) * 2 * pi for n in range(len(labels))]
+        angles += angles[:1]
+        fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
+        ax.plot(angles, values, linewidth=2, linestyle='solid')
+        ax.fill(angles, values, alpha=0.25)
+        ax.set_xticks(angles[:-1])
+        ax.set_xticklabels(labels, fontsize=10, rotation=45, ha='right')
+        plt.title(title, pad=20)
+        plt.show()
+    def extract_compartment_channel(feature_name):
+        # Identify compartment as the first part before an underscore
+        compartment = feature_name.split('_')[0]
+        if compartment == 'cells':
+            compartment = 'cell'
+        # Identify channels based on substring presence
+        channels = []
+        if 'channel_0' in feature_name:
+            channels.append('channel_0')
+        if 'channel_1' in feature_name:
+            channels.append('channel_1')
+        if 'channel_2' in feature_name:
+            channels.append('channel_2')
+        if 'channel_3' in feature_name:
+            channels.append('channel_3')
+        # If multiple channels are found, join them with a '+'
+        if channels:
+            channel = ' + '.join(channels)
+        else:
+            channel = 'morphology'  # Use 'morphology' if no channel identifier is found
+        return (compartment, channel)
+    def read_and_preprocess_data(settings):
+        df, _ = _read_and_merge_data(
+            locs=[settings['src']+'/measurements/measurements.db'],
+            tables=settings['tables'],
+            verbose=True,
+            nuclei_limit=settings['nuclei_limit'],
+            pathogen_limit=settings['pathogen_limit']
+        )
+        scores_df = pd.read_csv(settings['scores'])
+        # Clean and align columns for merging
+        df['object_label'] = df['object_label'].str.replace('o', '')
+        if 'row_name' not in scores_df.columns:
+            scores_df['row_name'] = scores_df['row']
+        if 'column_name' not in scores_df.columns:
+            scores_df['column_name'] = scores_df['col']
+        if 'object_label' not in scores_df.columns:
+            scores_df['object_label'] = scores_df['object']
+        # Remove the 'o' prefix from 'object_label' in df, ensuring it is a string type
+        df['object_label'] = df['object_label'].str.replace('o', '').astype(str)
+        # Ensure 'object_label' in scores_df is also a string
+        scores_df['object_label'] = scores_df['object'].astype(str)
+        # Ensure all join columns have the same data type in both DataFrames
+        df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
+        scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
+        # Select only the necessary columns from scores_df for merging
+        scores_df = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label', settings['score_column']]]
+        # Now merge DataFrames
+        merged_df = pd.merge(df, scores_df, on=['plate', 'row_name', 'column_name', 'field', 'object_label'], how='inner')
+        # Separate numerical features and the score column
+        X = merged_df.select_dtypes(include='number').drop(columns=[settings['score_column']])
+        y = merged_df[settings['score_column']]
+        return X, y, merged_df
+    X, y, merged_df = read_and_preprocess_data(settings)
+    # Step 1: Feature Importance using Random Forest
+    if settings['feature_importance'] or settings['feature_importance']:
+        model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
+        model.fit(X, y)
+        if settings['feature_importance']:
+            print(f"Feature Importance ...")
+            feature_importances = model.feature_importances_
+            feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
+            feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
+            top_feature_importance_df = feature_importance_df.head(settings['top_features'])
+            # Plot Feature Importance
+            plt.figure(figsize=(10, 6))
+            plt.barh(top_feature_importance_df['feature'], top_feature_importance_df['importance'])
+            plt.xlabel('Importance')
+            plt.title(f"Top {settings['top_features']} Features - Feature Importance")
+            plt.gca().invert_yaxis()
+            plt.show()
+        if settings['save']:
+            _results_to_csv(feature_importance_df, filename='feature_importance.csv')
+    # Step 2: Permutation Importance
+    if settings['permutation_importance']:
+        print(f"Permutation Importance ...")
+        perm_importance = permutation_importance(model, X, y, n_repeats=10, random_state=42, n_jobs=settings['n_jobs'])
+        perm_importance_df = pd.DataFrame({'feature': X.columns, 'importance': perm_importance.importances_mean})
+        perm_importance_df = perm_importance_df.sort_values(by='importance', ascending=False)
+        top_perm_importance_df = perm_importance_df.head(settings['top_features'])
+        # Plot Permutation Importance
+        plt.figure(figsize=(10, 6))
+        plt.barh(top_perm_importance_df['feature'], top_perm_importance_df['importance'])
+        plt.xlabel('Importance')
+        plt.title(f"Top {settings['top_features']} Features - Permutation Importance")
+        plt.gca().invert_yaxis()
+        plt.show()
+        if settings['save']:
+            _results_to_csv(perm_importance_df, filename='permutation_importance.csv')
+    # Step 3: SHAP Analysis
+    if settings['shap']:
+        print(f"SHAP Analysis ...")
+        # Select top N features based on Random Forest importance and fit the model on these features only
+        top_features = feature_importance_df.head(settings['top_features'])['feature']
+        X_top = X[top_features]
+        # Refit the model on this subset of features
+        model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
+        model.fit(X_top, y)
+        # Sample a smaller subset of rows to speed up SHAP
+        if settings['shap_sample']:
+            sample = int(len(X_top) / 100)
+            X_sample = X_top.sample(min(sample, len(X_top)), random_state=42)
+        else:
+            X_sample = X_top
+        # Initialize SHAP explainer with the same subset of features
+        explainer = shap.Explainer(model.predict, X_sample)
+        shap_values = explainer(X_sample, max_evals=1500)
+        # Plot SHAP summary for the selected sample and top features
+        shap.summary_plot(shap_values, X_sample, max_display=settings['top_features'])
+        # Convert SHAP values to a DataFrame for easier manipulation
+        shap_df = pd.DataFrame(shap_values.values, columns=X_sample.columns)
+        # Apply the function to create MultiIndex columns with compartment and channel
+        shap_df.columns = pd.MultiIndex.from_tuples(
+            [extract_compartment_channel(feat) for feat in shap_df.columns],
+            names=['compartment', 'channel']
+        )
+        # Aggregate SHAP values by compartment and channel
+        compartment_mean = shap_df.abs().groupby(level='compartment', axis=1).mean().mean(axis=0)
+        channel_mean = shap_df.abs().groupby(level='channel', axis=1).mean().mean(axis=0)
+        # Calculate combined importance for each pair of compartments and channels
+        combined_compartment = {}
+        for i, comp1 in enumerate(compartment_mean.index):
+            for comp2 in compartment_mean.index[i+1:]:
+                combined_compartment[f"{comp1} + {comp2}"] = shap_df.loc[:, (comp1, slice(None))].abs().mean().mean() + \
+                                                              shap_df.loc[:, (comp2, slice(None))].abs().mean().mean()
+        combined_channel = {}
+        for i, chan1 in enumerate(channel_mean.index):
+            for chan2 in channel_mean.index[i+1:]:
+                combined_channel[f"{chan1} + {chan2}"] = shap_df.loc[:, (slice(None), chan1)].abs().mean().mean() + \
+                                                          shap_df.loc[:, (slice(None), chan2)].abs().mean().mean()
+        # Prepare values and labels for radar charts
+        all_compartment_importance = list(compartment_mean.values) + list(combined_compartment.values())
+        all_compartment_labels = list(compartment_mean.index) + list(combined_compartment.keys())
+        all_channel_importance = list(channel_mean.values) + list(combined_channel.values())
+        all_channel_labels = list(channel_mean.index) + list(combined_channel.keys())
+        # Create radar plots for compartments and channels
+        create_extended_radar_plot(all_compartment_importance, all_compartment_labels, "SHAP Importance by Compartment (Individual and Combined)")
+        create_extended_radar_plot(all_channel_importance, all_channel_labels, "SHAP Importance by Channel (Individual and Combined)")
+    return merged_df

spacr/plot.py CHANGED Viewed

@@ -3688,3 +3688,51 @@ def overlay_masks_on_images(img_folder, normalize=True, resize=True, save=False,
             plt.axis('off')
             plt.show()
+def graph_importance(settings):
+    from .settings import set_graph_importance_defaults
+    from .utils import save_settings
+    if not isinstance(settings['csvs'], list):
+        settings['csvs'] = settings['csvs']
+    settings['src'] = os.path.dirname(settings['csvs'][0])
+    settings = set_graph_importance_defaults(settings)
+    save_settings(settings, name='graph_importance')
+    dfs = []
+    for path in settings['csvs']:
+        dft = pd.read_csv(path)
+        dfs.append(dft)
+    df = pd.concat(dfs)
+    if not all(col in df.columns for col in (settings['grouping_column'], settings['data_column'])):
+        print(f"grouping {settings['grouping_column']} and data {settings['data_column']} columns must be in {df.columns.to_list()}")
+        return
+    output_dir = os.path.dirname(settings['csvs'][0])
+    spacr_graph = spacrGraph(
+        df=df,
+        grouping_column=settings['grouping_column'],
+        data_column=settings['data_column'],
+        graph_type=settings['graph_type'],
+        graph_name=settings['grouping_column'],
+        summary_func='mean',
+        colors=None,
+        output_dir=output_dir,
+        save=settings['save'],
+        y_lim=None,
+        error_bar_type='std',
+        representation='object',
+        theme='muted',
+    )
+    # Create the plot
+    spacr_graph.create_plot()
+    # Get the figure object if needed
+    fig = spacr_graph.get_figure()
+    plt.show()

spacr/settings.py CHANGED Viewed

@@ -1370,4 +1370,68 @@ def get_analyze_plaque_settings(settings):
     settings.setdefault('rescale', False)
     settings.setdefault('resample', False)
     settings.setdefault('fill_in', True)
+    return settings
+def set_graph_importance_defaults(settings):
+    settings.setdefault('csvs','list of paths')
+    settings.setdefault('grouping_column','compartment')
+    settings.setdefault('data_column','compartment_importance_sum')
+    settings.setdefault('graph_type','jitter_bar')
+    settings.setdefault('save',False)
+    return settings
+def set_interperate_vision_model_defaults(settings):
+    settings.setdefault('src','path')
+    settings.setdefault('scores','path')
+    settings.setdefault('tables',['cell', 'nucleus', 'pathogen','cytoplasm'])
+    settings.setdefault('feature_importance',True)
+    settings.setdefault('permutation_importance',False)
+    settings.setdefault('shap',True)
+    settings.setdefault('save',False)
+    settings.setdefault('nuclei_limit',1000)
+    settings.setdefault('pathogen_limit',1000)
+    settings.setdefault('top_features',30)
+    settings.setdefault('shap_sample',True)
+    settings.setdefault('n_jobs',-1)
+    settings.setdefault('shap_approximate',True)
+    settings.setdefault('score_column','cv_predictions')
+    return settings
+def set_analyze_endodyogeny_defaults(settings):
+    settings.setdefault('src','path')
+    settings.setdefault('tables',['cell', 'nucleus', 'pathogen', 'cytoplasm'])
+    settings.setdefault('cell_types',['Hela'])
+    settings.setdefault('cell_plate_metadata',None)
+    settings.setdefault('pathogen_types',['nc', 'pc'])
+    settings.setdefault('pathogen_plate_metadata',[['c1'], ['c2']])
+    settings.setdefault('treatments',None)
+    settings.setdefault('treatment_plate_metadata',None)
+    settings.setdefault('min_area_bin',500)
+    settings.setdefault('group_column','pathogen')
+    settings.setdefault('compartment','pathogen')
+    settings.setdefault('pathogen_limit',1)
+    settings.setdefault('nuclei_limit',10)
+    settings.setdefault('level','object')
+    settings.setdefault('um_per_px',0.1)
+    settings.setdefault('max_bins',None)
+    settings.setdefault('save',False)
+    settings.setdefault('verbose',False)
+    return settings
+def set_analyze_class_proportion_defaults(settings):
+    settings.setdefault('src','path')
+    settings.setdefault('tables',['cell', 'nucleus', 'pathogen', 'cytoplasm'])
+    settings.setdefault('cell_types',['Hela'])
+    settings.setdefault('cell_plate_metadata',None)
+    settings.setdefault('pathogen_types',['nc','pc'])
+    settings.setdefault('pathogen_plate_metadata',[['c1'],['c2']])
+    settings.setdefault('treatments',None)
+    settings.setdefault('treatment_plate_metadata',None)
+    settings.setdefault('group_column','condition')
+    settings.setdefault('class_column','test')
+    settings.setdefault('pathogen_limit',1000)
+    settings.setdefault('nuclei_limit',1000)
+    settings.setdefault('level','well')
+    settings.setdefault('save',False)
+    settings.setdefault('verbose', False)
     return settings

spacr/submodules.py CHANGED Viewed

@@ -10,6 +10,7 @@ from IPython.display import display
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.inspection import permutation_importance
 from math import pi
+from scipy.stats import chi2_contingency
 import matplotlib.pyplot as plt
 from natsort import natsorted
@@ -844,4 +845,300 @@ def interperate_vision_model(settings={}):
             df.to_csv(save_path)
             print(f"Saved {save_path}")
-    return output
+    return output
+def analyze_endodyogeny(settings):
+    from .utils import annotate_conditions, save_settings
+    from .io import _read_and_merge_data
+    from .settings import set_analyze_endodyogeny_defaults
+    def _calculate_volume_bins(df, compartment='pathogen', min_area_bin=500, max_bins=None, verbose=False):
+        area_column = f'{compartment}_area'
+        df[f'{compartment}_volume'] = df[area_column] ** 1.5
+        min_volume_bin = min_area_bin ** 1.5
+        max_volume = df[f'{compartment}_volume'].max()
+        # Generate bin edges as floats, and filter out any duplicate edges
+        bins = [min_volume_bin * (2 ** i) for i in range(int(np.ceil(np.log2(max_volume / min_volume_bin)) + 1))]
+        bins = sorted(set(bins))  # Ensure bin edges are unique
+        # Create bin labels as ranges with decimal precision for float values (e.g., "500.0-1000.0")
+        bin_labels = [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)]
+        if verbose:
+            print('Volume bins:', bins)
+            print('Volume bin labels:', bin_labels)
+        # Apply the bins to create a new column with the binned labels
+        df[f'{compartment}_volume_bin'] = pd.cut(df[f'{compartment}_volume'], bins=bins, labels=bin_labels, right=False)
+        # Create a bin index column (numeric version of bins)
+        df['bin_index'] = pd.cut(df[f'{compartment}_volume'], bins=bins, labels=range(1, len(bins)), right=False).astype(int)
+        # Adjust bin indices and labels based on max_bins
+        if max_bins is not None:
+            df.loc[df['bin_index'] > max_bins, 'bin_index'] = max_bins
+            # Update bin labels to reflect capped bins
+            bin_labels = bin_labels[:max_bins - 1] + [f">{bins[max_bins - 1]:.2f}"]
+            df[f'{compartment}_volume_bin'] = df['bin_index'].map(
+                {i + 1: label for i, label in enumerate(bin_labels)}
+            )
+        if verbose:
+            print(df[[f'{compartment}_volume', f'{compartment}_volume_bin', 'bin_index']].head())
+        return df
+    def _plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_column='prc', level='object'):
+        # Always calculate chi-squared on raw data
+        raw_counts = df.groupby([group_column, bin_column]).size().unstack(fill_value=0)
+        chi2, p, dof, expected = chi2_contingency(raw_counts)
+        print(f"Chi-squared test statistic (raw data): {chi2:.4f}")
+        print(f"p-value (raw data): {p:.4e}")
+        # Extract bin labels and indices for formatting the legend in the correct order
+        bin_labels = df[bin_column].cat.categories if pd.api.types.is_categorical_dtype(df[bin_column]) else sorted(df[bin_column].unique())
+        bin_indices = range(1, len(bin_labels) + 1)
+        legend_labels = [f"{index}: {label}" for index, label in zip(bin_indices, bin_labels)]
+        # Plot based on level setting
+        if level == 'well':
+            # Aggregate by well for mean ± SD visualization
+            well_proportions = (
+                df.groupby([group_column, prc_column, bin_column])
+                .size()
+                .groupby(level=[0, 1])
+                .apply(lambda x: x / x.sum())
+                .unstack(fill_value=0)
+            )
+            mean_proportions = well_proportions.groupby(group_column).mean()
+            std_proportions = well_proportions.groupby(group_column).std()
+            ax = mean_proportions.plot(
+                kind='bar', stacked=True, yerr=std_proportions, capsize=5, colormap='viridis', figsize=(12, 8)
+            )
+            plt.title('Proportion of Volume Bins by Group (Mean ± SD across wells)')
+        else:
+            # Object-level plotting without aggregation
+            group_counts = df.groupby([group_column, bin_column]).size()
+            group_totals = group_counts.groupby(level=0).sum()
+            proportions = group_counts / group_totals
+            proportion_df = proportions.unstack(fill_value=0)
+            ax = proportion_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(12, 8))
+            plt.title('Proportion of Volume Bins by Group')
+        plt.xlabel('Group')
+        plt.ylabel('Proportion')
+        # Update legend with formatted labels, maintaining correct order
+        volume_unit = "px³" if settings['um_per_px'] is None else "µm³"
+        plt.legend(legend_labels, title=f'Volume Range ({volume_unit})', bbox_to_anchor=(1.05, 1), loc='upper left')
+        plt.ylim(0, 1)
+        fig = plt.gcf()
+        return chi2, p, dof, expected, raw_counts, fig
+    settings = set_analyze_endodyogeny_defaults(settings)
+    save_settings(settings, name='analyze_endodyogeny', show=True)
+    output = {}
+    # Process data
+    if not isinstance(settings['src'], list):
+        settings['src'] = [settings['src']]
+    locs = []
+    for s in settings['src']:
+        loc = os.path.join(s, 'measurements/measurements.db')
+        locs.append(loc)
+    df, _ = _read_and_merge_data(
+        locs,
+        tables=settings['tables'],
+        verbose=settings['verbose'],
+        nuclei_limit=settings['nuclei_limit'],
+        pathogen_limit=settings['pathogen_limit']
+    )
+    if not settings['um_per_px'] is None:
+        df[f"{settings['compartment']}_area"] = df[f"{settings['compartment']}_area"] * (settings['um_per_px'] ** 2)
+        settings['min_area_bin'] = settings['min_area_bin'] * (settings['um_per_px'] ** 2)
+    df = df[df[f"{settings['compartment']}_area"] >= settings['min_area_bin']]
+    df = annotate_conditions(
+        df=df,
+        cells=settings['cell_types'],
+        cell_loc=settings['cell_plate_metadata'],
+        pathogens=settings['pathogen_types'],
+        pathogen_loc=settings['pathogen_plate_metadata'],
+        treatments=settings['treatments'],
+        treatment_loc=settings['treatment_plate_metadata']
+    )
+    if settings['group_column'] not in df.columns:
+        print(f"{settings['group_column']} not found in DataFrame, please choose from:")
+        for col in df.columns:
+            print(col)
+    df = df.dropna(subset=[settings['group_column']])
+    df = _calculate_volume_bins(df, settings['compartment'], settings['min_area_bin'], settings['max_bins'], settings['verbose'])
+    output['data'] = df
+    # Perform chi-squared test and plot
+    chi2, p, dof, expected, raw_counts, fig = _plot_proportion_stacked_bars(settings, df, settings['group_column'], bin_column=f"{settings['compartment']}_volume_bin", level=settings['level']
+    )
+    # Create a DataFrame with chi-squared test results and raw counts
+    results_df = pd.DataFrame({
+        'chi_squared_stat': [chi2],
+        'p_value': [p],
+        'degrees_of_freedom': [dof]
+    })
+    # Flatten and add expected counts to results_df
+    expected_df = pd.DataFrame(expected, index=raw_counts.index, columns=raw_counts.columns)
+    expected_flat = expected_df.stack().reset_index()
+    expected_flat.columns = [settings['group_column'], f"{settings['compartment']}_volume_bin", 'expected_count']
+    results_df = results_df.merge(expected_flat, how="cross")
+    output['chi_squared'] = results_df
+    if settings['save']:
+        # Save DataFrame to CSV
+        output_dir = os.path.join(settings['src'][0], 'results')
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, 'chi_squared_results.csv')
+        output_path_fig = os.path.join(output_dir, 'chi_squared_results.pdf')
+        fig.savefig(output_path_fig, dpi=300, bbox_inches='tight')
+        results_df.to_csv(output_path, index=False)
+        print(f"Chi-squared results saved to {output_path}")
+    plt.show()
+    return output
+def analyze_class_proportion(settings):
+    from .utils import annotate_conditions, save_settings
+    from .io import _read_and_merge_data
+    from .settings import set_analyze_class_proportion_defaults
+    from .plot import plot_plates
+    def _plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_column='prc', level='object'):
+        # Always calculate chi-squared on raw data
+        raw_counts = df.groupby([group_column, bin_column]).size().unstack(fill_value=0)
+        chi2, p, dof, expected = chi2_contingency(raw_counts)
+        print(f"Chi-squared test statistic (raw data): {chi2:.4f}")
+        print(f"p-value (raw data): {p:.4e}")
+        # Plot based on level setting
+        if level == 'well':
+            # Aggregate by well for mean ± SD visualization
+            well_proportions = (
+                df.groupby([group_column, prc_column, bin_column])
+                .size()
+                .groupby(level=[0, 1])
+                .apply(lambda x: x / x.sum())
+                .unstack(fill_value=0)
+            )
+            mean_proportions = well_proportions.groupby(group_column).mean()
+            std_proportions = well_proportions.groupby(group_column).std()
+            ax = mean_proportions.plot(
+                kind='bar', stacked=True, yerr=std_proportions, capsize=5, colormap='viridis', figsize=(12, 8)
+            )
+            plt.title('Proportion of Volume Bins by Group (Mean ± SD across wells)')
+        else:
+            # Object-level plotting without aggregation
+            group_counts = df.groupby([group_column, bin_column]).size()
+            group_totals = group_counts.groupby(level=0).sum()
+            proportions = group_counts / group_totals
+            proportion_df = proportions.unstack(fill_value=0)
+            ax = proportion_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(12, 8))
+            plt.title('Proportion of Volume Bins by Group')
+        plt.xlabel('Group')
+        plt.ylabel('Proportion')
+        # Update legend with formatted labels, maintaining correct order
+        plt.legend(title=f'Classes', bbox_to_anchor=(1.05, 1), loc='upper left')
+        plt.ylim(0, 1)
+        fig = plt.gcf()
+        return chi2, p, dof, expected, raw_counts, fig
+    settings = set_analyze_class_proportion_defaults(settings)
+    save_settings(settings, name='analyze_class_proportion', show=True)
+    output = {}
+    # Process data
+    if not isinstance(settings['src'], list):
+        settings['src'] = [settings['src']]
+    locs = []
+    for s in settings['src']:
+        loc = os.path.join(s, 'measurements/measurements.db')
+        locs.append(loc)
+    if 'png_list' not in settings['tables']:
+        settings['tables'] = settings['tables'] + ['png_list']
+    df, _ = _read_and_merge_data(
+        locs,
+        tables=settings['tables'],
+        verbose=settings['verbose'],
+        nuclei_limit=settings['nuclei_limit'],
+        pathogen_limit=settings['pathogen_limit']
+    )
+    df = annotate_conditions(
+        df=df,
+        cells=settings['cell_types'],
+        cell_loc=settings['cell_plate_metadata'],
+        pathogens=settings['pathogen_types'],
+        pathogen_loc=settings['pathogen_plate_metadata'],
+        treatments=settings['treatments'],
+        treatment_loc=settings['treatment_plate_metadata']
+    )
+    if settings['group_column'] not in df.columns:
+        print(f"{settings['group_column']} not found in DataFrame, please choose from:")
+        for col in df.columns:
+            print(col)
+    df[settings['class_column']] = df[settings['class_column']].fillna(0)
+    output['data'] = df
+    # Perform chi-squared test and plot
+    chi2, p, dof, expected, raw_counts, fig = _plot_proportion_stacked_bars(settings, df, settings['group_column'], bin_column=settings['class_column'], level=settings['level'])
+    # Create a DataFrame with chi-squared test results and raw counts
+    results_df = pd.DataFrame({
+        'chi_squared_stat': [chi2],
+        'p_value': [p],
+        'degrees_of_freedom': [dof]
+    })
+    output['chi_squared'] = results_df
+    if settings['save']:
+        output_dir = os.path.join(settings['src'][0], 'results')
+        os.makedirs(output_dir, exist_ok=True)
+        output_path_chi = os.path.join(output_dir, 'class_chi_squared_results.csv')
+        output_path_data = os.path.join(output_dir, 'class_chi_squared_data.csv')
+        output_path_fig = os.path.join(output_dir, 'class_chi_squared.pdf')
+        fig.savefig(output_path_fig, dpi=300, bbox_inches='tight')
+        results_df.to_csv(output_path_chi, index=False)
+        df.to_csv(output_path_data, index=False)
+        print(f"Chi-squared results saved to {output_path_chi}")
+        print(f"Annotated data saved to {output_path_data}")
+    plt.show()
+    fig2 = plot_plates(df, variable=settings['class_column'], grouping='mean', min_max='allq', cmap='viridis', min_count=0, verbose=True, dst=None)
+    if settings['save']:
+        output_path_fig2 = os.path.join(output_dir, 'class_heatmap.pdf')
+        fig2.savefig(output_path_fig2, dpi=300, bbox_inches='tight')
+    plt.show()
+    return output

{spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: spacr
-Version: 0.3.62
+Version: 0.3.64
 Summary: Spatial phenotype analysis of crisp screens (SpaCr)
 Home-page: https://github.com/EinarOlafsson/spacr
 Author: Einar Birnir Olafsson

{spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/RECORD RENAMED Viewed

@@ -15,17 +15,17 @@ spacr/gui.py,sha256=ARyn9Q_g8HoP-cXh1nzMLVFCKqthY4v2u9yORyaQqQE,8230
 spacr/gui_core.py,sha256=N7R7yvfK_dJhOReM_kW3Ci8Bokhi1OzsxeKqvSGdvV4,41460
 spacr/gui_elements.py,sha256=EKlvEg_4_je7jciEdR3NTgPrcTraowa2e2RUt-xqd6M,138254
 spacr/gui_utils.py,sha256=u9RoIOWpAXFEOnUlLpMQZrc1pWSg6omZsJMIhJdRv_g,41211
-spacr/io.py,sha256=0cBVmhqMaPkdEXib5Vhp19FC_1qfaK_NgtoImuDuwGU,142664
+spacr/io.py,sha256=YlJAT6H8l4ipunMyKzjqoPcf-1AXgUmSyR1YN9WxmDI,142857
 spacr/logger.py,sha256=lJhTqt-_wfAunCPl93xE65Wr9Y1oIHJWaZMjunHUeIw,1538
 spacr/measure.py,sha256=2lK-ZcTxLM-MpXV1oZnucRD9iz5aprwahRKw9IEqshg,55085
 spacr/mediar.py,sha256=FwLvbLQW5LQzPgvJZG8Lw7GniA2vbZx6Jv6vIKu7I5c,14743
-spacr/ml.py,sha256=aLDeeaAl0d4-RP1CzFHPqz5br2HrFbJhvPexEm9lvSI,68198
+spacr/ml.py,sha256=GOQJH8jdTrJQwiLlDrcc9-yCxLFaMx4YD4OJs0-R5YI,77947
 spacr/openai.py,sha256=5vBZ3Jl2llYcW3oaTEXgdyCB2aJujMUIO5K038z7w_A,1246
-spacr/plot.py,sha256=zITe54dzQRz-gk_ZT0qJyARuUWJivIBKW8V4rjUH8SE,160320
+spacr/plot.py,sha256=0fne2Msy6niN80oiuwt9ZYw1QwXVnghaUmrwvEZN9-8,161992
 spacr/sequencing.py,sha256=ClUfwPPK6rNUbUuiEkzcwakzVyDKKUMv9ricrxT8qQY,25227
-spacr/settings.py,sha256=zANLspVmllDZeYjQWIfrHN3VkVgicnYGTduv30MmQ18,77257
+spacr/settings.py,sha256=LSoDNuz1m7rySh7MWXEL1xlUU4rFiCRVlGvZCSCOqzU,80085
 spacr/sim.py,sha256=1xKhXimNU3ukzIw-3l9cF3Znc_brW8h20yv8fSTzvss,71173
-spacr/submodules.py,sha256=Xq4gjvooHN8S7cTk5PIAkd7XD2c7CMVqNpeo8GCvtHc,42489
+spacr/submodules.py,sha256=X1OI0Dsc1qU4lqKFdF2EnloNkLkDzA1hDn7CYbkBmFc,55473
 spacr/timelapse.py,sha256=KGfG4L4-QnFfgbF7L6C5wL_3gd_rqr05Foje6RsoTBg,39603
 spacr/toxo.py,sha256=z2nT5aAze3NUIlwnBQcnkARihDwoPfqOgQIVoUluyK0,25087
 spacr/utils.py,sha256=vvciLh1gH0nsrCWQw3taUcDjxP59wme3gqrejeNO05w,222943
@@ -151,9 +151,9 @@ spacr/resources/icons/umap.png,sha256=dOLF3DeLYy9k0nkUybiZMe1wzHQwLJFRmgccppw-8b
 spacr/resources/images/plate1_E01_T0001F001L01A01Z01C02.tif,sha256=Tl0ZUfZ_AYAbu0up_nO0tPRtF1BxXhWQ3T3pURBCCRo,7958528
 spacr/resources/images/plate1_E01_T0001F001L01A02Z01C01.tif,sha256=m8N-V71rA1TT4dFlENNg8s0Q0YEXXs8slIn7yObmZJQ,7958528
 spacr/resources/images/plate1_E01_T0001F001L01A03Z01C03.tif,sha256=Pbhk7xn-KUP6RSIhJsxQcrHFImBm3GEpLkzx7WOc-5M,7958528
-spacr-0.3.62.dist-info/LICENSE,sha256=SR-2MeGc6SCM1UORJYyarSWY_A-JaOMFDj7ReSs9tRM,1083
-spacr-0.3.62.dist-info/METADATA,sha256=Ox14lWGxbXuMW36MriYHppKcZDqD_4HopfbcLAi8dLc,6032
-spacr-0.3.62.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
-spacr-0.3.62.dist-info/entry_points.txt,sha256=BMC0ql9aNNpv8lUZ8sgDLQMsqaVnX5L535gEhKUP5ho,296
-spacr-0.3.62.dist-info/top_level.txt,sha256=GJPU8FgwRXGzKeut6JopsSRY2R8T3i9lDgya42tLInY,6
-spacr-0.3.62.dist-info/RECORD,,
+spacr-0.3.64.dist-info/LICENSE,sha256=SR-2MeGc6SCM1UORJYyarSWY_A-JaOMFDj7ReSs9tRM,1083
+spacr-0.3.64.dist-info/METADATA,sha256=_07fLYI8eMAYJzOEcAVOemN4TFJAuzAvUrdX1T136T0,6032
+spacr-0.3.64.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
+spacr-0.3.64.dist-info/entry_points.txt,sha256=BMC0ql9aNNpv8lUZ8sgDLQMsqaVnX5L535gEhKUP5ho,296
+spacr-0.3.64.dist-info/top_level.txt,sha256=GJPU8FgwRXGzKeut6JopsSRY2R8T3i9lDgya42tLInY,6
+spacr-0.3.64.dist-info/RECORD,,

{spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/LICENSE RENAMED Viewed

File without changes

{spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/WHEEL RENAMED Viewed

File without changes

{spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/top_level.txt RENAMED Viewed

File without changes

spacr 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl

spacr 0.3.62py3-none-any.whl → 0.3.64py3-none-any.whl