PyPI - spacr - Versions diffs - 0.3.60__py3-none-any.whl → 0.3.62__py3-none-any.whl - Mend

spacr 0.3.60py3-none-any.whl → 0.3.62py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

spacr/app_annotate.py +0 -8
spacr/core.py +12 -7
spacr/gui_utils.py +24 -8
spacr/io.py +134 -157
spacr/ml.py +3 -4
spacr/plot.py +82 -23
spacr/settings.py +4 -13
spacr/submodules.py +299 -5
spacr/utils.py +96 -3
{spacr-0.3.60.dist-info → spacr-0.3.62.dist-info}/METADATA +1 -1
{spacr-0.3.60.dist-info → spacr-0.3.62.dist-info}/RECORD +15 -15
{spacr-0.3.60.dist-info → spacr-0.3.62.dist-info}/LICENSE +0 -0
{spacr-0.3.60.dist-info → spacr-0.3.62.dist-info}/WHEEL +0 -0
{spacr-0.3.60.dist-info → spacr-0.3.62.dist-info}/entry_points.txt +0 -0
{spacr-0.3.60.dist-info → spacr-0.3.62.dist-info}/top_level.txt +0 -0

spacr/plot.py CHANGED Viewed

@@ -909,7 +909,7 @@ def plot_merged(src, settings):
         path = os.path.join(src, file)
         stack = np.load(path)
         print(f'Loaded: {path}')
-        if not settings['uninfected']:
+        if settings['pathogen_limit'] > 0:
             if settings['pathogen_mask_dim'] is not None and settings['cell_mask_dim'] is not None:
                 stack = _remove_noninfected(stack, settings['cell_mask_dim'], settings['nucleus_mask_dim'], settings['pathogen_mask_dim'])
@@ -2198,8 +2198,8 @@ def jitterplot_by_annotation(src, x_column, y_column, plot_title='Jitter Plot',
                                     tables,
                                     verbose=True,
                                     nuclei_limit=True,
-                                    pathogen_limit=True,
-                                    uninfected=True)
+                                    pathogen_limit=True)
         paths_df = _read_db(loc, tables=['png_list'])
         merged_df = pd.merge(df, paths_df[0], on='prcfo', how='left')
         return merged_df
@@ -2435,7 +2435,9 @@ class spacrGraph:
         self.df = df
         self.grouping_column = grouping_column
+        self.order = sorted(df[self.grouping_column].unique().tolist())
         self.data_column = data_column if isinstance(data_column, list) else [data_column]
         self.graph_type = graph_type
         self.summary_func = summary_func
         self.order = order
@@ -2909,9 +2911,11 @@ class spacrGraph:
         ax.set_xlim(-0.5, num_groups - 0.5)
         # Set ticks to match the group labels in your DataFrame
-        group_labels = self.df[self.grouping_column].unique()
-        ax.set_xticks(range(len(group_labels)))
-        ax.set_xticklabels(group_labels, rotation=45, ha='right')
+        #group_labels = self.df[self.grouping_column].unique()
+        #group_labels = self.order
+        #ax.set_xticks(range(len(group_labels)))
+        #ax.set_xticklabels(group_labels, rotation=45, ha='right')
+        plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
         # Customize elements based on the graph type
         if graph_type == 'bar':
@@ -2943,6 +2947,66 @@ class spacrGraph:
         # Redraw the figure to apply changes
         ax.figure.canvas.draw()
+    def _standerdize_figure_format_v1(self, ax, num_groups, graph_type):
+        """
+        Adjusts the figure layout (size, bar width, jitter, and spacing) based on the number of groups.
+        """
+        if graph_type in ['line', 'line_std']:
+            print("Skipping layout adjustment for line graphs.")
+            return  # Skip layout adjustment for line graphs
+        correction_factor = 4
+        # Set figure size to ensure it remains square with a minimum size
+        fig_size = max(6, num_groups * 2) / correction_factor
+        ax.figure.set_size_inches(fig_size, fig_size)
+        # Configure layout based on the number of groups
+        bar_width = min(0.8, 1.5 / num_groups) / correction_factor
+        jitter_amount = min(0.1, 0.2 / num_groups) / correction_factor
+        jitter_size = max(50 / num_groups, 200)
+        # Adjust x-axis limits to fit the specified order of groups
+        ax.set_xlim(-0.5, len(self.order) - 0.5)  # Use `self.order` length to ensure alignment
+        # Use `self.order` as the x-tick labels to maintain consistent ordering
+        ax.set_xticks(range(len(self.order)))
+        #ax.set_xticklabels(self.order, rotation=45, ha='right')
+        plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
+        # Customize elements based on the graph type
+        if graph_type == 'bar':
+            # Adjust bars' width and position
+            for bar in ax.patches:
+                bar.set_width(bar_width)
+                bar.set_x(bar.get_x() - bar_width / 2)
+        elif graph_type in ['jitter', 'jitter_bar', 'jitter_box']:
+            # Adjust jitter points' position and size
+            for coll in ax.collections:
+                offsets = coll.get_offsets()
+                offsets[:, 0] += jitter_amount  # Shift jitter points slightly
+                coll.set_offsets(offsets)
+                coll.set_sizes([jitter_size] * len(offsets))  # Adjust point size dynamically
+        elif graph_type in ['box', 'violin']:
+            # Adjust box width for consistent spacing
+            for artist in ax.artists:
+                artist.set_width(bar_width)
+        # Adjust legend and axis labels
+        ax.tick_params(axis='x', labelsize=max(10, 15 - num_groups // 2))
+        ax.tick_params(axis='y', labelsize=max(10, 15 - num_groups // 2))
+        # Adjust legend placement and size
+        if ax.get_legend():
+            ax.get_legend().set_bbox_to_anchor((1.05, 1))
+            ax.get_legend().prop.set_size(max(8, 12 - num_groups // 3))
+        # Redraw the figure to apply changes
+        ax.figure.canvas.draw()
     def _create_bar_plot(self, ax):
         """Helper method to create a bar plot with consistent bar thickness and centered error bars."""
@@ -2959,7 +3023,7 @@ class spacrGraph:
         summary_df = self.df_melted.groupby([x_axis_column]).agg(mean=('Value', 'mean'),std=('Value', 'std'),sem=('Value', 'sem')).reset_index()
         error_bars = summary_df[self.error_bar_type] if self.error_bar_type in ['std', 'sem'] else None
-        sns.barplot(data=self.df_melted, x=x_axis_column, y='Value', hue=self.hue, palette=self.sns_palette, ax=ax, dodge=self.jitter_bar_dodge, ci=None)
+        sns.barplot(data=self.df_melted, x=x_axis_column, y='Value', hue=self.hue, palette=self.sns_palette, ax=ax, dodge=self.jitter_bar_dodge, ci=None, order=self.order)
         # Adjust the bar width manually
         if len(self.data_column) > 1:
@@ -2999,7 +3063,7 @@ class spacrGraph:
             hue = None
         # Create the jitter plot
-        sns.stripplot(data=self.df_melted,x=x_axis_column,y='Value',hue=self.hue, palette=self.sns_palette, dodge=self.jitter_bar_dodge, jitter=self.bar_width, ax=ax, alpha=0.6, size=16)
+        sns.stripplot(data=self.df_melted,x=x_axis_column,y='Value',hue=self.hue, palette=self.sns_palette, dodge=self.jitter_bar_dodge, jitter=self.bar_width, ax=ax, alpha=0.6, size=16, order=self.order)
         # Adjust legend and labels
         ax.set_xlabel(self.grouping_column)
@@ -3088,7 +3152,7 @@ class spacrGraph:
             hue = None
         # Create the box plot
-        sns.boxplot(data=self.df_melted,x=x_axis_column,y='Value',hue=self.hue,palette=self.sns_palette,ax=ax)
+        sns.boxplot(data=self.df_melted,x=x_axis_column,y='Value',hue=self.hue,palette=self.sns_palette,ax=ax, order=self.order)
         # Adjust legend and labels
         ax.set_xlabel(self.grouping_column)
@@ -3117,7 +3181,7 @@ class spacrGraph:
             hue = None
         # Create the violin plot
-        sns.violinplot(data=self.df_melted,x=x_axis_column,y='Value', hue=self.hue,palette=self.sns_palette,ax=ax)
+        sns.violinplot(data=self.df_melted,x=x_axis_column,y='Value', hue=self.hue,palette=self.sns_palette,ax=ax, order=self.order)
         # Adjust legend and labels
         ax.set_xlabel(self.grouping_column)
@@ -3148,8 +3212,8 @@ class spacrGraph:
         summary_df = self.df_melted.groupby([x_axis_column]).agg(mean=('Value', 'mean'),std=('Value', 'std'),sem=('Value', 'sem')).reset_index()
         error_bars = summary_df[self.error_bar_type] if self.error_bar_type in ['std', 'sem'] else None
-        sns.barplot(data=self.df_melted, x=x_axis_column, y='Value', hue=self.hue, palette=self.sns_palette, ax=ax, dodge=self.jitter_bar_dodge, ci=None)
-        sns.stripplot(data=self.df_melted,x=x_axis_column,y='Value',hue=self.hue, palette=self.sns_palette, dodge=self.jitter_bar_dodge, jitter=self.bar_width, ax=ax,alpha=0.6, edgecolor='white',linewidth=1, size=16)
+        sns.barplot(data=self.df_melted, x=x_axis_column, y='Value', hue=self.hue, palette=self.sns_palette, ax=ax, dodge=self.jitter_bar_dodge, ci=None, order=self.order)
+        sns.stripplot(data=self.df_melted,x=x_axis_column,y='Value',hue=self.hue, palette=self.sns_palette, dodge=self.jitter_bar_dodge, jitter=self.bar_width, ax=ax,alpha=0.6, edgecolor='white',linewidth=1, size=16, order=self.order)
         # Adjust the bar width manually
         if len(self.data_column) > 1:
@@ -3189,8 +3253,8 @@ class spacrGraph:
             hue = None
         # Create the box plot
-        sns.boxplot(data=self.df_melted,x=x_axis_column,y='Value',hue=self.hue,palette=self.sns_palette,ax=ax)
-        sns.stripplot(data=self.df_melted,x=x_axis_column,y='Value',hue=self.hue, palette=self.sns_palette, dodge=self.jitter_bar_dodge, jitter=self.bar_width, ax=ax,alpha=0.6, edgecolor='white',linewidth=1, size=12)
+        sns.boxplot(data=self.df_melted,x=x_axis_column,y='Value',hue=self.hue,palette=self.sns_palette,ax=ax, order=self.order)
+        sns.stripplot(data=self.df_melted,x=x_axis_column,y='Value',hue=self.hue, palette=self.sns_palette, dodge=self.jitter_bar_dodge, jitter=self.bar_width, ax=ax,alpha=0.6, edgecolor='white',linewidth=1, size=12, order=self.order)
         # Adjust legend and labels
         ax.set_xlabel(self.grouping_column)
@@ -3264,12 +3328,11 @@ def plot_data_from_db(settings):
             [df1] = _read_db(db_loc, tables=[settings['table_names']])
         else:
             df1, _ = _read_and_merge_data(locs=[db_loc],
-                                    tables = ['cell', 'nucleus', 'pathogen','cytoplasm'],
+                                    tables = settings['tables'],
                                     verbose=settings['verbose'],
                                     nuclei_limit=settings['nuclei_limit'],
-                                    pathogen_limit=settings['pathogen_limit'],
-                                    uninfected=settings['uninfected'])
+                                    pathogen_limit=settings['pathogen_limit'])
         dft = annotate_conditions(df1,
                                 cells=settings['cell_types'],
                                 cell_loc=settings['cell_plate_metadata'],
@@ -3281,10 +3344,7 @@ def plot_data_from_db(settings):
     df = pd.concat(dfs, axis=0)
     df['prc'] = df['plate'].astype(str) + '_' + df['row_name'].astype(str) + '_' + df['column_name'].astype(str)
-    #df['recruitment'] = df['pathogen_channel_1_mean_intensity'] / df['cytoplasm_channel_1_mean_intensity']
-    #df['recruitment'] = df['pathogen_channel_1_mean_intensity'] / df['cytoplasm_channel_1_mean_intensity']
-    df['class'] = df['png_path'].apply(lambda x: 'class_1' if 'class_1' in x else ('class_0' if 'class_0' in x else None))
     if settings['cell_plate_metadata'] !=  None:
         df = df.dropna(subset='host_cell')
@@ -3297,7 +3357,6 @@ def plot_data_from_db(settings):
     df = df.dropna(subset=settings['data_column'])
     df = df.dropna(subset=settings['grouping_column'])
     src = srcs[0]
     dst = os.path.join(src, 'results', settings['graph_name'])
     os.makedirs(dst, exist_ok=True)

spacr/settings.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os, ast
 def set_default_plot_merge_settings():
     settings = {}
-    settings.setdefault('uninfected', True)
     settings.setdefault('pathogen_limit', 10)
     settings.setdefault('nuclei_limit', 1)
     settings.setdefault('remove_background', False)
@@ -181,8 +180,8 @@ def set_default_umap_image_settings(settings={}):
     settings.setdefault('n_neighbors', 1000)
     settings.setdefault('min_dist', 0.1)
     settings.setdefault('metric', 'euclidean')
-    settings.setdefault('eps', 0.5)
-    settings.setdefault('min_samples', 1000)
+    settings.setdefault('eps', 0.9)
+    settings.setdefault('min_samples', 100)
     settings.setdefault('filter_by', 'channel_0')
     settings.setdefault('img_zoom', 0.5)
     settings.setdefault('plot_by_cluster', True)
@@ -201,16 +200,13 @@ def set_default_umap_image_settings(settings={}):
     settings.setdefault('col_to_compare', 'column_name')
     settings.setdefault('pos', 'c1')
     settings.setdefault('neg', 'c2')
+    settings.setdefault('mix', 'c3')
     settings.setdefault('embedding_by_controls', False)
     settings.setdefault('plot_images', True)
     settings.setdefault('reduction_method','umap')
     settings.setdefault('save_figure', False)
     settings.setdefault('n_jobs', -1)
     settings.setdefault('color_by', None)
-    settings.setdefault('neg', 'c1')
-    settings.setdefault('pos', 'c2')
-    settings.setdefault('mix', 'c3')
-    settings.setdefault('mix', 'c3')
     settings.setdefault('exclude_conditions', None)
     settings.setdefault('analyze_clusters', False)
     settings.setdefault('resnet_features', False)
@@ -295,7 +291,6 @@ def set_default_analyze_screen(settings):
     settings.setdefault('exclude',None)
     settings.setdefault('nuclei_limit',True)
     settings.setdefault('pathogen_limit',3)
-    settings.setdefault('uninfected',True)
     settings.setdefault('n_repeats',10)
     settings.setdefault('top_features',30)
     settings.setdefault('remove_low_variance_features',True)
@@ -353,7 +348,6 @@ def set_generate_training_dataset_defaults(settings):
     settings.setdefault('tables',None)
     settings.setdefault('nuclei_limit',True)
     settings.setdefault('pathogen_limit',True)
-    settings.setdefault('uninfected',True)
     settings.setdefault('png_type','cell_png')
     return settings
@@ -467,7 +461,6 @@ def get_analyze_recruitment_default_settings(settings):
     settings.setdefault('plot_nr',3)
     settings.setdefault('plot_control',True)
     settings.setdefault('figuresize',10)
-    settings.setdefault('uninfected',True)
     settings.setdefault('pathogen_limit',10)
     settings.setdefault('nuclei_limit',1)
     settings.setdefault('cells_per_well',0)
@@ -691,7 +684,6 @@ expected_types = {
     "measurement": str,
     "nr_imgs": int,
     "um_per_pixel": (int, float),
-    "uninfected": bool,
     "pathogen_limit": int,
     "nuclei_limit": int,
     "filter_min_max": (list, type(None)),
@@ -898,7 +890,7 @@ categories = {"Paths":[ "src", "grna", "barcodes", "custom_model_path", "dataset
              "Plot": ["plot", "plot_control", "plot_nr", "examples_to_plot", "normalize_plots", "cmap", "figuresize", "plot_cluster_grids", "img_zoom", "row_limit", "color_by", "plot_images", "smooth_lines", "plot_points", "plot_outlines", "black_background", "plot_by_cluster", "heatmap_feature","grouping","min_max","cmap","save_figure"],
              "Test": ["test_mode", "test_images", "random_test", "test_nr", "test", "test_split"],
              "Timelapse": ["timelapse", "fps", "timelapse_displacement", "timelapse_memory", "timelapse_frame_limits", "timelapse_remove_transient", "timelapse_mode", "timelapse_objects", "compartments"],
-             "Advanced": ["shuffle", "target_intensity_min", "cells_per_well", "nuclei_limit", "pathogen_limit", "uninfected", "background", "backgrounds", "schedule", "test_size","exclude","n_repeats","top_features", "model_type_ml", "model_type","minimum_cell_count","n_estimators","preprocess", "remove_background", "normalize", "lower_percentile", "merge_pathogens", "batch_size", "filter", "save", "masks", "verbose", "randomize", "n_jobs"],
+             "Advanced": ["shuffle", "target_intensity_min", "cells_per_well", "nuclei_limit", "pathogen_limit", "background", "backgrounds", "schedule", "test_size","exclude","n_repeats","top_features", "model_type_ml", "model_type","minimum_cell_count","n_estimators","preprocess", "remove_background", "normalize", "lower_percentile", "merge_pathogens", "batch_size", "filter", "save", "masks", "verbose", "randomize", "n_jobs"],
              "Miscellaneous": ["all_to_mip", "pick_slice", "skip_mode", "upscale", "upscale_factor"]
              }
@@ -1080,7 +1072,6 @@ def generate_fields(variables, scrollable_frame):
         "img_zoom": "(float) - Zoom factor for the images in plots.",
         "nuclei_limit": "(int) - Whether to include multinucleated cells in the analysis.",
         "pathogen_limit": "(int) - Whether to include multi-infected cells in the analysis.",
-        "uninfected": "(bool) - Whether to include non-infected cells in the analysis.",
         "uninfected": "(bool) - Whether to include uninfected cells in the analysis.",
         "init_weights": "(bool) - Whether to initialize weights for the model.",
         "src": "(str) - Path to the folder containing the images.",

spacr/submodules.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import seaborn as sns
-import os, random, sqlite3
+import os, random, sqlite3, re, shap
 import pandas as pd
 import numpy as np
 import cellpose
@@ -7,6 +7,9 @@ from skimage.measure import regionprops, label
 from cellpose import models as cp_models
 from cellpose import train as train_cp
 from IPython.display import display
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.inspection import permutation_importance
+from math import pi
 import matplotlib.pyplot as plt
 from natsort import natsorted
@@ -43,9 +46,8 @@ def analyze_recruitment(settings={}):
                                  tables=['cell', 'nucleus', 'pathogen','cytoplasm'],
                                  verbose=True,
                                  nuclei_limit=settings['nuclei_limit'],
-                                 pathogen_limit=settings['pathogen_limit'],
-                                 uninfected=settings['uninfected'])
+                                 pathogen_limit=settings['pathogen_limit'])
     df = annotate_conditions(df,
                              cells=settings['cell_types'],
                              cell_loc=settings['cell_plate_metadata'],
@@ -550,4 +552,296 @@ def compare_reads_to_scores(reads_csv, scores_csv, empirical_dict={'r1':(90,10),
     fig_1 = plot_line(df, x_column = 'pc_fraction', y_columns=y_columns, group_column=None, xlabel=None, ylabel='Fraction', title=None, figsize=(10, 6), save_path=save_paths[0])
     fig_2 = plot_line(df, x_column = 'nc_fraction', y_columns=y_columns, group_column=None, xlabel=None, ylabel='Fraction', title=None, figsize=(10, 6), save_path=save_paths[1])
-    return [fig_1, fig_2]
+    return [fig_1, fig_2]
+def interperate_vision_model(settings={}):
+    from .io import _read_and_merge_data
+    def generate_comparison_columns(df, compartments=['cell', 'nucleus', 'pathogen', 'cytoplasm']):
+        comparison_dict = {}
+        # Get columns by compartment
+        compartment_columns = {comp: [col for col in df.columns if col.startswith(comp)] for comp in compartments}
+        for comp0, comp0_columns in compartment_columns.items():
+            for comp0_col in comp0_columns:
+                related_cols = []
+                base_col_name = comp0_col.replace(comp0, '')  # Base feature name without compartment prefix
+                # Look for matching columns in other compartments
+                for prefix, prefix_columns in compartment_columns.items():
+                    if prefix == comp0:  # Skip same-compartment comparisons
+                        continue
+                    # Check if related column exists in other compartment
+                    related_col = prefix + base_col_name
+                    if related_col in df.columns:
+                        related_cols.append(related_col)
+                        new_col_name = f"{prefix}_{comp0}{base_col_name}"  # Format: prefix_comp0_base
+                        # Calculate ratio and handle infinite or NaN values
+                        df[new_col_name] = df[related_col] / df[comp0_col]
+                        df[new_col_name].replace([float('inf'), -float('inf')], pd.NA, inplace=True)  # Replace inf values with NA
+                        df[new_col_name].fillna(0, inplace=True)  # Replace NaN values with 0 for ease of further calculations
+                # Generate all-to-all comparisons
+                if related_cols:
+                    comparison_dict[comp0_col] = related_cols
+                    for i, rel_col_1 in enumerate(related_cols):
+                        for rel_col_2 in related_cols[i + 1:]:
+                            # Create a new column name for each pairwise comparison
+                            comp1, comp2 = rel_col_1.split('_')[0], rel_col_2.split('_')[0]
+                            new_col_name_all = f"{comp1}_{comp2}{base_col_name}"
+                            # Calculate pairwise ratio and handle infinite or NaN values
+                            df[new_col_name_all] = df[rel_col_1] / df[rel_col_2]
+                            df[new_col_name_all].replace([float('inf'), -float('inf')], pd.NA, inplace=True)  # Replace inf with NA
+                            df[new_col_name_all].fillna(0, inplace=True)  # Replace NaN with 0
+        return df, comparison_dict
+    def group_feature_class(df, feature_groups=['cell', 'cytoplasm', 'nucleus', 'pathogen'], name='compartment', include_all=False):
+        # Function to determine compartment based on multiple matches
+        def find_feature_class(feature, compartments):
+            matches = [compartment for compartment in compartments if re.search(compartment, feature)]
+            if len(matches) > 1:
+                return '-'.join(matches)
+            elif matches:
+                return matches[0]
+            else:
+                return None
+        from spacr.plot import spacrGraph
+        df[name] = df['feature'].apply(lambda x: find_feature_class(x, feature_groups))
+        if name == 'channel':
+            df['channel'].fillna('morphology', inplace=True)
+        # Create new DataFrame with summed importance for each compartment and channel
+        importance_sum = df.groupby(name)['importance'].sum().reset_index(name=f'{name}_importance_sum')
+        if include_all:
+            total_compartment_importance = importance_sum[f'{name}_importance_sum'].sum()
+            importance_sum = pd.concat(
+                [importance_sum,
+                 pd.DataFrame(
+                     [{name: 'all', f'{name}_importance_sum': total_compartment_importance}])]
+                , ignore_index=True)
+        return importance_sum
+    # Function to create radar plot for individual and combined values
+    def create_extended_radar_plot(values, labels, title):
+        values = list(values) + [values[0]]  # Close the loop for radar chart
+        angles = [n / float(len(labels)) * 2 * pi for n in range(len(labels))]
+        angles += angles[:1]
+        fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
+        ax.plot(angles, values, linewidth=2, linestyle='solid')
+        ax.fill(angles, values, alpha=0.25)
+        ax.set_xticks(angles[:-1])
+        ax.set_xticklabels(labels, fontsize=10, rotation=45, ha='right')
+        plt.title(title, pad=20)
+        plt.show()
+    def extract_compartment_channel(feature_name):
+        # Identify compartment as the first part before an underscore
+        compartment = feature_name.split('_')[0]
+        if compartment == 'cells':
+            compartment = 'cell'
+        # Identify channels based on substring presence
+        channels = []
+        if 'channel_0' in feature_name:
+            channels.append('channel_0')
+        if 'channel_1' in feature_name:
+            channels.append('channel_1')
+        if 'channel_2' in feature_name:
+            channels.append('channel_2')
+        if 'channel_3' in feature_name:
+            channels.append('channel_3')
+        # If multiple channels are found, join them with a '+'
+        if channels:
+            channel = ' + '.join(channels)
+        else:
+            channel = 'morphology'  # Use 'morphology' if no channel identifier is found
+        return (compartment, channel)
+    def read_and_preprocess_data(settings):
+        df, _ = _read_and_merge_data(
+            locs=[settings['src']+'/measurements/measurements.db'],
+            tables=settings['tables'],
+            verbose=True,
+            nuclei_limit=settings['nuclei_limit'],
+            pathogen_limit=settings['pathogen_limit']
+        )
+        df, _dict = generate_comparison_columns(df, compartments=['cell', 'nucleus', 'pathogen', 'cytoplasm'])
+        print(f"Expanded dataframe to {len(df.columns)} columns with relative features")
+        scores_df = pd.read_csv(settings['scores'])
+        # Clean and align columns for merging
+        df['object_label'] = df['object_label'].str.replace('o', '')
+        if 'row_name' not in scores_df.columns:
+            scores_df['row_name'] = scores_df['row']
+        if 'column_name' not in scores_df.columns:
+            scores_df['column_name'] = scores_df['col']
+        if 'object_label' not in scores_df.columns:
+            scores_df['object_label'] = scores_df['object']
+        # Remove the 'o' prefix from 'object_label' in df, ensuring it is a string type
+        df['object_label'] = df['object_label'].str.replace('o', '').astype(str)
+        # Ensure 'object_label' in scores_df is also a string
+        scores_df['object_label'] = scores_df['object'].astype(str)
+        # Ensure all join columns have the same data type in both DataFrames
+        df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
+        scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
+        # Select only the necessary columns from scores_df for merging
+        scores_df = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label', settings['score_column']]]
+        # Now merge DataFrames
+        merged_df = pd.merge(df, scores_df, on=['plate', 'row_name', 'column_name', 'field', 'object_label'], how='inner')
+        # Separate numerical features and the score column
+        X = merged_df.select_dtypes(include='number').drop(columns=[settings['score_column']])
+        y = merged_df[settings['score_column']]
+        return X, y, merged_df
+    X, y, merged_df = read_and_preprocess_data(settings)
+    output = {}
+    # Step 1: Feature Importance using Random Forest
+    if settings['feature_importance'] or settings['feature_importance']:
+        model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
+        model.fit(X, y)
+        if settings['feature_importance']:
+            print(f"Feature Importance ...")
+            feature_importances = model.feature_importances_
+            feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
+            feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
+            top_feature_importance_df = feature_importance_df.head(settings['top_features'])
+            # Plot Feature Importance
+            plt.figure(figsize=(10, 6))
+            plt.barh(top_feature_importance_df['feature'], top_feature_importance_df['importance'])
+            plt.xlabel('Importance')
+            plt.title(f"Top {settings['top_features']} Features - Feature Importance")
+            plt.gca().invert_yaxis()
+            plt.show()
+        output['feature_importance'] = feature_importance_df
+        fi_compartment_df = group_feature_class(feature_importance_df, feature_groups=settings['tables'], name='compartment', include_all=settings['include_all'])
+        fi_channel_df = group_feature_class(feature_importance_df, feature_groups=settings['channels'], name='channel', include_all=settings['include_all'])
+        output['feature_importance_compartment'] = fi_compartment_df
+        output['feature_importance_channel'] = fi_channel_df
+    # Step 2: Permutation Importance
+    if settings['permutation_importance']:
+        print(f"Permutation Importance ...")
+        perm_importance = permutation_importance(model, X, y, n_repeats=10, random_state=42, n_jobs=settings['n_jobs'])
+        perm_importance_df = pd.DataFrame({'feature': X.columns, 'importance': perm_importance.importances_mean})
+        perm_importance_df = perm_importance_df.sort_values(by='importance', ascending=False)
+        top_perm_importance_df = perm_importance_df.head(settings['top_features'])
+        # Plot Permutation Importance
+        plt.figure(figsize=(10, 6))
+        plt.barh(top_perm_importance_df['feature'], top_perm_importance_df['importance'])
+        plt.xlabel('Importance')
+        plt.title(f"Top {settings['top_features']} Features - Permutation Importance")
+        plt.gca().invert_yaxis()
+        plt.show()
+        output['permutation_importance'] = perm_importance_df
+    # Step 3: SHAP Analysis
+    if settings['shap']:
+        print(f"SHAP Analysis ...")
+        # Select top N features based on Random Forest importance and fit the model on these features only
+        top_features = feature_importance_df.head(settings['top_features'])['feature']
+        X_top = X[top_features]
+        # Refit the model on this subset of features
+        model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
+        model.fit(X_top, y)
+        # Sample a smaller subset of rows to speed up SHAP
+        if settings['shap_sample']:
+            sample = int(len(X_top) / 100)
+            X_sample = X_top.sample(min(sample, len(X_top)), random_state=42)
+        else:
+            X_sample = X_top
+        # Initialize SHAP explainer with the same subset of features
+        explainer = shap.Explainer(model.predict, X_sample)
+        shap_values = explainer(X_sample, max_evals=1500)
+        # Plot SHAP summary for the selected sample and top features
+        shap.summary_plot(shap_values, X_sample, max_display=settings['top_features'])
+        # Convert SHAP values to a DataFrame for easier manipulation
+        shap_df = pd.DataFrame(shap_values.values, columns=X_sample.columns)
+        # Apply the function to create MultiIndex columns with compartment and channel
+        shap_df.columns = pd.MultiIndex.from_tuples(
+            [extract_compartment_channel(feat) for feat in shap_df.columns],
+            names=['compartment', 'channel']
+        )
+        # Aggregate SHAP values by compartment and channel
+        compartment_mean = shap_df.abs().groupby(level='compartment', axis=1).mean().mean(axis=0)
+        channel_mean = shap_df.abs().groupby(level='channel', axis=1).mean().mean(axis=0)
+        # Calculate combined importance for each pair of compartments and channels
+        combined_compartment = {}
+        for i, comp1 in enumerate(compartment_mean.index):
+            for comp2 in compartment_mean.index[i+1:]:
+                combined_compartment[f"{comp1} + {comp2}"] = shap_df.loc[:, (comp1, slice(None))].abs().mean().mean() + \
+                                                              shap_df.loc[:, (comp2, slice(None))].abs().mean().mean()
+        combined_channel = {}
+        for i, chan1 in enumerate(channel_mean.index):
+            for chan2 in channel_mean.index[i+1:]:
+                combined_channel[f"{chan1} + {chan2}"] = shap_df.loc[:, (slice(None), chan1)].abs().mean().mean() + \
+                                                          shap_df.loc[:, (slice(None), chan2)].abs().mean().mean()
+        # Prepare values and labels for radar charts
+        all_compartment_importance = list(compartment_mean.values) + list(combined_compartment.values())
+        all_compartment_labels = list(compartment_mean.index) + list(combined_compartment.keys())
+        all_channel_importance = list(channel_mean.values) + list(combined_channel.values())
+        all_channel_labels = list(channel_mean.index) + list(combined_channel.keys())
+        # Create radar plots for compartments and channels
+        #create_extended_radar_plot(all_compartment_importance, all_compartment_labels, "SHAP Importance by Compartment (Individual and Combined)")
+        #create_extended_radar_plot(all_channel_importance, all_channel_labels, "SHAP Importance by Channel (Individual and Combined)")
+        output['shap'] = shap_df
+    if settings['save']:
+        dst = os.path.join(settings['src'], 'results')
+        os.makedirs(dst, exist_ok=True)
+        for key, df in output.items():
+            save_path = os.path.join(dst, f"{key}.csv")
+            df.to_csv(save_path)
+            print(f"Saved {save_path}")
+    return output

spacr 0.3.60__py3-none-any.whl → 0.3.62__py3-none-any.whl

spacr 0.3.60py3-none-any.whl → 0.3.62py3-none-any.whl