PyPI - spacr - Versions diffs - 0.3.61__py3-none-any.whl → 0.3.64__py3-none-any.whl - Mend

spacr 0.3.61py3-none-any.whl → 0.3.64py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

spacr/io.py +133 -3
spacr/ml.py +205 -0
spacr/plot.py +48 -0
spacr/settings.py +64 -0
spacr/submodules.py +298 -1
spacr/utils.py +58 -2
{spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/METADATA +1 -1
{spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/RECORD +12 -12
{spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/LICENSE +0 -0
{spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/WHEEL +0 -0
{spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/entry_points.txt +0 -0
{spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/top_level.txt +0 -0

spacr/io.py CHANGED Viewed

@@ -1777,7 +1777,7 @@ def _read_and_join_tables(db_path, table_names=['cell', 'cytoplasm', 'nucleus',
         png_list_df['cell_id'] = png_list_df['cell_id'].str[1:].astype(int)
         png_list_df.rename(columns={'cell_id': 'object_label'}, inplace=True)
         if 'cell' in dataframes:
-            join_cols = ['object_label', 'plate', 'row_name', 'column_name']
+            join_cols = ['object_label', 'plate', 'row_name', 'column_name','field']
             dataframes['cell'] = pd.merge(dataframes['cell'], png_list_df, on=join_cols, how='left')
         else:
             print("Cell table not found in database tables.")
@@ -2276,7 +2276,7 @@ def _read_db(db_loc, tables):
     conn.close() # Close the connection
     return dfs
-def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=False, pathogen_limit=False):
+def _read_and_merge_data_v1(locs, tables, verbose=False, nuclei_limit=False, pathogen_limit=False):
     from .utils import _split_data
@@ -2443,7 +2443,137 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=False, pathog
     if 'pathogen' in tables:
         obj_df_ls.append(pathogens)
-    return merged_df, obj_df_ls
+    return merged_df, obj_df_ls
+def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_limit=10):
+    from .io import _read_db
+    from .utils import _split_data
+    # Initialize an empty dictionary to store DataFrames by table name
+    data_dict = {table: [] for table in tables}
+    # Extract plate DataFrames
+    for loc in locs:
+        db_dfs = _read_db(loc, tables)
+        for table, df in zip(tables, db_dfs):
+            data_dict[table].append(df)
+    # Concatenate rows across locations for each table
+    for table, dfs in data_dict.items():
+        if dfs:
+            data_dict[table] = pd.concat(dfs, axis=0)
+        if verbose:
+            print(f"{table}: {len(data_dict[table])}")
+    # Initialize merged DataFrame with 'cells' if available
+    merged_df = pd.DataFrame()
+    # Process each table
+    if 'cell' in data_dict:
+        cells = data_dict['cell'].copy()
+        cells = cells.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
+        cells = cells.assign(prcfo=lambda x: x['prcf'] + '_' + x['object_label'])
+        cells_g_df, metadata = _split_data(cells, 'prcfo', 'object_label')
+        merged_df = cells_g_df.copy()
+        if verbose:
+            print(f'cells: {len(cells)}, cells grouped: {len(cells_g_df)}')
+    if 'cytoplasm' in data_dict:
+        cytoplasms = data_dict['cytoplasm'].copy()
+        cytoplasms = cytoplasms.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
+        cytoplasms = cytoplasms.assign(prcfo=lambda x: x['prcf'] + '_' + x['object_label'])
+        if not 'cell' in data_dict:
+            merged_df, metadata = _split_data(cytoplasms, 'prcfo', 'object_label')
+            if verbose:
+                print(f'nucleus: {len(cytoplasms)}, cytoplasms grouped: {len(merged_df)}')
+        else:
+            cytoplasms_g_df, _ = _split_data(cytoplasms, 'prcfo', 'object_label')
+            merged_df = merged_df.merge(cytoplasms_g_df, left_index=True, right_index=True)
+            if verbose:
+                print(f'cytoplasms: {len(cytoplasms)}, cytoplasms grouped: {len(cytoplasms_g_df)}')
+    if 'nucleus' in data_dict:
+        nucleus = data_dict['nucleus'].copy()
+        nucleus = nucleus.dropna(subset=['cell_id'])
+        nucleus = nucleus.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
+        nucleus = nucleus.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
+        nucleus = nucleus.assign(prcfo=lambda x: x['prcf'] + '_' + x['cell_id'])
+        nucleus['nucleus_prcfo_count'] = nucleus.groupby('prcfo')['prcfo'].transform('count')
+        if not nuclei_limit:
+            nucleus = nucleus[nucleus['nucleus_prcfo_count'] == 1]
+        if all(key not in data_dict for key in ['cell', 'cytoplasm']):
+            merged_df, metadata = _split_data(nucleus, 'prcfo', 'cell_id')
+            if verbose:
+                print(f'nucleus: {len(nucleus)}, nucleus grouped: {len(merged_df)}')
+        else:
+            nucleus_g_df, _ = _split_data(nucleus, 'prcfo', 'cell_id')
+            merged_df = merged_df.merge(nucleus_g_df, left_index=True, right_index=True)
+            if verbose:
+                print(f'nucleus: {len(nucleus)}, nucleus grouped: {len(nucleus_g_df)}')
+    if 'pathogen' in data_dict:
+        pathogens = data_dict['pathogen'].copy()
+        pathogens = pathogens.dropna(subset=['cell_id'])
+        pathogens = pathogens.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
+        pathogens = pathogens.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
+        pathogens = pathogens.assign(prcfo=lambda x: x['prcf'] + '_' + x['cell_id'])
+        pathogens['pathogen_prcfo_count'] = pathogens.groupby('prcfo')['prcfo'].transform('count')
+        if isinstance(pathogen_limit, bool) and not pathogen_limit:
+            pathogens = pathogens[pathogens['pathogen_prcfo_count'] <= 1]
+        elif isinstance(pathogen_limit, (float, int)):
+            pathogens = pathogens[pathogens['pathogen_prcfo_count'] <= int(pathogen_limit)]
+        if all(key not in data_dict for key in ['cell', 'cytoplasm', 'nucleus']):
+            merged_df, metadata = _split_data(pathogens, 'prcfo', 'cell_id')
+            if verbose:
+                print(f'pathogens: {len(pathogens)}, pathogens grouped: {len(merged_df)}')
+        else:
+            pathogens_g_df, _ = _split_data(pathogens, 'prcfo', 'cell_id')
+            merged_df = merged_df.merge(pathogens_g_df, left_index=True, right_index=True)
+            if verbose:
+                print(f'pathogens: {len(pathogens)}, pathogens grouped: {len(pathogens_g_df)}')
+    if 'png_list' in data_dict:
+        png_list = data_dict['png_list'].copy()
+        png_list_g_df_numeric, png_list_g_df_non_numeric = _split_data(png_list, 'prcfo', 'cell_id')
+        png_list_g_df_non_numeric.drop(columns=['plate','row_name','column_name','field','file_name','cell_id', 'prcf'], inplace=True)
+        if verbose:
+            print(f'png_list: {len(png_list)}, png_list grouped: {len(png_list_g_df_numeric)}')
+            print(f"Added png_list columns: {png_list_g_df_numeric.columns}, {png_list_g_df_non_numeric.columns}")
+        merged_df = merged_df.merge(png_list_g_df_numeric, left_index=True, right_index=True)
+        merged_df = merged_df.merge(png_list_g_df_non_numeric, left_index=True, right_index=True)
+    # Add prc (plate row column) and prcfo (plate row column field object) columns
+    metadata = metadata.assign(prc=lambda x: x['plate'] + '_' + x['row_name'] + '_' + x['column_name'])
+    cells_well = metadata.groupby('prc')['object_label'].nunique().reset_index(name='cells_per_well')
+    metadata = metadata.merge(cells_well, on='prc')
+    metadata = metadata.assign(prcfo=lambda x: x['plate'] + '_' + x['row_name'] + '_' + x['column_name'] + '_' + x['field'] + '_' + x['object_label'])
+    metadata.set_index('prcfo', inplace=True)
+    # Merge metadata with final merged DataFrame
+    #merged_df = metadata.merge(merged_df, left_index=True, right_index=True).dropna(axis=1)
+    merged_df = metadata.merge(merged_df, left_index=True, right_index=True)
+    merged_df.drop(columns=['label_list_morphology', 'label_list_intensity'], errors='ignore', inplace=True)
+    if verbose:
+        print(f'Generated dataframe with: {len(merged_df.columns)} columns and {len(merged_df)} rows')
+    # Prepare object DataFrames for output
+    obj_df_ls = [data_dict[table] for table in ['cell', 'cytoplasm', 'nucleus', 'pathogen'] if table in data_dict]
+    return merged_df, obj_df_ls
 def _read_mask(mask_path):
     mask = imageio2.imread(mask_path)

spacr/ml.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pandas as pd
 import numpy as np
 from scipy import stats
 from scipy.stats import shapiro
+from math import pi
 from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV
 from sklearn.metrics import mean_squared_error
@@ -1515,3 +1516,207 @@ def _calculate_similarity(df, features, col_to_compare, val1, val2):
     return df
+def interperate_vision_model(settings={}):
+    from .io import _read_and_merge_data, _results_to_csv
+    from .settings import set_interperate_vision_model_defaults
+    from .utils import save_settings
+    settings = set_interperate_vision_model_defaults(settings)
+    save_settings(settings, name='interperate_vision_model', show=True)
+    # Function to create radar plot for individual and combined values
+    def create_extended_radar_plot(values, labels, title):
+        values = list(values) + [values[0]]  # Close the loop for radar chart
+        angles = [n / float(len(labels)) * 2 * pi for n in range(len(labels))]
+        angles += angles[:1]
+        fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
+        ax.plot(angles, values, linewidth=2, linestyle='solid')
+        ax.fill(angles, values, alpha=0.25)
+        ax.set_xticks(angles[:-1])
+        ax.set_xticklabels(labels, fontsize=10, rotation=45, ha='right')
+        plt.title(title, pad=20)
+        plt.show()
+    def extract_compartment_channel(feature_name):
+        # Identify compartment as the first part before an underscore
+        compartment = feature_name.split('_')[0]
+        if compartment == 'cells':
+            compartment = 'cell'
+        # Identify channels based on substring presence
+        channels = []
+        if 'channel_0' in feature_name:
+            channels.append('channel_0')
+        if 'channel_1' in feature_name:
+            channels.append('channel_1')
+        if 'channel_2' in feature_name:
+            channels.append('channel_2')
+        if 'channel_3' in feature_name:
+            channels.append('channel_3')
+        # If multiple channels are found, join them with a '+'
+        if channels:
+            channel = ' + '.join(channels)
+        else:
+            channel = 'morphology'  # Use 'morphology' if no channel identifier is found
+        return (compartment, channel)
+    def read_and_preprocess_data(settings):
+        df, _ = _read_and_merge_data(
+            locs=[settings['src']+'/measurements/measurements.db'],
+            tables=settings['tables'],
+            verbose=True,
+            nuclei_limit=settings['nuclei_limit'],
+            pathogen_limit=settings['pathogen_limit']
+        )
+        scores_df = pd.read_csv(settings['scores'])
+        # Clean and align columns for merging
+        df['object_label'] = df['object_label'].str.replace('o', '')
+        if 'row_name' not in scores_df.columns:
+            scores_df['row_name'] = scores_df['row']
+        if 'column_name' not in scores_df.columns:
+            scores_df['column_name'] = scores_df['col']
+        if 'object_label' not in scores_df.columns:
+            scores_df['object_label'] = scores_df['object']
+        # Remove the 'o' prefix from 'object_label' in df, ensuring it is a string type
+        df['object_label'] = df['object_label'].str.replace('o', '').astype(str)
+        # Ensure 'object_label' in scores_df is also a string
+        scores_df['object_label'] = scores_df['object'].astype(str)
+        # Ensure all join columns have the same data type in both DataFrames
+        df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
+        scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
+        # Select only the necessary columns from scores_df for merging
+        scores_df = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label', settings['score_column']]]
+        # Now merge DataFrames
+        merged_df = pd.merge(df, scores_df, on=['plate', 'row_name', 'column_name', 'field', 'object_label'], how='inner')
+        # Separate numerical features and the score column
+        X = merged_df.select_dtypes(include='number').drop(columns=[settings['score_column']])
+        y = merged_df[settings['score_column']]
+        return X, y, merged_df
+    X, y, merged_df = read_and_preprocess_data(settings)
+    # Step 1: Feature Importance using Random Forest
+    if settings['feature_importance'] or settings['feature_importance']:
+        model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
+        model.fit(X, y)
+        if settings['feature_importance']:
+            print(f"Feature Importance ...")
+            feature_importances = model.feature_importances_
+            feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
+            feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
+            top_feature_importance_df = feature_importance_df.head(settings['top_features'])
+            # Plot Feature Importance
+            plt.figure(figsize=(10, 6))
+            plt.barh(top_feature_importance_df['feature'], top_feature_importance_df['importance'])
+            plt.xlabel('Importance')
+            plt.title(f"Top {settings['top_features']} Features - Feature Importance")
+            plt.gca().invert_yaxis()
+            plt.show()
+        if settings['save']:
+            _results_to_csv(feature_importance_df, filename='feature_importance.csv')
+    # Step 2: Permutation Importance
+    if settings['permutation_importance']:
+        print(f"Permutation Importance ...")
+        perm_importance = permutation_importance(model, X, y, n_repeats=10, random_state=42, n_jobs=settings['n_jobs'])
+        perm_importance_df = pd.DataFrame({'feature': X.columns, 'importance': perm_importance.importances_mean})
+        perm_importance_df = perm_importance_df.sort_values(by='importance', ascending=False)
+        top_perm_importance_df = perm_importance_df.head(settings['top_features'])
+        # Plot Permutation Importance
+        plt.figure(figsize=(10, 6))
+        plt.barh(top_perm_importance_df['feature'], top_perm_importance_df['importance'])
+        plt.xlabel('Importance')
+        plt.title(f"Top {settings['top_features']} Features - Permutation Importance")
+        plt.gca().invert_yaxis()
+        plt.show()
+        if settings['save']:
+            _results_to_csv(perm_importance_df, filename='permutation_importance.csv')
+    # Step 3: SHAP Analysis
+    if settings['shap']:
+        print(f"SHAP Analysis ...")
+        # Select top N features based on Random Forest importance and fit the model on these features only
+        top_features = feature_importance_df.head(settings['top_features'])['feature']
+        X_top = X[top_features]
+        # Refit the model on this subset of features
+        model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
+        model.fit(X_top, y)
+        # Sample a smaller subset of rows to speed up SHAP
+        if settings['shap_sample']:
+            sample = int(len(X_top) / 100)
+            X_sample = X_top.sample(min(sample, len(X_top)), random_state=42)
+        else:
+            X_sample = X_top
+        # Initialize SHAP explainer with the same subset of features
+        explainer = shap.Explainer(model.predict, X_sample)
+        shap_values = explainer(X_sample, max_evals=1500)
+        # Plot SHAP summary for the selected sample and top features
+        shap.summary_plot(shap_values, X_sample, max_display=settings['top_features'])
+        # Convert SHAP values to a DataFrame for easier manipulation
+        shap_df = pd.DataFrame(shap_values.values, columns=X_sample.columns)
+        # Apply the function to create MultiIndex columns with compartment and channel
+        shap_df.columns = pd.MultiIndex.from_tuples(
+            [extract_compartment_channel(feat) for feat in shap_df.columns],
+            names=['compartment', 'channel']
+        )
+        # Aggregate SHAP values by compartment and channel
+        compartment_mean = shap_df.abs().groupby(level='compartment', axis=1).mean().mean(axis=0)
+        channel_mean = shap_df.abs().groupby(level='channel', axis=1).mean().mean(axis=0)
+        # Calculate combined importance for each pair of compartments and channels
+        combined_compartment = {}
+        for i, comp1 in enumerate(compartment_mean.index):
+            for comp2 in compartment_mean.index[i+1:]:
+                combined_compartment[f"{comp1} + {comp2}"] = shap_df.loc[:, (comp1, slice(None))].abs().mean().mean() + \
+                                                              shap_df.loc[:, (comp2, slice(None))].abs().mean().mean()
+        combined_channel = {}
+        for i, chan1 in enumerate(channel_mean.index):
+            for chan2 in channel_mean.index[i+1:]:
+                combined_channel[f"{chan1} + {chan2}"] = shap_df.loc[:, (slice(None), chan1)].abs().mean().mean() + \
+                                                          shap_df.loc[:, (slice(None), chan2)].abs().mean().mean()
+        # Prepare values and labels for radar charts
+        all_compartment_importance = list(compartment_mean.values) + list(combined_compartment.values())
+        all_compartment_labels = list(compartment_mean.index) + list(combined_compartment.keys())
+        all_channel_importance = list(channel_mean.values) + list(combined_channel.values())
+        all_channel_labels = list(channel_mean.index) + list(combined_channel.keys())
+        # Create radar plots for compartments and channels
+        create_extended_radar_plot(all_compartment_importance, all_compartment_labels, "SHAP Importance by Compartment (Individual and Combined)")
+        create_extended_radar_plot(all_channel_importance, all_channel_labels, "SHAP Importance by Channel (Individual and Combined)")
+    return merged_df

spacr/plot.py CHANGED Viewed

@@ -3688,3 +3688,51 @@ def overlay_masks_on_images(img_folder, normalize=True, resize=True, save=False,
             plt.axis('off')
             plt.show()
+def graph_importance(settings):
+    from .settings import set_graph_importance_defaults
+    from .utils import save_settings
+    if not isinstance(settings['csvs'], list):
+        settings['csvs'] = settings['csvs']
+    settings['src'] = os.path.dirname(settings['csvs'][0])
+    settings = set_graph_importance_defaults(settings)
+    save_settings(settings, name='graph_importance')
+    dfs = []
+    for path in settings['csvs']:
+        dft = pd.read_csv(path)
+        dfs.append(dft)
+    df = pd.concat(dfs)
+    if not all(col in df.columns for col in (settings['grouping_column'], settings['data_column'])):
+        print(f"grouping {settings['grouping_column']} and data {settings['data_column']} columns must be in {df.columns.to_list()}")
+        return
+    output_dir = os.path.dirname(settings['csvs'][0])
+    spacr_graph = spacrGraph(
+        df=df,
+        grouping_column=settings['grouping_column'],
+        data_column=settings['data_column'],
+        graph_type=settings['graph_type'],
+        graph_name=settings['grouping_column'],
+        summary_func='mean',
+        colors=None,
+        output_dir=output_dir,
+        save=settings['save'],
+        y_lim=None,
+        error_bar_type='std',
+        representation='object',
+        theme='muted',
+    )
+    # Create the plot
+    spacr_graph.create_plot()
+    # Get the figure object if needed
+    fig = spacr_graph.get_figure()
+    plt.show()

spacr/settings.py CHANGED Viewed

@@ -1370,4 +1370,68 @@ def get_analyze_plaque_settings(settings):
     settings.setdefault('rescale', False)
     settings.setdefault('resample', False)
     settings.setdefault('fill_in', True)
+    return settings
+def set_graph_importance_defaults(settings):
+    settings.setdefault('csvs','list of paths')
+    settings.setdefault('grouping_column','compartment')
+    settings.setdefault('data_column','compartment_importance_sum')
+    settings.setdefault('graph_type','jitter_bar')
+    settings.setdefault('save',False)
+    return settings
+def set_interperate_vision_model_defaults(settings):
+    settings.setdefault('src','path')
+    settings.setdefault('scores','path')
+    settings.setdefault('tables',['cell', 'nucleus', 'pathogen','cytoplasm'])
+    settings.setdefault('feature_importance',True)
+    settings.setdefault('permutation_importance',False)
+    settings.setdefault('shap',True)
+    settings.setdefault('save',False)
+    settings.setdefault('nuclei_limit',1000)
+    settings.setdefault('pathogen_limit',1000)
+    settings.setdefault('top_features',30)
+    settings.setdefault('shap_sample',True)
+    settings.setdefault('n_jobs',-1)
+    settings.setdefault('shap_approximate',True)
+    settings.setdefault('score_column','cv_predictions')
+    return settings
+def set_analyze_endodyogeny_defaults(settings):
+    settings.setdefault('src','path')
+    settings.setdefault('tables',['cell', 'nucleus', 'pathogen', 'cytoplasm'])
+    settings.setdefault('cell_types',['Hela'])
+    settings.setdefault('cell_plate_metadata',None)
+    settings.setdefault('pathogen_types',['nc', 'pc'])
+    settings.setdefault('pathogen_plate_metadata',[['c1'], ['c2']])
+    settings.setdefault('treatments',None)
+    settings.setdefault('treatment_plate_metadata',None)
+    settings.setdefault('min_area_bin',500)
+    settings.setdefault('group_column','pathogen')
+    settings.setdefault('compartment','pathogen')
+    settings.setdefault('pathogen_limit',1)
+    settings.setdefault('nuclei_limit',10)
+    settings.setdefault('level','object')
+    settings.setdefault('um_per_px',0.1)
+    settings.setdefault('max_bins',None)
+    settings.setdefault('save',False)
+    settings.setdefault('verbose',False)
+    return settings
+def set_analyze_class_proportion_defaults(settings):
+    settings.setdefault('src','path')
+    settings.setdefault('tables',['cell', 'nucleus', 'pathogen', 'cytoplasm'])
+    settings.setdefault('cell_types',['Hela'])
+    settings.setdefault('cell_plate_metadata',None)
+    settings.setdefault('pathogen_types',['nc','pc'])
+    settings.setdefault('pathogen_plate_metadata',[['c1'],['c2']])
+    settings.setdefault('treatments',None)
+    settings.setdefault('treatment_plate_metadata',None)
+    settings.setdefault('group_column','condition')
+    settings.setdefault('class_column','test')
+    settings.setdefault('pathogen_limit',1000)
+    settings.setdefault('nuclei_limit',1000)
+    settings.setdefault('level','well')
+    settings.setdefault('save',False)
+    settings.setdefault('verbose', False)
     return settings

spacr/submodules.py CHANGED Viewed

@@ -10,6 +10,7 @@ from IPython.display import display
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.inspection import permutation_importance
 from math import pi
+from scipy.stats import chi2_contingency
 import matplotlib.pyplot as plt
 from natsort import natsorted
@@ -844,4 +845,300 @@ def interperate_vision_model(settings={}):
             df.to_csv(save_path)
             print(f"Saved {save_path}")
-    return output
+    return output
+def analyze_endodyogeny(settings):
+    from .utils import annotate_conditions, save_settings
+    from .io import _read_and_merge_data
+    from .settings import set_analyze_endodyogeny_defaults
+    def _calculate_volume_bins(df, compartment='pathogen', min_area_bin=500, max_bins=None, verbose=False):
+        area_column = f'{compartment}_area'
+        df[f'{compartment}_volume'] = df[area_column] ** 1.5
+        min_volume_bin = min_area_bin ** 1.5
+        max_volume = df[f'{compartment}_volume'].max()
+        # Generate bin edges as floats, and filter out any duplicate edges
+        bins = [min_volume_bin * (2 ** i) for i in range(int(np.ceil(np.log2(max_volume / min_volume_bin)) + 1))]
+        bins = sorted(set(bins))  # Ensure bin edges are unique
+        # Create bin labels as ranges with decimal precision for float values (e.g., "500.0-1000.0")
+        bin_labels = [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)]
+        if verbose:
+            print('Volume bins:', bins)
+            print('Volume bin labels:', bin_labels)
+        # Apply the bins to create a new column with the binned labels
+        df[f'{compartment}_volume_bin'] = pd.cut(df[f'{compartment}_volume'], bins=bins, labels=bin_labels, right=False)
+        # Create a bin index column (numeric version of bins)
+        df['bin_index'] = pd.cut(df[f'{compartment}_volume'], bins=bins, labels=range(1, len(bins)), right=False).astype(int)
+        # Adjust bin indices and labels based on max_bins
+        if max_bins is not None:
+            df.loc[df['bin_index'] > max_bins, 'bin_index'] = max_bins
+            # Update bin labels to reflect capped bins
+            bin_labels = bin_labels[:max_bins - 1] + [f">{bins[max_bins - 1]:.2f}"]
+            df[f'{compartment}_volume_bin'] = df['bin_index'].map(
+                {i + 1: label for i, label in enumerate(bin_labels)}
+            )
+        if verbose:
+            print(df[[f'{compartment}_volume', f'{compartment}_volume_bin', 'bin_index']].head())
+        return df
+    def _plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_column='prc', level='object'):
+        # Always calculate chi-squared on raw data
+        raw_counts = df.groupby([group_column, bin_column]).size().unstack(fill_value=0)
+        chi2, p, dof, expected = chi2_contingency(raw_counts)
+        print(f"Chi-squared test statistic (raw data): {chi2:.4f}")
+        print(f"p-value (raw data): {p:.4e}")
+        # Extract bin labels and indices for formatting the legend in the correct order
+        bin_labels = df[bin_column].cat.categories if pd.api.types.is_categorical_dtype(df[bin_column]) else sorted(df[bin_column].unique())
+        bin_indices = range(1, len(bin_labels) + 1)
+        legend_labels = [f"{index}: {label}" for index, label in zip(bin_indices, bin_labels)]
+        # Plot based on level setting
+        if level == 'well':
+            # Aggregate by well for mean ± SD visualization
+            well_proportions = (
+                df.groupby([group_column, prc_column, bin_column])
+                .size()
+                .groupby(level=[0, 1])
+                .apply(lambda x: x / x.sum())
+                .unstack(fill_value=0)
+            )
+            mean_proportions = well_proportions.groupby(group_column).mean()
+            std_proportions = well_proportions.groupby(group_column).std()
+            ax = mean_proportions.plot(
+                kind='bar', stacked=True, yerr=std_proportions, capsize=5, colormap='viridis', figsize=(12, 8)
+            )
+            plt.title('Proportion of Volume Bins by Group (Mean ± SD across wells)')
+        else:
+            # Object-level plotting without aggregation
+            group_counts = df.groupby([group_column, bin_column]).size()
+            group_totals = group_counts.groupby(level=0).sum()
+            proportions = group_counts / group_totals
+            proportion_df = proportions.unstack(fill_value=0)
+            ax = proportion_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(12, 8))
+            plt.title('Proportion of Volume Bins by Group')
+        plt.xlabel('Group')
+        plt.ylabel('Proportion')
+        # Update legend with formatted labels, maintaining correct order
+        volume_unit = "px³" if settings['um_per_px'] is None else "µm³"
+        plt.legend(legend_labels, title=f'Volume Range ({volume_unit})', bbox_to_anchor=(1.05, 1), loc='upper left')
+        plt.ylim(0, 1)
+        fig = plt.gcf()
+        return chi2, p, dof, expected, raw_counts, fig
+    settings = set_analyze_endodyogeny_defaults(settings)
+    save_settings(settings, name='analyze_endodyogeny', show=True)
+    output = {}
+    # Process data
+    if not isinstance(settings['src'], list):
+        settings['src'] = [settings['src']]
+    locs = []
+    for s in settings['src']:
+        loc = os.path.join(s, 'measurements/measurements.db')
+        locs.append(loc)
+    df, _ = _read_and_merge_data(
+        locs,
+        tables=settings['tables'],
+        verbose=settings['verbose'],
+        nuclei_limit=settings['nuclei_limit'],
+        pathogen_limit=settings['pathogen_limit']
+    )
+    if not settings['um_per_px'] is None:
+        df[f"{settings['compartment']}_area"] = df[f"{settings['compartment']}_area"] * (settings['um_per_px'] ** 2)
+        settings['min_area_bin'] = settings['min_area_bin'] * (settings['um_per_px'] ** 2)
+    df = df[df[f"{settings['compartment']}_area"] >= settings['min_area_bin']]
+    df = annotate_conditions(
+        df=df,
+        cells=settings['cell_types'],
+        cell_loc=settings['cell_plate_metadata'],
+        pathogens=settings['pathogen_types'],
+        pathogen_loc=settings['pathogen_plate_metadata'],
+        treatments=settings['treatments'],
+        treatment_loc=settings['treatment_plate_metadata']
+    )
+    if settings['group_column'] not in df.columns:
+        print(f"{settings['group_column']} not found in DataFrame, please choose from:")
+        for col in df.columns:
+            print(col)
+    df = df.dropna(subset=[settings['group_column']])
+    df = _calculate_volume_bins(df, settings['compartment'], settings['min_area_bin'], settings['max_bins'], settings['verbose'])
+    output['data'] = df
+    # Perform chi-squared test and plot
+    chi2, p, dof, expected, raw_counts, fig = _plot_proportion_stacked_bars(settings, df, settings['group_column'], bin_column=f"{settings['compartment']}_volume_bin", level=settings['level']
+    )
+    # Create a DataFrame with chi-squared test results and raw counts
+    results_df = pd.DataFrame({
+        'chi_squared_stat': [chi2],
+        'p_value': [p],
+        'degrees_of_freedom': [dof]
+    })
+    # Flatten and add expected counts to results_df
+    expected_df = pd.DataFrame(expected, index=raw_counts.index, columns=raw_counts.columns)
+    expected_flat = expected_df.stack().reset_index()
+    expected_flat.columns = [settings['group_column'], f"{settings['compartment']}_volume_bin", 'expected_count']
+    results_df = results_df.merge(expected_flat, how="cross")
+    output['chi_squared'] = results_df
+    if settings['save']:
+        # Save DataFrame to CSV
+        output_dir = os.path.join(settings['src'][0], 'results')
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, 'chi_squared_results.csv')
+        output_path_fig = os.path.join(output_dir, 'chi_squared_results.pdf')
+        fig.savefig(output_path_fig, dpi=300, bbox_inches='tight')
+        results_df.to_csv(output_path, index=False)
+        print(f"Chi-squared results saved to {output_path}")
+    plt.show()
+    return output
+def analyze_class_proportion(settings):
+    from .utils import annotate_conditions, save_settings
+    from .io import _read_and_merge_data
+    from .settings import set_analyze_class_proportion_defaults
+    from .plot import plot_plates
+    def _plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_column='prc', level='object'):
+        # Always calculate chi-squared on raw data
+        raw_counts = df.groupby([group_column, bin_column]).size().unstack(fill_value=0)
+        chi2, p, dof, expected = chi2_contingency(raw_counts)
+        print(f"Chi-squared test statistic (raw data): {chi2:.4f}")
+        print(f"p-value (raw data): {p:.4e}")
+        # Plot based on level setting
+        if level == 'well':
+            # Aggregate by well for mean ± SD visualization
+            well_proportions = (
+                df.groupby([group_column, prc_column, bin_column])
+                .size()
+                .groupby(level=[0, 1])
+                .apply(lambda x: x / x.sum())
+                .unstack(fill_value=0)
+            )
+            mean_proportions = well_proportions.groupby(group_column).mean()
+            std_proportions = well_proportions.groupby(group_column).std()
+            ax = mean_proportions.plot(
+                kind='bar', stacked=True, yerr=std_proportions, capsize=5, colormap='viridis', figsize=(12, 8)
+            )
+            plt.title('Proportion of Volume Bins by Group (Mean ± SD across wells)')
+        else:
+            # Object-level plotting without aggregation
+            group_counts = df.groupby([group_column, bin_column]).size()
+            group_totals = group_counts.groupby(level=0).sum()
+            proportions = group_counts / group_totals
+            proportion_df = proportions.unstack(fill_value=0)
+            ax = proportion_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(12, 8))
+            plt.title('Proportion of Volume Bins by Group')
+        plt.xlabel('Group')
+        plt.ylabel('Proportion')
+        # Update legend with formatted labels, maintaining correct order
+        plt.legend(title=f'Classes', bbox_to_anchor=(1.05, 1), loc='upper left')
+        plt.ylim(0, 1)
+        fig = plt.gcf()
+        return chi2, p, dof, expected, raw_counts, fig
+    settings = set_analyze_class_proportion_defaults(settings)
+    save_settings(settings, name='analyze_class_proportion', show=True)
+    output = {}
+    # Process data
+    if not isinstance(settings['src'], list):
+        settings['src'] = [settings['src']]
+    locs = []
+    for s in settings['src']:
+        loc = os.path.join(s, 'measurements/measurements.db')
+        locs.append(loc)
+    if 'png_list' not in settings['tables']:
+        settings['tables'] = settings['tables'] + ['png_list']
+    df, _ = _read_and_merge_data(
+        locs,
+        tables=settings['tables'],
+        verbose=settings['verbose'],
+        nuclei_limit=settings['nuclei_limit'],
+        pathogen_limit=settings['pathogen_limit']
+    )
+    df = annotate_conditions(
+        df=df,
+        cells=settings['cell_types'],
+        cell_loc=settings['cell_plate_metadata'],
+        pathogens=settings['pathogen_types'],
+        pathogen_loc=settings['pathogen_plate_metadata'],
+        treatments=settings['treatments'],
+        treatment_loc=settings['treatment_plate_metadata']
+    )
+    if settings['group_column'] not in df.columns:
+        print(f"{settings['group_column']} not found in DataFrame, please choose from:")
+        for col in df.columns:
+            print(col)
+    df[settings['class_column']] = df[settings['class_column']].fillna(0)
+    output['data'] = df
+    # Perform chi-squared test and plot
+    chi2, p, dof, expected, raw_counts, fig = _plot_proportion_stacked_bars(settings, df, settings['group_column'], bin_column=settings['class_column'], level=settings['level'])
+    # Create a DataFrame with chi-squared test results and raw counts
+    results_df = pd.DataFrame({
+        'chi_squared_stat': [chi2],
+        'p_value': [p],
+        'degrees_of_freedom': [dof]
+    })
+    output['chi_squared'] = results_df
+    if settings['save']:
+        output_dir = os.path.join(settings['src'][0], 'results')
+        os.makedirs(output_dir, exist_ok=True)
+        output_path_chi = os.path.join(output_dir, 'class_chi_squared_results.csv')
+        output_path_data = os.path.join(output_dir, 'class_chi_squared_data.csv')
+        output_path_fig = os.path.join(output_dir, 'class_chi_squared.pdf')
+        fig.savefig(output_path_fig, dpi=300, bbox_inches='tight')
+        results_df.to_csv(output_path_chi, index=False)
+        df.to_csv(output_path_data, index=False)
+        print(f"Chi-squared results saved to {output_path_chi}")
+        print(f"Annotated data saved to {output_path_data}")
+    plt.show()
+    fig2 = plot_plates(df, variable=settings['class_column'], grouping='mean', min_max='allq', cmap='viridis', min_count=0, verbose=True, dst=None)
+    if settings['save']:
+        output_path_fig2 = os.path.join(output_dir, 'class_heatmap.pdf')
+        fig2.savefig(output_path_fig2, dpi=300, bbox_inches='tight')
+    plt.show()
+    return output

spacr/utils.py CHANGED Viewed

@@ -1371,7 +1371,7 @@ def annotate_conditions(df, cells=None, cell_loc=None, pathogens=None, pathogen_
     return df
-def _split_data(df, group_by, object_type):
+def _split_data_v1(df, group_by, object_type):
     """
     Splits the input dataframe into numeric and non-numeric parts, groups them by the specified column,
     and returns the grouped dataframes.
@@ -1385,16 +1385,72 @@ def _split_data(df, group_by, object_type):
     grouped_numeric (pandas.DataFrame): The grouped dataframe containing numeric columns.
     grouped_non_numeric (pandas.DataFrame): The grouped dataframe containing non-numeric columns.
     """
+    if 'prcf' not in df.columns:
+        try:
+            df['prcf'] = df['plate'].astype(str) + '_' + df['row_name'].astype(str) + '_' + df['column_name'].astype(str) + '_' + df['field'].astype(str)
+        except Exception as e:
+            print(e)
     df['prcfo'] = df['prcf'] + '_' + df[object_type]
     df = df.set_index(group_by, inplace=False)
     df_numeric = df.select_dtypes(include=np.number)
     df_non_numeric = df.select_dtypes(exclude=np.number)
+    []
     grouped_numeric = df_numeric.groupby(df_numeric.index).mean()
     grouped_non_numeric = df_non_numeric.groupby(df_non_numeric.index).first()
     return pd.DataFrame(grouped_numeric), pd.DataFrame(grouped_non_numeric)
+def _split_data(df, group_by, object_type):
+    """
+    Splits the input dataframe into numeric and non-numeric parts, groups them by the specified column,
+    and returns the grouped dataframes with conditional aggregation.
+    Parameters:
+    df (pandas.DataFrame): The input dataframe.
+    group_by (str): The column name to group the dataframes by.
+    object_type (str): The column name to concatenate with 'prcf' to create a new column 'prcfo'.
+    Returns:
+    grouped_numeric (pandas.DataFrame): The grouped dataframe containing numeric columns with conditional aggregation.
+    grouped_non_numeric (pandas.DataFrame): The grouped dataframe containing non-numeric columns.
+    """
+    # Ensure 'prcf' column exists by concatenating specific columns
+    if 'prcf' not in df.columns:
+        try:
+            df['prcf'] = df['plate'].astype(str) + '_' + df['row_name'].astype(str) + '_' + df['column_name'].astype(str) + '_' + df['field'].astype(str)
+        except Exception as e:
+            print(e)
+    # Create the 'prcfo' column
+    df['prcfo'] = df['prcf'] + '_' + df[object_type]
+    df = df.set_index(group_by, inplace=False)
+    # Split the DataFrame into numeric and non-numeric parts
+    df_numeric = df.select_dtypes(include=np.number)
+    df_non_numeric = df.select_dtypes(exclude=np.number)
+    # Define keywords for columns to be summed instead of averaged
+    sum_keywords = ['area', 'perimeter', 'convex_area', 'bbox_area', 'filled_area', 'major_axis_length', 'minor_axis_length', 'equivalent_diameter']
+    # Create a dictionary for custom aggregation
+    agg_dict = {}
+    for column in df_numeric.columns:
+        if any(keyword in column for keyword in sum_keywords):
+            agg_dict[column] = 'sum'
+        else:
+            agg_dict[column] = 'mean'
+    # Apply custom aggregation
+    grouped_numeric = df_numeric.groupby(df_numeric.index).agg(agg_dict)
+    grouped_non_numeric = df_non_numeric.groupby(df_non_numeric.index).first()
+    return pd.DataFrame(grouped_numeric), pd.DataFrame(grouped_non_numeric)
 def _calculate_recruitment(df, channel):
     """
@@ -5184,7 +5240,7 @@ def group_feature_class(df, feature_groups=['cell', 'cytoplasm', 'nucleus', 'pat
         else:
             return None
-    from spacr.plot import spacrGraph
+    from .plot import spacrGraph
     df[name] = df['feature'].apply(lambda x: find_feature_class(x, feature_groups))

{spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: spacr
-Version: 0.3.61
+Version: 0.3.64
 Summary: Spatial phenotype analysis of crisp screens (SpaCr)
 Home-page: https://github.com/EinarOlafsson/spacr
 Author: Einar Birnir Olafsson

{spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/RECORD RENAMED Viewed

@@ -15,20 +15,20 @@ spacr/gui.py,sha256=ARyn9Q_g8HoP-cXh1nzMLVFCKqthY4v2u9yORyaQqQE,8230
 spacr/gui_core.py,sha256=N7R7yvfK_dJhOReM_kW3Ci8Bokhi1OzsxeKqvSGdvV4,41460
 spacr/gui_elements.py,sha256=EKlvEg_4_je7jciEdR3NTgPrcTraowa2e2RUt-xqd6M,138254
 spacr/gui_utils.py,sha256=u9RoIOWpAXFEOnUlLpMQZrc1pWSg6omZsJMIhJdRv_g,41211
-spacr/io.py,sha256=p-ky3yjtoSSvdsktPXVy_dx8dHgMeWqUZOtOwwfrk2o,136108
+spacr/io.py,sha256=YlJAT6H8l4ipunMyKzjqoPcf-1AXgUmSyR1YN9WxmDI,142857
 spacr/logger.py,sha256=lJhTqt-_wfAunCPl93xE65Wr9Y1oIHJWaZMjunHUeIw,1538
 spacr/measure.py,sha256=2lK-ZcTxLM-MpXV1oZnucRD9iz5aprwahRKw9IEqshg,55085
 spacr/mediar.py,sha256=FwLvbLQW5LQzPgvJZG8Lw7GniA2vbZx6Jv6vIKu7I5c,14743
-spacr/ml.py,sha256=aLDeeaAl0d4-RP1CzFHPqz5br2HrFbJhvPexEm9lvSI,68198
+spacr/ml.py,sha256=GOQJH8jdTrJQwiLlDrcc9-yCxLFaMx4YD4OJs0-R5YI,77947
 spacr/openai.py,sha256=5vBZ3Jl2llYcW3oaTEXgdyCB2aJujMUIO5K038z7w_A,1246
-spacr/plot.py,sha256=zITe54dzQRz-gk_ZT0qJyARuUWJivIBKW8V4rjUH8SE,160320
+spacr/plot.py,sha256=0fne2Msy6niN80oiuwt9ZYw1QwXVnghaUmrwvEZN9-8,161992
 spacr/sequencing.py,sha256=ClUfwPPK6rNUbUuiEkzcwakzVyDKKUMv9ricrxT8qQY,25227
-spacr/settings.py,sha256=zANLspVmllDZeYjQWIfrHN3VkVgicnYGTduv30MmQ18,77257
+spacr/settings.py,sha256=LSoDNuz1m7rySh7MWXEL1xlUU4rFiCRVlGvZCSCOqzU,80085
 spacr/sim.py,sha256=1xKhXimNU3ukzIw-3l9cF3Znc_brW8h20yv8fSTzvss,71173
-spacr/submodules.py,sha256=Xq4gjvooHN8S7cTk5PIAkd7XD2c7CMVqNpeo8GCvtHc,42489
+spacr/submodules.py,sha256=X1OI0Dsc1qU4lqKFdF2EnloNkLkDzA1hDn7CYbkBmFc,55473
 spacr/timelapse.py,sha256=KGfG4L4-QnFfgbF7L6C5wL_3gd_rqr05Foje6RsoTBg,39603
 spacr/toxo.py,sha256=z2nT5aAze3NUIlwnBQcnkARihDwoPfqOgQIVoUluyK0,25087
-spacr/utils.py,sha256=tqIKiSc30xEX0IlfSpoctFJQDVnGHDAX7l1VakRCBuY,220601
+spacr/utils.py,sha256=vvciLh1gH0nsrCWQw3taUcDjxP59wme3gqrejeNO05w,222943
 spacr/version.py,sha256=axH5tnGwtgSnJHb5IDhiu4Zjk5GhLyAEDRe-rnaoFOA,409
 spacr/resources/MEDIAR/.gitignore,sha256=Ff1q9Nme14JUd-4Q3jZ65aeQ5X4uttptssVDgBVHYo8,152
 spacr/resources/MEDIAR/LICENSE,sha256=yEj_TRDLUfDpHDNM0StALXIt6mLqSgaV2hcCwa6_TcY,1065
@@ -151,9 +151,9 @@ spacr/resources/icons/umap.png,sha256=dOLF3DeLYy9k0nkUybiZMe1wzHQwLJFRmgccppw-8b
 spacr/resources/images/plate1_E01_T0001F001L01A01Z01C02.tif,sha256=Tl0ZUfZ_AYAbu0up_nO0tPRtF1BxXhWQ3T3pURBCCRo,7958528
 spacr/resources/images/plate1_E01_T0001F001L01A02Z01C01.tif,sha256=m8N-V71rA1TT4dFlENNg8s0Q0YEXXs8slIn7yObmZJQ,7958528
 spacr/resources/images/plate1_E01_T0001F001L01A03Z01C03.tif,sha256=Pbhk7xn-KUP6RSIhJsxQcrHFImBm3GEpLkzx7WOc-5M,7958528
-spacr-0.3.61.dist-info/LICENSE,sha256=SR-2MeGc6SCM1UORJYyarSWY_A-JaOMFDj7ReSs9tRM,1083
-spacr-0.3.61.dist-info/METADATA,sha256=2jlzT9lkaXx01IWlYMYrpf24p48qDHvrRLZm-YUUl-0,6032
-spacr-0.3.61.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
-spacr-0.3.61.dist-info/entry_points.txt,sha256=BMC0ql9aNNpv8lUZ8sgDLQMsqaVnX5L535gEhKUP5ho,296
-spacr-0.3.61.dist-info/top_level.txt,sha256=GJPU8FgwRXGzKeut6JopsSRY2R8T3i9lDgya42tLInY,6
-spacr-0.3.61.dist-info/RECORD,,
+spacr-0.3.64.dist-info/LICENSE,sha256=SR-2MeGc6SCM1UORJYyarSWY_A-JaOMFDj7ReSs9tRM,1083
+spacr-0.3.64.dist-info/METADATA,sha256=_07fLYI8eMAYJzOEcAVOemN4TFJAuzAvUrdX1T136T0,6032
+spacr-0.3.64.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
+spacr-0.3.64.dist-info/entry_points.txt,sha256=BMC0ql9aNNpv8lUZ8sgDLQMsqaVnX5L535gEhKUP5ho,296
+spacr-0.3.64.dist-info/top_level.txt,sha256=GJPU8FgwRXGzKeut6JopsSRY2R8T3i9lDgya42tLInY,6
+spacr-0.3.64.dist-info/RECORD,,

{spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/LICENSE RENAMED Viewed

File without changes

{spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/WHEEL RENAMED Viewed

File without changes

{spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/top_level.txt RENAMED Viewed

File without changes

spacr 0.3.61__py3-none-any.whl → 0.3.64__py3-none-any.whl

spacr 0.3.61py3-none-any.whl → 0.3.64py3-none-any.whl