spacr 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spacr/io.py CHANGED
@@ -2551,6 +2551,7 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
2551
2551
  png_list_g_df_non_numeric.drop(columns=['plate','row_name','column_name','field','file_name','cell_id', 'prcf'], inplace=True)
2552
2552
  if verbose:
2553
2553
  print(f'png_list: {len(png_list)}, png_list grouped: {len(png_list_g_df_numeric)}')
2554
+ print(f"Added png_list columns: {png_list_g_df_numeric.columns}, {png_list_g_df_non_numeric.columns}")
2554
2555
  merged_df = merged_df.merge(png_list_g_df_numeric, left_index=True, right_index=True)
2555
2556
  merged_df = merged_df.merge(png_list_g_df_non_numeric, left_index=True, right_index=True)
2556
2557
 
@@ -2562,7 +2563,8 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
2562
2563
  metadata.set_index('prcfo', inplace=True)
2563
2564
 
2564
2565
  # Merge metadata with final merged DataFrame
2565
- merged_df = metadata.merge(merged_df, left_index=True, right_index=True).dropna(axis=1)
2566
+ #merged_df = metadata.merge(merged_df, left_index=True, right_index=True).dropna(axis=1)
2567
+ merged_df = metadata.merge(merged_df, left_index=True, right_index=True)
2566
2568
  merged_df.drop(columns=['label_list_morphology', 'label_list_intensity'], errors='ignore', inplace=True)
2567
2569
 
2568
2570
  if verbose:
spacr/ml.py CHANGED
@@ -3,6 +3,7 @@ import pandas as pd
3
3
  import numpy as np
4
4
  from scipy import stats
5
5
  from scipy.stats import shapiro
6
+ from math import pi
6
7
 
7
8
  from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV
8
9
  from sklearn.metrics import mean_squared_error
@@ -1515,3 +1516,207 @@ def _calculate_similarity(df, features, col_to_compare, val1, val2):
1515
1516
 
1516
1517
  return df
1517
1518
 
1519
+ def interperate_vision_model(settings={}):
1520
+
1521
+ from .io import _read_and_merge_data, _results_to_csv
1522
+ from .settings import set_interperate_vision_model_defaults
1523
+ from .utils import save_settings
1524
+
1525
+ settings = set_interperate_vision_model_defaults(settings)
1526
+ save_settings(settings, name='interperate_vision_model', show=True)
1527
+
1528
+ # Function to create radar plot for individual and combined values
1529
+ def create_extended_radar_plot(values, labels, title):
1530
+ values = list(values) + [values[0]] # Close the loop for radar chart
1531
+ angles = [n / float(len(labels)) * 2 * pi for n in range(len(labels))]
1532
+ angles += angles[:1]
1533
+
1534
+ fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
1535
+ ax.plot(angles, values, linewidth=2, linestyle='solid')
1536
+ ax.fill(angles, values, alpha=0.25)
1537
+
1538
+ ax.set_xticks(angles[:-1])
1539
+ ax.set_xticklabels(labels, fontsize=10, rotation=45, ha='right')
1540
+ plt.title(title, pad=20)
1541
+ plt.show()
1542
+
1543
+ def extract_compartment_channel(feature_name):
1544
+ # Identify compartment as the first part before an underscore
1545
+ compartment = feature_name.split('_')[0]
1546
+
1547
+ if compartment == 'cells':
1548
+ compartment = 'cell'
1549
+
1550
+ # Identify channels based on substring presence
1551
+ channels = []
1552
+ if 'channel_0' in feature_name:
1553
+ channels.append('channel_0')
1554
+ if 'channel_1' in feature_name:
1555
+ channels.append('channel_1')
1556
+ if 'channel_2' in feature_name:
1557
+ channels.append('channel_2')
1558
+ if 'channel_3' in feature_name:
1559
+ channels.append('channel_3')
1560
+
1561
+ # If multiple channels are found, join them with a '+'
1562
+ if channels:
1563
+ channel = ' + '.join(channels)
1564
+ else:
1565
+ channel = 'morphology' # Use 'morphology' if no channel identifier is found
1566
+
1567
+ return (compartment, channel)
1568
+
1569
+ def read_and_preprocess_data(settings):
1570
+
1571
+ df, _ = _read_and_merge_data(
1572
+ locs=[settings['src']+'/measurements/measurements.db'],
1573
+ tables=settings['tables'],
1574
+ verbose=True,
1575
+ nuclei_limit=settings['nuclei_limit'],
1576
+ pathogen_limit=settings['pathogen_limit']
1577
+ )
1578
+
1579
+ scores_df = pd.read_csv(settings['scores'])
1580
+
1581
+ # Clean and align columns for merging
1582
+ df['object_label'] = df['object_label'].str.replace('o', '')
1583
+
1584
+ if 'row_name' not in scores_df.columns:
1585
+ scores_df['row_name'] = scores_df['row']
1586
+
1587
+ if 'column_name' not in scores_df.columns:
1588
+ scores_df['column_name'] = scores_df['col']
1589
+
1590
+ if 'object_label' not in scores_df.columns:
1591
+ scores_df['object_label'] = scores_df['object']
1592
+
1593
+ # Remove the 'o' prefix from 'object_label' in df, ensuring it is a string type
1594
+ df['object_label'] = df['object_label'].str.replace('o', '').astype(str)
1595
+
1596
+ # Ensure 'object_label' in scores_df is also a string
1597
+ scores_df['object_label'] = scores_df['object'].astype(str)
1598
+
1599
+ # Ensure all join columns have the same data type in both DataFrames
1600
+ df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
1601
+ scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
1602
+
1603
+ # Select only the necessary columns from scores_df for merging
1604
+ scores_df = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label', settings['score_column']]]
1605
+
1606
+ # Now merge DataFrames
1607
+ merged_df = pd.merge(df, scores_df, on=['plate', 'row_name', 'column_name', 'field', 'object_label'], how='inner')
1608
+
1609
+ # Separate numerical features and the score column
1610
+ X = merged_df.select_dtypes(include='number').drop(columns=[settings['score_column']])
1611
+ y = merged_df[settings['score_column']]
1612
+
1613
+ return X, y, merged_df
1614
+
1615
+ X, y, merged_df = read_and_preprocess_data(settings)
1616
+
1617
+ # Step 1: Feature Importance using Random Forest
1618
+ if settings['feature_importance'] or settings['feature_importance']:
1619
+ model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
1620
+ model.fit(X, y)
1621
+
1622
+ if settings['feature_importance']:
1623
+ print(f"Feature Importance ...")
1624
+ feature_importances = model.feature_importances_
1625
+ feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
1626
+ feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
1627
+ top_feature_importance_df = feature_importance_df.head(settings['top_features'])
1628
+
1629
+ # Plot Feature Importance
1630
+ plt.figure(figsize=(10, 6))
1631
+ plt.barh(top_feature_importance_df['feature'], top_feature_importance_df['importance'])
1632
+ plt.xlabel('Importance')
1633
+ plt.title(f"Top {settings['top_features']} Features - Feature Importance")
1634
+ plt.gca().invert_yaxis()
1635
+ plt.show()
1636
+
1637
+ if settings['save']:
1638
+ _results_to_csv(feature_importance_df, filename='feature_importance.csv')
1639
+
1640
+ # Step 2: Permutation Importance
1641
+ if settings['permutation_importance']:
1642
+ print(f"Permutation Importance ...")
1643
+ perm_importance = permutation_importance(model, X, y, n_repeats=10, random_state=42, n_jobs=settings['n_jobs'])
1644
+ perm_importance_df = pd.DataFrame({'feature': X.columns, 'importance': perm_importance.importances_mean})
1645
+ perm_importance_df = perm_importance_df.sort_values(by='importance', ascending=False)
1646
+ top_perm_importance_df = perm_importance_df.head(settings['top_features'])
1647
+
1648
+ # Plot Permutation Importance
1649
+ plt.figure(figsize=(10, 6))
1650
+ plt.barh(top_perm_importance_df['feature'], top_perm_importance_df['importance'])
1651
+ plt.xlabel('Importance')
1652
+ plt.title(f"Top {settings['top_features']} Features - Permutation Importance")
1653
+ plt.gca().invert_yaxis()
1654
+ plt.show()
1655
+
1656
+ if settings['save']:
1657
+ _results_to_csv(perm_importance_df, filename='permutation_importance.csv')
1658
+
1659
+ # Step 3: SHAP Analysis
1660
+ if settings['shap']:
1661
+ print(f"SHAP Analysis ...")
1662
+
1663
+ # Select top N features based on Random Forest importance and fit the model on these features only
1664
+ top_features = feature_importance_df.head(settings['top_features'])['feature']
1665
+ X_top = X[top_features]
1666
+
1667
+ # Refit the model on this subset of features
1668
+ model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
1669
+ model.fit(X_top, y)
1670
+
1671
+ # Sample a smaller subset of rows to speed up SHAP
1672
+ if settings['shap_sample']:
1673
+ sample = int(len(X_top) / 100)
1674
+ X_sample = X_top.sample(min(sample, len(X_top)), random_state=42)
1675
+ else:
1676
+ X_sample = X_top
1677
+
1678
+ # Initialize SHAP explainer with the same subset of features
1679
+ explainer = shap.Explainer(model.predict, X_sample)
1680
+ shap_values = explainer(X_sample, max_evals=1500)
1681
+
1682
+ # Plot SHAP summary for the selected sample and top features
1683
+ shap.summary_plot(shap_values, X_sample, max_display=settings['top_features'])
1684
+
1685
+ # Convert SHAP values to a DataFrame for easier manipulation
1686
+ shap_df = pd.DataFrame(shap_values.values, columns=X_sample.columns)
1687
+
1688
+ # Apply the function to create MultiIndex columns with compartment and channel
1689
+ shap_df.columns = pd.MultiIndex.from_tuples(
1690
+ [extract_compartment_channel(feat) for feat in shap_df.columns],
1691
+ names=['compartment', 'channel']
1692
+ )
1693
+
1694
+ # Aggregate SHAP values by compartment and channel
1695
+ compartment_mean = shap_df.abs().groupby(level='compartment', axis=1).mean().mean(axis=0)
1696
+ channel_mean = shap_df.abs().groupby(level='channel', axis=1).mean().mean(axis=0)
1697
+
1698
+ # Calculate combined importance for each pair of compartments and channels
1699
+ combined_compartment = {}
1700
+ for i, comp1 in enumerate(compartment_mean.index):
1701
+ for comp2 in compartment_mean.index[i+1:]:
1702
+ combined_compartment[f"{comp1} + {comp2}"] = shap_df.loc[:, (comp1, slice(None))].abs().mean().mean() + \
1703
+ shap_df.loc[:, (comp2, slice(None))].abs().mean().mean()
1704
+
1705
+ combined_channel = {}
1706
+ for i, chan1 in enumerate(channel_mean.index):
1707
+ for chan2 in channel_mean.index[i+1:]:
1708
+ combined_channel[f"{chan1} + {chan2}"] = shap_df.loc[:, (slice(None), chan1)].abs().mean().mean() + \
1709
+ shap_df.loc[:, (slice(None), chan2)].abs().mean().mean()
1710
+
1711
+ # Prepare values and labels for radar charts
1712
+ all_compartment_importance = list(compartment_mean.values) + list(combined_compartment.values())
1713
+ all_compartment_labels = list(compartment_mean.index) + list(combined_compartment.keys())
1714
+
1715
+ all_channel_importance = list(channel_mean.values) + list(combined_channel.values())
1716
+ all_channel_labels = list(channel_mean.index) + list(combined_channel.keys())
1717
+
1718
+ # Create radar plots for compartments and channels
1719
+ create_extended_radar_plot(all_compartment_importance, all_compartment_labels, "SHAP Importance by Compartment (Individual and Combined)")
1720
+ create_extended_radar_plot(all_channel_importance, all_channel_labels, "SHAP Importance by Channel (Individual and Combined)")
1721
+
1722
+ return merged_df
spacr/plot.py CHANGED
@@ -3688,3 +3688,51 @@ def overlay_masks_on_images(img_folder, normalize=True, resize=True, save=False,
3688
3688
  plt.axis('off')
3689
3689
  plt.show()
3690
3690
 
3691
+ def graph_importance(settings):
3692
+
3693
+ from .settings import set_graph_importance_defaults
3694
+ from .utils import save_settings
3695
+
3696
+ if not isinstance(settings['csvs'], list):
3697
+ settings['csvs'] = settings['csvs']
3698
+
3699
+ settings['src'] = os.path.dirname(settings['csvs'][0])
3700
+
3701
+ settings = set_graph_importance_defaults(settings)
3702
+ save_settings(settings, name='graph_importance')
3703
+
3704
+ dfs = []
3705
+ for path in settings['csvs']:
3706
+ dft = pd.read_csv(path)
3707
+ dfs.append(dft)
3708
+
3709
+ df = pd.concat(dfs)
3710
+
3711
+ if not all(col in df.columns for col in (settings['grouping_column'], settings['data_column'])):
3712
+ print(f"grouping {settings['grouping_column']} and data {settings['data_column']} columns must be in {df.columns.to_list()}")
3713
+ return
3714
+
3715
+ output_dir = os.path.dirname(settings['csvs'][0])
3716
+
3717
+ spacr_graph = spacrGraph(
3718
+ df=df,
3719
+ grouping_column=settings['grouping_column'],
3720
+ data_column=settings['data_column'],
3721
+ graph_type=settings['graph_type'],
3722
+ graph_name=settings['grouping_column'],
3723
+ summary_func='mean',
3724
+ colors=None,
3725
+ output_dir=output_dir,
3726
+ save=settings['save'],
3727
+ y_lim=None,
3728
+ error_bar_type='std',
3729
+ representation='object',
3730
+ theme='muted',
3731
+ )
3732
+
3733
+ # Create the plot
3734
+ spacr_graph.create_plot()
3735
+
3736
+ # Get the figure object if needed
3737
+ fig = spacr_graph.get_figure()
3738
+ plt.show()
spacr/settings.py CHANGED
@@ -1370,4 +1370,68 @@ def get_analyze_plaque_settings(settings):
1370
1370
  settings.setdefault('rescale', False)
1371
1371
  settings.setdefault('resample', False)
1372
1372
  settings.setdefault('fill_in', True)
1373
+ return settings
1374
+
1375
+ def set_graph_importance_defaults(settings):
1376
+ settings.setdefault('csvs','list of paths')
1377
+ settings.setdefault('grouping_column','compartment')
1378
+ settings.setdefault('data_column','compartment_importance_sum')
1379
+ settings.setdefault('graph_type','jitter_bar')
1380
+ settings.setdefault('save',False)
1381
+ return settings
1382
+
1383
+ def set_interperate_vision_model_defaults(settings):
1384
+ settings.setdefault('src','path')
1385
+ settings.setdefault('scores','path')
1386
+ settings.setdefault('tables',['cell', 'nucleus', 'pathogen','cytoplasm'])
1387
+ settings.setdefault('feature_importance',True)
1388
+ settings.setdefault('permutation_importance',False)
1389
+ settings.setdefault('shap',True)
1390
+ settings.setdefault('save',False)
1391
+ settings.setdefault('nuclei_limit',1000)
1392
+ settings.setdefault('pathogen_limit',1000)
1393
+ settings.setdefault('top_features',30)
1394
+ settings.setdefault('shap_sample',True)
1395
+ settings.setdefault('n_jobs',-1)
1396
+ settings.setdefault('shap_approximate',True)
1397
+ settings.setdefault('score_column','cv_predictions')
1398
+ return settings
1399
+
1400
+ def set_analyze_endodyogeny_defaults(settings):
1401
+ settings.setdefault('src','path')
1402
+ settings.setdefault('tables',['cell', 'nucleus', 'pathogen', 'cytoplasm'])
1403
+ settings.setdefault('cell_types',['Hela'])
1404
+ settings.setdefault('cell_plate_metadata',None)
1405
+ settings.setdefault('pathogen_types',['nc', 'pc'])
1406
+ settings.setdefault('pathogen_plate_metadata',[['c1'], ['c2']])
1407
+ settings.setdefault('treatments',None)
1408
+ settings.setdefault('treatment_plate_metadata',None)
1409
+ settings.setdefault('min_area_bin',500)
1410
+ settings.setdefault('group_column','pathogen')
1411
+ settings.setdefault('compartment','pathogen')
1412
+ settings.setdefault('pathogen_limit',1)
1413
+ settings.setdefault('nuclei_limit',10)
1414
+ settings.setdefault('level','object')
1415
+ settings.setdefault('um_per_px',0.1)
1416
+ settings.setdefault('max_bins',None)
1417
+ settings.setdefault('save',False)
1418
+ settings.setdefault('verbose',False)
1419
+ return settings
1420
+
1421
+ def set_analyze_class_proportion_defaults(settings):
1422
+ settings.setdefault('src','path')
1423
+ settings.setdefault('tables',['cell', 'nucleus', 'pathogen', 'cytoplasm'])
1424
+ settings.setdefault('cell_types',['Hela'])
1425
+ settings.setdefault('cell_plate_metadata',None)
1426
+ settings.setdefault('pathogen_types',['nc','pc'])
1427
+ settings.setdefault('pathogen_plate_metadata',[['c1'],['c2']])
1428
+ settings.setdefault('treatments',None)
1429
+ settings.setdefault('treatment_plate_metadata',None)
1430
+ settings.setdefault('group_column','condition')
1431
+ settings.setdefault('class_column','test')
1432
+ settings.setdefault('pathogen_limit',1000)
1433
+ settings.setdefault('nuclei_limit',1000)
1434
+ settings.setdefault('level','well')
1435
+ settings.setdefault('save',False)
1436
+ settings.setdefault('verbose', False)
1373
1437
  return settings
spacr/submodules.py CHANGED
@@ -10,6 +10,7 @@ from IPython.display import display
10
10
  from sklearn.ensemble import RandomForestClassifier
11
11
  from sklearn.inspection import permutation_importance
12
12
  from math import pi
13
+ from scipy.stats import chi2_contingency
13
14
 
14
15
  import matplotlib.pyplot as plt
15
16
  from natsort import natsorted
@@ -844,4 +845,300 @@ def interperate_vision_model(settings={}):
844
845
  df.to_csv(save_path)
845
846
  print(f"Saved {save_path}")
846
847
 
847
- return output
848
+ return output
849
+
850
+ def analyze_endodyogeny(settings):
851
+
852
+ from .utils import annotate_conditions, save_settings
853
+ from .io import _read_and_merge_data
854
+ from .settings import set_analyze_endodyogeny_defaults
855
+
856
+ def _calculate_volume_bins(df, compartment='pathogen', min_area_bin=500, max_bins=None, verbose=False):
857
+ area_column = f'{compartment}_area'
858
+ df[f'{compartment}_volume'] = df[area_column] ** 1.5
859
+ min_volume_bin = min_area_bin ** 1.5
860
+ max_volume = df[f'{compartment}_volume'].max()
861
+
862
+ # Generate bin edges as floats, and filter out any duplicate edges
863
+ bins = [min_volume_bin * (2 ** i) for i in range(int(np.ceil(np.log2(max_volume / min_volume_bin)) + 1))]
864
+ bins = sorted(set(bins)) # Ensure bin edges are unique
865
+
866
+ # Create bin labels as ranges with decimal precision for float values (e.g., "500.0-1000.0")
867
+ bin_labels = [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)]
868
+ if verbose:
869
+ print('Volume bins:', bins)
870
+ print('Volume bin labels:', bin_labels)
871
+
872
+ # Apply the bins to create a new column with the binned labels
873
+ df[f'{compartment}_volume_bin'] = pd.cut(df[f'{compartment}_volume'], bins=bins, labels=bin_labels, right=False)
874
+
875
+ # Create a bin index column (numeric version of bins)
876
+ df['bin_index'] = pd.cut(df[f'{compartment}_volume'], bins=bins, labels=range(1, len(bins)), right=False).astype(int)
877
+
878
+ # Adjust bin indices and labels based on max_bins
879
+ if max_bins is not None:
880
+ df.loc[df['bin_index'] > max_bins, 'bin_index'] = max_bins
881
+
882
+ # Update bin labels to reflect capped bins
883
+ bin_labels = bin_labels[:max_bins - 1] + [f">{bins[max_bins - 1]:.2f}"]
884
+ df[f'{compartment}_volume_bin'] = df['bin_index'].map(
885
+ {i + 1: label for i, label in enumerate(bin_labels)}
886
+ )
887
+
888
+ if verbose:
889
+ print(df[[f'{compartment}_volume', f'{compartment}_volume_bin', 'bin_index']].head())
890
+
891
+ return df
892
+
893
+ def _plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_column='prc', level='object'):
894
+ # Always calculate chi-squared on raw data
895
+ raw_counts = df.groupby([group_column, bin_column]).size().unstack(fill_value=0)
896
+ chi2, p, dof, expected = chi2_contingency(raw_counts)
897
+ print(f"Chi-squared test statistic (raw data): {chi2:.4f}")
898
+ print(f"p-value (raw data): {p:.4e}")
899
+
900
+ # Extract bin labels and indices for formatting the legend in the correct order
901
+ bin_labels = df[bin_column].cat.categories if pd.api.types.is_categorical_dtype(df[bin_column]) else sorted(df[bin_column].unique())
902
+ bin_indices = range(1, len(bin_labels) + 1)
903
+ legend_labels = [f"{index}: {label}" for index, label in zip(bin_indices, bin_labels)]
904
+
905
+ # Plot based on level setting
906
+ if level == 'well':
907
+ # Aggregate by well for mean ± SD visualization
908
+ well_proportions = (
909
+ df.groupby([group_column, prc_column, bin_column])
910
+ .size()
911
+ .groupby(level=[0, 1])
912
+ .apply(lambda x: x / x.sum())
913
+ .unstack(fill_value=0)
914
+ )
915
+ mean_proportions = well_proportions.groupby(group_column).mean()
916
+ std_proportions = well_proportions.groupby(group_column).std()
917
+
918
+ ax = mean_proportions.plot(
919
+ kind='bar', stacked=True, yerr=std_proportions, capsize=5, colormap='viridis', figsize=(12, 8)
920
+ )
921
+ plt.title('Proportion of Volume Bins by Group (Mean ± SD across wells)')
922
+ else:
923
+ # Object-level plotting without aggregation
924
+ group_counts = df.groupby([group_column, bin_column]).size()
925
+ group_totals = group_counts.groupby(level=0).sum()
926
+ proportions = group_counts / group_totals
927
+ proportion_df = proportions.unstack(fill_value=0)
928
+
929
+ ax = proportion_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(12, 8))
930
+ plt.title('Proportion of Volume Bins by Group')
931
+
932
+ plt.xlabel('Group')
933
+ plt.ylabel('Proportion')
934
+
935
+ # Update legend with formatted labels, maintaining correct order
936
+ volume_unit = "px³" if settings['um_per_px'] is None else "µm³"
937
+ plt.legend(legend_labels, title=f'Volume Range ({volume_unit})', bbox_to_anchor=(1.05, 1), loc='upper left')
938
+ plt.ylim(0, 1)
939
+ fig = plt.gcf()
940
+ return chi2, p, dof, expected, raw_counts, fig
941
+
942
+ settings = set_analyze_endodyogeny_defaults(settings)
943
+ save_settings(settings, name='analyze_endodyogeny', show=True)
944
+ output = {}
945
+
946
+ # Process data
947
+ if not isinstance(settings['src'], list):
948
+ settings['src'] = [settings['src']]
949
+
950
+ locs = []
951
+ for s in settings['src']:
952
+ loc = os.path.join(s, 'measurements/measurements.db')
953
+ locs.append(loc)
954
+
955
+ df, _ = _read_and_merge_data(
956
+ locs,
957
+ tables=settings['tables'],
958
+ verbose=settings['verbose'],
959
+ nuclei_limit=settings['nuclei_limit'],
960
+ pathogen_limit=settings['pathogen_limit']
961
+ )
962
+
963
+ if not settings['um_per_px'] is None:
964
+ df[f"{settings['compartment']}_area"] = df[f"{settings['compartment']}_area"] * (settings['um_per_px'] ** 2)
965
+ settings['min_area_bin'] = settings['min_area_bin'] * (settings['um_per_px'] ** 2)
966
+
967
+ df = df[df[f"{settings['compartment']}_area"] >= settings['min_area_bin']]
968
+
969
+ df = annotate_conditions(
970
+ df=df,
971
+ cells=settings['cell_types'],
972
+ cell_loc=settings['cell_plate_metadata'],
973
+ pathogens=settings['pathogen_types'],
974
+ pathogen_loc=settings['pathogen_plate_metadata'],
975
+ treatments=settings['treatments'],
976
+ treatment_loc=settings['treatment_plate_metadata']
977
+ )
978
+
979
+ if settings['group_column'] not in df.columns:
980
+ print(f"{settings['group_column']} not found in DataFrame, please choose from:")
981
+ for col in df.columns:
982
+ print(col)
983
+
984
+ df = df.dropna(subset=[settings['group_column']])
985
+ df = _calculate_volume_bins(df, settings['compartment'], settings['min_area_bin'], settings['max_bins'], settings['verbose'])
986
+ output['data'] = df
987
+ # Perform chi-squared test and plot
988
+ chi2, p, dof, expected, raw_counts, fig = _plot_proportion_stacked_bars(settings, df, settings['group_column'], bin_column=f"{settings['compartment']}_volume_bin", level=settings['level']
989
+ )
990
+
991
+ # Create a DataFrame with chi-squared test results and raw counts
992
+ results_df = pd.DataFrame({
993
+ 'chi_squared_stat': [chi2],
994
+ 'p_value': [p],
995
+ 'degrees_of_freedom': [dof]
996
+ })
997
+
998
+ # Flatten and add expected counts to results_df
999
+ expected_df = pd.DataFrame(expected, index=raw_counts.index, columns=raw_counts.columns)
1000
+ expected_flat = expected_df.stack().reset_index()
1001
+ expected_flat.columns = [settings['group_column'], f"{settings['compartment']}_volume_bin", 'expected_count']
1002
+ results_df = results_df.merge(expected_flat, how="cross")
1003
+ output['chi_squared'] = results_df
1004
+
1005
+ if settings['save']:
1006
+ # Save DataFrame to CSV
1007
+ output_dir = os.path.join(settings['src'][0], 'results')
1008
+ os.makedirs(output_dir, exist_ok=True)
1009
+ output_path = os.path.join(output_dir, 'chi_squared_results.csv')
1010
+ output_path_fig = os.path.join(output_dir, 'chi_squared_results.pdf')
1011
+ fig.savefig(output_path_fig, dpi=300, bbox_inches='tight')
1012
+ results_df.to_csv(output_path, index=False)
1013
+ print(f"Chi-squared results saved to {output_path}")
1014
+
1015
+ plt.show()
1016
+
1017
+ return output
1018
+
1019
+ def analyze_class_proportion(settings):
1020
+
1021
+ from .utils import annotate_conditions, save_settings
1022
+ from .io import _read_and_merge_data
1023
+ from .settings import set_analyze_class_proportion_defaults
1024
+ from .plot import plot_plates
1025
+
1026
+
1027
+ def _plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_column='prc', level='object'):
1028
+ # Always calculate chi-squared on raw data
1029
+ raw_counts = df.groupby([group_column, bin_column]).size().unstack(fill_value=0)
1030
+ chi2, p, dof, expected = chi2_contingency(raw_counts)
1031
+ print(f"Chi-squared test statistic (raw data): {chi2:.4f}")
1032
+ print(f"p-value (raw data): {p:.4e}")
1033
+
1034
+ # Plot based on level setting
1035
+ if level == 'well':
1036
+ # Aggregate by well for mean ± SD visualization
1037
+ well_proportions = (
1038
+ df.groupby([group_column, prc_column, bin_column])
1039
+ .size()
1040
+ .groupby(level=[0, 1])
1041
+ .apply(lambda x: x / x.sum())
1042
+ .unstack(fill_value=0)
1043
+ )
1044
+ mean_proportions = well_proportions.groupby(group_column).mean()
1045
+ std_proportions = well_proportions.groupby(group_column).std()
1046
+
1047
+ ax = mean_proportions.plot(
1048
+ kind='bar', stacked=True, yerr=std_proportions, capsize=5, colormap='viridis', figsize=(12, 8)
1049
+ )
1050
+ plt.title('Proportion of Volume Bins by Group (Mean ± SD across wells)')
1051
+ else:
1052
+ # Object-level plotting without aggregation
1053
+ group_counts = df.groupby([group_column, bin_column]).size()
1054
+ group_totals = group_counts.groupby(level=0).sum()
1055
+ proportions = group_counts / group_totals
1056
+ proportion_df = proportions.unstack(fill_value=0)
1057
+
1058
+ ax = proportion_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(12, 8))
1059
+ plt.title('Proportion of Volume Bins by Group')
1060
+
1061
+ plt.xlabel('Group')
1062
+ plt.ylabel('Proportion')
1063
+
1064
+ # Update legend with formatted labels, maintaining correct order
1065
+ plt.legend(title=f'Classes', bbox_to_anchor=(1.05, 1), loc='upper left')
1066
+ plt.ylim(0, 1)
1067
+ fig = plt.gcf()
1068
+ return chi2, p, dof, expected, raw_counts, fig
1069
+
1070
+ settings = set_analyze_class_proportion_defaults(settings)
1071
+ save_settings(settings, name='analyze_class_proportion', show=True)
1072
+ output = {}
1073
+
1074
+ # Process data
1075
+ if not isinstance(settings['src'], list):
1076
+ settings['src'] = [settings['src']]
1077
+
1078
+ locs = []
1079
+ for s in settings['src']:
1080
+ loc = os.path.join(s, 'measurements/measurements.db')
1081
+ locs.append(loc)
1082
+
1083
+ if 'png_list' not in settings['tables']:
1084
+ settings['tables'] = settings['tables'] + ['png_list']
1085
+
1086
+ df, _ = _read_and_merge_data(
1087
+ locs,
1088
+ tables=settings['tables'],
1089
+ verbose=settings['verbose'],
1090
+ nuclei_limit=settings['nuclei_limit'],
1091
+ pathogen_limit=settings['pathogen_limit']
1092
+ )
1093
+
1094
+ df = annotate_conditions(
1095
+ df=df,
1096
+ cells=settings['cell_types'],
1097
+ cell_loc=settings['cell_plate_metadata'],
1098
+ pathogens=settings['pathogen_types'],
1099
+ pathogen_loc=settings['pathogen_plate_metadata'],
1100
+ treatments=settings['treatments'],
1101
+ treatment_loc=settings['treatment_plate_metadata']
1102
+ )
1103
+
1104
+ if settings['group_column'] not in df.columns:
1105
+ print(f"{settings['group_column']} not found in DataFrame, please choose from:")
1106
+ for col in df.columns:
1107
+ print(col)
1108
+
1109
+ df[settings['class_column']] = df[settings['class_column']].fillna(0)
1110
+ output['data'] = df
1111
+
1112
+ # Perform chi-squared test and plot
1113
+ chi2, p, dof, expected, raw_counts, fig = _plot_proportion_stacked_bars(settings, df, settings['group_column'], bin_column=settings['class_column'], level=settings['level'])
1114
+
1115
+ # Create a DataFrame with chi-squared test results and raw counts
1116
+ results_df = pd.DataFrame({
1117
+ 'chi_squared_stat': [chi2],
1118
+ 'p_value': [p],
1119
+ 'degrees_of_freedom': [dof]
1120
+ })
1121
+
1122
+ output['chi_squared'] = results_df
1123
+
1124
+ if settings['save']:
1125
+ output_dir = os.path.join(settings['src'][0], 'results')
1126
+ os.makedirs(output_dir, exist_ok=True)
1127
+ output_path_chi = os.path.join(output_dir, 'class_chi_squared_results.csv')
1128
+ output_path_data = os.path.join(output_dir, 'class_chi_squared_data.csv')
1129
+ output_path_fig = os.path.join(output_dir, 'class_chi_squared.pdf')
1130
+ fig.savefig(output_path_fig, dpi=300, bbox_inches='tight')
1131
+ results_df.to_csv(output_path_chi, index=False)
1132
+ df.to_csv(output_path_data, index=False)
1133
+ print(f"Chi-squared results saved to {output_path_chi}")
1134
+ print(f"Annotated data saved to {output_path_data}")
1135
+
1136
+ plt.show()
1137
+
1138
+ fig2 = plot_plates(df, variable=settings['class_column'], grouping='mean', min_max='allq', cmap='viridis', min_count=0, verbose=True, dst=None)
1139
+ if settings['save']:
1140
+ output_path_fig2 = os.path.join(output_dir, 'class_heatmap.pdf')
1141
+ fig2.savefig(output_path_fig2, dpi=300, bbox_inches='tight')
1142
+
1143
+ plt.show()
1144
+ return output
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: spacr
3
- Version: 0.3.62
3
+ Version: 0.3.64
4
4
  Summary: Spatial phenotype analysis of crisp screens (SpaCr)
5
5
  Home-page: https://github.com/EinarOlafsson/spacr
6
6
  Author: Einar Birnir Olafsson
@@ -15,17 +15,17 @@ spacr/gui.py,sha256=ARyn9Q_g8HoP-cXh1nzMLVFCKqthY4v2u9yORyaQqQE,8230
15
15
  spacr/gui_core.py,sha256=N7R7yvfK_dJhOReM_kW3Ci8Bokhi1OzsxeKqvSGdvV4,41460
16
16
  spacr/gui_elements.py,sha256=EKlvEg_4_je7jciEdR3NTgPrcTraowa2e2RUt-xqd6M,138254
17
17
  spacr/gui_utils.py,sha256=u9RoIOWpAXFEOnUlLpMQZrc1pWSg6omZsJMIhJdRv_g,41211
18
- spacr/io.py,sha256=0cBVmhqMaPkdEXib5Vhp19FC_1qfaK_NgtoImuDuwGU,142664
18
+ spacr/io.py,sha256=YlJAT6H8l4ipunMyKzjqoPcf-1AXgUmSyR1YN9WxmDI,142857
19
19
  spacr/logger.py,sha256=lJhTqt-_wfAunCPl93xE65Wr9Y1oIHJWaZMjunHUeIw,1538
20
20
  spacr/measure.py,sha256=2lK-ZcTxLM-MpXV1oZnucRD9iz5aprwahRKw9IEqshg,55085
21
21
  spacr/mediar.py,sha256=FwLvbLQW5LQzPgvJZG8Lw7GniA2vbZx6Jv6vIKu7I5c,14743
22
- spacr/ml.py,sha256=aLDeeaAl0d4-RP1CzFHPqz5br2HrFbJhvPexEm9lvSI,68198
22
+ spacr/ml.py,sha256=GOQJH8jdTrJQwiLlDrcc9-yCxLFaMx4YD4OJs0-R5YI,77947
23
23
  spacr/openai.py,sha256=5vBZ3Jl2llYcW3oaTEXgdyCB2aJujMUIO5K038z7w_A,1246
24
- spacr/plot.py,sha256=zITe54dzQRz-gk_ZT0qJyARuUWJivIBKW8V4rjUH8SE,160320
24
+ spacr/plot.py,sha256=0fne2Msy6niN80oiuwt9ZYw1QwXVnghaUmrwvEZN9-8,161992
25
25
  spacr/sequencing.py,sha256=ClUfwPPK6rNUbUuiEkzcwakzVyDKKUMv9ricrxT8qQY,25227
26
- spacr/settings.py,sha256=zANLspVmllDZeYjQWIfrHN3VkVgicnYGTduv30MmQ18,77257
26
+ spacr/settings.py,sha256=LSoDNuz1m7rySh7MWXEL1xlUU4rFiCRVlGvZCSCOqzU,80085
27
27
  spacr/sim.py,sha256=1xKhXimNU3ukzIw-3l9cF3Znc_brW8h20yv8fSTzvss,71173
28
- spacr/submodules.py,sha256=Xq4gjvooHN8S7cTk5PIAkd7XD2c7CMVqNpeo8GCvtHc,42489
28
+ spacr/submodules.py,sha256=X1OI0Dsc1qU4lqKFdF2EnloNkLkDzA1hDn7CYbkBmFc,55473
29
29
  spacr/timelapse.py,sha256=KGfG4L4-QnFfgbF7L6C5wL_3gd_rqr05Foje6RsoTBg,39603
30
30
  spacr/toxo.py,sha256=z2nT5aAze3NUIlwnBQcnkARihDwoPfqOgQIVoUluyK0,25087
31
31
  spacr/utils.py,sha256=vvciLh1gH0nsrCWQw3taUcDjxP59wme3gqrejeNO05w,222943
@@ -151,9 +151,9 @@ spacr/resources/icons/umap.png,sha256=dOLF3DeLYy9k0nkUybiZMe1wzHQwLJFRmgccppw-8b
151
151
  spacr/resources/images/plate1_E01_T0001F001L01A01Z01C02.tif,sha256=Tl0ZUfZ_AYAbu0up_nO0tPRtF1BxXhWQ3T3pURBCCRo,7958528
152
152
  spacr/resources/images/plate1_E01_T0001F001L01A02Z01C01.tif,sha256=m8N-V71rA1TT4dFlENNg8s0Q0YEXXs8slIn7yObmZJQ,7958528
153
153
  spacr/resources/images/plate1_E01_T0001F001L01A03Z01C03.tif,sha256=Pbhk7xn-KUP6RSIhJsxQcrHFImBm3GEpLkzx7WOc-5M,7958528
154
- spacr-0.3.62.dist-info/LICENSE,sha256=SR-2MeGc6SCM1UORJYyarSWY_A-JaOMFDj7ReSs9tRM,1083
155
- spacr-0.3.62.dist-info/METADATA,sha256=Ox14lWGxbXuMW36MriYHppKcZDqD_4HopfbcLAi8dLc,6032
156
- spacr-0.3.62.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
157
- spacr-0.3.62.dist-info/entry_points.txt,sha256=BMC0ql9aNNpv8lUZ8sgDLQMsqaVnX5L535gEhKUP5ho,296
158
- spacr-0.3.62.dist-info/top_level.txt,sha256=GJPU8FgwRXGzKeut6JopsSRY2R8T3i9lDgya42tLInY,6
159
- spacr-0.3.62.dist-info/RECORD,,
154
+ spacr-0.3.64.dist-info/LICENSE,sha256=SR-2MeGc6SCM1UORJYyarSWY_A-JaOMFDj7ReSs9tRM,1083
155
+ spacr-0.3.64.dist-info/METADATA,sha256=_07fLYI8eMAYJzOEcAVOemN4TFJAuzAvUrdX1T136T0,6032
156
+ spacr-0.3.64.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
157
+ spacr-0.3.64.dist-info/entry_points.txt,sha256=BMC0ql9aNNpv8lUZ8sgDLQMsqaVnX5L535gEhKUP5ho,296
158
+ spacr-0.3.64.dist-info/top_level.txt,sha256=GJPU8FgwRXGzKeut6JopsSRY2R8T3i9lDgya42tLInY,6
159
+ spacr-0.3.64.dist-info/RECORD,,
File without changes