spacr 0.3.62__py3-none-any.whl → 0.3.65__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/__init__.py +2 -0
- spacr/io.py +3 -1
- spacr/ml.py +205 -0
- spacr/plot.py +126 -2
- spacr/settings.py +64 -0
- spacr/stats.py +221 -0
- spacr/submodules.py +275 -1
- spacr/utils.py +0 -23
- {spacr-0.3.62.dist-info → spacr-0.3.65.dist-info}/METADATA +1 -1
- {spacr-0.3.62.dist-info → spacr-0.3.65.dist-info}/RECORD +14 -13
- {spacr-0.3.62.dist-info → spacr-0.3.65.dist-info}/LICENSE +0 -0
- {spacr-0.3.62.dist-info → spacr-0.3.65.dist-info}/WHEEL +0 -0
- {spacr-0.3.62.dist-info → spacr-0.3.65.dist-info}/entry_points.txt +0 -0
- {spacr-0.3.62.dist-info → spacr-0.3.65.dist-info}/top_level.txt +0 -0
spacr/__init__.py
CHANGED
@@ -27,6 +27,7 @@ from . import openai
|
|
27
27
|
from . import ml
|
28
28
|
from . import toxo
|
29
29
|
from . import cellpose
|
30
|
+
from . import stats
|
30
31
|
from . import logger
|
31
32
|
|
32
33
|
__all__ = [
|
@@ -57,6 +58,7 @@ __all__ = [
|
|
57
58
|
"ml",
|
58
59
|
"toxo",
|
59
60
|
"cellpose",
|
61
|
+
"stats",
|
60
62
|
"logger"
|
61
63
|
]
|
62
64
|
|
spacr/io.py
CHANGED
@@ -2551,6 +2551,7 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
|
|
2551
2551
|
png_list_g_df_non_numeric.drop(columns=['plate','row_name','column_name','field','file_name','cell_id', 'prcf'], inplace=True)
|
2552
2552
|
if verbose:
|
2553
2553
|
print(f'png_list: {len(png_list)}, png_list grouped: {len(png_list_g_df_numeric)}')
|
2554
|
+
print(f"Added png_list columns: {png_list_g_df_numeric.columns}, {png_list_g_df_non_numeric.columns}")
|
2554
2555
|
merged_df = merged_df.merge(png_list_g_df_numeric, left_index=True, right_index=True)
|
2555
2556
|
merged_df = merged_df.merge(png_list_g_df_non_numeric, left_index=True, right_index=True)
|
2556
2557
|
|
@@ -2562,7 +2563,8 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
|
|
2562
2563
|
metadata.set_index('prcfo', inplace=True)
|
2563
2564
|
|
2564
2565
|
# Merge metadata with final merged DataFrame
|
2565
|
-
merged_df = metadata.merge(merged_df, left_index=True, right_index=True).dropna(axis=1)
|
2566
|
+
#merged_df = metadata.merge(merged_df, left_index=True, right_index=True).dropna(axis=1)
|
2567
|
+
merged_df = metadata.merge(merged_df, left_index=True, right_index=True)
|
2566
2568
|
merged_df.drop(columns=['label_list_morphology', 'label_list_intensity'], errors='ignore', inplace=True)
|
2567
2569
|
|
2568
2570
|
if verbose:
|
spacr/ml.py
CHANGED
@@ -3,6 +3,7 @@ import pandas as pd
|
|
3
3
|
import numpy as np
|
4
4
|
from scipy import stats
|
5
5
|
from scipy.stats import shapiro
|
6
|
+
from math import pi
|
6
7
|
|
7
8
|
from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV
|
8
9
|
from sklearn.metrics import mean_squared_error
|
@@ -1515,3 +1516,207 @@ def _calculate_similarity(df, features, col_to_compare, val1, val2):
|
|
1515
1516
|
|
1516
1517
|
return df
|
1517
1518
|
|
1519
|
+
def interperate_vision_model(settings={}):
|
1520
|
+
|
1521
|
+
from .io import _read_and_merge_data, _results_to_csv
|
1522
|
+
from .settings import set_interperate_vision_model_defaults
|
1523
|
+
from .utils import save_settings
|
1524
|
+
|
1525
|
+
settings = set_interperate_vision_model_defaults(settings)
|
1526
|
+
save_settings(settings, name='interperate_vision_model', show=True)
|
1527
|
+
|
1528
|
+
# Function to create radar plot for individual and combined values
|
1529
|
+
def create_extended_radar_plot(values, labels, title):
|
1530
|
+
values = list(values) + [values[0]] # Close the loop for radar chart
|
1531
|
+
angles = [n / float(len(labels)) * 2 * pi for n in range(len(labels))]
|
1532
|
+
angles += angles[:1]
|
1533
|
+
|
1534
|
+
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
|
1535
|
+
ax.plot(angles, values, linewidth=2, linestyle='solid')
|
1536
|
+
ax.fill(angles, values, alpha=0.25)
|
1537
|
+
|
1538
|
+
ax.set_xticks(angles[:-1])
|
1539
|
+
ax.set_xticklabels(labels, fontsize=10, rotation=45, ha='right')
|
1540
|
+
plt.title(title, pad=20)
|
1541
|
+
plt.show()
|
1542
|
+
|
1543
|
+
def extract_compartment_channel(feature_name):
|
1544
|
+
# Identify compartment as the first part before an underscore
|
1545
|
+
compartment = feature_name.split('_')[0]
|
1546
|
+
|
1547
|
+
if compartment == 'cells':
|
1548
|
+
compartment = 'cell'
|
1549
|
+
|
1550
|
+
# Identify channels based on substring presence
|
1551
|
+
channels = []
|
1552
|
+
if 'channel_0' in feature_name:
|
1553
|
+
channels.append('channel_0')
|
1554
|
+
if 'channel_1' in feature_name:
|
1555
|
+
channels.append('channel_1')
|
1556
|
+
if 'channel_2' in feature_name:
|
1557
|
+
channels.append('channel_2')
|
1558
|
+
if 'channel_3' in feature_name:
|
1559
|
+
channels.append('channel_3')
|
1560
|
+
|
1561
|
+
# If multiple channels are found, join them with a '+'
|
1562
|
+
if channels:
|
1563
|
+
channel = ' + '.join(channels)
|
1564
|
+
else:
|
1565
|
+
channel = 'morphology' # Use 'morphology' if no channel identifier is found
|
1566
|
+
|
1567
|
+
return (compartment, channel)
|
1568
|
+
|
1569
|
+
def read_and_preprocess_data(settings):
|
1570
|
+
|
1571
|
+
df, _ = _read_and_merge_data(
|
1572
|
+
locs=[settings['src']+'/measurements/measurements.db'],
|
1573
|
+
tables=settings['tables'],
|
1574
|
+
verbose=True,
|
1575
|
+
nuclei_limit=settings['nuclei_limit'],
|
1576
|
+
pathogen_limit=settings['pathogen_limit']
|
1577
|
+
)
|
1578
|
+
|
1579
|
+
scores_df = pd.read_csv(settings['scores'])
|
1580
|
+
|
1581
|
+
# Clean and align columns for merging
|
1582
|
+
df['object_label'] = df['object_label'].str.replace('o', '')
|
1583
|
+
|
1584
|
+
if 'row_name' not in scores_df.columns:
|
1585
|
+
scores_df['row_name'] = scores_df['row']
|
1586
|
+
|
1587
|
+
if 'column_name' not in scores_df.columns:
|
1588
|
+
scores_df['column_name'] = scores_df['col']
|
1589
|
+
|
1590
|
+
if 'object_label' not in scores_df.columns:
|
1591
|
+
scores_df['object_label'] = scores_df['object']
|
1592
|
+
|
1593
|
+
# Remove the 'o' prefix from 'object_label' in df, ensuring it is a string type
|
1594
|
+
df['object_label'] = df['object_label'].str.replace('o', '').astype(str)
|
1595
|
+
|
1596
|
+
# Ensure 'object_label' in scores_df is also a string
|
1597
|
+
scores_df['object_label'] = scores_df['object'].astype(str)
|
1598
|
+
|
1599
|
+
# Ensure all join columns have the same data type in both DataFrames
|
1600
|
+
df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
|
1601
|
+
scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
|
1602
|
+
|
1603
|
+
# Select only the necessary columns from scores_df for merging
|
1604
|
+
scores_df = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label', settings['score_column']]]
|
1605
|
+
|
1606
|
+
# Now merge DataFrames
|
1607
|
+
merged_df = pd.merge(df, scores_df, on=['plate', 'row_name', 'column_name', 'field', 'object_label'], how='inner')
|
1608
|
+
|
1609
|
+
# Separate numerical features and the score column
|
1610
|
+
X = merged_df.select_dtypes(include='number').drop(columns=[settings['score_column']])
|
1611
|
+
y = merged_df[settings['score_column']]
|
1612
|
+
|
1613
|
+
return X, y, merged_df
|
1614
|
+
|
1615
|
+
X, y, merged_df = read_and_preprocess_data(settings)
|
1616
|
+
|
1617
|
+
# Step 1: Feature Importance using Random Forest
|
1618
|
+
if settings['feature_importance'] or settings['feature_importance']:
|
1619
|
+
model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
|
1620
|
+
model.fit(X, y)
|
1621
|
+
|
1622
|
+
if settings['feature_importance']:
|
1623
|
+
print(f"Feature Importance ...")
|
1624
|
+
feature_importances = model.feature_importances_
|
1625
|
+
feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
|
1626
|
+
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
|
1627
|
+
top_feature_importance_df = feature_importance_df.head(settings['top_features'])
|
1628
|
+
|
1629
|
+
# Plot Feature Importance
|
1630
|
+
plt.figure(figsize=(10, 6))
|
1631
|
+
plt.barh(top_feature_importance_df['feature'], top_feature_importance_df['importance'])
|
1632
|
+
plt.xlabel('Importance')
|
1633
|
+
plt.title(f"Top {settings['top_features']} Features - Feature Importance")
|
1634
|
+
plt.gca().invert_yaxis()
|
1635
|
+
plt.show()
|
1636
|
+
|
1637
|
+
if settings['save']:
|
1638
|
+
_results_to_csv(feature_importance_df, filename='feature_importance.csv')
|
1639
|
+
|
1640
|
+
# Step 2: Permutation Importance
|
1641
|
+
if settings['permutation_importance']:
|
1642
|
+
print(f"Permutation Importance ...")
|
1643
|
+
perm_importance = permutation_importance(model, X, y, n_repeats=10, random_state=42, n_jobs=settings['n_jobs'])
|
1644
|
+
perm_importance_df = pd.DataFrame({'feature': X.columns, 'importance': perm_importance.importances_mean})
|
1645
|
+
perm_importance_df = perm_importance_df.sort_values(by='importance', ascending=False)
|
1646
|
+
top_perm_importance_df = perm_importance_df.head(settings['top_features'])
|
1647
|
+
|
1648
|
+
# Plot Permutation Importance
|
1649
|
+
plt.figure(figsize=(10, 6))
|
1650
|
+
plt.barh(top_perm_importance_df['feature'], top_perm_importance_df['importance'])
|
1651
|
+
plt.xlabel('Importance')
|
1652
|
+
plt.title(f"Top {settings['top_features']} Features - Permutation Importance")
|
1653
|
+
plt.gca().invert_yaxis()
|
1654
|
+
plt.show()
|
1655
|
+
|
1656
|
+
if settings['save']:
|
1657
|
+
_results_to_csv(perm_importance_df, filename='permutation_importance.csv')
|
1658
|
+
|
1659
|
+
# Step 3: SHAP Analysis
|
1660
|
+
if settings['shap']:
|
1661
|
+
print(f"SHAP Analysis ...")
|
1662
|
+
|
1663
|
+
# Select top N features based on Random Forest importance and fit the model on these features only
|
1664
|
+
top_features = feature_importance_df.head(settings['top_features'])['feature']
|
1665
|
+
X_top = X[top_features]
|
1666
|
+
|
1667
|
+
# Refit the model on this subset of features
|
1668
|
+
model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
|
1669
|
+
model.fit(X_top, y)
|
1670
|
+
|
1671
|
+
# Sample a smaller subset of rows to speed up SHAP
|
1672
|
+
if settings['shap_sample']:
|
1673
|
+
sample = int(len(X_top) / 100)
|
1674
|
+
X_sample = X_top.sample(min(sample, len(X_top)), random_state=42)
|
1675
|
+
else:
|
1676
|
+
X_sample = X_top
|
1677
|
+
|
1678
|
+
# Initialize SHAP explainer with the same subset of features
|
1679
|
+
explainer = shap.Explainer(model.predict, X_sample)
|
1680
|
+
shap_values = explainer(X_sample, max_evals=1500)
|
1681
|
+
|
1682
|
+
# Plot SHAP summary for the selected sample and top features
|
1683
|
+
shap.summary_plot(shap_values, X_sample, max_display=settings['top_features'])
|
1684
|
+
|
1685
|
+
# Convert SHAP values to a DataFrame for easier manipulation
|
1686
|
+
shap_df = pd.DataFrame(shap_values.values, columns=X_sample.columns)
|
1687
|
+
|
1688
|
+
# Apply the function to create MultiIndex columns with compartment and channel
|
1689
|
+
shap_df.columns = pd.MultiIndex.from_tuples(
|
1690
|
+
[extract_compartment_channel(feat) for feat in shap_df.columns],
|
1691
|
+
names=['compartment', 'channel']
|
1692
|
+
)
|
1693
|
+
|
1694
|
+
# Aggregate SHAP values by compartment and channel
|
1695
|
+
compartment_mean = shap_df.abs().groupby(level='compartment', axis=1).mean().mean(axis=0)
|
1696
|
+
channel_mean = shap_df.abs().groupby(level='channel', axis=1).mean().mean(axis=0)
|
1697
|
+
|
1698
|
+
# Calculate combined importance for each pair of compartments and channels
|
1699
|
+
combined_compartment = {}
|
1700
|
+
for i, comp1 in enumerate(compartment_mean.index):
|
1701
|
+
for comp2 in compartment_mean.index[i+1:]:
|
1702
|
+
combined_compartment[f"{comp1} + {comp2}"] = shap_df.loc[:, (comp1, slice(None))].abs().mean().mean() + \
|
1703
|
+
shap_df.loc[:, (comp2, slice(None))].abs().mean().mean()
|
1704
|
+
|
1705
|
+
combined_channel = {}
|
1706
|
+
for i, chan1 in enumerate(channel_mean.index):
|
1707
|
+
for chan2 in channel_mean.index[i+1:]:
|
1708
|
+
combined_channel[f"{chan1} + {chan2}"] = shap_df.loc[:, (slice(None), chan1)].abs().mean().mean() + \
|
1709
|
+
shap_df.loc[:, (slice(None), chan2)].abs().mean().mean()
|
1710
|
+
|
1711
|
+
# Prepare values and labels for radar charts
|
1712
|
+
all_compartment_importance = list(compartment_mean.values) + list(combined_compartment.values())
|
1713
|
+
all_compartment_labels = list(compartment_mean.index) + list(combined_compartment.keys())
|
1714
|
+
|
1715
|
+
all_channel_importance = list(channel_mean.values) + list(combined_channel.values())
|
1716
|
+
all_channel_labels = list(channel_mean.index) + list(combined_channel.keys())
|
1717
|
+
|
1718
|
+
# Create radar plots for compartments and channels
|
1719
|
+
create_extended_radar_plot(all_compartment_importance, all_compartment_labels, "SHAP Importance by Compartment (Individual and Combined)")
|
1720
|
+
create_extended_radar_plot(all_channel_importance, all_channel_labels, "SHAP Importance by Channel (Individual and Combined)")
|
1721
|
+
|
1722
|
+
return merged_df
|
spacr/plot.py
CHANGED
@@ -17,7 +17,7 @@ from skimage.measure import find_contours, label, regionprops
|
|
17
17
|
from skimage.segmentation import mark_boundaries
|
18
18
|
from skimage.transform import resize as sk_resize
|
19
19
|
import scikit_posthocs as sp
|
20
|
-
|
20
|
+
from scipy.stats import chi2_contingency
|
21
21
|
import tifffile as tiff
|
22
22
|
|
23
23
|
from scipy.stats import normaltest, ttest_ind, mannwhitneyu, f_oneway, kruskal
|
@@ -2609,7 +2609,7 @@ class spacrGraph:
|
|
2609
2609
|
def perform_posthoc_tests(self, is_normal, unique_groups):
|
2610
2610
|
"""Perform post-hoc tests for multiple groups based on all_to_all flag."""
|
2611
2611
|
|
2612
|
-
from .
|
2612
|
+
from .stats import choose_p_adjust_method
|
2613
2613
|
|
2614
2614
|
posthoc_results = []
|
2615
2615
|
if is_normal and len(unique_groups) > 2 and self.all_to_all:
|
@@ -3688,3 +3688,127 @@ def overlay_masks_on_images(img_folder, normalize=True, resize=True, save=False,
|
|
3688
3688
|
plt.axis('off')
|
3689
3689
|
plt.show()
|
3690
3690
|
|
3691
|
+
def graph_importance(settings):
|
3692
|
+
|
3693
|
+
from .settings import set_graph_importance_defaults
|
3694
|
+
from .utils import save_settings
|
3695
|
+
|
3696
|
+
if not isinstance(settings['csvs'], list):
|
3697
|
+
settings['csvs'] = settings['csvs']
|
3698
|
+
|
3699
|
+
settings['src'] = os.path.dirname(settings['csvs'][0])
|
3700
|
+
|
3701
|
+
settings = set_graph_importance_defaults(settings)
|
3702
|
+
save_settings(settings, name='graph_importance')
|
3703
|
+
|
3704
|
+
dfs = []
|
3705
|
+
for path in settings['csvs']:
|
3706
|
+
dft = pd.read_csv(path)
|
3707
|
+
dfs.append(dft)
|
3708
|
+
|
3709
|
+
df = pd.concat(dfs)
|
3710
|
+
|
3711
|
+
if not all(col in df.columns for col in (settings['grouping_column'], settings['data_column'])):
|
3712
|
+
print(f"grouping {settings['grouping_column']} and data {settings['data_column']} columns must be in {df.columns.to_list()}")
|
3713
|
+
return
|
3714
|
+
|
3715
|
+
output_dir = os.path.dirname(settings['csvs'][0])
|
3716
|
+
|
3717
|
+
spacr_graph = spacrGraph(
|
3718
|
+
df=df,
|
3719
|
+
grouping_column=settings['grouping_column'],
|
3720
|
+
data_column=settings['data_column'],
|
3721
|
+
graph_type=settings['graph_type'],
|
3722
|
+
graph_name=settings['grouping_column'],
|
3723
|
+
summary_func='mean',
|
3724
|
+
colors=None,
|
3725
|
+
output_dir=output_dir,
|
3726
|
+
save=settings['save'],
|
3727
|
+
y_lim=None,
|
3728
|
+
error_bar_type='std',
|
3729
|
+
representation='object',
|
3730
|
+
theme='muted',
|
3731
|
+
)
|
3732
|
+
|
3733
|
+
# Create the plot
|
3734
|
+
spacr_graph.create_plot()
|
3735
|
+
|
3736
|
+
# Get the figure object if needed
|
3737
|
+
fig = spacr_graph.get_figure()
|
3738
|
+
plt.show()
|
3739
|
+
|
3740
|
+
def plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_column='prc', level='object'):
|
3741
|
+
"""
|
3742
|
+
Generate a stacked bar plot for proportions and perform chi-squared and pairwise tests.
|
3743
|
+
|
3744
|
+
Parameters:
|
3745
|
+
- settings (dict): Analysis settings.
|
3746
|
+
- df (DataFrame): Input data.
|
3747
|
+
- group_column (str): Column indicating the groups.
|
3748
|
+
- bin_column (str): Column indicating the categories.
|
3749
|
+
- prc_column (str): Optional; column for additional stratification.
|
3750
|
+
- level (str): Level of aggregation ('well' or 'object').
|
3751
|
+
|
3752
|
+
Returns:
|
3753
|
+
- chi2 (float): Chi-squared statistic for the overall test.
|
3754
|
+
- p (float): p-value for the overall chi-squared test.
|
3755
|
+
- dof (int): Degrees of freedom for the overall chi-squared test.
|
3756
|
+
- expected (ndarray): Expected frequencies for the overall chi-squared test.
|
3757
|
+
- raw_counts (DataFrame): Contingency table of observed counts.
|
3758
|
+
- fig (Figure): The generated plot.
|
3759
|
+
- pairwise_results (list): Pairwise test results from `chi_pairwise`.
|
3760
|
+
"""
|
3761
|
+
|
3762
|
+
from .stats import chi_pairwise
|
3763
|
+
|
3764
|
+
# Calculate contingency table for overall chi-squared test
|
3765
|
+
raw_counts = df.groupby([group_column, bin_column]).size().unstack(fill_value=0)
|
3766
|
+
chi2, p, dof, expected = chi2_contingency(raw_counts)
|
3767
|
+
print(f"Chi-squared test statistic (raw data): {chi2:.4f}")
|
3768
|
+
print(f"p-value (raw data): {p:.4e}")
|
3769
|
+
|
3770
|
+
# Perform pairwise comparisons
|
3771
|
+
pairwise_results = chi_pairwise(raw_counts, verbose=settings.get('verbose', False))
|
3772
|
+
|
3773
|
+
# Plot based on level setting
|
3774
|
+
if level == 'well':
|
3775
|
+
# Aggregate by well for mean ± SD visualization
|
3776
|
+
well_proportions = (
|
3777
|
+
df.groupby([group_column, prc_column, bin_column])
|
3778
|
+
.size()
|
3779
|
+
.groupby(level=[0, 1])
|
3780
|
+
.apply(lambda x: x / x.sum())
|
3781
|
+
.unstack(fill_value=0)
|
3782
|
+
)
|
3783
|
+
mean_proportions = well_proportions.groupby(group_column).mean()
|
3784
|
+
std_proportions = well_proportions.groupby(group_column).std()
|
3785
|
+
|
3786
|
+
ax = mean_proportions.plot(
|
3787
|
+
kind='bar', stacked=True, yerr=std_proportions, capsize=5, colormap='viridis', figsize=(12, 8)
|
3788
|
+
)
|
3789
|
+
plt.title('Proportion of Volume Bins by Group (Mean ± SD across wells)')
|
3790
|
+
else:
|
3791
|
+
# Object-level plotting without aggregation
|
3792
|
+
group_counts = df.groupby([group_column, bin_column]).size()
|
3793
|
+
group_totals = group_counts.groupby(level=0).sum()
|
3794
|
+
proportions = group_counts / group_totals
|
3795
|
+
proportion_df = proportions.unstack(fill_value=0)
|
3796
|
+
|
3797
|
+
ax = proportion_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(12, 8))
|
3798
|
+
plt.title('Proportion of Volume Bins by Group')
|
3799
|
+
|
3800
|
+
plt.xlabel('Group')
|
3801
|
+
plt.ylabel('Proportion')
|
3802
|
+
|
3803
|
+
# Update legend with formatted labels, maintaining correct order
|
3804
|
+
plt.legend(title=f'Classes', bbox_to_anchor=(1.05, 1), loc='upper left')
|
3805
|
+
plt.ylim(0, 1)
|
3806
|
+
fig = plt.gcf()
|
3807
|
+
|
3808
|
+
results_df = pd.DataFrame({
|
3809
|
+
'chi_squared_stat': [chi2],
|
3810
|
+
'p_value': [p],
|
3811
|
+
'degrees_of_freedom': [dof]
|
3812
|
+
})
|
3813
|
+
|
3814
|
+
return results_df, pairwise_results, fig
|
spacr/settings.py
CHANGED
@@ -1370,4 +1370,68 @@ def get_analyze_plaque_settings(settings):
|
|
1370
1370
|
settings.setdefault('rescale', False)
|
1371
1371
|
settings.setdefault('resample', False)
|
1372
1372
|
settings.setdefault('fill_in', True)
|
1373
|
+
return settings
|
1374
|
+
|
1375
|
+
def set_graph_importance_defaults(settings):
|
1376
|
+
settings.setdefault('csvs','list of paths')
|
1377
|
+
settings.setdefault('grouping_column','compartment')
|
1378
|
+
settings.setdefault('data_column','compartment_importance_sum')
|
1379
|
+
settings.setdefault('graph_type','jitter_bar')
|
1380
|
+
settings.setdefault('save',False)
|
1381
|
+
return settings
|
1382
|
+
|
1383
|
+
def set_interperate_vision_model_defaults(settings):
|
1384
|
+
settings.setdefault('src','path')
|
1385
|
+
settings.setdefault('scores','path')
|
1386
|
+
settings.setdefault('tables',['cell', 'nucleus', 'pathogen','cytoplasm'])
|
1387
|
+
settings.setdefault('feature_importance',True)
|
1388
|
+
settings.setdefault('permutation_importance',False)
|
1389
|
+
settings.setdefault('shap',True)
|
1390
|
+
settings.setdefault('save',False)
|
1391
|
+
settings.setdefault('nuclei_limit',1000)
|
1392
|
+
settings.setdefault('pathogen_limit',1000)
|
1393
|
+
settings.setdefault('top_features',30)
|
1394
|
+
settings.setdefault('shap_sample',True)
|
1395
|
+
settings.setdefault('n_jobs',-1)
|
1396
|
+
settings.setdefault('shap_approximate',True)
|
1397
|
+
settings.setdefault('score_column','cv_predictions')
|
1398
|
+
return settings
|
1399
|
+
|
1400
|
+
def set_analyze_endodyogeny_defaults(settings):
|
1401
|
+
settings.setdefault('src','path')
|
1402
|
+
settings.setdefault('tables',['cell', 'nucleus', 'pathogen', 'cytoplasm'])
|
1403
|
+
settings.setdefault('cell_types',['Hela'])
|
1404
|
+
settings.setdefault('cell_plate_metadata',None)
|
1405
|
+
settings.setdefault('pathogen_types',['nc', 'pc'])
|
1406
|
+
settings.setdefault('pathogen_plate_metadata',[['c1'], ['c2']])
|
1407
|
+
settings.setdefault('treatments',None)
|
1408
|
+
settings.setdefault('treatment_plate_metadata',None)
|
1409
|
+
settings.setdefault('min_area_bin',500)
|
1410
|
+
settings.setdefault('group_column','pathogen')
|
1411
|
+
settings.setdefault('compartment','pathogen')
|
1412
|
+
settings.setdefault('pathogen_limit',1)
|
1413
|
+
settings.setdefault('nuclei_limit',10)
|
1414
|
+
settings.setdefault('level','object')
|
1415
|
+
settings.setdefault('um_per_px',0.1)
|
1416
|
+
settings.setdefault('max_bins',None)
|
1417
|
+
settings.setdefault('save',False)
|
1418
|
+
settings.setdefault('verbose',False)
|
1419
|
+
return settings
|
1420
|
+
|
1421
|
+
def set_analyze_class_proportion_defaults(settings):
|
1422
|
+
settings.setdefault('src','path')
|
1423
|
+
settings.setdefault('tables',['cell', 'nucleus', 'pathogen', 'cytoplasm'])
|
1424
|
+
settings.setdefault('cell_types',['Hela'])
|
1425
|
+
settings.setdefault('cell_plate_metadata',None)
|
1426
|
+
settings.setdefault('pathogen_types',['nc','pc'])
|
1427
|
+
settings.setdefault('pathogen_plate_metadata',[['c1'],['c2']])
|
1428
|
+
settings.setdefault('treatments',None)
|
1429
|
+
settings.setdefault('treatment_plate_metadata',None)
|
1430
|
+
settings.setdefault('group_column','condition')
|
1431
|
+
settings.setdefault('class_column','test')
|
1432
|
+
settings.setdefault('pathogen_limit',1000)
|
1433
|
+
settings.setdefault('nuclei_limit',1000)
|
1434
|
+
settings.setdefault('level','well')
|
1435
|
+
settings.setdefault('save',False)
|
1436
|
+
settings.setdefault('verbose', False)
|
1373
1437
|
return settings
|
spacr/stats.py
ADDED
@@ -0,0 +1,221 @@
|
|
1
|
+
from scipy.stats import shapiro, normaltest, levene, ttest_ind, mannwhitneyu, kruskal, f_oneway
|
2
|
+
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
3
|
+
import scikit_posthocs as sp
|
4
|
+
import numpy as np
|
5
|
+
import pandas as pd
|
6
|
+
from scipy.stats import chi2_contingency, fisher_exact
|
7
|
+
import itertools
|
8
|
+
from statsmodels.stats.multitest import multipletests
|
9
|
+
|
10
|
+
|
11
|
+
def choose_p_adjust_method(num_groups, num_data_points):
|
12
|
+
"""
|
13
|
+
Selects the most appropriate p-value adjustment method based on data characteristics.
|
14
|
+
|
15
|
+
Parameters:
|
16
|
+
- num_groups: Number of unique groups being compared
|
17
|
+
- num_data_points: Number of data points per group (assuming balanced groups)
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
- A string representing the recommended p-adjustment method
|
21
|
+
"""
|
22
|
+
num_comparisons = (num_groups * (num_groups - 1)) // 2 # Number of pairwise comparisons
|
23
|
+
|
24
|
+
# Decision logic for choosing the adjustment method
|
25
|
+
if num_comparisons <= 10 and num_data_points > 5:
|
26
|
+
return 'holm' # Balanced between power and Type I error control
|
27
|
+
elif num_comparisons > 10 and num_data_points <= 5:
|
28
|
+
return 'fdr_bh' # FDR control for large number of comparisons and small sample size
|
29
|
+
elif num_comparisons <= 10:
|
30
|
+
return 'sidak' # Less conservative than Bonferroni, good for independent comparisons
|
31
|
+
else:
|
32
|
+
return 'bonferroni' # Very conservative, use for strict control of Type I errors
|
33
|
+
|
34
|
+
def perform_normality_tests(df, grouping_column, data_columns):
|
35
|
+
"""Perform normality tests for each group and data column."""
|
36
|
+
unique_groups = df[grouping_column].unique()
|
37
|
+
normality_results = []
|
38
|
+
|
39
|
+
for column in data_columns:
|
40
|
+
for group in unique_groups:
|
41
|
+
data = df.loc[df[grouping_column] == group, column].dropna()
|
42
|
+
n_samples = len(data)
|
43
|
+
|
44
|
+
if n_samples < 3:
|
45
|
+
# Skip test if there aren't enough data points
|
46
|
+
print(f"Skipping normality test for group '{group}' on column '{column}' - Not enough data.")
|
47
|
+
normality_results.append({
|
48
|
+
'Comparison': f'Normality test for {group} on {column}',
|
49
|
+
'Test Statistic': None,
|
50
|
+
'p-value': None,
|
51
|
+
'Test Name': 'Skipped',
|
52
|
+
'Column': column,
|
53
|
+
'n': n_samples
|
54
|
+
})
|
55
|
+
continue
|
56
|
+
|
57
|
+
# Choose the appropriate normality test based on the sample size
|
58
|
+
if n_samples >= 8:
|
59
|
+
stat, p_value = normaltest(data)
|
60
|
+
test_name = "D'Agostino-Pearson test"
|
61
|
+
else:
|
62
|
+
stat, p_value = shapiro(data)
|
63
|
+
test_name = "Shapiro-Wilk test"
|
64
|
+
|
65
|
+
normality_results.append({
|
66
|
+
'Comparison': f'Normality test for {group} on {column}',
|
67
|
+
'Test Statistic': stat,
|
68
|
+
'p-value': p_value,
|
69
|
+
'Test Name': test_name,
|
70
|
+
'Column': column,
|
71
|
+
'n': n_samples
|
72
|
+
})
|
73
|
+
|
74
|
+
# Check if all groups are normally distributed (p > 0.05)
|
75
|
+
normal_p_values = [result['p-value'] for result in normality_results if result['Column'] == column and result['p-value'] is not None]
|
76
|
+
is_normal = all(p > 0.05 for p in normal_p_values)
|
77
|
+
|
78
|
+
return is_normal, normality_results
|
79
|
+
|
80
|
+
|
81
|
+
def perform_levene_test(df, grouping_column, data_column):
|
82
|
+
"""Perform Levene's test for equal variance."""
|
83
|
+
unique_groups = df[grouping_column].unique()
|
84
|
+
grouped_data = [df.loc[df[grouping_column] == group, data_column].dropna() for group in unique_groups]
|
85
|
+
stat, p_value = levene(*grouped_data)
|
86
|
+
return stat, p_value
|
87
|
+
|
88
|
+
def perform_statistical_tests(df, grouping_column, data_columns, paired=False):
|
89
|
+
"""Perform statistical tests for each data column."""
|
90
|
+
unique_groups = df[grouping_column].unique()
|
91
|
+
test_results = []
|
92
|
+
|
93
|
+
for column in data_columns:
|
94
|
+
grouped_data = [df.loc[df[grouping_column] == group, column].dropna() for group in unique_groups]
|
95
|
+
if len(unique_groups) == 2: # For two groups
|
96
|
+
if paired:
|
97
|
+
print("Performing paired tests (not implemented in this template).")
|
98
|
+
continue # Extend as needed
|
99
|
+
else:
|
100
|
+
# Check normality for two groups
|
101
|
+
is_normal, _ = perform_normality_tests(df, grouping_column, [column])
|
102
|
+
if is_normal:
|
103
|
+
stat, p = ttest_ind(grouped_data[0], grouped_data[1])
|
104
|
+
test_name = 'T-test'
|
105
|
+
else:
|
106
|
+
stat, p = mannwhitneyu(grouped_data[0], grouped_data[1])
|
107
|
+
test_name = 'Mann-Whitney U test'
|
108
|
+
else:
|
109
|
+
# Check normality for multiple groups
|
110
|
+
is_normal, _ = perform_normality_tests(df, grouping_column, [column])
|
111
|
+
if is_normal:
|
112
|
+
stat, p = f_oneway(*grouped_data)
|
113
|
+
test_name = 'One-way ANOVA'
|
114
|
+
else:
|
115
|
+
stat, p = kruskal(*grouped_data)
|
116
|
+
test_name = 'Kruskal-Wallis test'
|
117
|
+
|
118
|
+
test_results.append({
|
119
|
+
'Column': column,
|
120
|
+
'Test Name': test_name,
|
121
|
+
'Test Statistic': stat,
|
122
|
+
'p-value': p,
|
123
|
+
'Groups': len(unique_groups)
|
124
|
+
})
|
125
|
+
|
126
|
+
return test_results
|
127
|
+
|
128
|
+
|
129
|
+
def perform_posthoc_tests(df, grouping_column, data_column, is_normal):
|
130
|
+
"""Perform post-hoc tests for multiple groups with both original and adjusted p-values."""
|
131
|
+
unique_groups = df[grouping_column].unique()
|
132
|
+
posthoc_results = []
|
133
|
+
|
134
|
+
if len(unique_groups) > 2:
|
135
|
+
num_groups = len(unique_groups)
|
136
|
+
num_data_points = len(df[data_column].dropna()) // num_groups # Assuming roughly equal data points per group
|
137
|
+
p_adjust_method = choose_p_adjust_method(num_groups, num_data_points)
|
138
|
+
|
139
|
+
if is_normal:
|
140
|
+
# Tukey's HSD automatically adjusts p-values
|
141
|
+
tukey_result = pairwise_tukeyhsd(df[data_column], df[grouping_column], alpha=0.05)
|
142
|
+
for comparison, p_value in zip(tukey_result._results_table.data[1:], tukey_result.pvalues):
|
143
|
+
posthoc_results.append({
|
144
|
+
'Comparison': f"{comparison[0]} vs {comparison[1]}",
|
145
|
+
'Original p-value': None, # Tukey HSD does not provide raw p-values
|
146
|
+
'Adjusted p-value': p_value,
|
147
|
+
'Adjusted Method': 'Tukey HSD',
|
148
|
+
'Test Name': 'Tukey HSD'
|
149
|
+
})
|
150
|
+
else:
|
151
|
+
# Dunn's test with p-value adjustment
|
152
|
+
raw_dunn_result = sp.posthoc_dunn(df, val_col=data_column, group_col=grouping_column, p_adjust=None)
|
153
|
+
adjusted_dunn_result = sp.posthoc_dunn(df, val_col=data_column, group_col=grouping_column, p_adjust=p_adjust_method)
|
154
|
+
for i, group_a in enumerate(adjusted_dunn_result.index):
|
155
|
+
for j, group_b in enumerate(adjusted_dunn_result.columns):
|
156
|
+
if i < j: # Only consider unique pairs
|
157
|
+
posthoc_results.append({
|
158
|
+
'Comparison': f"{group_a} vs {group_b}",
|
159
|
+
'Original p-value': raw_dunn_result.iloc[i, j],
|
160
|
+
'Adjusted p-value': adjusted_dunn_result.iloc[i, j],
|
161
|
+
'Adjusted Method': p_adjust_method,
|
162
|
+
'Test Name': "Dunn's Post-hoc"
|
163
|
+
})
|
164
|
+
|
165
|
+
return posthoc_results
|
166
|
+
|
167
|
+
def chi_pairwise(raw_counts, verbose=False):
|
168
|
+
"""
|
169
|
+
Perform pairwise chi-square or Fisher's exact tests between all unique group pairs
|
170
|
+
and apply p-value correction.
|
171
|
+
|
172
|
+
Parameters:
|
173
|
+
- raw_counts (DataFrame): Contingency table with group-wise counts.
|
174
|
+
- verbose (bool): Whether to print results for each pair.
|
175
|
+
|
176
|
+
Returns:
|
177
|
+
- pairwise_df (DataFrame): DataFrame with pairwise test results, including corrected p-values.
|
178
|
+
"""
|
179
|
+
pairwise_results = []
|
180
|
+
groups = raw_counts.index.unique() # Use index from raw_counts for group pairs
|
181
|
+
raw_p_values = [] # Store raw p-values for correction later
|
182
|
+
|
183
|
+
# Calculate the number of groups and average number of data points per group
|
184
|
+
num_groups = len(groups)
|
185
|
+
num_data_points = raw_counts.sum(axis=1).mean() # Average total data points per group
|
186
|
+
p_adjust_method = choose_p_adjust_method(num_groups, num_data_points)
|
187
|
+
|
188
|
+
for group1, group2 in itertools.combinations(groups, 2):
|
189
|
+
contingency_table = raw_counts.loc[[group1, group2]].values
|
190
|
+
if contingency_table.shape[1] == 2: # Fisher's Exact Test for 2x2 tables
|
191
|
+
oddsratio, p_value = fisher_exact(contingency_table)
|
192
|
+
test_name = "Fisher's Exact Test"
|
193
|
+
else: # Chi-Square Test for larger tables
|
194
|
+
chi2_stat, p_value, _, _ = chi2_contingency(contingency_table)
|
195
|
+
test_name = 'Pairwise Chi-Square Test'
|
196
|
+
|
197
|
+
pairwise_results.append({
|
198
|
+
'Group 1': group1,
|
199
|
+
'Group 2': group2,
|
200
|
+
'Test Name': test_name,
|
201
|
+
'p-value': p_value
|
202
|
+
})
|
203
|
+
raw_p_values.append(p_value)
|
204
|
+
|
205
|
+
# Apply p-value correction
|
206
|
+
corrected_p_values = multipletests(raw_p_values, method=p_adjust_method)[1]
|
207
|
+
|
208
|
+
# Add corrected p-values to results
|
209
|
+
for i, result in enumerate(pairwise_results):
|
210
|
+
result['p-value_adj'] = corrected_p_values[i]
|
211
|
+
|
212
|
+
pairwise_df = pd.DataFrame(pairwise_results)
|
213
|
+
|
214
|
+
pairwise_df['adj'] = p_adjust_method
|
215
|
+
|
216
|
+
if verbose:
|
217
|
+
# Print pairwise results
|
218
|
+
print("\nPairwise Frequency Analysis Results:")
|
219
|
+
print(pairwise_df.to_string(index=False))
|
220
|
+
|
221
|
+
return pairwise_df
|
spacr/submodules.py
CHANGED
@@ -10,6 +10,7 @@ from IPython.display import display
|
|
10
10
|
from sklearn.ensemble import RandomForestClassifier
|
11
11
|
from sklearn.inspection import permutation_importance
|
12
12
|
from math import pi
|
13
|
+
from scipy.stats import chi2_contingency
|
13
14
|
|
14
15
|
import matplotlib.pyplot as plt
|
15
16
|
from natsort import natsorted
|
@@ -844,4 +845,277 @@ def interperate_vision_model(settings={}):
|
|
844
845
|
df.to_csv(save_path)
|
845
846
|
print(f"Saved {save_path}")
|
846
847
|
|
847
|
-
return output
|
848
|
+
return output
|
849
|
+
|
850
|
+
def _plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_column='prc', level='object'):
|
851
|
+
# Always calculate chi-squared on raw data
|
852
|
+
raw_counts = df.groupby([group_column, bin_column]).size().unstack(fill_value=0)
|
853
|
+
chi2, p, dof, expected = chi2_contingency(raw_counts)
|
854
|
+
print(f"Chi-squared test statistic (raw data): {chi2:.4f}")
|
855
|
+
print(f"p-value (raw data): {p:.4e}")
|
856
|
+
|
857
|
+
# Extract bin labels and indices for formatting the legend in the correct order
|
858
|
+
bin_labels = df[bin_column].cat.categories if pd.api.types.is_categorical_dtype(df[bin_column]) else sorted(df[bin_column].unique())
|
859
|
+
bin_indices = range(1, len(bin_labels) + 1)
|
860
|
+
legend_labels = [f"{index}: {label}" for index, label in zip(bin_indices, bin_labels)]
|
861
|
+
|
862
|
+
# Plot based on level setting
|
863
|
+
if level == 'well':
|
864
|
+
# Aggregate by well for mean ± SD visualization
|
865
|
+
well_proportions = (
|
866
|
+
df.groupby([group_column, prc_column, bin_column])
|
867
|
+
.size()
|
868
|
+
.groupby(level=[0, 1])
|
869
|
+
.apply(lambda x: x / x.sum())
|
870
|
+
.unstack(fill_value=0)
|
871
|
+
)
|
872
|
+
mean_proportions = well_proportions.groupby(group_column).mean()
|
873
|
+
std_proportions = well_proportions.groupby(group_column).std()
|
874
|
+
|
875
|
+
ax = mean_proportions.plot(
|
876
|
+
kind='bar', stacked=True, yerr=std_proportions, capsize=5, colormap='viridis', figsize=(12, 8)
|
877
|
+
)
|
878
|
+
plt.title('Proportion of Volume Bins by Group (Mean ± SD across wells)')
|
879
|
+
else:
|
880
|
+
# Object-level plotting without aggregation
|
881
|
+
group_counts = df.groupby([group_column, bin_column]).size()
|
882
|
+
group_totals = group_counts.groupby(level=0).sum()
|
883
|
+
proportions = group_counts / group_totals
|
884
|
+
proportion_df = proportions.unstack(fill_value=0)
|
885
|
+
|
886
|
+
ax = proportion_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(12, 8))
|
887
|
+
plt.title('Proportion of Volume Bins by Group')
|
888
|
+
|
889
|
+
plt.xlabel('Group')
|
890
|
+
plt.ylabel('Proportion')
|
891
|
+
|
892
|
+
# Update legend with formatted labels, maintaining correct order
|
893
|
+
volume_unit = "px³" if settings['um_per_px'] is None else "µm³"
|
894
|
+
plt.legend(legend_labels, title=f'Volume Range ({volume_unit})', bbox_to_anchor=(1.05, 1), loc='upper left')
|
895
|
+
plt.ylim(0, 1)
|
896
|
+
fig = plt.gcf()
|
897
|
+
return chi2, p, dof, expected, raw_counts, fig
|
898
|
+
|
899
|
+
def analyze_endodyogeny(settings):
|
900
|
+
|
901
|
+
from .utils import annotate_conditions, save_settings
|
902
|
+
from .io import _read_and_merge_data
|
903
|
+
from .settings import set_analyze_endodyogeny_defaults
|
904
|
+
from .plot import plot_proportion_stacked_bars
|
905
|
+
|
906
|
+
def _calculate_volume_bins(df, compartment='pathogen', min_area_bin=500, max_bins=None, verbose=False):
|
907
|
+
area_column = f'{compartment}_area'
|
908
|
+
df[f'{compartment}_volume'] = df[area_column] ** 1.5
|
909
|
+
min_volume_bin = min_area_bin ** 1.5
|
910
|
+
max_volume = df[f'{compartment}_volume'].max()
|
911
|
+
|
912
|
+
# Generate bin edges as floats, and filter out any duplicate edges
|
913
|
+
bins = [min_volume_bin * (2 ** i) for i in range(int(np.ceil(np.log2(max_volume / min_volume_bin)) + 1))]
|
914
|
+
bins = sorted(set(bins)) # Ensure bin edges are unique
|
915
|
+
|
916
|
+
# Create bin labels as ranges with decimal precision for float values (e.g., "500.0-1000.0")
|
917
|
+
bin_labels = [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)]
|
918
|
+
if verbose:
|
919
|
+
print('Volume bins:', bins)
|
920
|
+
print('Volume bin labels:', bin_labels)
|
921
|
+
|
922
|
+
# Apply the bins to create a new column with the binned labels
|
923
|
+
df[f'{compartment}_volume_bin'] = pd.cut(df[f'{compartment}_volume'], bins=bins, labels=bin_labels, right=False)
|
924
|
+
|
925
|
+
# Create a bin index column (numeric version of bins)
|
926
|
+
df['bin_index'] = pd.cut(df[f'{compartment}_volume'], bins=bins, labels=range(1, len(bins)), right=False).astype(int)
|
927
|
+
|
928
|
+
# Adjust bin indices and labels based on max_bins
|
929
|
+
if max_bins is not None:
|
930
|
+
df.loc[df['bin_index'] > max_bins, 'bin_index'] = max_bins
|
931
|
+
|
932
|
+
# Update bin labels to reflect capped bins
|
933
|
+
bin_labels = bin_labels[:max_bins - 1] + [f">{bins[max_bins - 1]:.2f}"]
|
934
|
+
df[f'{compartment}_volume_bin'] = df['bin_index'].map(
|
935
|
+
{i + 1: label for i, label in enumerate(bin_labels)}
|
936
|
+
)
|
937
|
+
|
938
|
+
if verbose:
|
939
|
+
print(df[[f'{compartment}_volume', f'{compartment}_volume_bin', 'bin_index']].head())
|
940
|
+
|
941
|
+
return df
|
942
|
+
|
943
|
+
settings = set_analyze_endodyogeny_defaults(settings)
|
944
|
+
save_settings(settings, name='analyze_endodyogeny', show=True)
|
945
|
+
output = {}
|
946
|
+
|
947
|
+
# Process data
|
948
|
+
if not isinstance(settings['src'], list):
|
949
|
+
settings['src'] = [settings['src']]
|
950
|
+
|
951
|
+
locs = []
|
952
|
+
for s in settings['src']:
|
953
|
+
loc = os.path.join(s, 'measurements/measurements.db')
|
954
|
+
locs.append(loc)
|
955
|
+
|
956
|
+
df, _ = _read_and_merge_data(
|
957
|
+
locs,
|
958
|
+
tables=settings['tables'],
|
959
|
+
verbose=settings['verbose'],
|
960
|
+
nuclei_limit=settings['nuclei_limit'],
|
961
|
+
pathogen_limit=settings['pathogen_limit']
|
962
|
+
)
|
963
|
+
|
964
|
+
if not settings['um_per_px'] is None:
|
965
|
+
df[f"{settings['compartment']}_area"] = df[f"{settings['compartment']}_area"] * (settings['um_per_px'] ** 2)
|
966
|
+
settings['min_area_bin'] = settings['min_area_bin'] * (settings['um_per_px'] ** 2)
|
967
|
+
|
968
|
+
df = df[df[f"{settings['compartment']}_area"] >= settings['min_area_bin']]
|
969
|
+
|
970
|
+
df = annotate_conditions(
|
971
|
+
df=df,
|
972
|
+
cells=settings['cell_types'],
|
973
|
+
cell_loc=settings['cell_plate_metadata'],
|
974
|
+
pathogens=settings['pathogen_types'],
|
975
|
+
pathogen_loc=settings['pathogen_plate_metadata'],
|
976
|
+
treatments=settings['treatments'],
|
977
|
+
treatment_loc=settings['treatment_plate_metadata']
|
978
|
+
)
|
979
|
+
|
980
|
+
if settings['group_column'] not in df.columns:
|
981
|
+
print(f"{settings['group_column']} not found in DataFrame, please choose from:")
|
982
|
+
for col in df.columns:
|
983
|
+
print(col)
|
984
|
+
|
985
|
+
df = df.dropna(subset=[settings['group_column']])
|
986
|
+
df = _calculate_volume_bins(df, settings['compartment'], settings['min_area_bin'], settings['max_bins'], settings['verbose'])
|
987
|
+
output['data'] = df
|
988
|
+
# Perform chi-squared test and plot
|
989
|
+
results_df, pairwise_results_df, fig = plot_proportion_stacked_bars(settings, df, settings['group_column'], bin_column=f"{settings['compartment']}_volume_bin", level=settings['level'])
|
990
|
+
|
991
|
+
# Extract bin labels and indices for formatting the legend in the correct order
|
992
|
+
bin_labels = df[f"{settings['compartment']}_volume_bin"].cat.categories if pd.api.types.is_categorical_dtype(df[f"{settings['compartment']}_volume_bin"]) else sorted(df[f"{settings['compartment']}_volume_bin"].unique())
|
993
|
+
bin_indices = range(1, len(bin_labels) + 1)
|
994
|
+
legend_labels = [f"{index}: {label}" for index, label in zip(bin_indices, bin_labels)]
|
995
|
+
|
996
|
+
# Update legend with formatted labels, maintaining correct order
|
997
|
+
volume_unit = "px³" if settings['um_per_px'] is None else "µm³"
|
998
|
+
plt.legend(legend_labels, title=f'Volume Range ({volume_unit})', bbox_to_anchor=(1.05, 1), loc='upper left')
|
999
|
+
plt.ylim(0, 1)
|
1000
|
+
|
1001
|
+
output['chi_squared'] = results_df
|
1002
|
+
|
1003
|
+
if settings['save']:
|
1004
|
+
# Save DataFrame to CSV
|
1005
|
+
output_dir = os.path.join(settings['src'][0], 'results', 'analyze_endodyogeny')
|
1006
|
+
os.makedirs(output_dir, exist_ok=True)
|
1007
|
+
output_path = os.path.join(output_dir, 'chi_squared_results.csv')
|
1008
|
+
output_path_pairwise = os.path.join(output_dir, 'chi_squared_results.csv')
|
1009
|
+
output_path_fig = os.path.join(output_dir, 'chi_squared_results.pdf')
|
1010
|
+
fig.savefig(output_path_fig, dpi=300, bbox_inches='tight')
|
1011
|
+
results_df.to_csv(output_path, index=False)
|
1012
|
+
pairwise_results_df.to_csv(output_path_pairwise, index=False)
|
1013
|
+
print(f"Chi-squared results saved to {output_path}")
|
1014
|
+
|
1015
|
+
plt.show()
|
1016
|
+
|
1017
|
+
return output
|
1018
|
+
|
1019
|
+
def analyze_class_proportion(settings):
|
1020
|
+
|
1021
|
+
from .utils import annotate_conditions, save_settings
|
1022
|
+
from .io import _read_and_merge_data
|
1023
|
+
from .settings import set_analyze_class_proportion_defaults
|
1024
|
+
from .plot import plot_plates, plot_proportion_stacked_bars
|
1025
|
+
from .stats import perform_normality_tests, perform_levene_test, perform_statistical_tests, perform_posthoc_tests
|
1026
|
+
|
1027
|
+
settings = set_analyze_class_proportion_defaults(settings)
|
1028
|
+
save_settings(settings, name='analyze_class_proportion', show=True)
|
1029
|
+
output = {}
|
1030
|
+
|
1031
|
+
# Process data
|
1032
|
+
if not isinstance(settings['src'], list):
|
1033
|
+
settings['src'] = [settings['src']]
|
1034
|
+
|
1035
|
+
locs = []
|
1036
|
+
for s in settings['src']:
|
1037
|
+
loc = os.path.join(s, 'measurements/measurements.db')
|
1038
|
+
locs.append(loc)
|
1039
|
+
|
1040
|
+
if 'png_list' not in settings['tables']:
|
1041
|
+
settings['tables'] = settings['tables'] + ['png_list']
|
1042
|
+
|
1043
|
+
df, _ = _read_and_merge_data(
|
1044
|
+
locs,
|
1045
|
+
tables=settings['tables'],
|
1046
|
+
verbose=settings['verbose'],
|
1047
|
+
nuclei_limit=settings['nuclei_limit'],
|
1048
|
+
pathogen_limit=settings['pathogen_limit']
|
1049
|
+
)
|
1050
|
+
|
1051
|
+
df = annotate_conditions(
|
1052
|
+
df=df,
|
1053
|
+
cells=settings['cell_types'],
|
1054
|
+
cell_loc=settings['cell_plate_metadata'],
|
1055
|
+
pathogens=settings['pathogen_types'],
|
1056
|
+
pathogen_loc=settings['pathogen_plate_metadata'],
|
1057
|
+
treatments=settings['treatments'],
|
1058
|
+
treatment_loc=settings['treatment_plate_metadata']
|
1059
|
+
)
|
1060
|
+
|
1061
|
+
if settings['group_column'] not in df.columns:
|
1062
|
+
print(f"{settings['group_column']} not found in DataFrame, please choose from:")
|
1063
|
+
for col in df.columns:
|
1064
|
+
print(col)
|
1065
|
+
|
1066
|
+
df[settings['class_column']] = df[settings['class_column']].fillna(0)
|
1067
|
+
output['data'] = df
|
1068
|
+
|
1069
|
+
# Perform chi-squared test and plot
|
1070
|
+
results_df, pairwise_results, fig = plot_proportion_stacked_bars(settings, df, settings['group_column'], bin_column=settings['class_column'], level=settings['level'])
|
1071
|
+
|
1072
|
+
output['chi_squared'] = results_df
|
1073
|
+
|
1074
|
+
if settings['save']:
|
1075
|
+
output_dir = os.path.join(settings['src'][0], 'results', 'analyze_class_proportion')
|
1076
|
+
os.makedirs(output_dir, exist_ok=True)
|
1077
|
+
output_path_chi = os.path.join(output_dir, 'class_chi_squared_results.csv')
|
1078
|
+
output_path_chi_pairwise = os.path.join(output_dir, 'class_frequency_test.csv')
|
1079
|
+
output_path_data = os.path.join(output_dir, 'class_chi_squared_data.csv')
|
1080
|
+
output_path_fig = os.path.join(output_dir, 'class_chi_squared.pdf')
|
1081
|
+
fig.savefig(output_path_fig, dpi=300, bbox_inches='tight')
|
1082
|
+
results_df.to_csv(output_path_chi, index=False)
|
1083
|
+
pairwise_results.to_csv(output_path_chi_pairwise, index=False)
|
1084
|
+
df.to_csv(output_path_data, index=False)
|
1085
|
+
print(f"Chi-squared results saved to {output_path_chi}")
|
1086
|
+
print(f"Annotated data saved to {output_path_data}")
|
1087
|
+
|
1088
|
+
plt.show()
|
1089
|
+
|
1090
|
+
fig2 = plot_plates(df, variable=settings['class_column'], grouping='mean', min_max='allq', cmap='viridis', min_count=0, verbose=True, dst=None)
|
1091
|
+
if settings['save']:
|
1092
|
+
output_path_fig2 = os.path.join(output_dir, 'class_heatmap.pdf')
|
1093
|
+
fig2.savefig(output_path_fig2, dpi=300, bbox_inches='tight')
|
1094
|
+
|
1095
|
+
plt.show()
|
1096
|
+
|
1097
|
+
# Perform normality, variance, and statistical tests
|
1098
|
+
is_normal, normality_results = perform_normality_tests(df, settings['group_column'], [settings['class_column']])
|
1099
|
+
variance_stat, variance_p = perform_levene_test(df, settings['group_column'], settings['class_column'])
|
1100
|
+
|
1101
|
+
print(f"Levene's test statistic: {variance_stat:.4f}, p-value: {variance_p:.4e}")
|
1102
|
+
variance_results = {
|
1103
|
+
'Test Statistic': variance_stat,
|
1104
|
+
'p-value': variance_p,
|
1105
|
+
'Test Name': "Levene's Test"
|
1106
|
+
}
|
1107
|
+
|
1108
|
+
test_results = perform_statistical_tests(df, settings['group_column'], [settings['class_column']])
|
1109
|
+
posthoc_results = perform_posthoc_tests(
|
1110
|
+
df, settings['group_column'], settings['class_column'], is_normal=is_normal
|
1111
|
+
)
|
1112
|
+
|
1113
|
+
# Save additional results
|
1114
|
+
if settings['save']:
|
1115
|
+
pd.DataFrame(normality_results).to_csv(os.path.join(output_dir, 'normality_results.csv'), index=False)
|
1116
|
+
pd.DataFrame([variance_results]).to_csv(os.path.join(output_dir, 'variance_results.csv'), index=False)
|
1117
|
+
pd.DataFrame(test_results).to_csv(os.path.join(output_dir, 'statistical_test_results.csv'), index=False)
|
1118
|
+
pd.DataFrame(posthoc_results).to_csv(os.path.join(output_dir, 'posthoc_results.csv'), index=False)
|
1119
|
+
print("Statistical analysis results saved.")
|
1120
|
+
|
1121
|
+
return output
|
spacr/utils.py
CHANGED
@@ -5156,29 +5156,6 @@ def control_filelist(folder, mode='column', values=['01','02']):
|
|
5156
5156
|
if mode is 'row_name':
|
5157
5157
|
filtered_files = [file for file in files if file.split('_')[1][:1] in values]
|
5158
5158
|
return filtered_files
|
5159
|
-
|
5160
|
-
def choose_p_adjust_method(num_groups, num_data_points):
|
5161
|
-
"""
|
5162
|
-
Selects the most appropriate p-value adjustment method based on data characteristics.
|
5163
|
-
|
5164
|
-
Parameters:
|
5165
|
-
- num_groups: Number of unique groups being compared
|
5166
|
-
- num_data_points: Number of data points per group (assuming balanced groups)
|
5167
|
-
|
5168
|
-
Returns:
|
5169
|
-
- A string representing the recommended p-adjustment method
|
5170
|
-
"""
|
5171
|
-
num_comparisons = (num_groups * (num_groups - 1)) // 2 # Number of pairwise comparisons
|
5172
|
-
|
5173
|
-
# Decision logic for choosing the adjustment method
|
5174
|
-
if num_comparisons <= 10 and num_data_points > 5:
|
5175
|
-
return 'holm' # Balanced between power and Type I error control
|
5176
|
-
elif num_comparisons > 10 and num_data_points <= 5:
|
5177
|
-
return 'fdr_bh' # FDR control for large number of comparisons and small sample size
|
5178
|
-
elif num_comparisons <= 10:
|
5179
|
-
return 'sidak' # Less conservative than Bonferroni, good for independent comparisons
|
5180
|
-
else:
|
5181
|
-
return 'bonferroni' # Very conservative, use for strict control of Type I errors
|
5182
5159
|
|
5183
5160
|
def rename_columns_in_db(db_path):
|
5184
5161
|
with sqlite3.connect(db_path) as conn:
|
@@ -1,4 +1,4 @@
|
|
1
|
-
spacr/__init__.py,sha256=
|
1
|
+
spacr/__init__.py,sha256=fvk5JfLpOqUA1W0yPcsVZnS9qbpXFOceFk09LKolVfw,1627
|
2
2
|
spacr/__main__.py,sha256=bkAJJD2kjIqOP-u1kLvct9jQQCeUXzlEjdgitwi1Lm8,75
|
3
3
|
spacr/app_annotate.py,sha256=W9eLPa_LZIvXsXx_-0iDFEU938LBDvRy6prXo0qF4KQ,2533
|
4
4
|
spacr/app_classify.py,sha256=urTP_wlZ58hSyM5a19slYlBxN0PdC-9-ga0hvq8CGWc,165
|
@@ -15,20 +15,21 @@ spacr/gui.py,sha256=ARyn9Q_g8HoP-cXh1nzMLVFCKqthY4v2u9yORyaQqQE,8230
|
|
15
15
|
spacr/gui_core.py,sha256=N7R7yvfK_dJhOReM_kW3Ci8Bokhi1OzsxeKqvSGdvV4,41460
|
16
16
|
spacr/gui_elements.py,sha256=EKlvEg_4_je7jciEdR3NTgPrcTraowa2e2RUt-xqd6M,138254
|
17
17
|
spacr/gui_utils.py,sha256=u9RoIOWpAXFEOnUlLpMQZrc1pWSg6omZsJMIhJdRv_g,41211
|
18
|
-
spacr/io.py,sha256=
|
18
|
+
spacr/io.py,sha256=YlJAT6H8l4ipunMyKzjqoPcf-1AXgUmSyR1YN9WxmDI,142857
|
19
19
|
spacr/logger.py,sha256=lJhTqt-_wfAunCPl93xE65Wr9Y1oIHJWaZMjunHUeIw,1538
|
20
20
|
spacr/measure.py,sha256=2lK-ZcTxLM-MpXV1oZnucRD9iz5aprwahRKw9IEqshg,55085
|
21
21
|
spacr/mediar.py,sha256=FwLvbLQW5LQzPgvJZG8Lw7GniA2vbZx6Jv6vIKu7I5c,14743
|
22
|
-
spacr/ml.py,sha256=
|
22
|
+
spacr/ml.py,sha256=GOQJH8jdTrJQwiLlDrcc9-yCxLFaMx4YD4OJs0-R5YI,77947
|
23
23
|
spacr/openai.py,sha256=5vBZ3Jl2llYcW3oaTEXgdyCB2aJujMUIO5K038z7w_A,1246
|
24
|
-
spacr/plot.py,sha256=
|
24
|
+
spacr/plot.py,sha256=LApfosnN9gaF6eGRrPGt3uZIwSwAT7kgRbMnUDuxx0Y,165160
|
25
25
|
spacr/sequencing.py,sha256=ClUfwPPK6rNUbUuiEkzcwakzVyDKKUMv9ricrxT8qQY,25227
|
26
|
-
spacr/settings.py,sha256=
|
26
|
+
spacr/settings.py,sha256=LSoDNuz1m7rySh7MWXEL1xlUU4rFiCRVlGvZCSCOqzU,80085
|
27
27
|
spacr/sim.py,sha256=1xKhXimNU3ukzIw-3l9cF3Znc_brW8h20yv8fSTzvss,71173
|
28
|
-
spacr/
|
28
|
+
spacr/stats.py,sha256=mbhwsyIqt5upsSD346qGjdCw7CFBa0tIS7zHU9e0jNI,9536
|
29
|
+
spacr/submodules.py,sha256=3hgY8MWQTfajJbUIYmHMzYNd42d80L_0aN6bpoTUnu0,55059
|
29
30
|
spacr/timelapse.py,sha256=KGfG4L4-QnFfgbF7L6C5wL_3gd_rqr05Foje6RsoTBg,39603
|
30
31
|
spacr/toxo.py,sha256=z2nT5aAze3NUIlwnBQcnkARihDwoPfqOgQIVoUluyK0,25087
|
31
|
-
spacr/utils.py,sha256=
|
32
|
+
spacr/utils.py,sha256=zojZlZtGwwDVDY0fgRt5XViVuJLuxadRO1IYctWm_SQ,221885
|
32
33
|
spacr/version.py,sha256=axH5tnGwtgSnJHb5IDhiu4Zjk5GhLyAEDRe-rnaoFOA,409
|
33
34
|
spacr/resources/MEDIAR/.gitignore,sha256=Ff1q9Nme14JUd-4Q3jZ65aeQ5X4uttptssVDgBVHYo8,152
|
34
35
|
spacr/resources/MEDIAR/LICENSE,sha256=yEj_TRDLUfDpHDNM0StALXIt6mLqSgaV2hcCwa6_TcY,1065
|
@@ -151,9 +152,9 @@ spacr/resources/icons/umap.png,sha256=dOLF3DeLYy9k0nkUybiZMe1wzHQwLJFRmgccppw-8b
|
|
151
152
|
spacr/resources/images/plate1_E01_T0001F001L01A01Z01C02.tif,sha256=Tl0ZUfZ_AYAbu0up_nO0tPRtF1BxXhWQ3T3pURBCCRo,7958528
|
152
153
|
spacr/resources/images/plate1_E01_T0001F001L01A02Z01C01.tif,sha256=m8N-V71rA1TT4dFlENNg8s0Q0YEXXs8slIn7yObmZJQ,7958528
|
153
154
|
spacr/resources/images/plate1_E01_T0001F001L01A03Z01C03.tif,sha256=Pbhk7xn-KUP6RSIhJsxQcrHFImBm3GEpLkzx7WOc-5M,7958528
|
154
|
-
spacr-0.3.
|
155
|
-
spacr-0.3.
|
156
|
-
spacr-0.3.
|
157
|
-
spacr-0.3.
|
158
|
-
spacr-0.3.
|
159
|
-
spacr-0.3.
|
155
|
+
spacr-0.3.65.dist-info/LICENSE,sha256=SR-2MeGc6SCM1UORJYyarSWY_A-JaOMFDj7ReSs9tRM,1083
|
156
|
+
spacr-0.3.65.dist-info/METADATA,sha256=FHAKN1FrIXWI6vqz43lT8VPSPzBpEwRIC54aQaL0Mr8,6032
|
157
|
+
spacr-0.3.65.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
158
|
+
spacr-0.3.65.dist-info/entry_points.txt,sha256=BMC0ql9aNNpv8lUZ8sgDLQMsqaVnX5L535gEhKUP5ho,296
|
159
|
+
spacr-0.3.65.dist-info/top_level.txt,sha256=GJPU8FgwRXGzKeut6JopsSRY2R8T3i9lDgya42tLInY,6
|
160
|
+
spacr-0.3.65.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|