spacr 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/io.py +3 -1
- spacr/ml.py +205 -0
- spacr/plot.py +48 -0
- spacr/settings.py +64 -0
- spacr/submodules.py +298 -1
- {spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/METADATA +1 -1
- {spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/RECORD +11 -11
- {spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/LICENSE +0 -0
- {spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/WHEEL +0 -0
- {spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/entry_points.txt +0 -0
- {spacr-0.3.62.dist-info → spacr-0.3.64.dist-info}/top_level.txt +0 -0
spacr/io.py
CHANGED
@@ -2551,6 +2551,7 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
|
|
2551
2551
|
png_list_g_df_non_numeric.drop(columns=['plate','row_name','column_name','field','file_name','cell_id', 'prcf'], inplace=True)
|
2552
2552
|
if verbose:
|
2553
2553
|
print(f'png_list: {len(png_list)}, png_list grouped: {len(png_list_g_df_numeric)}')
|
2554
|
+
print(f"Added png_list columns: {png_list_g_df_numeric.columns}, {png_list_g_df_non_numeric.columns}")
|
2554
2555
|
merged_df = merged_df.merge(png_list_g_df_numeric, left_index=True, right_index=True)
|
2555
2556
|
merged_df = merged_df.merge(png_list_g_df_non_numeric, left_index=True, right_index=True)
|
2556
2557
|
|
@@ -2562,7 +2563,8 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
|
|
2562
2563
|
metadata.set_index('prcfo', inplace=True)
|
2563
2564
|
|
2564
2565
|
# Merge metadata with final merged DataFrame
|
2565
|
-
merged_df = metadata.merge(merged_df, left_index=True, right_index=True).dropna(axis=1)
|
2566
|
+
#merged_df = metadata.merge(merged_df, left_index=True, right_index=True).dropna(axis=1)
|
2567
|
+
merged_df = metadata.merge(merged_df, left_index=True, right_index=True)
|
2566
2568
|
merged_df.drop(columns=['label_list_morphology', 'label_list_intensity'], errors='ignore', inplace=True)
|
2567
2569
|
|
2568
2570
|
if verbose:
|
spacr/ml.py
CHANGED
@@ -3,6 +3,7 @@ import pandas as pd
|
|
3
3
|
import numpy as np
|
4
4
|
from scipy import stats
|
5
5
|
from scipy.stats import shapiro
|
6
|
+
from math import pi
|
6
7
|
|
7
8
|
from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV
|
8
9
|
from sklearn.metrics import mean_squared_error
|
@@ -1515,3 +1516,207 @@ def _calculate_similarity(df, features, col_to_compare, val1, val2):
|
|
1515
1516
|
|
1516
1517
|
return df
|
1517
1518
|
|
1519
|
+
def interperate_vision_model(settings={}):
|
1520
|
+
|
1521
|
+
from .io import _read_and_merge_data, _results_to_csv
|
1522
|
+
from .settings import set_interperate_vision_model_defaults
|
1523
|
+
from .utils import save_settings
|
1524
|
+
|
1525
|
+
settings = set_interperate_vision_model_defaults(settings)
|
1526
|
+
save_settings(settings, name='interperate_vision_model', show=True)
|
1527
|
+
|
1528
|
+
# Function to create radar plot for individual and combined values
|
1529
|
+
def create_extended_radar_plot(values, labels, title):
|
1530
|
+
values = list(values) + [values[0]] # Close the loop for radar chart
|
1531
|
+
angles = [n / float(len(labels)) * 2 * pi for n in range(len(labels))]
|
1532
|
+
angles += angles[:1]
|
1533
|
+
|
1534
|
+
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
|
1535
|
+
ax.plot(angles, values, linewidth=2, linestyle='solid')
|
1536
|
+
ax.fill(angles, values, alpha=0.25)
|
1537
|
+
|
1538
|
+
ax.set_xticks(angles[:-1])
|
1539
|
+
ax.set_xticklabels(labels, fontsize=10, rotation=45, ha='right')
|
1540
|
+
plt.title(title, pad=20)
|
1541
|
+
plt.show()
|
1542
|
+
|
1543
|
+
def extract_compartment_channel(feature_name):
|
1544
|
+
# Identify compartment as the first part before an underscore
|
1545
|
+
compartment = feature_name.split('_')[0]
|
1546
|
+
|
1547
|
+
if compartment == 'cells':
|
1548
|
+
compartment = 'cell'
|
1549
|
+
|
1550
|
+
# Identify channels based on substring presence
|
1551
|
+
channels = []
|
1552
|
+
if 'channel_0' in feature_name:
|
1553
|
+
channels.append('channel_0')
|
1554
|
+
if 'channel_1' in feature_name:
|
1555
|
+
channels.append('channel_1')
|
1556
|
+
if 'channel_2' in feature_name:
|
1557
|
+
channels.append('channel_2')
|
1558
|
+
if 'channel_3' in feature_name:
|
1559
|
+
channels.append('channel_3')
|
1560
|
+
|
1561
|
+
# If multiple channels are found, join them with a '+'
|
1562
|
+
if channels:
|
1563
|
+
channel = ' + '.join(channels)
|
1564
|
+
else:
|
1565
|
+
channel = 'morphology' # Use 'morphology' if no channel identifier is found
|
1566
|
+
|
1567
|
+
return (compartment, channel)
|
1568
|
+
|
1569
|
+
def read_and_preprocess_data(settings):
|
1570
|
+
|
1571
|
+
df, _ = _read_and_merge_data(
|
1572
|
+
locs=[settings['src']+'/measurements/measurements.db'],
|
1573
|
+
tables=settings['tables'],
|
1574
|
+
verbose=True,
|
1575
|
+
nuclei_limit=settings['nuclei_limit'],
|
1576
|
+
pathogen_limit=settings['pathogen_limit']
|
1577
|
+
)
|
1578
|
+
|
1579
|
+
scores_df = pd.read_csv(settings['scores'])
|
1580
|
+
|
1581
|
+
# Clean and align columns for merging
|
1582
|
+
df['object_label'] = df['object_label'].str.replace('o', '')
|
1583
|
+
|
1584
|
+
if 'row_name' not in scores_df.columns:
|
1585
|
+
scores_df['row_name'] = scores_df['row']
|
1586
|
+
|
1587
|
+
if 'column_name' not in scores_df.columns:
|
1588
|
+
scores_df['column_name'] = scores_df['col']
|
1589
|
+
|
1590
|
+
if 'object_label' not in scores_df.columns:
|
1591
|
+
scores_df['object_label'] = scores_df['object']
|
1592
|
+
|
1593
|
+
# Remove the 'o' prefix from 'object_label' in df, ensuring it is a string type
|
1594
|
+
df['object_label'] = df['object_label'].str.replace('o', '').astype(str)
|
1595
|
+
|
1596
|
+
# Ensure 'object_label' in scores_df is also a string
|
1597
|
+
scores_df['object_label'] = scores_df['object'].astype(str)
|
1598
|
+
|
1599
|
+
# Ensure all join columns have the same data type in both DataFrames
|
1600
|
+
df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
|
1601
|
+
scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
|
1602
|
+
|
1603
|
+
# Select only the necessary columns from scores_df for merging
|
1604
|
+
scores_df = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label', settings['score_column']]]
|
1605
|
+
|
1606
|
+
# Now merge DataFrames
|
1607
|
+
merged_df = pd.merge(df, scores_df, on=['plate', 'row_name', 'column_name', 'field', 'object_label'], how='inner')
|
1608
|
+
|
1609
|
+
# Separate numerical features and the score column
|
1610
|
+
X = merged_df.select_dtypes(include='number').drop(columns=[settings['score_column']])
|
1611
|
+
y = merged_df[settings['score_column']]
|
1612
|
+
|
1613
|
+
return X, y, merged_df
|
1614
|
+
|
1615
|
+
X, y, merged_df = read_and_preprocess_data(settings)
|
1616
|
+
|
1617
|
+
# Step 1: Feature Importance using Random Forest
|
1618
|
+
if settings['feature_importance'] or settings['feature_importance']:
|
1619
|
+
model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
|
1620
|
+
model.fit(X, y)
|
1621
|
+
|
1622
|
+
if settings['feature_importance']:
|
1623
|
+
print(f"Feature Importance ...")
|
1624
|
+
feature_importances = model.feature_importances_
|
1625
|
+
feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
|
1626
|
+
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
|
1627
|
+
top_feature_importance_df = feature_importance_df.head(settings['top_features'])
|
1628
|
+
|
1629
|
+
# Plot Feature Importance
|
1630
|
+
plt.figure(figsize=(10, 6))
|
1631
|
+
plt.barh(top_feature_importance_df['feature'], top_feature_importance_df['importance'])
|
1632
|
+
plt.xlabel('Importance')
|
1633
|
+
plt.title(f"Top {settings['top_features']} Features - Feature Importance")
|
1634
|
+
plt.gca().invert_yaxis()
|
1635
|
+
plt.show()
|
1636
|
+
|
1637
|
+
if settings['save']:
|
1638
|
+
_results_to_csv(feature_importance_df, filename='feature_importance.csv')
|
1639
|
+
|
1640
|
+
# Step 2: Permutation Importance
|
1641
|
+
if settings['permutation_importance']:
|
1642
|
+
print(f"Permutation Importance ...")
|
1643
|
+
perm_importance = permutation_importance(model, X, y, n_repeats=10, random_state=42, n_jobs=settings['n_jobs'])
|
1644
|
+
perm_importance_df = pd.DataFrame({'feature': X.columns, 'importance': perm_importance.importances_mean})
|
1645
|
+
perm_importance_df = perm_importance_df.sort_values(by='importance', ascending=False)
|
1646
|
+
top_perm_importance_df = perm_importance_df.head(settings['top_features'])
|
1647
|
+
|
1648
|
+
# Plot Permutation Importance
|
1649
|
+
plt.figure(figsize=(10, 6))
|
1650
|
+
plt.barh(top_perm_importance_df['feature'], top_perm_importance_df['importance'])
|
1651
|
+
plt.xlabel('Importance')
|
1652
|
+
plt.title(f"Top {settings['top_features']} Features - Permutation Importance")
|
1653
|
+
plt.gca().invert_yaxis()
|
1654
|
+
plt.show()
|
1655
|
+
|
1656
|
+
if settings['save']:
|
1657
|
+
_results_to_csv(perm_importance_df, filename='permutation_importance.csv')
|
1658
|
+
|
1659
|
+
# Step 3: SHAP Analysis
|
1660
|
+
if settings['shap']:
|
1661
|
+
print(f"SHAP Analysis ...")
|
1662
|
+
|
1663
|
+
# Select top N features based on Random Forest importance and fit the model on these features only
|
1664
|
+
top_features = feature_importance_df.head(settings['top_features'])['feature']
|
1665
|
+
X_top = X[top_features]
|
1666
|
+
|
1667
|
+
# Refit the model on this subset of features
|
1668
|
+
model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
|
1669
|
+
model.fit(X_top, y)
|
1670
|
+
|
1671
|
+
# Sample a smaller subset of rows to speed up SHAP
|
1672
|
+
if settings['shap_sample']:
|
1673
|
+
sample = int(len(X_top) / 100)
|
1674
|
+
X_sample = X_top.sample(min(sample, len(X_top)), random_state=42)
|
1675
|
+
else:
|
1676
|
+
X_sample = X_top
|
1677
|
+
|
1678
|
+
# Initialize SHAP explainer with the same subset of features
|
1679
|
+
explainer = shap.Explainer(model.predict, X_sample)
|
1680
|
+
shap_values = explainer(X_sample, max_evals=1500)
|
1681
|
+
|
1682
|
+
# Plot SHAP summary for the selected sample and top features
|
1683
|
+
shap.summary_plot(shap_values, X_sample, max_display=settings['top_features'])
|
1684
|
+
|
1685
|
+
# Convert SHAP values to a DataFrame for easier manipulation
|
1686
|
+
shap_df = pd.DataFrame(shap_values.values, columns=X_sample.columns)
|
1687
|
+
|
1688
|
+
# Apply the function to create MultiIndex columns with compartment and channel
|
1689
|
+
shap_df.columns = pd.MultiIndex.from_tuples(
|
1690
|
+
[extract_compartment_channel(feat) for feat in shap_df.columns],
|
1691
|
+
names=['compartment', 'channel']
|
1692
|
+
)
|
1693
|
+
|
1694
|
+
# Aggregate SHAP values by compartment and channel
|
1695
|
+
compartment_mean = shap_df.abs().groupby(level='compartment', axis=1).mean().mean(axis=0)
|
1696
|
+
channel_mean = shap_df.abs().groupby(level='channel', axis=1).mean().mean(axis=0)
|
1697
|
+
|
1698
|
+
# Calculate combined importance for each pair of compartments and channels
|
1699
|
+
combined_compartment = {}
|
1700
|
+
for i, comp1 in enumerate(compartment_mean.index):
|
1701
|
+
for comp2 in compartment_mean.index[i+1:]:
|
1702
|
+
combined_compartment[f"{comp1} + {comp2}"] = shap_df.loc[:, (comp1, slice(None))].abs().mean().mean() + \
|
1703
|
+
shap_df.loc[:, (comp2, slice(None))].abs().mean().mean()
|
1704
|
+
|
1705
|
+
combined_channel = {}
|
1706
|
+
for i, chan1 in enumerate(channel_mean.index):
|
1707
|
+
for chan2 in channel_mean.index[i+1:]:
|
1708
|
+
combined_channel[f"{chan1} + {chan2}"] = shap_df.loc[:, (slice(None), chan1)].abs().mean().mean() + \
|
1709
|
+
shap_df.loc[:, (slice(None), chan2)].abs().mean().mean()
|
1710
|
+
|
1711
|
+
# Prepare values and labels for radar charts
|
1712
|
+
all_compartment_importance = list(compartment_mean.values) + list(combined_compartment.values())
|
1713
|
+
all_compartment_labels = list(compartment_mean.index) + list(combined_compartment.keys())
|
1714
|
+
|
1715
|
+
all_channel_importance = list(channel_mean.values) + list(combined_channel.values())
|
1716
|
+
all_channel_labels = list(channel_mean.index) + list(combined_channel.keys())
|
1717
|
+
|
1718
|
+
# Create radar plots for compartments and channels
|
1719
|
+
create_extended_radar_plot(all_compartment_importance, all_compartment_labels, "SHAP Importance by Compartment (Individual and Combined)")
|
1720
|
+
create_extended_radar_plot(all_channel_importance, all_channel_labels, "SHAP Importance by Channel (Individual and Combined)")
|
1721
|
+
|
1722
|
+
return merged_df
|
spacr/plot.py
CHANGED
@@ -3688,3 +3688,51 @@ def overlay_masks_on_images(img_folder, normalize=True, resize=True, save=False,
|
|
3688
3688
|
plt.axis('off')
|
3689
3689
|
plt.show()
|
3690
3690
|
|
3691
|
+
def graph_importance(settings):
|
3692
|
+
|
3693
|
+
from .settings import set_graph_importance_defaults
|
3694
|
+
from .utils import save_settings
|
3695
|
+
|
3696
|
+
if not isinstance(settings['csvs'], list):
|
3697
|
+
settings['csvs'] = settings['csvs']
|
3698
|
+
|
3699
|
+
settings['src'] = os.path.dirname(settings['csvs'][0])
|
3700
|
+
|
3701
|
+
settings = set_graph_importance_defaults(settings)
|
3702
|
+
save_settings(settings, name='graph_importance')
|
3703
|
+
|
3704
|
+
dfs = []
|
3705
|
+
for path in settings['csvs']:
|
3706
|
+
dft = pd.read_csv(path)
|
3707
|
+
dfs.append(dft)
|
3708
|
+
|
3709
|
+
df = pd.concat(dfs)
|
3710
|
+
|
3711
|
+
if not all(col in df.columns for col in (settings['grouping_column'], settings['data_column'])):
|
3712
|
+
print(f"grouping {settings['grouping_column']} and data {settings['data_column']} columns must be in {df.columns.to_list()}")
|
3713
|
+
return
|
3714
|
+
|
3715
|
+
output_dir = os.path.dirname(settings['csvs'][0])
|
3716
|
+
|
3717
|
+
spacr_graph = spacrGraph(
|
3718
|
+
df=df,
|
3719
|
+
grouping_column=settings['grouping_column'],
|
3720
|
+
data_column=settings['data_column'],
|
3721
|
+
graph_type=settings['graph_type'],
|
3722
|
+
graph_name=settings['grouping_column'],
|
3723
|
+
summary_func='mean',
|
3724
|
+
colors=None,
|
3725
|
+
output_dir=output_dir,
|
3726
|
+
save=settings['save'],
|
3727
|
+
y_lim=None,
|
3728
|
+
error_bar_type='std',
|
3729
|
+
representation='object',
|
3730
|
+
theme='muted',
|
3731
|
+
)
|
3732
|
+
|
3733
|
+
# Create the plot
|
3734
|
+
spacr_graph.create_plot()
|
3735
|
+
|
3736
|
+
# Get the figure object if needed
|
3737
|
+
fig = spacr_graph.get_figure()
|
3738
|
+
plt.show()
|
spacr/settings.py
CHANGED
@@ -1370,4 +1370,68 @@ def get_analyze_plaque_settings(settings):
|
|
1370
1370
|
settings.setdefault('rescale', False)
|
1371
1371
|
settings.setdefault('resample', False)
|
1372
1372
|
settings.setdefault('fill_in', True)
|
1373
|
+
return settings
|
1374
|
+
|
1375
|
+
def set_graph_importance_defaults(settings):
|
1376
|
+
settings.setdefault('csvs','list of paths')
|
1377
|
+
settings.setdefault('grouping_column','compartment')
|
1378
|
+
settings.setdefault('data_column','compartment_importance_sum')
|
1379
|
+
settings.setdefault('graph_type','jitter_bar')
|
1380
|
+
settings.setdefault('save',False)
|
1381
|
+
return settings
|
1382
|
+
|
1383
|
+
def set_interperate_vision_model_defaults(settings):
|
1384
|
+
settings.setdefault('src','path')
|
1385
|
+
settings.setdefault('scores','path')
|
1386
|
+
settings.setdefault('tables',['cell', 'nucleus', 'pathogen','cytoplasm'])
|
1387
|
+
settings.setdefault('feature_importance',True)
|
1388
|
+
settings.setdefault('permutation_importance',False)
|
1389
|
+
settings.setdefault('shap',True)
|
1390
|
+
settings.setdefault('save',False)
|
1391
|
+
settings.setdefault('nuclei_limit',1000)
|
1392
|
+
settings.setdefault('pathogen_limit',1000)
|
1393
|
+
settings.setdefault('top_features',30)
|
1394
|
+
settings.setdefault('shap_sample',True)
|
1395
|
+
settings.setdefault('n_jobs',-1)
|
1396
|
+
settings.setdefault('shap_approximate',True)
|
1397
|
+
settings.setdefault('score_column','cv_predictions')
|
1398
|
+
return settings
|
1399
|
+
|
1400
|
+
def set_analyze_endodyogeny_defaults(settings):
|
1401
|
+
settings.setdefault('src','path')
|
1402
|
+
settings.setdefault('tables',['cell', 'nucleus', 'pathogen', 'cytoplasm'])
|
1403
|
+
settings.setdefault('cell_types',['Hela'])
|
1404
|
+
settings.setdefault('cell_plate_metadata',None)
|
1405
|
+
settings.setdefault('pathogen_types',['nc', 'pc'])
|
1406
|
+
settings.setdefault('pathogen_plate_metadata',[['c1'], ['c2']])
|
1407
|
+
settings.setdefault('treatments',None)
|
1408
|
+
settings.setdefault('treatment_plate_metadata',None)
|
1409
|
+
settings.setdefault('min_area_bin',500)
|
1410
|
+
settings.setdefault('group_column','pathogen')
|
1411
|
+
settings.setdefault('compartment','pathogen')
|
1412
|
+
settings.setdefault('pathogen_limit',1)
|
1413
|
+
settings.setdefault('nuclei_limit',10)
|
1414
|
+
settings.setdefault('level','object')
|
1415
|
+
settings.setdefault('um_per_px',0.1)
|
1416
|
+
settings.setdefault('max_bins',None)
|
1417
|
+
settings.setdefault('save',False)
|
1418
|
+
settings.setdefault('verbose',False)
|
1419
|
+
return settings
|
1420
|
+
|
1421
|
+
def set_analyze_class_proportion_defaults(settings):
|
1422
|
+
settings.setdefault('src','path')
|
1423
|
+
settings.setdefault('tables',['cell', 'nucleus', 'pathogen', 'cytoplasm'])
|
1424
|
+
settings.setdefault('cell_types',['Hela'])
|
1425
|
+
settings.setdefault('cell_plate_metadata',None)
|
1426
|
+
settings.setdefault('pathogen_types',['nc','pc'])
|
1427
|
+
settings.setdefault('pathogen_plate_metadata',[['c1'],['c2']])
|
1428
|
+
settings.setdefault('treatments',None)
|
1429
|
+
settings.setdefault('treatment_plate_metadata',None)
|
1430
|
+
settings.setdefault('group_column','condition')
|
1431
|
+
settings.setdefault('class_column','test')
|
1432
|
+
settings.setdefault('pathogen_limit',1000)
|
1433
|
+
settings.setdefault('nuclei_limit',1000)
|
1434
|
+
settings.setdefault('level','well')
|
1435
|
+
settings.setdefault('save',False)
|
1436
|
+
settings.setdefault('verbose', False)
|
1373
1437
|
return settings
|
spacr/submodules.py
CHANGED
@@ -10,6 +10,7 @@ from IPython.display import display
|
|
10
10
|
from sklearn.ensemble import RandomForestClassifier
|
11
11
|
from sklearn.inspection import permutation_importance
|
12
12
|
from math import pi
|
13
|
+
from scipy.stats import chi2_contingency
|
13
14
|
|
14
15
|
import matplotlib.pyplot as plt
|
15
16
|
from natsort import natsorted
|
@@ -844,4 +845,300 @@ def interperate_vision_model(settings={}):
|
|
844
845
|
df.to_csv(save_path)
|
845
846
|
print(f"Saved {save_path}")
|
846
847
|
|
847
|
-
return output
|
848
|
+
return output
|
849
|
+
|
850
|
+
def analyze_endodyogeny(settings):
|
851
|
+
|
852
|
+
from .utils import annotate_conditions, save_settings
|
853
|
+
from .io import _read_and_merge_data
|
854
|
+
from .settings import set_analyze_endodyogeny_defaults
|
855
|
+
|
856
|
+
def _calculate_volume_bins(df, compartment='pathogen', min_area_bin=500, max_bins=None, verbose=False):
|
857
|
+
area_column = f'{compartment}_area'
|
858
|
+
df[f'{compartment}_volume'] = df[area_column] ** 1.5
|
859
|
+
min_volume_bin = min_area_bin ** 1.5
|
860
|
+
max_volume = df[f'{compartment}_volume'].max()
|
861
|
+
|
862
|
+
# Generate bin edges as floats, and filter out any duplicate edges
|
863
|
+
bins = [min_volume_bin * (2 ** i) for i in range(int(np.ceil(np.log2(max_volume / min_volume_bin)) + 1))]
|
864
|
+
bins = sorted(set(bins)) # Ensure bin edges are unique
|
865
|
+
|
866
|
+
# Create bin labels as ranges with decimal precision for float values (e.g., "500.0-1000.0")
|
867
|
+
bin_labels = [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)]
|
868
|
+
if verbose:
|
869
|
+
print('Volume bins:', bins)
|
870
|
+
print('Volume bin labels:', bin_labels)
|
871
|
+
|
872
|
+
# Apply the bins to create a new column with the binned labels
|
873
|
+
df[f'{compartment}_volume_bin'] = pd.cut(df[f'{compartment}_volume'], bins=bins, labels=bin_labels, right=False)
|
874
|
+
|
875
|
+
# Create a bin index column (numeric version of bins)
|
876
|
+
df['bin_index'] = pd.cut(df[f'{compartment}_volume'], bins=bins, labels=range(1, len(bins)), right=False).astype(int)
|
877
|
+
|
878
|
+
# Adjust bin indices and labels based on max_bins
|
879
|
+
if max_bins is not None:
|
880
|
+
df.loc[df['bin_index'] > max_bins, 'bin_index'] = max_bins
|
881
|
+
|
882
|
+
# Update bin labels to reflect capped bins
|
883
|
+
bin_labels = bin_labels[:max_bins - 1] + [f">{bins[max_bins - 1]:.2f}"]
|
884
|
+
df[f'{compartment}_volume_bin'] = df['bin_index'].map(
|
885
|
+
{i + 1: label for i, label in enumerate(bin_labels)}
|
886
|
+
)
|
887
|
+
|
888
|
+
if verbose:
|
889
|
+
print(df[[f'{compartment}_volume', f'{compartment}_volume_bin', 'bin_index']].head())
|
890
|
+
|
891
|
+
return df
|
892
|
+
|
893
|
+
def _plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_column='prc', level='object'):
|
894
|
+
# Always calculate chi-squared on raw data
|
895
|
+
raw_counts = df.groupby([group_column, bin_column]).size().unstack(fill_value=0)
|
896
|
+
chi2, p, dof, expected = chi2_contingency(raw_counts)
|
897
|
+
print(f"Chi-squared test statistic (raw data): {chi2:.4f}")
|
898
|
+
print(f"p-value (raw data): {p:.4e}")
|
899
|
+
|
900
|
+
# Extract bin labels and indices for formatting the legend in the correct order
|
901
|
+
bin_labels = df[bin_column].cat.categories if pd.api.types.is_categorical_dtype(df[bin_column]) else sorted(df[bin_column].unique())
|
902
|
+
bin_indices = range(1, len(bin_labels) + 1)
|
903
|
+
legend_labels = [f"{index}: {label}" for index, label in zip(bin_indices, bin_labels)]
|
904
|
+
|
905
|
+
# Plot based on level setting
|
906
|
+
if level == 'well':
|
907
|
+
# Aggregate by well for mean ± SD visualization
|
908
|
+
well_proportions = (
|
909
|
+
df.groupby([group_column, prc_column, bin_column])
|
910
|
+
.size()
|
911
|
+
.groupby(level=[0, 1])
|
912
|
+
.apply(lambda x: x / x.sum())
|
913
|
+
.unstack(fill_value=0)
|
914
|
+
)
|
915
|
+
mean_proportions = well_proportions.groupby(group_column).mean()
|
916
|
+
std_proportions = well_proportions.groupby(group_column).std()
|
917
|
+
|
918
|
+
ax = mean_proportions.plot(
|
919
|
+
kind='bar', stacked=True, yerr=std_proportions, capsize=5, colormap='viridis', figsize=(12, 8)
|
920
|
+
)
|
921
|
+
plt.title('Proportion of Volume Bins by Group (Mean ± SD across wells)')
|
922
|
+
else:
|
923
|
+
# Object-level plotting without aggregation
|
924
|
+
group_counts = df.groupby([group_column, bin_column]).size()
|
925
|
+
group_totals = group_counts.groupby(level=0).sum()
|
926
|
+
proportions = group_counts / group_totals
|
927
|
+
proportion_df = proportions.unstack(fill_value=0)
|
928
|
+
|
929
|
+
ax = proportion_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(12, 8))
|
930
|
+
plt.title('Proportion of Volume Bins by Group')
|
931
|
+
|
932
|
+
plt.xlabel('Group')
|
933
|
+
plt.ylabel('Proportion')
|
934
|
+
|
935
|
+
# Update legend with formatted labels, maintaining correct order
|
936
|
+
volume_unit = "px³" if settings['um_per_px'] is None else "µm³"
|
937
|
+
plt.legend(legend_labels, title=f'Volume Range ({volume_unit})', bbox_to_anchor=(1.05, 1), loc='upper left')
|
938
|
+
plt.ylim(0, 1)
|
939
|
+
fig = plt.gcf()
|
940
|
+
return chi2, p, dof, expected, raw_counts, fig
|
941
|
+
|
942
|
+
settings = set_analyze_endodyogeny_defaults(settings)
|
943
|
+
save_settings(settings, name='analyze_endodyogeny', show=True)
|
944
|
+
output = {}
|
945
|
+
|
946
|
+
# Process data
|
947
|
+
if not isinstance(settings['src'], list):
|
948
|
+
settings['src'] = [settings['src']]
|
949
|
+
|
950
|
+
locs = []
|
951
|
+
for s in settings['src']:
|
952
|
+
loc = os.path.join(s, 'measurements/measurements.db')
|
953
|
+
locs.append(loc)
|
954
|
+
|
955
|
+
df, _ = _read_and_merge_data(
|
956
|
+
locs,
|
957
|
+
tables=settings['tables'],
|
958
|
+
verbose=settings['verbose'],
|
959
|
+
nuclei_limit=settings['nuclei_limit'],
|
960
|
+
pathogen_limit=settings['pathogen_limit']
|
961
|
+
)
|
962
|
+
|
963
|
+
if not settings['um_per_px'] is None:
|
964
|
+
df[f"{settings['compartment']}_area"] = df[f"{settings['compartment']}_area"] * (settings['um_per_px'] ** 2)
|
965
|
+
settings['min_area_bin'] = settings['min_area_bin'] * (settings['um_per_px'] ** 2)
|
966
|
+
|
967
|
+
df = df[df[f"{settings['compartment']}_area"] >= settings['min_area_bin']]
|
968
|
+
|
969
|
+
df = annotate_conditions(
|
970
|
+
df=df,
|
971
|
+
cells=settings['cell_types'],
|
972
|
+
cell_loc=settings['cell_plate_metadata'],
|
973
|
+
pathogens=settings['pathogen_types'],
|
974
|
+
pathogen_loc=settings['pathogen_plate_metadata'],
|
975
|
+
treatments=settings['treatments'],
|
976
|
+
treatment_loc=settings['treatment_plate_metadata']
|
977
|
+
)
|
978
|
+
|
979
|
+
if settings['group_column'] not in df.columns:
|
980
|
+
print(f"{settings['group_column']} not found in DataFrame, please choose from:")
|
981
|
+
for col in df.columns:
|
982
|
+
print(col)
|
983
|
+
|
984
|
+
df = df.dropna(subset=[settings['group_column']])
|
985
|
+
df = _calculate_volume_bins(df, settings['compartment'], settings['min_area_bin'], settings['max_bins'], settings['verbose'])
|
986
|
+
output['data'] = df
|
987
|
+
# Perform chi-squared test and plot
|
988
|
+
chi2, p, dof, expected, raw_counts, fig = _plot_proportion_stacked_bars(settings, df, settings['group_column'], bin_column=f"{settings['compartment']}_volume_bin", level=settings['level']
|
989
|
+
)
|
990
|
+
|
991
|
+
# Create a DataFrame with chi-squared test results and raw counts
|
992
|
+
results_df = pd.DataFrame({
|
993
|
+
'chi_squared_stat': [chi2],
|
994
|
+
'p_value': [p],
|
995
|
+
'degrees_of_freedom': [dof]
|
996
|
+
})
|
997
|
+
|
998
|
+
# Flatten and add expected counts to results_df
|
999
|
+
expected_df = pd.DataFrame(expected, index=raw_counts.index, columns=raw_counts.columns)
|
1000
|
+
expected_flat = expected_df.stack().reset_index()
|
1001
|
+
expected_flat.columns = [settings['group_column'], f"{settings['compartment']}_volume_bin", 'expected_count']
|
1002
|
+
results_df = results_df.merge(expected_flat, how="cross")
|
1003
|
+
output['chi_squared'] = results_df
|
1004
|
+
|
1005
|
+
if settings['save']:
|
1006
|
+
# Save DataFrame to CSV
|
1007
|
+
output_dir = os.path.join(settings['src'][0], 'results')
|
1008
|
+
os.makedirs(output_dir, exist_ok=True)
|
1009
|
+
output_path = os.path.join(output_dir, 'chi_squared_results.csv')
|
1010
|
+
output_path_fig = os.path.join(output_dir, 'chi_squared_results.pdf')
|
1011
|
+
fig.savefig(output_path_fig, dpi=300, bbox_inches='tight')
|
1012
|
+
results_df.to_csv(output_path, index=False)
|
1013
|
+
print(f"Chi-squared results saved to {output_path}")
|
1014
|
+
|
1015
|
+
plt.show()
|
1016
|
+
|
1017
|
+
return output
|
1018
|
+
|
1019
|
+
def analyze_class_proportion(settings):
|
1020
|
+
|
1021
|
+
from .utils import annotate_conditions, save_settings
|
1022
|
+
from .io import _read_and_merge_data
|
1023
|
+
from .settings import set_analyze_class_proportion_defaults
|
1024
|
+
from .plot import plot_plates
|
1025
|
+
|
1026
|
+
|
1027
|
+
def _plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_column='prc', level='object'):
|
1028
|
+
# Always calculate chi-squared on raw data
|
1029
|
+
raw_counts = df.groupby([group_column, bin_column]).size().unstack(fill_value=0)
|
1030
|
+
chi2, p, dof, expected = chi2_contingency(raw_counts)
|
1031
|
+
print(f"Chi-squared test statistic (raw data): {chi2:.4f}")
|
1032
|
+
print(f"p-value (raw data): {p:.4e}")
|
1033
|
+
|
1034
|
+
# Plot based on level setting
|
1035
|
+
if level == 'well':
|
1036
|
+
# Aggregate by well for mean ± SD visualization
|
1037
|
+
well_proportions = (
|
1038
|
+
df.groupby([group_column, prc_column, bin_column])
|
1039
|
+
.size()
|
1040
|
+
.groupby(level=[0, 1])
|
1041
|
+
.apply(lambda x: x / x.sum())
|
1042
|
+
.unstack(fill_value=0)
|
1043
|
+
)
|
1044
|
+
mean_proportions = well_proportions.groupby(group_column).mean()
|
1045
|
+
std_proportions = well_proportions.groupby(group_column).std()
|
1046
|
+
|
1047
|
+
ax = mean_proportions.plot(
|
1048
|
+
kind='bar', stacked=True, yerr=std_proportions, capsize=5, colormap='viridis', figsize=(12, 8)
|
1049
|
+
)
|
1050
|
+
plt.title('Proportion of Volume Bins by Group (Mean ± SD across wells)')
|
1051
|
+
else:
|
1052
|
+
# Object-level plotting without aggregation
|
1053
|
+
group_counts = df.groupby([group_column, bin_column]).size()
|
1054
|
+
group_totals = group_counts.groupby(level=0).sum()
|
1055
|
+
proportions = group_counts / group_totals
|
1056
|
+
proportion_df = proportions.unstack(fill_value=0)
|
1057
|
+
|
1058
|
+
ax = proportion_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(12, 8))
|
1059
|
+
plt.title('Proportion of Volume Bins by Group')
|
1060
|
+
|
1061
|
+
plt.xlabel('Group')
|
1062
|
+
plt.ylabel('Proportion')
|
1063
|
+
|
1064
|
+
# Update legend with formatted labels, maintaining correct order
|
1065
|
+
plt.legend(title=f'Classes', bbox_to_anchor=(1.05, 1), loc='upper left')
|
1066
|
+
plt.ylim(0, 1)
|
1067
|
+
fig = plt.gcf()
|
1068
|
+
return chi2, p, dof, expected, raw_counts, fig
|
1069
|
+
|
1070
|
+
settings = set_analyze_class_proportion_defaults(settings)
|
1071
|
+
save_settings(settings, name='analyze_class_proportion', show=True)
|
1072
|
+
output = {}
|
1073
|
+
|
1074
|
+
# Process data
|
1075
|
+
if not isinstance(settings['src'], list):
|
1076
|
+
settings['src'] = [settings['src']]
|
1077
|
+
|
1078
|
+
locs = []
|
1079
|
+
for s in settings['src']:
|
1080
|
+
loc = os.path.join(s, 'measurements/measurements.db')
|
1081
|
+
locs.append(loc)
|
1082
|
+
|
1083
|
+
if 'png_list' not in settings['tables']:
|
1084
|
+
settings['tables'] = settings['tables'] + ['png_list']
|
1085
|
+
|
1086
|
+
df, _ = _read_and_merge_data(
|
1087
|
+
locs,
|
1088
|
+
tables=settings['tables'],
|
1089
|
+
verbose=settings['verbose'],
|
1090
|
+
nuclei_limit=settings['nuclei_limit'],
|
1091
|
+
pathogen_limit=settings['pathogen_limit']
|
1092
|
+
)
|
1093
|
+
|
1094
|
+
df = annotate_conditions(
|
1095
|
+
df=df,
|
1096
|
+
cells=settings['cell_types'],
|
1097
|
+
cell_loc=settings['cell_plate_metadata'],
|
1098
|
+
pathogens=settings['pathogen_types'],
|
1099
|
+
pathogen_loc=settings['pathogen_plate_metadata'],
|
1100
|
+
treatments=settings['treatments'],
|
1101
|
+
treatment_loc=settings['treatment_plate_metadata']
|
1102
|
+
)
|
1103
|
+
|
1104
|
+
if settings['group_column'] not in df.columns:
|
1105
|
+
print(f"{settings['group_column']} not found in DataFrame, please choose from:")
|
1106
|
+
for col in df.columns:
|
1107
|
+
print(col)
|
1108
|
+
|
1109
|
+
df[settings['class_column']] = df[settings['class_column']].fillna(0)
|
1110
|
+
output['data'] = df
|
1111
|
+
|
1112
|
+
# Perform chi-squared test and plot
|
1113
|
+
chi2, p, dof, expected, raw_counts, fig = _plot_proportion_stacked_bars(settings, df, settings['group_column'], bin_column=settings['class_column'], level=settings['level'])
|
1114
|
+
|
1115
|
+
# Create a DataFrame with chi-squared test results and raw counts
|
1116
|
+
results_df = pd.DataFrame({
|
1117
|
+
'chi_squared_stat': [chi2],
|
1118
|
+
'p_value': [p],
|
1119
|
+
'degrees_of_freedom': [dof]
|
1120
|
+
})
|
1121
|
+
|
1122
|
+
output['chi_squared'] = results_df
|
1123
|
+
|
1124
|
+
if settings['save']:
|
1125
|
+
output_dir = os.path.join(settings['src'][0], 'results')
|
1126
|
+
os.makedirs(output_dir, exist_ok=True)
|
1127
|
+
output_path_chi = os.path.join(output_dir, 'class_chi_squared_results.csv')
|
1128
|
+
output_path_data = os.path.join(output_dir, 'class_chi_squared_data.csv')
|
1129
|
+
output_path_fig = os.path.join(output_dir, 'class_chi_squared.pdf')
|
1130
|
+
fig.savefig(output_path_fig, dpi=300, bbox_inches='tight')
|
1131
|
+
results_df.to_csv(output_path_chi, index=False)
|
1132
|
+
df.to_csv(output_path_data, index=False)
|
1133
|
+
print(f"Chi-squared results saved to {output_path_chi}")
|
1134
|
+
print(f"Annotated data saved to {output_path_data}")
|
1135
|
+
|
1136
|
+
plt.show()
|
1137
|
+
|
1138
|
+
fig2 = plot_plates(df, variable=settings['class_column'], grouping='mean', min_max='allq', cmap='viridis', min_count=0, verbose=True, dst=None)
|
1139
|
+
if settings['save']:
|
1140
|
+
output_path_fig2 = os.path.join(output_dir, 'class_heatmap.pdf')
|
1141
|
+
fig2.savefig(output_path_fig2, dpi=300, bbox_inches='tight')
|
1142
|
+
|
1143
|
+
plt.show()
|
1144
|
+
return output
|
@@ -15,17 +15,17 @@ spacr/gui.py,sha256=ARyn9Q_g8HoP-cXh1nzMLVFCKqthY4v2u9yORyaQqQE,8230
|
|
15
15
|
spacr/gui_core.py,sha256=N7R7yvfK_dJhOReM_kW3Ci8Bokhi1OzsxeKqvSGdvV4,41460
|
16
16
|
spacr/gui_elements.py,sha256=EKlvEg_4_je7jciEdR3NTgPrcTraowa2e2RUt-xqd6M,138254
|
17
17
|
spacr/gui_utils.py,sha256=u9RoIOWpAXFEOnUlLpMQZrc1pWSg6omZsJMIhJdRv_g,41211
|
18
|
-
spacr/io.py,sha256=
|
18
|
+
spacr/io.py,sha256=YlJAT6H8l4ipunMyKzjqoPcf-1AXgUmSyR1YN9WxmDI,142857
|
19
19
|
spacr/logger.py,sha256=lJhTqt-_wfAunCPl93xE65Wr9Y1oIHJWaZMjunHUeIw,1538
|
20
20
|
spacr/measure.py,sha256=2lK-ZcTxLM-MpXV1oZnucRD9iz5aprwahRKw9IEqshg,55085
|
21
21
|
spacr/mediar.py,sha256=FwLvbLQW5LQzPgvJZG8Lw7GniA2vbZx6Jv6vIKu7I5c,14743
|
22
|
-
spacr/ml.py,sha256=
|
22
|
+
spacr/ml.py,sha256=GOQJH8jdTrJQwiLlDrcc9-yCxLFaMx4YD4OJs0-R5YI,77947
|
23
23
|
spacr/openai.py,sha256=5vBZ3Jl2llYcW3oaTEXgdyCB2aJujMUIO5K038z7w_A,1246
|
24
|
-
spacr/plot.py,sha256=
|
24
|
+
spacr/plot.py,sha256=0fne2Msy6niN80oiuwt9ZYw1QwXVnghaUmrwvEZN9-8,161992
|
25
25
|
spacr/sequencing.py,sha256=ClUfwPPK6rNUbUuiEkzcwakzVyDKKUMv9ricrxT8qQY,25227
|
26
|
-
spacr/settings.py,sha256=
|
26
|
+
spacr/settings.py,sha256=LSoDNuz1m7rySh7MWXEL1xlUU4rFiCRVlGvZCSCOqzU,80085
|
27
27
|
spacr/sim.py,sha256=1xKhXimNU3ukzIw-3l9cF3Znc_brW8h20yv8fSTzvss,71173
|
28
|
-
spacr/submodules.py,sha256=
|
28
|
+
spacr/submodules.py,sha256=X1OI0Dsc1qU4lqKFdF2EnloNkLkDzA1hDn7CYbkBmFc,55473
|
29
29
|
spacr/timelapse.py,sha256=KGfG4L4-QnFfgbF7L6C5wL_3gd_rqr05Foje6RsoTBg,39603
|
30
30
|
spacr/toxo.py,sha256=z2nT5aAze3NUIlwnBQcnkARihDwoPfqOgQIVoUluyK0,25087
|
31
31
|
spacr/utils.py,sha256=vvciLh1gH0nsrCWQw3taUcDjxP59wme3gqrejeNO05w,222943
|
@@ -151,9 +151,9 @@ spacr/resources/icons/umap.png,sha256=dOLF3DeLYy9k0nkUybiZMe1wzHQwLJFRmgccppw-8b
|
|
151
151
|
spacr/resources/images/plate1_E01_T0001F001L01A01Z01C02.tif,sha256=Tl0ZUfZ_AYAbu0up_nO0tPRtF1BxXhWQ3T3pURBCCRo,7958528
|
152
152
|
spacr/resources/images/plate1_E01_T0001F001L01A02Z01C01.tif,sha256=m8N-V71rA1TT4dFlENNg8s0Q0YEXXs8slIn7yObmZJQ,7958528
|
153
153
|
spacr/resources/images/plate1_E01_T0001F001L01A03Z01C03.tif,sha256=Pbhk7xn-KUP6RSIhJsxQcrHFImBm3GEpLkzx7WOc-5M,7958528
|
154
|
-
spacr-0.3.
|
155
|
-
spacr-0.3.
|
156
|
-
spacr-0.3.
|
157
|
-
spacr-0.3.
|
158
|
-
spacr-0.3.
|
159
|
-
spacr-0.3.
|
154
|
+
spacr-0.3.64.dist-info/LICENSE,sha256=SR-2MeGc6SCM1UORJYyarSWY_A-JaOMFDj7ReSs9tRM,1083
|
155
|
+
spacr-0.3.64.dist-info/METADATA,sha256=_07fLYI8eMAYJzOEcAVOemN4TFJAuzAvUrdX1T136T0,6032
|
156
|
+
spacr-0.3.64.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
157
|
+
spacr-0.3.64.dist-info/entry_points.txt,sha256=BMC0ql9aNNpv8lUZ8sgDLQMsqaVnX5L535gEhKUP5ho,296
|
158
|
+
spacr-0.3.64.dist-info/top_level.txt,sha256=GJPU8FgwRXGzKeut6JopsSRY2R8T3i9lDgya42tLInY,6
|
159
|
+
spacr-0.3.64.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|