spacr 0.3.61__py3-none-any.whl → 0.3.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/io.py +133 -3
- spacr/ml.py +205 -0
- spacr/plot.py +48 -0
- spacr/settings.py +64 -0
- spacr/submodules.py +298 -1
- spacr/utils.py +58 -2
- {spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/METADATA +1 -1
- {spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/RECORD +12 -12
- {spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/LICENSE +0 -0
- {spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/WHEEL +0 -0
- {spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/entry_points.txt +0 -0
- {spacr-0.3.61.dist-info → spacr-0.3.64.dist-info}/top_level.txt +0 -0
spacr/io.py
CHANGED
@@ -1777,7 +1777,7 @@ def _read_and_join_tables(db_path, table_names=['cell', 'cytoplasm', 'nucleus',
|
|
1777
1777
|
png_list_df['cell_id'] = png_list_df['cell_id'].str[1:].astype(int)
|
1778
1778
|
png_list_df.rename(columns={'cell_id': 'object_label'}, inplace=True)
|
1779
1779
|
if 'cell' in dataframes:
|
1780
|
-
join_cols = ['object_label', 'plate', 'row_name', 'column_name']
|
1780
|
+
join_cols = ['object_label', 'plate', 'row_name', 'column_name','field']
|
1781
1781
|
dataframes['cell'] = pd.merge(dataframes['cell'], png_list_df, on=join_cols, how='left')
|
1782
1782
|
else:
|
1783
1783
|
print("Cell table not found in database tables.")
|
@@ -2276,7 +2276,7 @@ def _read_db(db_loc, tables):
|
|
2276
2276
|
conn.close() # Close the connection
|
2277
2277
|
return dfs
|
2278
2278
|
|
2279
|
-
def
|
2279
|
+
def _read_and_merge_data_v1(locs, tables, verbose=False, nuclei_limit=False, pathogen_limit=False):
|
2280
2280
|
|
2281
2281
|
from .utils import _split_data
|
2282
2282
|
|
@@ -2443,7 +2443,137 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=False, pathog
|
|
2443
2443
|
if 'pathogen' in tables:
|
2444
2444
|
obj_df_ls.append(pathogens)
|
2445
2445
|
|
2446
|
-
return merged_df, obj_df_ls
|
2446
|
+
return merged_df, obj_df_ls
|
2447
|
+
|
2448
|
+
def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_limit=10):
|
2449
|
+
from .io import _read_db
|
2450
|
+
from .utils import _split_data
|
2451
|
+
|
2452
|
+
# Initialize an empty dictionary to store DataFrames by table name
|
2453
|
+
data_dict = {table: [] for table in tables}
|
2454
|
+
|
2455
|
+
# Extract plate DataFrames
|
2456
|
+
for loc in locs:
|
2457
|
+
db_dfs = _read_db(loc, tables)
|
2458
|
+
for table, df in zip(tables, db_dfs):
|
2459
|
+
data_dict[table].append(df)
|
2460
|
+
|
2461
|
+
# Concatenate rows across locations for each table
|
2462
|
+
for table, dfs in data_dict.items():
|
2463
|
+
if dfs:
|
2464
|
+
data_dict[table] = pd.concat(dfs, axis=0)
|
2465
|
+
if verbose:
|
2466
|
+
print(f"{table}: {len(data_dict[table])}")
|
2467
|
+
|
2468
|
+
# Initialize merged DataFrame with 'cells' if available
|
2469
|
+
merged_df = pd.DataFrame()
|
2470
|
+
|
2471
|
+
# Process each table
|
2472
|
+
if 'cell' in data_dict:
|
2473
|
+
cells = data_dict['cell'].copy()
|
2474
|
+
cells = cells.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
|
2475
|
+
cells = cells.assign(prcfo=lambda x: x['prcf'] + '_' + x['object_label'])
|
2476
|
+
cells_g_df, metadata = _split_data(cells, 'prcfo', 'object_label')
|
2477
|
+
merged_df = cells_g_df.copy()
|
2478
|
+
if verbose:
|
2479
|
+
print(f'cells: {len(cells)}, cells grouped: {len(cells_g_df)}')
|
2480
|
+
|
2481
|
+
if 'cytoplasm' in data_dict:
|
2482
|
+
cytoplasms = data_dict['cytoplasm'].copy()
|
2483
|
+
cytoplasms = cytoplasms.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
|
2484
|
+
cytoplasms = cytoplasms.assign(prcfo=lambda x: x['prcf'] + '_' + x['object_label'])
|
2485
|
+
|
2486
|
+
if not 'cell' in data_dict:
|
2487
|
+
merged_df, metadata = _split_data(cytoplasms, 'prcfo', 'object_label')
|
2488
|
+
|
2489
|
+
if verbose:
|
2490
|
+
print(f'nucleus: {len(cytoplasms)}, cytoplasms grouped: {len(merged_df)}')
|
2491
|
+
|
2492
|
+
else:
|
2493
|
+
cytoplasms_g_df, _ = _split_data(cytoplasms, 'prcfo', 'object_label')
|
2494
|
+
merged_df = merged_df.merge(cytoplasms_g_df, left_index=True, right_index=True)
|
2495
|
+
|
2496
|
+
if verbose:
|
2497
|
+
print(f'cytoplasms: {len(cytoplasms)}, cytoplasms grouped: {len(cytoplasms_g_df)}')
|
2498
|
+
|
2499
|
+
if 'nucleus' in data_dict:
|
2500
|
+
nucleus = data_dict['nucleus'].copy()
|
2501
|
+
nucleus = nucleus.dropna(subset=['cell_id'])
|
2502
|
+
nucleus = nucleus.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
|
2503
|
+
nucleus = nucleus.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
|
2504
|
+
nucleus = nucleus.assign(prcfo=lambda x: x['prcf'] + '_' + x['cell_id'])
|
2505
|
+
nucleus['nucleus_prcfo_count'] = nucleus.groupby('prcfo')['prcfo'].transform('count')
|
2506
|
+
if not nuclei_limit:
|
2507
|
+
nucleus = nucleus[nucleus['nucleus_prcfo_count'] == 1]
|
2508
|
+
|
2509
|
+
if all(key not in data_dict for key in ['cell', 'cytoplasm']):
|
2510
|
+
merged_df, metadata = _split_data(nucleus, 'prcfo', 'cell_id')
|
2511
|
+
|
2512
|
+
if verbose:
|
2513
|
+
print(f'nucleus: {len(nucleus)}, nucleus grouped: {len(merged_df)}')
|
2514
|
+
|
2515
|
+
else:
|
2516
|
+
nucleus_g_df, _ = _split_data(nucleus, 'prcfo', 'cell_id')
|
2517
|
+
merged_df = merged_df.merge(nucleus_g_df, left_index=True, right_index=True)
|
2518
|
+
|
2519
|
+
if verbose:
|
2520
|
+
print(f'nucleus: {len(nucleus)}, nucleus grouped: {len(nucleus_g_df)}')
|
2521
|
+
|
2522
|
+
if 'pathogen' in data_dict:
|
2523
|
+
pathogens = data_dict['pathogen'].copy()
|
2524
|
+
pathogens = pathogens.dropna(subset=['cell_id'])
|
2525
|
+
pathogens = pathogens.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
|
2526
|
+
pathogens = pathogens.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
|
2527
|
+
pathogens = pathogens.assign(prcfo=lambda x: x['prcf'] + '_' + x['cell_id'])
|
2528
|
+
pathogens['pathogen_prcfo_count'] = pathogens.groupby('prcfo')['prcfo'].transform('count')
|
2529
|
+
|
2530
|
+
if isinstance(pathogen_limit, bool) and not pathogen_limit:
|
2531
|
+
pathogens = pathogens[pathogens['pathogen_prcfo_count'] <= 1]
|
2532
|
+
elif isinstance(pathogen_limit, (float, int)):
|
2533
|
+
pathogens = pathogens[pathogens['pathogen_prcfo_count'] <= int(pathogen_limit)]
|
2534
|
+
|
2535
|
+
if all(key not in data_dict for key in ['cell', 'cytoplasm', 'nucleus']):
|
2536
|
+
merged_df, metadata = _split_data(pathogens, 'prcfo', 'cell_id')
|
2537
|
+
|
2538
|
+
if verbose:
|
2539
|
+
print(f'pathogens: {len(pathogens)}, pathogens grouped: {len(merged_df)}')
|
2540
|
+
|
2541
|
+
else:
|
2542
|
+
pathogens_g_df, _ = _split_data(pathogens, 'prcfo', 'cell_id')
|
2543
|
+
merged_df = merged_df.merge(pathogens_g_df, left_index=True, right_index=True)
|
2544
|
+
|
2545
|
+
if verbose:
|
2546
|
+
print(f'pathogens: {len(pathogens)}, pathogens grouped: {len(pathogens_g_df)}')
|
2547
|
+
|
2548
|
+
if 'png_list' in data_dict:
|
2549
|
+
png_list = data_dict['png_list'].copy()
|
2550
|
+
png_list_g_df_numeric, png_list_g_df_non_numeric = _split_data(png_list, 'prcfo', 'cell_id')
|
2551
|
+
png_list_g_df_non_numeric.drop(columns=['plate','row_name','column_name','field','file_name','cell_id', 'prcf'], inplace=True)
|
2552
|
+
if verbose:
|
2553
|
+
print(f'png_list: {len(png_list)}, png_list grouped: {len(png_list_g_df_numeric)}')
|
2554
|
+
print(f"Added png_list columns: {png_list_g_df_numeric.columns}, {png_list_g_df_non_numeric.columns}")
|
2555
|
+
merged_df = merged_df.merge(png_list_g_df_numeric, left_index=True, right_index=True)
|
2556
|
+
merged_df = merged_df.merge(png_list_g_df_non_numeric, left_index=True, right_index=True)
|
2557
|
+
|
2558
|
+
# Add prc (plate row column) and prcfo (plate row column field object) columns
|
2559
|
+
metadata = metadata.assign(prc=lambda x: x['plate'] + '_' + x['row_name'] + '_' + x['column_name'])
|
2560
|
+
cells_well = metadata.groupby('prc')['object_label'].nunique().reset_index(name='cells_per_well')
|
2561
|
+
metadata = metadata.merge(cells_well, on='prc')
|
2562
|
+
metadata = metadata.assign(prcfo=lambda x: x['plate'] + '_' + x['row_name'] + '_' + x['column_name'] + '_' + x['field'] + '_' + x['object_label'])
|
2563
|
+
metadata.set_index('prcfo', inplace=True)
|
2564
|
+
|
2565
|
+
# Merge metadata with final merged DataFrame
|
2566
|
+
#merged_df = metadata.merge(merged_df, left_index=True, right_index=True).dropna(axis=1)
|
2567
|
+
merged_df = metadata.merge(merged_df, left_index=True, right_index=True)
|
2568
|
+
merged_df.drop(columns=['label_list_morphology', 'label_list_intensity'], errors='ignore', inplace=True)
|
2569
|
+
|
2570
|
+
if verbose:
|
2571
|
+
print(f'Generated dataframe with: {len(merged_df.columns)} columns and {len(merged_df)} rows')
|
2572
|
+
|
2573
|
+
# Prepare object DataFrames for output
|
2574
|
+
obj_df_ls = [data_dict[table] for table in ['cell', 'cytoplasm', 'nucleus', 'pathogen'] if table in data_dict]
|
2575
|
+
|
2576
|
+
return merged_df, obj_df_ls
|
2447
2577
|
|
2448
2578
|
def _read_mask(mask_path):
|
2449
2579
|
mask = imageio2.imread(mask_path)
|
spacr/ml.py
CHANGED
@@ -3,6 +3,7 @@ import pandas as pd
|
|
3
3
|
import numpy as np
|
4
4
|
from scipy import stats
|
5
5
|
from scipy.stats import shapiro
|
6
|
+
from math import pi
|
6
7
|
|
7
8
|
from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV
|
8
9
|
from sklearn.metrics import mean_squared_error
|
@@ -1515,3 +1516,207 @@ def _calculate_similarity(df, features, col_to_compare, val1, val2):
|
|
1515
1516
|
|
1516
1517
|
return df
|
1517
1518
|
|
1519
|
+
def interperate_vision_model(settings={}):
|
1520
|
+
|
1521
|
+
from .io import _read_and_merge_data, _results_to_csv
|
1522
|
+
from .settings import set_interperate_vision_model_defaults
|
1523
|
+
from .utils import save_settings
|
1524
|
+
|
1525
|
+
settings = set_interperate_vision_model_defaults(settings)
|
1526
|
+
save_settings(settings, name='interperate_vision_model', show=True)
|
1527
|
+
|
1528
|
+
# Function to create radar plot for individual and combined values
|
1529
|
+
def create_extended_radar_plot(values, labels, title):
|
1530
|
+
values = list(values) + [values[0]] # Close the loop for radar chart
|
1531
|
+
angles = [n / float(len(labels)) * 2 * pi for n in range(len(labels))]
|
1532
|
+
angles += angles[:1]
|
1533
|
+
|
1534
|
+
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
|
1535
|
+
ax.plot(angles, values, linewidth=2, linestyle='solid')
|
1536
|
+
ax.fill(angles, values, alpha=0.25)
|
1537
|
+
|
1538
|
+
ax.set_xticks(angles[:-1])
|
1539
|
+
ax.set_xticklabels(labels, fontsize=10, rotation=45, ha='right')
|
1540
|
+
plt.title(title, pad=20)
|
1541
|
+
plt.show()
|
1542
|
+
|
1543
|
+
def extract_compartment_channel(feature_name):
|
1544
|
+
# Identify compartment as the first part before an underscore
|
1545
|
+
compartment = feature_name.split('_')[0]
|
1546
|
+
|
1547
|
+
if compartment == 'cells':
|
1548
|
+
compartment = 'cell'
|
1549
|
+
|
1550
|
+
# Identify channels based on substring presence
|
1551
|
+
channels = []
|
1552
|
+
if 'channel_0' in feature_name:
|
1553
|
+
channels.append('channel_0')
|
1554
|
+
if 'channel_1' in feature_name:
|
1555
|
+
channels.append('channel_1')
|
1556
|
+
if 'channel_2' in feature_name:
|
1557
|
+
channels.append('channel_2')
|
1558
|
+
if 'channel_3' in feature_name:
|
1559
|
+
channels.append('channel_3')
|
1560
|
+
|
1561
|
+
# If multiple channels are found, join them with a '+'
|
1562
|
+
if channels:
|
1563
|
+
channel = ' + '.join(channels)
|
1564
|
+
else:
|
1565
|
+
channel = 'morphology' # Use 'morphology' if no channel identifier is found
|
1566
|
+
|
1567
|
+
return (compartment, channel)
|
1568
|
+
|
1569
|
+
def read_and_preprocess_data(settings):
|
1570
|
+
|
1571
|
+
df, _ = _read_and_merge_data(
|
1572
|
+
locs=[settings['src']+'/measurements/measurements.db'],
|
1573
|
+
tables=settings['tables'],
|
1574
|
+
verbose=True,
|
1575
|
+
nuclei_limit=settings['nuclei_limit'],
|
1576
|
+
pathogen_limit=settings['pathogen_limit']
|
1577
|
+
)
|
1578
|
+
|
1579
|
+
scores_df = pd.read_csv(settings['scores'])
|
1580
|
+
|
1581
|
+
# Clean and align columns for merging
|
1582
|
+
df['object_label'] = df['object_label'].str.replace('o', '')
|
1583
|
+
|
1584
|
+
if 'row_name' not in scores_df.columns:
|
1585
|
+
scores_df['row_name'] = scores_df['row']
|
1586
|
+
|
1587
|
+
if 'column_name' not in scores_df.columns:
|
1588
|
+
scores_df['column_name'] = scores_df['col']
|
1589
|
+
|
1590
|
+
if 'object_label' not in scores_df.columns:
|
1591
|
+
scores_df['object_label'] = scores_df['object']
|
1592
|
+
|
1593
|
+
# Remove the 'o' prefix from 'object_label' in df, ensuring it is a string type
|
1594
|
+
df['object_label'] = df['object_label'].str.replace('o', '').astype(str)
|
1595
|
+
|
1596
|
+
# Ensure 'object_label' in scores_df is also a string
|
1597
|
+
scores_df['object_label'] = scores_df['object'].astype(str)
|
1598
|
+
|
1599
|
+
# Ensure all join columns have the same data type in both DataFrames
|
1600
|
+
df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
|
1601
|
+
scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']] = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label']].astype(str)
|
1602
|
+
|
1603
|
+
# Select only the necessary columns from scores_df for merging
|
1604
|
+
scores_df = scores_df[['plate', 'row_name', 'column_name', 'field', 'object_label', settings['score_column']]]
|
1605
|
+
|
1606
|
+
# Now merge DataFrames
|
1607
|
+
merged_df = pd.merge(df, scores_df, on=['plate', 'row_name', 'column_name', 'field', 'object_label'], how='inner')
|
1608
|
+
|
1609
|
+
# Separate numerical features and the score column
|
1610
|
+
X = merged_df.select_dtypes(include='number').drop(columns=[settings['score_column']])
|
1611
|
+
y = merged_df[settings['score_column']]
|
1612
|
+
|
1613
|
+
return X, y, merged_df
|
1614
|
+
|
1615
|
+
X, y, merged_df = read_and_preprocess_data(settings)
|
1616
|
+
|
1617
|
+
# Step 1: Feature Importance using Random Forest
|
1618
|
+
if settings['feature_importance'] or settings['feature_importance']:
|
1619
|
+
model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
|
1620
|
+
model.fit(X, y)
|
1621
|
+
|
1622
|
+
if settings['feature_importance']:
|
1623
|
+
print(f"Feature Importance ...")
|
1624
|
+
feature_importances = model.feature_importances_
|
1625
|
+
feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
|
1626
|
+
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
|
1627
|
+
top_feature_importance_df = feature_importance_df.head(settings['top_features'])
|
1628
|
+
|
1629
|
+
# Plot Feature Importance
|
1630
|
+
plt.figure(figsize=(10, 6))
|
1631
|
+
plt.barh(top_feature_importance_df['feature'], top_feature_importance_df['importance'])
|
1632
|
+
plt.xlabel('Importance')
|
1633
|
+
plt.title(f"Top {settings['top_features']} Features - Feature Importance")
|
1634
|
+
plt.gca().invert_yaxis()
|
1635
|
+
plt.show()
|
1636
|
+
|
1637
|
+
if settings['save']:
|
1638
|
+
_results_to_csv(feature_importance_df, filename='feature_importance.csv')
|
1639
|
+
|
1640
|
+
# Step 2: Permutation Importance
|
1641
|
+
if settings['permutation_importance']:
|
1642
|
+
print(f"Permutation Importance ...")
|
1643
|
+
perm_importance = permutation_importance(model, X, y, n_repeats=10, random_state=42, n_jobs=settings['n_jobs'])
|
1644
|
+
perm_importance_df = pd.DataFrame({'feature': X.columns, 'importance': perm_importance.importances_mean})
|
1645
|
+
perm_importance_df = perm_importance_df.sort_values(by='importance', ascending=False)
|
1646
|
+
top_perm_importance_df = perm_importance_df.head(settings['top_features'])
|
1647
|
+
|
1648
|
+
# Plot Permutation Importance
|
1649
|
+
plt.figure(figsize=(10, 6))
|
1650
|
+
plt.barh(top_perm_importance_df['feature'], top_perm_importance_df['importance'])
|
1651
|
+
plt.xlabel('Importance')
|
1652
|
+
plt.title(f"Top {settings['top_features']} Features - Permutation Importance")
|
1653
|
+
plt.gca().invert_yaxis()
|
1654
|
+
plt.show()
|
1655
|
+
|
1656
|
+
if settings['save']:
|
1657
|
+
_results_to_csv(perm_importance_df, filename='permutation_importance.csv')
|
1658
|
+
|
1659
|
+
# Step 3: SHAP Analysis
|
1660
|
+
if settings['shap']:
|
1661
|
+
print(f"SHAP Analysis ...")
|
1662
|
+
|
1663
|
+
# Select top N features based on Random Forest importance and fit the model on these features only
|
1664
|
+
top_features = feature_importance_df.head(settings['top_features'])['feature']
|
1665
|
+
X_top = X[top_features]
|
1666
|
+
|
1667
|
+
# Refit the model on this subset of features
|
1668
|
+
model = RandomForestClassifier(random_state=42, n_jobs=settings['n_jobs'])
|
1669
|
+
model.fit(X_top, y)
|
1670
|
+
|
1671
|
+
# Sample a smaller subset of rows to speed up SHAP
|
1672
|
+
if settings['shap_sample']:
|
1673
|
+
sample = int(len(X_top) / 100)
|
1674
|
+
X_sample = X_top.sample(min(sample, len(X_top)), random_state=42)
|
1675
|
+
else:
|
1676
|
+
X_sample = X_top
|
1677
|
+
|
1678
|
+
# Initialize SHAP explainer with the same subset of features
|
1679
|
+
explainer = shap.Explainer(model.predict, X_sample)
|
1680
|
+
shap_values = explainer(X_sample, max_evals=1500)
|
1681
|
+
|
1682
|
+
# Plot SHAP summary for the selected sample and top features
|
1683
|
+
shap.summary_plot(shap_values, X_sample, max_display=settings['top_features'])
|
1684
|
+
|
1685
|
+
# Convert SHAP values to a DataFrame for easier manipulation
|
1686
|
+
shap_df = pd.DataFrame(shap_values.values, columns=X_sample.columns)
|
1687
|
+
|
1688
|
+
# Apply the function to create MultiIndex columns with compartment and channel
|
1689
|
+
shap_df.columns = pd.MultiIndex.from_tuples(
|
1690
|
+
[extract_compartment_channel(feat) for feat in shap_df.columns],
|
1691
|
+
names=['compartment', 'channel']
|
1692
|
+
)
|
1693
|
+
|
1694
|
+
# Aggregate SHAP values by compartment and channel
|
1695
|
+
compartment_mean = shap_df.abs().groupby(level='compartment', axis=1).mean().mean(axis=0)
|
1696
|
+
channel_mean = shap_df.abs().groupby(level='channel', axis=1).mean().mean(axis=0)
|
1697
|
+
|
1698
|
+
# Calculate combined importance for each pair of compartments and channels
|
1699
|
+
combined_compartment = {}
|
1700
|
+
for i, comp1 in enumerate(compartment_mean.index):
|
1701
|
+
for comp2 in compartment_mean.index[i+1:]:
|
1702
|
+
combined_compartment[f"{comp1} + {comp2}"] = shap_df.loc[:, (comp1, slice(None))].abs().mean().mean() + \
|
1703
|
+
shap_df.loc[:, (comp2, slice(None))].abs().mean().mean()
|
1704
|
+
|
1705
|
+
combined_channel = {}
|
1706
|
+
for i, chan1 in enumerate(channel_mean.index):
|
1707
|
+
for chan2 in channel_mean.index[i+1:]:
|
1708
|
+
combined_channel[f"{chan1} + {chan2}"] = shap_df.loc[:, (slice(None), chan1)].abs().mean().mean() + \
|
1709
|
+
shap_df.loc[:, (slice(None), chan2)].abs().mean().mean()
|
1710
|
+
|
1711
|
+
# Prepare values and labels for radar charts
|
1712
|
+
all_compartment_importance = list(compartment_mean.values) + list(combined_compartment.values())
|
1713
|
+
all_compartment_labels = list(compartment_mean.index) + list(combined_compartment.keys())
|
1714
|
+
|
1715
|
+
all_channel_importance = list(channel_mean.values) + list(combined_channel.values())
|
1716
|
+
all_channel_labels = list(channel_mean.index) + list(combined_channel.keys())
|
1717
|
+
|
1718
|
+
# Create radar plots for compartments and channels
|
1719
|
+
create_extended_radar_plot(all_compartment_importance, all_compartment_labels, "SHAP Importance by Compartment (Individual and Combined)")
|
1720
|
+
create_extended_radar_plot(all_channel_importance, all_channel_labels, "SHAP Importance by Channel (Individual and Combined)")
|
1721
|
+
|
1722
|
+
return merged_df
|
spacr/plot.py
CHANGED
@@ -3688,3 +3688,51 @@ def overlay_masks_on_images(img_folder, normalize=True, resize=True, save=False,
|
|
3688
3688
|
plt.axis('off')
|
3689
3689
|
plt.show()
|
3690
3690
|
|
3691
|
+
def graph_importance(settings):
|
3692
|
+
|
3693
|
+
from .settings import set_graph_importance_defaults
|
3694
|
+
from .utils import save_settings
|
3695
|
+
|
3696
|
+
if not isinstance(settings['csvs'], list):
|
3697
|
+
settings['csvs'] = settings['csvs']
|
3698
|
+
|
3699
|
+
settings['src'] = os.path.dirname(settings['csvs'][0])
|
3700
|
+
|
3701
|
+
settings = set_graph_importance_defaults(settings)
|
3702
|
+
save_settings(settings, name='graph_importance')
|
3703
|
+
|
3704
|
+
dfs = []
|
3705
|
+
for path in settings['csvs']:
|
3706
|
+
dft = pd.read_csv(path)
|
3707
|
+
dfs.append(dft)
|
3708
|
+
|
3709
|
+
df = pd.concat(dfs)
|
3710
|
+
|
3711
|
+
if not all(col in df.columns for col in (settings['grouping_column'], settings['data_column'])):
|
3712
|
+
print(f"grouping {settings['grouping_column']} and data {settings['data_column']} columns must be in {df.columns.to_list()}")
|
3713
|
+
return
|
3714
|
+
|
3715
|
+
output_dir = os.path.dirname(settings['csvs'][0])
|
3716
|
+
|
3717
|
+
spacr_graph = spacrGraph(
|
3718
|
+
df=df,
|
3719
|
+
grouping_column=settings['grouping_column'],
|
3720
|
+
data_column=settings['data_column'],
|
3721
|
+
graph_type=settings['graph_type'],
|
3722
|
+
graph_name=settings['grouping_column'],
|
3723
|
+
summary_func='mean',
|
3724
|
+
colors=None,
|
3725
|
+
output_dir=output_dir,
|
3726
|
+
save=settings['save'],
|
3727
|
+
y_lim=None,
|
3728
|
+
error_bar_type='std',
|
3729
|
+
representation='object',
|
3730
|
+
theme='muted',
|
3731
|
+
)
|
3732
|
+
|
3733
|
+
# Create the plot
|
3734
|
+
spacr_graph.create_plot()
|
3735
|
+
|
3736
|
+
# Get the figure object if needed
|
3737
|
+
fig = spacr_graph.get_figure()
|
3738
|
+
plt.show()
|
spacr/settings.py
CHANGED
@@ -1370,4 +1370,68 @@ def get_analyze_plaque_settings(settings):
|
|
1370
1370
|
settings.setdefault('rescale', False)
|
1371
1371
|
settings.setdefault('resample', False)
|
1372
1372
|
settings.setdefault('fill_in', True)
|
1373
|
+
return settings
|
1374
|
+
|
1375
|
+
def set_graph_importance_defaults(settings):
|
1376
|
+
settings.setdefault('csvs','list of paths')
|
1377
|
+
settings.setdefault('grouping_column','compartment')
|
1378
|
+
settings.setdefault('data_column','compartment_importance_sum')
|
1379
|
+
settings.setdefault('graph_type','jitter_bar')
|
1380
|
+
settings.setdefault('save',False)
|
1381
|
+
return settings
|
1382
|
+
|
1383
|
+
def set_interperate_vision_model_defaults(settings):
|
1384
|
+
settings.setdefault('src','path')
|
1385
|
+
settings.setdefault('scores','path')
|
1386
|
+
settings.setdefault('tables',['cell', 'nucleus', 'pathogen','cytoplasm'])
|
1387
|
+
settings.setdefault('feature_importance',True)
|
1388
|
+
settings.setdefault('permutation_importance',False)
|
1389
|
+
settings.setdefault('shap',True)
|
1390
|
+
settings.setdefault('save',False)
|
1391
|
+
settings.setdefault('nuclei_limit',1000)
|
1392
|
+
settings.setdefault('pathogen_limit',1000)
|
1393
|
+
settings.setdefault('top_features',30)
|
1394
|
+
settings.setdefault('shap_sample',True)
|
1395
|
+
settings.setdefault('n_jobs',-1)
|
1396
|
+
settings.setdefault('shap_approximate',True)
|
1397
|
+
settings.setdefault('score_column','cv_predictions')
|
1398
|
+
return settings
|
1399
|
+
|
1400
|
+
def set_analyze_endodyogeny_defaults(settings):
|
1401
|
+
settings.setdefault('src','path')
|
1402
|
+
settings.setdefault('tables',['cell', 'nucleus', 'pathogen', 'cytoplasm'])
|
1403
|
+
settings.setdefault('cell_types',['Hela'])
|
1404
|
+
settings.setdefault('cell_plate_metadata',None)
|
1405
|
+
settings.setdefault('pathogen_types',['nc', 'pc'])
|
1406
|
+
settings.setdefault('pathogen_plate_metadata',[['c1'], ['c2']])
|
1407
|
+
settings.setdefault('treatments',None)
|
1408
|
+
settings.setdefault('treatment_plate_metadata',None)
|
1409
|
+
settings.setdefault('min_area_bin',500)
|
1410
|
+
settings.setdefault('group_column','pathogen')
|
1411
|
+
settings.setdefault('compartment','pathogen')
|
1412
|
+
settings.setdefault('pathogen_limit',1)
|
1413
|
+
settings.setdefault('nuclei_limit',10)
|
1414
|
+
settings.setdefault('level','object')
|
1415
|
+
settings.setdefault('um_per_px',0.1)
|
1416
|
+
settings.setdefault('max_bins',None)
|
1417
|
+
settings.setdefault('save',False)
|
1418
|
+
settings.setdefault('verbose',False)
|
1419
|
+
return settings
|
1420
|
+
|
1421
|
+
def set_analyze_class_proportion_defaults(settings):
|
1422
|
+
settings.setdefault('src','path')
|
1423
|
+
settings.setdefault('tables',['cell', 'nucleus', 'pathogen', 'cytoplasm'])
|
1424
|
+
settings.setdefault('cell_types',['Hela'])
|
1425
|
+
settings.setdefault('cell_plate_metadata',None)
|
1426
|
+
settings.setdefault('pathogen_types',['nc','pc'])
|
1427
|
+
settings.setdefault('pathogen_plate_metadata',[['c1'],['c2']])
|
1428
|
+
settings.setdefault('treatments',None)
|
1429
|
+
settings.setdefault('treatment_plate_metadata',None)
|
1430
|
+
settings.setdefault('group_column','condition')
|
1431
|
+
settings.setdefault('class_column','test')
|
1432
|
+
settings.setdefault('pathogen_limit',1000)
|
1433
|
+
settings.setdefault('nuclei_limit',1000)
|
1434
|
+
settings.setdefault('level','well')
|
1435
|
+
settings.setdefault('save',False)
|
1436
|
+
settings.setdefault('verbose', False)
|
1373
1437
|
return settings
|
spacr/submodules.py
CHANGED
@@ -10,6 +10,7 @@ from IPython.display import display
|
|
10
10
|
from sklearn.ensemble import RandomForestClassifier
|
11
11
|
from sklearn.inspection import permutation_importance
|
12
12
|
from math import pi
|
13
|
+
from scipy.stats import chi2_contingency
|
13
14
|
|
14
15
|
import matplotlib.pyplot as plt
|
15
16
|
from natsort import natsorted
|
@@ -844,4 +845,300 @@ def interperate_vision_model(settings={}):
|
|
844
845
|
df.to_csv(save_path)
|
845
846
|
print(f"Saved {save_path}")
|
846
847
|
|
847
|
-
return output
|
848
|
+
return output
|
849
|
+
|
850
|
+
def analyze_endodyogeny(settings):
|
851
|
+
|
852
|
+
from .utils import annotate_conditions, save_settings
|
853
|
+
from .io import _read_and_merge_data
|
854
|
+
from .settings import set_analyze_endodyogeny_defaults
|
855
|
+
|
856
|
+
def _calculate_volume_bins(df, compartment='pathogen', min_area_bin=500, max_bins=None, verbose=False):
|
857
|
+
area_column = f'{compartment}_area'
|
858
|
+
df[f'{compartment}_volume'] = df[area_column] ** 1.5
|
859
|
+
min_volume_bin = min_area_bin ** 1.5
|
860
|
+
max_volume = df[f'{compartment}_volume'].max()
|
861
|
+
|
862
|
+
# Generate bin edges as floats, and filter out any duplicate edges
|
863
|
+
bins = [min_volume_bin * (2 ** i) for i in range(int(np.ceil(np.log2(max_volume / min_volume_bin)) + 1))]
|
864
|
+
bins = sorted(set(bins)) # Ensure bin edges are unique
|
865
|
+
|
866
|
+
# Create bin labels as ranges with decimal precision for float values (e.g., "500.0-1000.0")
|
867
|
+
bin_labels = [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)]
|
868
|
+
if verbose:
|
869
|
+
print('Volume bins:', bins)
|
870
|
+
print('Volume bin labels:', bin_labels)
|
871
|
+
|
872
|
+
# Apply the bins to create a new column with the binned labels
|
873
|
+
df[f'{compartment}_volume_bin'] = pd.cut(df[f'{compartment}_volume'], bins=bins, labels=bin_labels, right=False)
|
874
|
+
|
875
|
+
# Create a bin index column (numeric version of bins)
|
876
|
+
df['bin_index'] = pd.cut(df[f'{compartment}_volume'], bins=bins, labels=range(1, len(bins)), right=False).astype(int)
|
877
|
+
|
878
|
+
# Adjust bin indices and labels based on max_bins
|
879
|
+
if max_bins is not None:
|
880
|
+
df.loc[df['bin_index'] > max_bins, 'bin_index'] = max_bins
|
881
|
+
|
882
|
+
# Update bin labels to reflect capped bins
|
883
|
+
bin_labels = bin_labels[:max_bins - 1] + [f">{bins[max_bins - 1]:.2f}"]
|
884
|
+
df[f'{compartment}_volume_bin'] = df['bin_index'].map(
|
885
|
+
{i + 1: label for i, label in enumerate(bin_labels)}
|
886
|
+
)
|
887
|
+
|
888
|
+
if verbose:
|
889
|
+
print(df[[f'{compartment}_volume', f'{compartment}_volume_bin', 'bin_index']].head())
|
890
|
+
|
891
|
+
return df
|
892
|
+
|
893
|
+
def _plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_column='prc', level='object'):
|
894
|
+
# Always calculate chi-squared on raw data
|
895
|
+
raw_counts = df.groupby([group_column, bin_column]).size().unstack(fill_value=0)
|
896
|
+
chi2, p, dof, expected = chi2_contingency(raw_counts)
|
897
|
+
print(f"Chi-squared test statistic (raw data): {chi2:.4f}")
|
898
|
+
print(f"p-value (raw data): {p:.4e}")
|
899
|
+
|
900
|
+
# Extract bin labels and indices for formatting the legend in the correct order
|
901
|
+
bin_labels = df[bin_column].cat.categories if pd.api.types.is_categorical_dtype(df[bin_column]) else sorted(df[bin_column].unique())
|
902
|
+
bin_indices = range(1, len(bin_labels) + 1)
|
903
|
+
legend_labels = [f"{index}: {label}" for index, label in zip(bin_indices, bin_labels)]
|
904
|
+
|
905
|
+
# Plot based on level setting
|
906
|
+
if level == 'well':
|
907
|
+
# Aggregate by well for mean ± SD visualization
|
908
|
+
well_proportions = (
|
909
|
+
df.groupby([group_column, prc_column, bin_column])
|
910
|
+
.size()
|
911
|
+
.groupby(level=[0, 1])
|
912
|
+
.apply(lambda x: x / x.sum())
|
913
|
+
.unstack(fill_value=0)
|
914
|
+
)
|
915
|
+
mean_proportions = well_proportions.groupby(group_column).mean()
|
916
|
+
std_proportions = well_proportions.groupby(group_column).std()
|
917
|
+
|
918
|
+
ax = mean_proportions.plot(
|
919
|
+
kind='bar', stacked=True, yerr=std_proportions, capsize=5, colormap='viridis', figsize=(12, 8)
|
920
|
+
)
|
921
|
+
plt.title('Proportion of Volume Bins by Group (Mean ± SD across wells)')
|
922
|
+
else:
|
923
|
+
# Object-level plotting without aggregation
|
924
|
+
group_counts = df.groupby([group_column, bin_column]).size()
|
925
|
+
group_totals = group_counts.groupby(level=0).sum()
|
926
|
+
proportions = group_counts / group_totals
|
927
|
+
proportion_df = proportions.unstack(fill_value=0)
|
928
|
+
|
929
|
+
ax = proportion_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(12, 8))
|
930
|
+
plt.title('Proportion of Volume Bins by Group')
|
931
|
+
|
932
|
+
plt.xlabel('Group')
|
933
|
+
plt.ylabel('Proportion')
|
934
|
+
|
935
|
+
# Update legend with formatted labels, maintaining correct order
|
936
|
+
volume_unit = "px³" if settings['um_per_px'] is None else "µm³"
|
937
|
+
plt.legend(legend_labels, title=f'Volume Range ({volume_unit})', bbox_to_anchor=(1.05, 1), loc='upper left')
|
938
|
+
plt.ylim(0, 1)
|
939
|
+
fig = plt.gcf()
|
940
|
+
return chi2, p, dof, expected, raw_counts, fig
|
941
|
+
|
942
|
+
settings = set_analyze_endodyogeny_defaults(settings)
|
943
|
+
save_settings(settings, name='analyze_endodyogeny', show=True)
|
944
|
+
output = {}
|
945
|
+
|
946
|
+
# Process data
|
947
|
+
if not isinstance(settings['src'], list):
|
948
|
+
settings['src'] = [settings['src']]
|
949
|
+
|
950
|
+
locs = []
|
951
|
+
for s in settings['src']:
|
952
|
+
loc = os.path.join(s, 'measurements/measurements.db')
|
953
|
+
locs.append(loc)
|
954
|
+
|
955
|
+
df, _ = _read_and_merge_data(
|
956
|
+
locs,
|
957
|
+
tables=settings['tables'],
|
958
|
+
verbose=settings['verbose'],
|
959
|
+
nuclei_limit=settings['nuclei_limit'],
|
960
|
+
pathogen_limit=settings['pathogen_limit']
|
961
|
+
)
|
962
|
+
|
963
|
+
if not settings['um_per_px'] is None:
|
964
|
+
df[f"{settings['compartment']}_area"] = df[f"{settings['compartment']}_area"] * (settings['um_per_px'] ** 2)
|
965
|
+
settings['min_area_bin'] = settings['min_area_bin'] * (settings['um_per_px'] ** 2)
|
966
|
+
|
967
|
+
df = df[df[f"{settings['compartment']}_area"] >= settings['min_area_bin']]
|
968
|
+
|
969
|
+
df = annotate_conditions(
|
970
|
+
df=df,
|
971
|
+
cells=settings['cell_types'],
|
972
|
+
cell_loc=settings['cell_plate_metadata'],
|
973
|
+
pathogens=settings['pathogen_types'],
|
974
|
+
pathogen_loc=settings['pathogen_plate_metadata'],
|
975
|
+
treatments=settings['treatments'],
|
976
|
+
treatment_loc=settings['treatment_plate_metadata']
|
977
|
+
)
|
978
|
+
|
979
|
+
if settings['group_column'] not in df.columns:
|
980
|
+
print(f"{settings['group_column']} not found in DataFrame, please choose from:")
|
981
|
+
for col in df.columns:
|
982
|
+
print(col)
|
983
|
+
|
984
|
+
df = df.dropna(subset=[settings['group_column']])
|
985
|
+
df = _calculate_volume_bins(df, settings['compartment'], settings['min_area_bin'], settings['max_bins'], settings['verbose'])
|
986
|
+
output['data'] = df
|
987
|
+
# Perform chi-squared test and plot
|
988
|
+
chi2, p, dof, expected, raw_counts, fig = _plot_proportion_stacked_bars(settings, df, settings['group_column'], bin_column=f"{settings['compartment']}_volume_bin", level=settings['level']
|
989
|
+
)
|
990
|
+
|
991
|
+
# Create a DataFrame with chi-squared test results and raw counts
|
992
|
+
results_df = pd.DataFrame({
|
993
|
+
'chi_squared_stat': [chi2],
|
994
|
+
'p_value': [p],
|
995
|
+
'degrees_of_freedom': [dof]
|
996
|
+
})
|
997
|
+
|
998
|
+
# Flatten and add expected counts to results_df
|
999
|
+
expected_df = pd.DataFrame(expected, index=raw_counts.index, columns=raw_counts.columns)
|
1000
|
+
expected_flat = expected_df.stack().reset_index()
|
1001
|
+
expected_flat.columns = [settings['group_column'], f"{settings['compartment']}_volume_bin", 'expected_count']
|
1002
|
+
results_df = results_df.merge(expected_flat, how="cross")
|
1003
|
+
output['chi_squared'] = results_df
|
1004
|
+
|
1005
|
+
if settings['save']:
|
1006
|
+
# Save DataFrame to CSV
|
1007
|
+
output_dir = os.path.join(settings['src'][0], 'results')
|
1008
|
+
os.makedirs(output_dir, exist_ok=True)
|
1009
|
+
output_path = os.path.join(output_dir, 'chi_squared_results.csv')
|
1010
|
+
output_path_fig = os.path.join(output_dir, 'chi_squared_results.pdf')
|
1011
|
+
fig.savefig(output_path_fig, dpi=300, bbox_inches='tight')
|
1012
|
+
results_df.to_csv(output_path, index=False)
|
1013
|
+
print(f"Chi-squared results saved to {output_path}")
|
1014
|
+
|
1015
|
+
plt.show()
|
1016
|
+
|
1017
|
+
return output
|
1018
|
+
|
1019
|
+
def analyze_class_proportion(settings):
|
1020
|
+
|
1021
|
+
from .utils import annotate_conditions, save_settings
|
1022
|
+
from .io import _read_and_merge_data
|
1023
|
+
from .settings import set_analyze_class_proportion_defaults
|
1024
|
+
from .plot import plot_plates
|
1025
|
+
|
1026
|
+
|
1027
|
+
def _plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_column='prc', level='object'):
|
1028
|
+
# Always calculate chi-squared on raw data
|
1029
|
+
raw_counts = df.groupby([group_column, bin_column]).size().unstack(fill_value=0)
|
1030
|
+
chi2, p, dof, expected = chi2_contingency(raw_counts)
|
1031
|
+
print(f"Chi-squared test statistic (raw data): {chi2:.4f}")
|
1032
|
+
print(f"p-value (raw data): {p:.4e}")
|
1033
|
+
|
1034
|
+
# Plot based on level setting
|
1035
|
+
if level == 'well':
|
1036
|
+
# Aggregate by well for mean ± SD visualization
|
1037
|
+
well_proportions = (
|
1038
|
+
df.groupby([group_column, prc_column, bin_column])
|
1039
|
+
.size()
|
1040
|
+
.groupby(level=[0, 1])
|
1041
|
+
.apply(lambda x: x / x.sum())
|
1042
|
+
.unstack(fill_value=0)
|
1043
|
+
)
|
1044
|
+
mean_proportions = well_proportions.groupby(group_column).mean()
|
1045
|
+
std_proportions = well_proportions.groupby(group_column).std()
|
1046
|
+
|
1047
|
+
ax = mean_proportions.plot(
|
1048
|
+
kind='bar', stacked=True, yerr=std_proportions, capsize=5, colormap='viridis', figsize=(12, 8)
|
1049
|
+
)
|
1050
|
+
plt.title('Proportion of Volume Bins by Group (Mean ± SD across wells)')
|
1051
|
+
else:
|
1052
|
+
# Object-level plotting without aggregation
|
1053
|
+
group_counts = df.groupby([group_column, bin_column]).size()
|
1054
|
+
group_totals = group_counts.groupby(level=0).sum()
|
1055
|
+
proportions = group_counts / group_totals
|
1056
|
+
proportion_df = proportions.unstack(fill_value=0)
|
1057
|
+
|
1058
|
+
ax = proportion_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(12, 8))
|
1059
|
+
plt.title('Proportion of Volume Bins by Group')
|
1060
|
+
|
1061
|
+
plt.xlabel('Group')
|
1062
|
+
plt.ylabel('Proportion')
|
1063
|
+
|
1064
|
+
# Update legend with formatted labels, maintaining correct order
|
1065
|
+
plt.legend(title=f'Classes', bbox_to_anchor=(1.05, 1), loc='upper left')
|
1066
|
+
plt.ylim(0, 1)
|
1067
|
+
fig = plt.gcf()
|
1068
|
+
return chi2, p, dof, expected, raw_counts, fig
|
1069
|
+
|
1070
|
+
settings = set_analyze_class_proportion_defaults(settings)
|
1071
|
+
save_settings(settings, name='analyze_class_proportion', show=True)
|
1072
|
+
output = {}
|
1073
|
+
|
1074
|
+
# Process data
|
1075
|
+
if not isinstance(settings['src'], list):
|
1076
|
+
settings['src'] = [settings['src']]
|
1077
|
+
|
1078
|
+
locs = []
|
1079
|
+
for s in settings['src']:
|
1080
|
+
loc = os.path.join(s, 'measurements/measurements.db')
|
1081
|
+
locs.append(loc)
|
1082
|
+
|
1083
|
+
if 'png_list' not in settings['tables']:
|
1084
|
+
settings['tables'] = settings['tables'] + ['png_list']
|
1085
|
+
|
1086
|
+
df, _ = _read_and_merge_data(
|
1087
|
+
locs,
|
1088
|
+
tables=settings['tables'],
|
1089
|
+
verbose=settings['verbose'],
|
1090
|
+
nuclei_limit=settings['nuclei_limit'],
|
1091
|
+
pathogen_limit=settings['pathogen_limit']
|
1092
|
+
)
|
1093
|
+
|
1094
|
+
df = annotate_conditions(
|
1095
|
+
df=df,
|
1096
|
+
cells=settings['cell_types'],
|
1097
|
+
cell_loc=settings['cell_plate_metadata'],
|
1098
|
+
pathogens=settings['pathogen_types'],
|
1099
|
+
pathogen_loc=settings['pathogen_plate_metadata'],
|
1100
|
+
treatments=settings['treatments'],
|
1101
|
+
treatment_loc=settings['treatment_plate_metadata']
|
1102
|
+
)
|
1103
|
+
|
1104
|
+
if settings['group_column'] not in df.columns:
|
1105
|
+
print(f"{settings['group_column']} not found in DataFrame, please choose from:")
|
1106
|
+
for col in df.columns:
|
1107
|
+
print(col)
|
1108
|
+
|
1109
|
+
df[settings['class_column']] = df[settings['class_column']].fillna(0)
|
1110
|
+
output['data'] = df
|
1111
|
+
|
1112
|
+
# Perform chi-squared test and plot
|
1113
|
+
chi2, p, dof, expected, raw_counts, fig = _plot_proportion_stacked_bars(settings, df, settings['group_column'], bin_column=settings['class_column'], level=settings['level'])
|
1114
|
+
|
1115
|
+
# Create a DataFrame with chi-squared test results and raw counts
|
1116
|
+
results_df = pd.DataFrame({
|
1117
|
+
'chi_squared_stat': [chi2],
|
1118
|
+
'p_value': [p],
|
1119
|
+
'degrees_of_freedom': [dof]
|
1120
|
+
})
|
1121
|
+
|
1122
|
+
output['chi_squared'] = results_df
|
1123
|
+
|
1124
|
+
if settings['save']:
|
1125
|
+
output_dir = os.path.join(settings['src'][0], 'results')
|
1126
|
+
os.makedirs(output_dir, exist_ok=True)
|
1127
|
+
output_path_chi = os.path.join(output_dir, 'class_chi_squared_results.csv')
|
1128
|
+
output_path_data = os.path.join(output_dir, 'class_chi_squared_data.csv')
|
1129
|
+
output_path_fig = os.path.join(output_dir, 'class_chi_squared.pdf')
|
1130
|
+
fig.savefig(output_path_fig, dpi=300, bbox_inches='tight')
|
1131
|
+
results_df.to_csv(output_path_chi, index=False)
|
1132
|
+
df.to_csv(output_path_data, index=False)
|
1133
|
+
print(f"Chi-squared results saved to {output_path_chi}")
|
1134
|
+
print(f"Annotated data saved to {output_path_data}")
|
1135
|
+
|
1136
|
+
plt.show()
|
1137
|
+
|
1138
|
+
fig2 = plot_plates(df, variable=settings['class_column'], grouping='mean', min_max='allq', cmap='viridis', min_count=0, verbose=True, dst=None)
|
1139
|
+
if settings['save']:
|
1140
|
+
output_path_fig2 = os.path.join(output_dir, 'class_heatmap.pdf')
|
1141
|
+
fig2.savefig(output_path_fig2, dpi=300, bbox_inches='tight')
|
1142
|
+
|
1143
|
+
plt.show()
|
1144
|
+
return output
|
spacr/utils.py
CHANGED
@@ -1371,7 +1371,7 @@ def annotate_conditions(df, cells=None, cell_loc=None, pathogens=None, pathogen_
|
|
1371
1371
|
|
1372
1372
|
return df
|
1373
1373
|
|
1374
|
-
def
|
1374
|
+
def _split_data_v1(df, group_by, object_type):
|
1375
1375
|
"""
|
1376
1376
|
Splits the input dataframe into numeric and non-numeric parts, groups them by the specified column,
|
1377
1377
|
and returns the grouped dataframes.
|
@@ -1385,16 +1385,72 @@ def _split_data(df, group_by, object_type):
|
|
1385
1385
|
grouped_numeric (pandas.DataFrame): The grouped dataframe containing numeric columns.
|
1386
1386
|
grouped_non_numeric (pandas.DataFrame): The grouped dataframe containing non-numeric columns.
|
1387
1387
|
"""
|
1388
|
+
|
1389
|
+
if 'prcf' not in df.columns:
|
1390
|
+
try:
|
1391
|
+
df['prcf'] = df['plate'].astype(str) + '_' + df['row_name'].astype(str) + '_' + df['column_name'].astype(str) + '_' + df['field'].astype(str)
|
1392
|
+
except Exception as e:
|
1393
|
+
print(e)
|
1394
|
+
|
1388
1395
|
df['prcfo'] = df['prcf'] + '_' + df[object_type]
|
1389
1396
|
df = df.set_index(group_by, inplace=False)
|
1390
1397
|
|
1391
1398
|
df_numeric = df.select_dtypes(include=np.number)
|
1392
1399
|
df_non_numeric = df.select_dtypes(exclude=np.number)
|
1400
|
+
|
1401
|
+
[]
|
1393
1402
|
|
1394
1403
|
grouped_numeric = df_numeric.groupby(df_numeric.index).mean()
|
1395
1404
|
grouped_non_numeric = df_non_numeric.groupby(df_non_numeric.index).first()
|
1396
1405
|
|
1397
1406
|
return pd.DataFrame(grouped_numeric), pd.DataFrame(grouped_non_numeric)
|
1407
|
+
|
1408
|
+
def _split_data(df, group_by, object_type):
|
1409
|
+
"""
|
1410
|
+
Splits the input dataframe into numeric and non-numeric parts, groups them by the specified column,
|
1411
|
+
and returns the grouped dataframes with conditional aggregation.
|
1412
|
+
|
1413
|
+
Parameters:
|
1414
|
+
df (pandas.DataFrame): The input dataframe.
|
1415
|
+
group_by (str): The column name to group the dataframes by.
|
1416
|
+
object_type (str): The column name to concatenate with 'prcf' to create a new column 'prcfo'.
|
1417
|
+
|
1418
|
+
Returns:
|
1419
|
+
grouped_numeric (pandas.DataFrame): The grouped dataframe containing numeric columns with conditional aggregation.
|
1420
|
+
grouped_non_numeric (pandas.DataFrame): The grouped dataframe containing non-numeric columns.
|
1421
|
+
"""
|
1422
|
+
|
1423
|
+
# Ensure 'prcf' column exists by concatenating specific columns
|
1424
|
+
if 'prcf' not in df.columns:
|
1425
|
+
try:
|
1426
|
+
df['prcf'] = df['plate'].astype(str) + '_' + df['row_name'].astype(str) + '_' + df['column_name'].astype(str) + '_' + df['field'].astype(str)
|
1427
|
+
except Exception as e:
|
1428
|
+
print(e)
|
1429
|
+
|
1430
|
+
# Create the 'prcfo' column
|
1431
|
+
df['prcfo'] = df['prcf'] + '_' + df[object_type]
|
1432
|
+
df = df.set_index(group_by, inplace=False)
|
1433
|
+
|
1434
|
+
# Split the DataFrame into numeric and non-numeric parts
|
1435
|
+
df_numeric = df.select_dtypes(include=np.number)
|
1436
|
+
df_non_numeric = df.select_dtypes(exclude=np.number)
|
1437
|
+
|
1438
|
+
# Define keywords for columns to be summed instead of averaged
|
1439
|
+
sum_keywords = ['area', 'perimeter', 'convex_area', 'bbox_area', 'filled_area', 'major_axis_length', 'minor_axis_length', 'equivalent_diameter']
|
1440
|
+
|
1441
|
+
# Create a dictionary for custom aggregation
|
1442
|
+
agg_dict = {}
|
1443
|
+
for column in df_numeric.columns:
|
1444
|
+
if any(keyword in column for keyword in sum_keywords):
|
1445
|
+
agg_dict[column] = 'sum'
|
1446
|
+
else:
|
1447
|
+
agg_dict[column] = 'mean'
|
1448
|
+
|
1449
|
+
# Apply custom aggregation
|
1450
|
+
grouped_numeric = df_numeric.groupby(df_numeric.index).agg(agg_dict)
|
1451
|
+
grouped_non_numeric = df_non_numeric.groupby(df_non_numeric.index).first()
|
1452
|
+
|
1453
|
+
return pd.DataFrame(grouped_numeric), pd.DataFrame(grouped_non_numeric)
|
1398
1454
|
|
1399
1455
|
def _calculate_recruitment(df, channel):
|
1400
1456
|
"""
|
@@ -5184,7 +5240,7 @@ def group_feature_class(df, feature_groups=['cell', 'cytoplasm', 'nucleus', 'pat
|
|
5184
5240
|
else:
|
5185
5241
|
return None
|
5186
5242
|
|
5187
|
-
from
|
5243
|
+
from .plot import spacrGraph
|
5188
5244
|
|
5189
5245
|
df[name] = df['feature'].apply(lambda x: find_feature_class(x, feature_groups))
|
5190
5246
|
|
@@ -15,20 +15,20 @@ spacr/gui.py,sha256=ARyn9Q_g8HoP-cXh1nzMLVFCKqthY4v2u9yORyaQqQE,8230
|
|
15
15
|
spacr/gui_core.py,sha256=N7R7yvfK_dJhOReM_kW3Ci8Bokhi1OzsxeKqvSGdvV4,41460
|
16
16
|
spacr/gui_elements.py,sha256=EKlvEg_4_je7jciEdR3NTgPrcTraowa2e2RUt-xqd6M,138254
|
17
17
|
spacr/gui_utils.py,sha256=u9RoIOWpAXFEOnUlLpMQZrc1pWSg6omZsJMIhJdRv_g,41211
|
18
|
-
spacr/io.py,sha256=
|
18
|
+
spacr/io.py,sha256=YlJAT6H8l4ipunMyKzjqoPcf-1AXgUmSyR1YN9WxmDI,142857
|
19
19
|
spacr/logger.py,sha256=lJhTqt-_wfAunCPl93xE65Wr9Y1oIHJWaZMjunHUeIw,1538
|
20
20
|
spacr/measure.py,sha256=2lK-ZcTxLM-MpXV1oZnucRD9iz5aprwahRKw9IEqshg,55085
|
21
21
|
spacr/mediar.py,sha256=FwLvbLQW5LQzPgvJZG8Lw7GniA2vbZx6Jv6vIKu7I5c,14743
|
22
|
-
spacr/ml.py,sha256=
|
22
|
+
spacr/ml.py,sha256=GOQJH8jdTrJQwiLlDrcc9-yCxLFaMx4YD4OJs0-R5YI,77947
|
23
23
|
spacr/openai.py,sha256=5vBZ3Jl2llYcW3oaTEXgdyCB2aJujMUIO5K038z7w_A,1246
|
24
|
-
spacr/plot.py,sha256=
|
24
|
+
spacr/plot.py,sha256=0fne2Msy6niN80oiuwt9ZYw1QwXVnghaUmrwvEZN9-8,161992
|
25
25
|
spacr/sequencing.py,sha256=ClUfwPPK6rNUbUuiEkzcwakzVyDKKUMv9ricrxT8qQY,25227
|
26
|
-
spacr/settings.py,sha256=
|
26
|
+
spacr/settings.py,sha256=LSoDNuz1m7rySh7MWXEL1xlUU4rFiCRVlGvZCSCOqzU,80085
|
27
27
|
spacr/sim.py,sha256=1xKhXimNU3ukzIw-3l9cF3Znc_brW8h20yv8fSTzvss,71173
|
28
|
-
spacr/submodules.py,sha256=
|
28
|
+
spacr/submodules.py,sha256=X1OI0Dsc1qU4lqKFdF2EnloNkLkDzA1hDn7CYbkBmFc,55473
|
29
29
|
spacr/timelapse.py,sha256=KGfG4L4-QnFfgbF7L6C5wL_3gd_rqr05Foje6RsoTBg,39603
|
30
30
|
spacr/toxo.py,sha256=z2nT5aAze3NUIlwnBQcnkARihDwoPfqOgQIVoUluyK0,25087
|
31
|
-
spacr/utils.py,sha256=
|
31
|
+
spacr/utils.py,sha256=vvciLh1gH0nsrCWQw3taUcDjxP59wme3gqrejeNO05w,222943
|
32
32
|
spacr/version.py,sha256=axH5tnGwtgSnJHb5IDhiu4Zjk5GhLyAEDRe-rnaoFOA,409
|
33
33
|
spacr/resources/MEDIAR/.gitignore,sha256=Ff1q9Nme14JUd-4Q3jZ65aeQ5X4uttptssVDgBVHYo8,152
|
34
34
|
spacr/resources/MEDIAR/LICENSE,sha256=yEj_TRDLUfDpHDNM0StALXIt6mLqSgaV2hcCwa6_TcY,1065
|
@@ -151,9 +151,9 @@ spacr/resources/icons/umap.png,sha256=dOLF3DeLYy9k0nkUybiZMe1wzHQwLJFRmgccppw-8b
|
|
151
151
|
spacr/resources/images/plate1_E01_T0001F001L01A01Z01C02.tif,sha256=Tl0ZUfZ_AYAbu0up_nO0tPRtF1BxXhWQ3T3pURBCCRo,7958528
|
152
152
|
spacr/resources/images/plate1_E01_T0001F001L01A02Z01C01.tif,sha256=m8N-V71rA1TT4dFlENNg8s0Q0YEXXs8slIn7yObmZJQ,7958528
|
153
153
|
spacr/resources/images/plate1_E01_T0001F001L01A03Z01C03.tif,sha256=Pbhk7xn-KUP6RSIhJsxQcrHFImBm3GEpLkzx7WOc-5M,7958528
|
154
|
-
spacr-0.3.
|
155
|
-
spacr-0.3.
|
156
|
-
spacr-0.3.
|
157
|
-
spacr-0.3.
|
158
|
-
spacr-0.3.
|
159
|
-
spacr-0.3.
|
154
|
+
spacr-0.3.64.dist-info/LICENSE,sha256=SR-2MeGc6SCM1UORJYyarSWY_A-JaOMFDj7ReSs9tRM,1083
|
155
|
+
spacr-0.3.64.dist-info/METADATA,sha256=_07fLYI8eMAYJzOEcAVOemN4TFJAuzAvUrdX1T136T0,6032
|
156
|
+
spacr-0.3.64.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
157
|
+
spacr-0.3.64.dist-info/entry_points.txt,sha256=BMC0ql9aNNpv8lUZ8sgDLQMsqaVnX5L535gEhKUP5ho,296
|
158
|
+
spacr-0.3.64.dist-info/top_level.txt,sha256=GJPU8FgwRXGzKeut6JopsSRY2R8T3i9lDgya42tLInY,6
|
159
|
+
spacr-0.3.64.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|