spacr 0.3.2__py3-none-any.whl → 0.3.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/core.py +105 -1
- spacr/deep_spacr.py +171 -25
- spacr/io.py +80 -121
- spacr/ml.py +153 -66
- spacr/plot.py +429 -7
- spacr/settings.py +6 -5
- spacr/submodules.py +7 -6
- spacr/toxo.py +9 -4
- spacr/utils.py +152 -13
- {spacr-0.3.2.dist-info → spacr-0.3.22.dist-info}/METADATA +28 -25
- {spacr-0.3.2.dist-info → spacr-0.3.22.dist-info}/RECORD +15 -15
- {spacr-0.3.2.dist-info → spacr-0.3.22.dist-info}/LICENSE +0 -0
- {spacr-0.3.2.dist-info → spacr-0.3.22.dist-info}/WHEEL +0 -0
- {spacr-0.3.2.dist-info → spacr-0.3.22.dist-info}/entry_points.txt +0 -0
- {spacr-0.3.2.dist-info → spacr-0.3.22.dist-info}/top_level.txt +0 -0
spacr/io.py
CHANGED
@@ -22,6 +22,7 @@ from torchvision.transforms import ToTensor
|
|
22
22
|
import seaborn as sns
|
23
23
|
from nd2reader import ND2Reader
|
24
24
|
from torchvision import transforms
|
25
|
+
from sklearn.model_selection import train_test_split
|
25
26
|
|
26
27
|
def process_non_tif_non_2D_images(folder):
|
27
28
|
"""Processes all images in the folder and splits them into grayscale channels, preserving bit depth."""
|
@@ -984,47 +985,6 @@ def _move_to_chan_folder(src, regex, timelapse=False, metadata_type=''):
|
|
984
985
|
shutil.move(os.path.join(src, filename), move)
|
985
986
|
return
|
986
987
|
|
987
|
-
def _merge_channels_v2(src, plot=False):
|
988
|
-
from .plot import plot_arrays
|
989
|
-
"""
|
990
|
-
Merge the channels in the given source directory and save the merged files in a 'stack' directory.
|
991
|
-
|
992
|
-
Args:
|
993
|
-
src (str): The path to the source directory containing the channel folders.
|
994
|
-
plot (bool, optional): Whether to plot the merged arrays. Defaults to False.
|
995
|
-
|
996
|
-
Returns:
|
997
|
-
None
|
998
|
-
"""
|
999
|
-
src = Path(src)
|
1000
|
-
stack_dir = src / 'stack'
|
1001
|
-
chan_dirs = [d for d in src.iterdir() if d.is_dir() and d.name in ['01', '02', '03', '04', '00', '1', '2', '3', '4','0']]
|
1002
|
-
|
1003
|
-
chan_dirs.sort(key=lambda x: x.name)
|
1004
|
-
print(f'List of folders in src: {[d.name for d in chan_dirs]}. Single channel folders.')
|
1005
|
-
start_time = time.time()
|
1006
|
-
|
1007
|
-
# First directory and its files
|
1008
|
-
dir_files = list(chan_dirs[0].iterdir())
|
1009
|
-
|
1010
|
-
# Create the 'stack' directory if it doesn't exist
|
1011
|
-
stack_dir.mkdir(exist_ok=True)
|
1012
|
-
print(f'generated folder with merged arrays: {stack_dir}')
|
1013
|
-
|
1014
|
-
if _is_dir_empty(stack_dir):
|
1015
|
-
with Pool(max(cpu_count() // 2, 1)) as pool:
|
1016
|
-
#with Pool(cpu_count()) as pool:
|
1017
|
-
merge_func = partial(_merge_file, chan_dirs, stack_dir)
|
1018
|
-
pool.map(merge_func, dir_files)
|
1019
|
-
|
1020
|
-
avg_time = (time.time() - start_time) / len(dir_files)
|
1021
|
-
print(f'Average Time: {avg_time:.3f} sec')
|
1022
|
-
|
1023
|
-
if plot:
|
1024
|
-
plot_arrays(src+'/stack')
|
1025
|
-
|
1026
|
-
return
|
1027
|
-
|
1028
988
|
def _merge_channels(src, plot=False):
|
1029
989
|
"""
|
1030
990
|
Merge the channels in the given source directory and save the merged files in a 'stack' directory without using multiprocessing.
|
@@ -2384,12 +2344,8 @@ def _results_to_csv(src, df, df_well):
|
|
2384
2344
|
wells.to_csv(wells_loc, index=True, header=True)
|
2385
2345
|
cells.to_csv(cells_loc, index=True, header=True)
|
2386
2346
|
return cells, wells
|
2387
|
-
|
2388
|
-
###################################################
|
2389
|
-
# Classify
|
2390
|
-
###################################################
|
2391
2347
|
|
2392
|
-
def read_plot_model_stats(
|
2348
|
+
def read_plot_model_stats(train_file_path, val_file_path ,save=False):
|
2393
2349
|
|
2394
2350
|
def _plot_and_save(train_df, val_df, column='accuracy', save=False, path=None, dpi=600):
|
2395
2351
|
|
@@ -2418,37 +2374,19 @@ def read_plot_model_stats(file_path ,save=False):
|
|
2418
2374
|
plt.savefig(pdf_path, format='pdf', dpi=dpi)
|
2419
2375
|
else:
|
2420
2376
|
plt.show()
|
2421
|
-
# Read the CSV into a dataframe
|
2422
|
-
df = pd.read_csv(file_path, index_col=0)
|
2423
|
-
|
2424
|
-
# Split the dataframe into train and validation based on the index
|
2425
|
-
train_df = df.filter(like='_train', axis=0).copy()
|
2426
|
-
val_df = df.filter(like='_val', axis=0).copy()
|
2427
|
-
|
2428
|
-
fldr_1 = os.path.dirname(file_path)
|
2429
|
-
|
2430
|
-
train_csv_path = os.path.join(fldr_1, 'train.csv')
|
2431
|
-
val_csv_path = os.path.join(fldr_1, 'validation.csv')
|
2432
2377
|
|
2433
|
-
|
2434
|
-
|
2435
|
-
|
2436
|
-
bn_2 = os.path.basename(fldr_2)
|
2437
|
-
bn_3 = os.path.basename(fldr_3)
|
2438
|
-
model_name = str(f'{bn_1}_{bn_2}_{bn_3}')
|
2378
|
+
# Read the CSVs into DataFrames
|
2379
|
+
train_df = pd.read_csv(train_file_path, index_col=0)
|
2380
|
+
val_df = pd.read_csv(val_file_path, index_col=0)
|
2439
2381
|
|
2440
|
-
#
|
2441
|
-
|
2442
|
-
val_df['epoch'] = [int(idx.split('_')[0]) for idx in val_df.index]
|
2443
|
-
|
2444
|
-
# Save dataframes to a CSV file
|
2445
|
-
train_df.to_csv(train_csv_path)
|
2446
|
-
val_df.to_csv(val_csv_path)
|
2382
|
+
# Get the folder path for saving plots
|
2383
|
+
fldr_1 = os.path.dirname(train_file_path)
|
2447
2384
|
|
2448
2385
|
if save:
|
2449
2386
|
# Setting the style
|
2450
2387
|
sns.set(style="whitegrid")
|
2451
2388
|
|
2389
|
+
# Plot and save the results
|
2452
2390
|
_plot_and_save(train_df, val_df, column='accuracy', save=save, path=fldr_1)
|
2453
2391
|
_plot_and_save(train_df, val_df, column='neg_accuracy', save=save, path=fldr_1)
|
2454
2392
|
_plot_and_save(train_df, val_df, column='pos_accuracy', save=save, path=fldr_1)
|
@@ -2496,50 +2434,53 @@ def _save_model(model, model_type, results_df, dst, epoch, epochs, intermedeate_
|
|
2496
2434
|
|
2497
2435
|
return model_path
|
2498
2436
|
|
2499
|
-
def _save_progress(dst,
|
2437
|
+
def _save_progress(dst, train_df, validation_df):
|
2500
2438
|
"""
|
2501
2439
|
Save the progress of the classification model.
|
2502
2440
|
|
2503
2441
|
Parameters:
|
2504
2442
|
dst (str): The destination directory to save the progress.
|
2505
|
-
|
2506
|
-
|
2443
|
+
train_df (pandas.DataFrame): The DataFrame containing training stats.
|
2444
|
+
validation_df (pandas.DataFrame): The DataFrame containing validation stats (if available).
|
2507
2445
|
|
2508
2446
|
Returns:
|
2509
2447
|
None
|
2510
2448
|
"""
|
2449
|
+
|
2450
|
+
def _save_df_to_csv(file_path, df):
|
2451
|
+
"""
|
2452
|
+
Save the given DataFrame to the specified CSV file, either creating a new file or appending to an existing one.
|
2453
|
+
|
2454
|
+
Parameters:
|
2455
|
+
file_path (str): The file path where the CSV will be saved.
|
2456
|
+
df (pandas.DataFrame): The DataFrame to save.
|
2457
|
+
"""
|
2458
|
+
if not os.path.exists(file_path):
|
2459
|
+
with open(file_path, 'w') as f:
|
2460
|
+
df.to_csv(f, index=True, header=True)
|
2461
|
+
f.flush() # Ensure data is written to the file system
|
2462
|
+
else:
|
2463
|
+
with open(file_path, 'a') as f:
|
2464
|
+
df.to_csv(f, index=True, header=False)
|
2465
|
+
f.flush()
|
2466
|
+
|
2511
2467
|
# Save accuracy, loss, PRAUC
|
2512
2468
|
os.makedirs(dst, exist_ok=True)
|
2513
|
-
|
2514
|
-
|
2515
|
-
results_df.to_csv(results_path, index=True, header=True, mode='w')
|
2516
|
-
else:
|
2517
|
-
results_df.to_csv(results_path, index=True, header=False, mode='a')
|
2469
|
+
results_path_train = os.path.join(dst, 'train.csv')
|
2470
|
+
results_path_validation = os.path.join(dst, 'validation.csv')
|
2518
2471
|
|
2519
|
-
|
2520
|
-
|
2521
|
-
return
|
2472
|
+
# Save training data
|
2473
|
+
_save_df_to_csv(results_path_train, train_df)
|
2522
2474
|
|
2523
|
-
|
2524
|
-
|
2525
|
-
|
2475
|
+
# Save validation data if available
|
2476
|
+
if validation_df is not None:
|
2477
|
+
_save_df_to_csv(results_path_validation, validation_df)
|
2526
2478
|
|
2527
|
-
|
2528
|
-
|
2529
|
-
- src (str): The source directory where the settings file will be saved.
|
2479
|
+
# Call read_plot_model_stats after ensuring the files are saved
|
2480
|
+
read_plot_model_stats(results_path_train, results_path_validation, save=True)
|
2530
2481
|
|
2531
|
-
Returns:
|
2532
|
-
None
|
2533
|
-
"""
|
2534
|
-
dst = os.path.join(src,'model')
|
2535
|
-
settings_loc = os.path.join(dst,'settings.csv')
|
2536
|
-
os.makedirs(dst, exist_ok=True)
|
2537
|
-
settings_df = pd.DataFrame(list(settings.items()), columns=['setting_key', 'setting_value'])
|
2538
|
-
display(settings_df)
|
2539
|
-
settings_df.to_csv(settings_loc, index=False)
|
2540
2482
|
return
|
2541
2483
|
|
2542
|
-
|
2543
2484
|
def _copy_missclassified(df):
|
2544
2485
|
misclassified = df[df['true_label'] != df['predicted_label']]
|
2545
2486
|
for _, row in misclassified.iterrows():
|
@@ -2869,7 +2810,8 @@ def generate_dataset(settings={}):
|
|
2869
2810
|
all_paths = []
|
2870
2811
|
for i, src in enumerate(settings['src']):
|
2871
2812
|
db_path = os.path.join(src, 'measurements', 'measurements.db')
|
2872
|
-
|
2813
|
+
if i == 0:
|
2814
|
+
dst = os.path.join(src, 'datasets')
|
2873
2815
|
paths = generate_path_list_from_db(db_path, file_metadata=settings['file_metadata'])
|
2874
2816
|
correct_paths(paths, src)
|
2875
2817
|
all_paths.extend(paths)
|
@@ -2917,6 +2859,8 @@ def generate_dataset(settings={}):
|
|
2917
2859
|
|
2918
2860
|
# Combine the temporary tar files into a final tar
|
2919
2861
|
date_name = datetime.date.today().strftime('%y%m%d')
|
2862
|
+
if len(settings['src']) > 1:
|
2863
|
+
date_name = f"{date_name}_combined"
|
2920
2864
|
if not settings['file_metadata'] is None:
|
2921
2865
|
tar_name = f"{date_name}_{settings['experiment']}_{settings['file_metadata']}.tar"
|
2922
2866
|
else:
|
@@ -2967,7 +2911,6 @@ def generate_loaders(src, mode='train', image_size=224, batch_size=32, classes=[
|
|
2967
2911
|
- val_loaders (list): List of data loaders for validation datasets.
|
2968
2912
|
"""
|
2969
2913
|
|
2970
|
-
from .io import spacrDataset
|
2971
2914
|
from .utils import SelectChannels, augment_dataset
|
2972
2915
|
|
2973
2916
|
chans = []
|
@@ -3066,10 +3009,6 @@ def generate_loaders(src, mode='train', image_size=224, batch_size=32, classes=[
|
|
3066
3009
|
|
3067
3010
|
def generate_training_dataset(settings):
|
3068
3011
|
|
3069
|
-
from .io import _read_and_merge_data, _read_db
|
3070
|
-
from .utils import get_paths_from_db, annotate_conditions, save_settings
|
3071
|
-
from .settings import set_generate_training_dataset_defaults
|
3072
|
-
|
3073
3012
|
# Function to filter png_list_df by prcfo present in df without merging
|
3074
3013
|
def filter_png_list(db_path, settings):
|
3075
3014
|
tables = ['cell', 'nucleus', 'pathogen', 'cytoplasm']
|
@@ -3173,34 +3112,55 @@ def generate_training_dataset(settings):
|
|
3173
3112
|
class_paths_ls[i] = random.sample(class_paths, size)
|
3174
3113
|
|
3175
3114
|
return class_paths_ls
|
3115
|
+
|
3116
|
+
from .io import _read_and_merge_data, _read_db
|
3117
|
+
from .utils import get_paths_from_db, annotate_conditions, save_settings
|
3118
|
+
from .settings import set_generate_training_dataset_defaults
|
3176
3119
|
|
3177
3120
|
# Set default settings and save
|
3178
3121
|
settings = set_generate_training_dataset_defaults(settings)
|
3179
3122
|
save_settings(settings, 'cv_dataset', show=True)
|
3180
3123
|
|
3181
|
-
|
3182
|
-
dst = os.path.join(settings['src'], 'datasets', 'training')
|
3124
|
+
class_path_list = None
|
3183
3125
|
|
3184
|
-
|
3185
|
-
|
3186
|
-
for i in range(1, 100000):
|
3187
|
-
dst = os.path.join(settings['src'], 'datasets', f'training_{i}')
|
3188
|
-
if not os.path.exists(dst):
|
3189
|
-
print(f'Creating new directory for training: {dst}')
|
3190
|
-
break
|
3126
|
+
if isinstance(settings['src'], str):
|
3127
|
+
src = [settings['src']]
|
3191
3128
|
|
3192
|
-
|
3193
|
-
|
3194
|
-
|
3129
|
+
for i, src in enumerate(settings['src']):
|
3130
|
+
db_path = os.path.join(src, 'measurements', 'measurements.db')
|
3131
|
+
|
3132
|
+
if len(settings['src']) > 1 and i == 0:
|
3133
|
+
dst = os.path.join(src, 'datasets', 'training_all')
|
3134
|
+
elif len(settings['src']) == 1:
|
3135
|
+
dst = os.path.join(src, 'datasets', 'training')
|
3136
|
+
|
3137
|
+
# Create a new directory for training data if necessary
|
3138
|
+
if os.path.exists(dst):
|
3139
|
+
for i in range(1, 100000):
|
3140
|
+
dst = dst + f'_{i}'
|
3141
|
+
if not os.path.exists(dst):
|
3142
|
+
print(f'Creating new directory for training: {dst}')
|
3143
|
+
break
|
3195
3144
|
|
3196
|
-
|
3197
|
-
|
3145
|
+
# Select dataset based on dataset mode
|
3146
|
+
if settings['dataset_mode'] == 'annotation':
|
3147
|
+
class_paths_ls = annotation_based_selection(db_path, dst, settings)
|
3148
|
+
|
3149
|
+
elif settings['dataset_mode'] == 'metadata':
|
3150
|
+
class_paths_ls = metadata_based_selection(db_path, settings)
|
3151
|
+
|
3152
|
+
elif settings['dataset_mode'] == 'measurement':
|
3153
|
+
class_paths_ls = measurement_based_selection(settings, db_path)
|
3154
|
+
|
3155
|
+
if class_path_list is None:
|
3156
|
+
class_path_list = [[] for _ in range(len(class_paths_ls))]
|
3198
3157
|
|
3199
|
-
|
3200
|
-
|
3158
|
+
# Extend each list in class_path_list with the corresponding list from class_paths_ls
|
3159
|
+
for idx in range(len(class_paths_ls)):
|
3160
|
+
class_path_list[idx].extend(class_paths_ls[idx])
|
3201
3161
|
|
3202
3162
|
# Generate and return training and testing directories
|
3203
|
-
train_class_dir, test_class_dir = generate_dataset_from_lists(dst, class_data=
|
3163
|
+
train_class_dir, test_class_dir = generate_dataset_from_lists(dst, class_data=class_path_list, classes=settings['classes'], test_split=settings['test_split'])
|
3204
3164
|
|
3205
3165
|
return train_class_dir, test_class_dir
|
3206
3166
|
|
@@ -3234,7 +3194,6 @@ def training_dataset_from_annotation(db_path, dst, annotation_column='test', ann
|
|
3234
3194
|
|
3235
3195
|
def generate_dataset_from_lists(dst, class_data, classes, test_split=0.1):
|
3236
3196
|
from .utils import print_progress
|
3237
|
-
from .deep_spacr import train_test_split
|
3238
3197
|
# Make sure that the length of class_data matches the length of classes
|
3239
3198
|
if len(class_data) != len(classes):
|
3240
3199
|
raise ValueError("class_data and classes must have the same length.")
|
spacr/ml.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
import os, shap
|
1
|
+
import os, shap, re
|
2
2
|
import pandas as pd
|
3
3
|
import numpy as np
|
4
4
|
from scipy import stats
|
@@ -354,75 +354,128 @@ def perform_regression(settings):
|
|
354
354
|
from .settings import get_perform_regression_default_settings
|
355
355
|
from .toxo import go_term_enrichment_by_column, custom_volcano_plot
|
356
356
|
|
357
|
-
|
358
|
-
|
359
|
-
if
|
360
|
-
settings['
|
361
|
-
|
362
|
-
|
357
|
+
def _perform_regression_read_data(settings):
|
358
|
+
|
359
|
+
if isinstance(settings['score_data'], list) and isinstance(settings['count_data'], list):
|
360
|
+
settings['plate'] = None
|
361
|
+
if len(settings['score_data']) == 1:
|
362
|
+
settings['score_data'] = settings['score_data'][0]
|
363
|
+
if len(settings['count_data']) == 1:
|
364
|
+
settings['count_data'] = settings['count_data'][0]
|
365
|
+
else:
|
366
|
+
count_data_df = pd.DataFrame()
|
367
|
+
for i, count_data in enumerate(settings['count_data']):
|
368
|
+
df = pd.read_csv(count_data)
|
369
|
+
df['plate_name'] = f'plate{i+1}'
|
370
|
+
if 'column' in df.columns:
|
371
|
+
df['col'] = df['column']
|
372
|
+
count_data_df = pd.concat([count_data_df, df])
|
373
|
+
print('Count data:', len(count_data_df))
|
374
|
+
|
375
|
+
score_data_df = pd.DataFrame()
|
376
|
+
for i, score_data in enumerate(settings['score_data']):
|
377
|
+
df = pd.read_csv(score_data)
|
378
|
+
df['plate_name'] = f'plate{i+1}'
|
379
|
+
if 'column' in df.columns:
|
380
|
+
df['col'] = df['column']
|
381
|
+
score_data_df = pd.concat([score_data_df, df])
|
382
|
+
print('Score data:', len(score_data_df))
|
363
383
|
else:
|
364
|
-
count_data_df = pd.
|
365
|
-
|
366
|
-
df = pd.read_csv(count_data)
|
367
|
-
df['plate_name'] = f'plate{i+1}'
|
368
|
-
count_data_df = pd.concat([count_data_df, df])
|
369
|
-
print('Count data:', len(count_data_df))
|
370
|
-
|
371
|
-
score_data_df = pd.DataFrame()
|
372
|
-
for i, score_data in enumerate(settings['score_data']):
|
373
|
-
df = pd.read_csv(score_data)
|
374
|
-
df['plate_name'] = f'plate{i+1}'
|
375
|
-
score_data_df = pd.concat([score_data_df, df])
|
376
|
-
print('Score data:', len(score_data_df))
|
377
|
-
else:
|
378
|
-
count_data_df = pd.read_csv(settings['count_data'])
|
379
|
-
score_data_df = pd.read_csv(settings['score_data'])
|
380
|
-
|
381
|
-
reg_types = ['ols','gls','wls','rlm','glm','mixed','quantile','logit','probit','poisson','lasso','ridge']
|
382
|
-
if settings['regression_type'] not in reg_types:
|
383
|
-
print(f'Possible regression types: {reg_types}')
|
384
|
-
raise ValueError(f"Unsupported regression type {settings['regression_type']}")
|
385
|
-
|
386
|
-
if settings['dependent_variable'] not in score_data_df.columns:
|
387
|
-
print(f'Columns in DataFrame:')
|
388
|
-
for col in score_data_df.columns:
|
389
|
-
print(col)
|
390
|
-
raise ValueError(f"Dependent variable {settings['dependent_variable']} not found in the DataFrame")
|
391
|
-
|
392
|
-
if isinstance(settings['count_data'], list):
|
393
|
-
src = os.path.dirname(settings['count_data'][0])
|
394
|
-
csv_path = settings['count_data'][0]
|
395
|
-
else:
|
396
|
-
src = os.path.dirname(settings['count_data'])
|
397
|
-
csv_path = settings['count_data']
|
384
|
+
count_data_df = pd.read_csv(settings['count_data'])
|
385
|
+
score_data_df = pd.read_csv(settings['score_data'])
|
398
386
|
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
387
|
+
print(f"Dependent variable: {len(score_data_df)}")
|
388
|
+
print(f"Independent variable: {len(count_data_df)}")
|
389
|
+
|
390
|
+
if settings['dependent_variable'] not in score_data_df.columns:
|
391
|
+
print(f'Columns in DataFrame:')
|
392
|
+
for col in score_data_df.columns:
|
393
|
+
print(col)
|
394
|
+
raise ValueError(f"Dependent variable {settings['dependent_variable']} not found in the DataFrame")
|
395
|
+
|
396
|
+
if 'prediction_probability_class_1' in score_data_df.columns:
|
397
|
+
if not settings['class_1_threshold'] is None:
|
398
|
+
score_data_df['predictions'] = (score_data_df['prediction_probability_class_1'] >= settings['class_1_threshold']).astype(int)
|
399
|
+
|
400
|
+
reg_types = ['ols','gls','wls','rlm','glm','mixed','quantile','logit','probit','poisson','lasso','ridge']
|
401
|
+
if settings['regression_type'] not in reg_types:
|
402
|
+
print(f'Possible regression types: {reg_types}')
|
403
|
+
raise ValueError(f"Unsupported regression type {settings['regression_type']}")
|
403
404
|
|
404
|
-
|
405
|
-
|
405
|
+
return count_data_df, score_data_df
|
406
|
+
|
407
|
+
def _perform_regression_set_paths(settings):
|
408
|
+
|
409
|
+
if isinstance(settings['score_data'], list):
|
410
|
+
score_data = settings['score_data'][0]
|
411
|
+
else:
|
412
|
+
score_data = settings['score_data']
|
413
|
+
|
414
|
+
score_source = os.path.splitext(os.path.basename(score_data))[0]
|
415
|
+
|
416
|
+
if isinstance(settings['count_data'], list):
|
417
|
+
src = os.path.dirname(settings['count_data'][0])
|
418
|
+
csv_path = settings['count_data'][0]
|
419
|
+
else:
|
420
|
+
src = os.path.dirname(settings['count_data'])
|
421
|
+
csv_path = settings['count_data']
|
406
422
|
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
423
|
+
settings['src'] = src
|
424
|
+
res_folder = os.path.join(src, 'results', score_source, settings['regression_type'])
|
425
|
+
|
426
|
+
if isinstance(settings['count_data'], list):
|
427
|
+
res_folder = os.path.join(res_folder, 'list')
|
428
|
+
|
429
|
+
os.makedirs(res_folder, exist_ok=True)
|
430
|
+
results_filename = 'results.csv'
|
431
|
+
results_filename_gene = 'results_gene.csv'
|
432
|
+
results_filename_grna = 'results_grna.csv'
|
433
|
+
hits_filename = 'results_significant.csv'
|
434
|
+
results_path=os.path.join(res_folder, results_filename)
|
435
|
+
results_path_gene=os.path.join(res_folder, results_filename_gene)
|
436
|
+
results_path_grna=os.path.join(res_folder, results_filename_grna)
|
437
|
+
hits_path=os.path.join(res_folder, hits_filename)
|
438
|
+
|
439
|
+
return results_path, results_path_gene, results_path_grna, hits_path, res_folder, csv_path
|
440
|
+
|
441
|
+
def _count_variable_instances(df, column_1, column_2):
|
442
|
+
if column_1 is not None:
|
443
|
+
n_grna = df[column_1].value_counts().reset_index()
|
444
|
+
n_grna.columns = [column_1, f'n_{column_1}']
|
445
|
+
if column_2 is not None:
|
446
|
+
n_gene = df[column_2].value_counts().reset_index()
|
447
|
+
n_gene.columns = [column_2, f'n_{column_2}']
|
448
|
+
if column_1 is not None and column_2 is not None:
|
449
|
+
return df, n_grna, n_gene
|
450
|
+
elif column_1 is not None:
|
451
|
+
return df, n_grna
|
452
|
+
elif column_2 is not None:
|
453
|
+
return df, n_gene
|
454
|
+
else:
|
455
|
+
return df
|
413
456
|
|
414
457
|
settings = get_perform_regression_default_settings(settings)
|
458
|
+
count_data_df, score_data_df = _perform_regression_read_data(settings)
|
459
|
+
results_path, results_path_gene, results_path_grna, hits_path, res_folder, csv_path = _perform_regression_set_paths(settings)
|
415
460
|
save_settings(settings, name='regression', show=True)
|
416
461
|
|
417
462
|
score_data_df = clean_controls(score_data_df, settings['pc'], settings['nc'], settings['other'])
|
418
|
-
|
419
|
-
if 'prediction_probability_class_1' in score_data_df.columns:
|
420
|
-
if not settings['class_1_threshold'] is None:
|
421
|
-
score_data_df['predictions'] = (score_data_df['prediction_probability_class_1'] >= settings['class_1_threshold']).astype(int)
|
463
|
+
print(f"Dependent variable after clean_controls: {len(score_data_df)}")
|
422
464
|
|
423
465
|
dependent_df, dependent_variable = process_scores(score_data_df, settings['dependent_variable'], settings['plate'], settings['min_cell_count'], settings['agg_type'], settings['transform'])
|
424
|
-
|
425
|
-
|
466
|
+
print(f"Dependent variable after process_scores: {len(dependent_df)}")
|
467
|
+
|
468
|
+
filter_value = [settings['nc'], settings['pc']]
|
469
|
+
|
470
|
+
if settings['other'] is not None:
|
471
|
+
if isinstance(settings['other'], str):
|
472
|
+
settings['other'] = [settings['other']]
|
473
|
+
filter_value.extend(settings['other'])
|
474
|
+
|
475
|
+
independent_df = process_reads(count_data_df, settings['fraction_threshold'], settings['plate'], filter_column=settings['location_column'], filter_value=filter_value)
|
476
|
+
independent_df, n_grna, n_gene = _count_variable_instances(independent_df, column_1='grna', column_2='gene')
|
477
|
+
|
478
|
+
print(f"Independent variable after process_reads: {len(independent_df)}")
|
426
479
|
|
427
480
|
merged_df = pd.merge(independent_df, dependent_df, on='prc')
|
428
481
|
|
@@ -436,7 +489,20 @@ def perform_regression(settings):
|
|
436
489
|
|
437
490
|
model, coef_df = regression(merged_df, csv_path, dependent_variable, settings['regression_type'], settings['alpha'], settings['random_row_column_effects'], highlight=settings['highlight'], dst=res_folder, cov_type=settings['cov_type'])
|
438
491
|
|
492
|
+
coef_df['grna'] = coef_df['feature'].apply(lambda x: re.search(r'grna\[(.*?)\]', x).group(1) if 'grna' in x else None)
|
493
|
+
coef_df['gene'] = coef_df['feature'].apply(lambda x: re.search(r'gene\[(.*?)\]', x).group(1) if 'gene' in x else None)
|
494
|
+
coef_df = coef_df.merge(n_grna, how='left', on='grna')
|
495
|
+
coef_df = coef_df.merge(n_gene, how='left', on='gene')
|
496
|
+
display(coef_df)
|
497
|
+
|
498
|
+
gene_coef_df = coef_df[coef_df['n_gene'] != None]
|
499
|
+
grna_coef_df = coef_df[coef_df['n_grna'] != None]
|
500
|
+
gene_coef_df = gene_coef_df.dropna(subset=['n_gene'])
|
501
|
+
grna_coef_df = grna_coef_df.dropna(subset=['n_grna'])
|
502
|
+
|
439
503
|
coef_df.to_csv(results_path, index=False)
|
504
|
+
gene_coef_df.to_csv(results_path_gene, index=False)
|
505
|
+
grna_coef_df.to_csv(results_path_grna, index=False)
|
440
506
|
|
441
507
|
if settings['regression_type'] == 'lasso':
|
442
508
|
significant = coef_df[coef_df['coefficient'] > 0]
|
@@ -460,18 +526,24 @@ def perform_regression(settings):
|
|
460
526
|
filename, _ = os.path.splitext(file)
|
461
527
|
_ = merge_regression_res_with_metadata(hits_path, metadata_file, name=filename)
|
462
528
|
merged_df = merge_regression_res_with_metadata(results_path, metadata_file, name=filename)
|
529
|
+
gene_merged_df = merge_regression_res_with_metadata(results_path_gene, metadata_file, name=filename)
|
530
|
+
grna_merged_df = merge_regression_res_with_metadata(results_path_grna, metadata_file, name=filename)
|
463
531
|
|
464
532
|
if settings['toxo']:
|
465
533
|
|
466
534
|
data_path = merged_df
|
535
|
+
data_path_gene = gene_merged_df
|
536
|
+
data_path_grna = grna_merged_df
|
467
537
|
base_dir = os.path.dirname(os.path.abspath(__file__))
|
468
538
|
metadata_path = os.path.join(base_dir, 'resources', 'data', 'lopit.csv')
|
469
539
|
|
470
540
|
custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
|
471
|
-
|
472
|
-
metadata_path =
|
473
|
-
|
474
|
-
|
541
|
+
custom_volcano_plot(data_path_gene, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
|
542
|
+
custom_volcano_plot(data_path_grna, metadata_path, metadata_column='tagm_location', string_list=[settings['highlight']], point_size=50, figsize=20)
|
543
|
+
|
544
|
+
if len(significant) > 2:
|
545
|
+
metadata_path = os.path.join(base_dir, 'resources', 'data', 'toxoplasma_metadata.csv')
|
546
|
+
go_term_enrichment_by_column(significant, metadata_path)
|
475
547
|
|
476
548
|
print('Significant Genes')
|
477
549
|
display(significant)
|
@@ -481,29 +553,43 @@ def perform_regression(settings):
|
|
481
553
|
|
482
554
|
return output
|
483
555
|
|
484
|
-
def process_reads(csv_path, fraction_threshold, plate):
|
556
|
+
def process_reads(csv_path, fraction_threshold, plate, filter_column=None, filter_value=None):
|
485
557
|
|
486
558
|
if isinstance(csv_path, pd.DataFrame):
|
487
559
|
csv_df = csv_path
|
488
560
|
else:
|
489
561
|
# Read the CSV file into a DataFrame
|
490
562
|
csv_df = pd.read_csv(csv_path)
|
491
|
-
|
563
|
+
|
492
564
|
if 'plate_name' in csv_df.columns:
|
493
565
|
csv_df = csv_df.rename(columns={'plate_name': 'plate'})
|
494
566
|
if 'column_name' in csv_df.columns:
|
495
567
|
csv_df = csv_df.rename(columns={'column_name': 'column'})
|
568
|
+
if 'col' in csv_df.columns:
|
569
|
+
csv_df = csv_df.rename(columns={'col': 'column'})
|
496
570
|
if 'row_name' in csv_df.columns:
|
497
571
|
csv_df = csv_df.rename(columns={'row_name': 'row'})
|
498
572
|
if 'grna_name' in csv_df.columns:
|
499
573
|
csv_df = csv_df.rename(columns={'grna_name': 'grna'})
|
500
574
|
if 'plate_row' in csv_df.columns:
|
501
575
|
csv_df[['plate', 'row']] = csv_df['plate_row'].str.split('_', expand=True)
|
576
|
+
|
502
577
|
if not 'plate' in csv_df.columns:
|
503
578
|
if not plate is None:
|
504
579
|
csv_df['plate'] = plate
|
505
580
|
else:
|
506
581
|
csv_df['plate'] = 'plate1'
|
582
|
+
|
583
|
+
if isinstance(filter_column, str):
|
584
|
+
filter_column = [filter_column]
|
585
|
+
|
586
|
+
if isinstance(filter_value, str):
|
587
|
+
filter_value = [filter_value]
|
588
|
+
|
589
|
+
if isinstance(filter_column, list):
|
590
|
+
for filter_col in filter_column:
|
591
|
+
for value in filter_value:
|
592
|
+
csv_df = csv_df[csv_df[filter_col] != value]
|
507
593
|
|
508
594
|
# Ensure the necessary columns are present
|
509
595
|
if not all(col in csv_df.columns for col in ['row','column','grna','count']):
|
@@ -587,7 +673,8 @@ def process_scores(df, dependent_variable, plate, min_cell_count=25, agg_type='m
|
|
587
673
|
if 'col' not in df.columns:
|
588
674
|
df['col'] = df['column']
|
589
675
|
|
590
|
-
df['prc'] = df['plate'] + '_' + df['row'] + '_' + df['col']
|
676
|
+
df['prc'] = df['plate'].astype(str) + '_' + df['row'].astype(str) + '_' + df['col'].astype(str)
|
677
|
+
|
591
678
|
df = df[['prc', dependent_variable]]
|
592
679
|
|
593
680
|
# Group by prc and calculate the mean and count of the dependent_variable
|