spacr 0.3.1__py3-none-any.whl → 0.3.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/__init__.py +19 -3
- spacr/cellpose.py +311 -0
- spacr/core.py +245 -2494
- spacr/deep_spacr.py +316 -48
- spacr/gui.py +1 -0
- spacr/gui_core.py +74 -63
- spacr/gui_elements.py +110 -5
- spacr/gui_utils.py +346 -6
- spacr/io.py +680 -141
- spacr/logger.py +28 -9
- spacr/measure.py +107 -95
- spacr/mediar.py +0 -3
- spacr/ml.py +1051 -0
- spacr/openai.py +37 -0
- spacr/plot.py +707 -20
- spacr/resources/data/lopit.csv +3833 -0
- spacr/resources/data/toxoplasma_metadata.csv +8843 -0
- spacr/resources/icons/convert.png +0 -0
- spacr/resources/{models/cp/toxo_plaque_cyto_e25000_X1120_Y1120.CP_model → icons/dna_matrix.mp4} +0 -0
- spacr/sequencing.py +241 -1311
- spacr/settings.py +134 -47
- spacr/sim.py +0 -2
- spacr/submodules.py +349 -0
- spacr/timelapse.py +0 -2
- spacr/toxo.py +238 -0
- spacr/utils.py +419 -180
- {spacr-0.3.1.dist-info → spacr-0.3.22.dist-info}/METADATA +31 -22
- {spacr-0.3.1.dist-info → spacr-0.3.22.dist-info}/RECORD +32 -33
- spacr/chris.py +0 -50
- spacr/graph_learning.py +0 -340
- spacr/resources/MEDIAR/.git +0 -1
- spacr/resources/MEDIAR_weights/.DS_Store +0 -0
- spacr/resources/icons/.DS_Store +0 -0
- spacr/resources/icons/spacr_logo_rotation.gif +0 -0
- spacr/resources/models/cp/toxo_plaque_cyto_e25000_X1120_Y1120.CP_model_settings.csv +0 -23
- spacr/resources/models/cp/toxo_pv_lumen.CP_model +0 -0
- spacr/sim_app.py +0 -0
- {spacr-0.3.1.dist-info → spacr-0.3.22.dist-info}/LICENSE +0 -0
- {spacr-0.3.1.dist-info → spacr-0.3.22.dist-info}/WHEEL +0 -0
- {spacr-0.3.1.dist-info → spacr-0.3.22.dist-info}/entry_points.txt +0 -0
- {spacr-0.3.1.dist-info → spacr-0.3.22.dist-info}/top_level.txt +0 -0
spacr/submodules.py
ADDED
@@ -0,0 +1,349 @@
|
|
1
|
+
import seaborn as sns
|
2
|
+
import os, random, sqlite3
|
3
|
+
import pandas as pd
|
4
|
+
import numpy as np
|
5
|
+
import cellpose
|
6
|
+
from skimage.measure import regionprops, label
|
7
|
+
from cellpose import models as cp_models
|
8
|
+
from cellpose import train as train_cp
|
9
|
+
from IPython.display import display
|
10
|
+
|
11
|
+
def analyze_recruitment(settings={}):
|
12
|
+
"""
|
13
|
+
Analyze recruitment data by grouping the DataFrame by well coordinates and plotting controls and recruitment data.
|
14
|
+
|
15
|
+
Parameters:
|
16
|
+
settings (dict): settings.
|
17
|
+
|
18
|
+
Returns:
|
19
|
+
None
|
20
|
+
"""
|
21
|
+
|
22
|
+
from .io import _read_and_merge_data, _results_to_csv
|
23
|
+
from .plot import plot_image_mask_overlay, _plot_controls, _plot_recruitment
|
24
|
+
from .utils import _object_filter, annotate_conditions, _calculate_recruitment, _group_by_well, save_settings
|
25
|
+
from .settings import get_analyze_recruitment_default_settings
|
26
|
+
|
27
|
+
settings = get_analyze_recruitment_default_settings(settings=settings)
|
28
|
+
save_settings(settings, name='recruitment')
|
29
|
+
|
30
|
+
print(f"Cell(s): {settings['cell_types']}, in {settings['cell_plate_metadata']}")
|
31
|
+
print(f"Pathogen(s): {settings['pathogen_types']}, in {settings['pathogen_plate_metadata']}")
|
32
|
+
print(f"Treatment(s): {settings['treatments']}, in {settings['treatment_plate_metadata']}")
|
33
|
+
|
34
|
+
mask_chans=[settings['nucleus_chann_dim'], settings['pathogen_chann_dim'], settings['cell_chann_dim']]
|
35
|
+
|
36
|
+
sns.color_palette("mako", as_cmap=True)
|
37
|
+
print(f"channel:{settings['channel_of_interest']} = {settings['target']}")
|
38
|
+
|
39
|
+
df, _ = _read_and_merge_data(locs=[settings['src']+'/measurements/measurements.db'],
|
40
|
+
tables=['cell', 'nucleus', 'pathogen','cytoplasm'],
|
41
|
+
verbose=True,
|
42
|
+
nuclei_limit=settings['nuclei_limit'],
|
43
|
+
pathogen_limit=settings['pathogen_limit'],
|
44
|
+
uninfected=settings['uninfected'])
|
45
|
+
|
46
|
+
df = annotate_conditions(df,
|
47
|
+
cells=settings['cell_types'],
|
48
|
+
cell_loc=settings['cell_plate_metadata'],
|
49
|
+
pathogens=settings['pathogen_types'],
|
50
|
+
pathogen_loc=settings['pathogen_plate_metadata'],
|
51
|
+
treatments=settings['treatments'],
|
52
|
+
treatment_loc=settings['treatment_plate_metadata'])
|
53
|
+
|
54
|
+
df = df.dropna(subset=['condition'])
|
55
|
+
print(f'After dropping non-annotated wells: {len(df)} rows')
|
56
|
+
|
57
|
+
files = df['file_name'].tolist()
|
58
|
+
print(f'found: {len(files)} files')
|
59
|
+
|
60
|
+
files = [item + '.npy' for item in files]
|
61
|
+
random.shuffle(files)
|
62
|
+
|
63
|
+
_max = 10**100
|
64
|
+
if settings['cell_size_range'] is None:
|
65
|
+
settings['cell_size_range'] = [0,_max]
|
66
|
+
if settings['nucleus_size_range'] is None:
|
67
|
+
settings['nucleus_size_range'] = [0,_max]
|
68
|
+
if settings['pathogen_size_range'] is None:
|
69
|
+
settings['pathogen_size_range'] = [0,_max]
|
70
|
+
|
71
|
+
if settings['plot']:
|
72
|
+
merged_path = os.path.join(settings['src'],'merged')
|
73
|
+
if os.path.exists(merged_path):
|
74
|
+
try:
|
75
|
+
for idx, file in enumerate(os.listdir(merged_path)):
|
76
|
+
file_path = os.path.join(merged_path,file)
|
77
|
+
if idx <= settings['plot_nr']:
|
78
|
+
plot_image_mask_overlay(file_path,
|
79
|
+
settings['channel_dims'],
|
80
|
+
settings['cell_chann_dim'],
|
81
|
+
settings['nucleus_chann_dim'],
|
82
|
+
settings['pathogen_chann_dim'],
|
83
|
+
figuresize=10,
|
84
|
+
normalize=True,
|
85
|
+
thickness=3,
|
86
|
+
save_pdf=True)
|
87
|
+
except Exception as e:
|
88
|
+
print(f'Failed to plot images with outlines, Error: {e}')
|
89
|
+
|
90
|
+
if not settings['cell_chann_dim'] is None:
|
91
|
+
df = _object_filter(df, 'cell', settings['cell_size_range'], settings['cell_intensity_range'], mask_chans, 0)
|
92
|
+
if not settings['target_intensity_min'] is None or not settings['target_intensity_min'] is 0:
|
93
|
+
df = df[df[f"cell_channel_{settings['channel_of_interest']}_percentile_95"] > settings['target_intensity_min']]
|
94
|
+
print(f"After channel {settings['channel_of_interest']} filtration", len(df))
|
95
|
+
if not settings['nucleus_chann_dim'] is None:
|
96
|
+
df = _object_filter(df, 'nucleus', settings['nucleus_size_range'], settings['nucleus_intensity_range'], mask_chans, 1)
|
97
|
+
if not settings['pathogen_chann_dim'] is None:
|
98
|
+
df = _object_filter(df, 'pathogen', settings['pathogen_size_range'], settings['pathogen_intensity_range'], mask_chans, 2)
|
99
|
+
|
100
|
+
df['recruitment'] = df[f"pathogen_channel_{settings['channel_of_interest']}_mean_intensity"]/df[f"cytoplasm_channel_{settings['channel_of_interest']}_mean_intensity"]
|
101
|
+
|
102
|
+
for chan in settings['channel_dims']:
|
103
|
+
df = _calculate_recruitment(df, channel=chan)
|
104
|
+
print(f'calculated recruitment for: {len(df)} rows')
|
105
|
+
|
106
|
+
df_well = _group_by_well(df)
|
107
|
+
print(f'found: {len(df_well)} wells')
|
108
|
+
|
109
|
+
df_well = df_well[df_well['cells_per_well'] >= settings['cells_per_well']]
|
110
|
+
prc_list = df_well['prc'].unique().tolist()
|
111
|
+
df = df[df['prc'].isin(prc_list)]
|
112
|
+
print(f"After cells per well filter: {len(df)} cells in {len(df_well)} wells left wth threshold {settings['cells_per_well']}")
|
113
|
+
|
114
|
+
if settings['plot_control']:
|
115
|
+
_plot_controls(df, mask_chans, settings['channel_of_interest'], figuresize=5)
|
116
|
+
|
117
|
+
print(f'PV level: {len(df)} rows')
|
118
|
+
_plot_recruitment(df, 'by PV', settings['channel_of_interest'], columns=[], figuresize=settings['figuresize'])
|
119
|
+
print(f'well level: {len(df_well)} rows')
|
120
|
+
_plot_recruitment(df_well, 'by well', settings['channel_of_interest'], columns=[], figuresize=settings['figuresize'])
|
121
|
+
cells,wells = _results_to_csv(settings['src'], df, df_well)
|
122
|
+
|
123
|
+
return [cells,wells]
|
124
|
+
|
125
|
+
def analyze_plaques(folder):
|
126
|
+
summary_data = []
|
127
|
+
details_data = []
|
128
|
+
stats_data = []
|
129
|
+
|
130
|
+
for filename in os.listdir(folder):
|
131
|
+
filepath = os.path.join(folder, filename)
|
132
|
+
if os.path.isfile(filepath):
|
133
|
+
# Assuming each file is a NumPy array file (.npy) containing a 16-bit labeled image
|
134
|
+
#image = np.load(filepath)
|
135
|
+
image = cellpose.io.imread(filepath)
|
136
|
+
labeled_image = label(image)
|
137
|
+
regions = regionprops(labeled_image)
|
138
|
+
|
139
|
+
object_count = len(regions)
|
140
|
+
sizes = [region.area for region in regions]
|
141
|
+
average_size = np.mean(sizes) if sizes else 0
|
142
|
+
std_dev_size = np.std(sizes) if sizes else 0
|
143
|
+
|
144
|
+
summary_data.append({'file': filename, 'object_count': object_count, 'average_size': average_size})
|
145
|
+
stats_data.append({'file': filename, 'plaque_count': object_count, 'average_size': average_size, 'std_dev_size': std_dev_size})
|
146
|
+
for size in sizes:
|
147
|
+
details_data.append({'file': filename, 'plaque_size': size})
|
148
|
+
|
149
|
+
# Convert lists to pandas DataFrames
|
150
|
+
summary_df = pd.DataFrame(summary_data)
|
151
|
+
details_df = pd.DataFrame(details_data)
|
152
|
+
stats_df = pd.DataFrame(stats_data)
|
153
|
+
|
154
|
+
# Save DataFrames to a SQLite database
|
155
|
+
db_name = os.path.join(folder, 'plaques_analysis.db')
|
156
|
+
conn = sqlite3.connect(db_name)
|
157
|
+
|
158
|
+
summary_df.to_sql('summary', conn, if_exists='replace', index=False)
|
159
|
+
details_df.to_sql('details', conn, if_exists='replace', index=False)
|
160
|
+
stats_df.to_sql('stats', conn, if_exists='replace', index=False)
|
161
|
+
|
162
|
+
conn.close()
|
163
|
+
|
164
|
+
print(f"Analysis completed and saved to database '{db_name}'.")
|
165
|
+
|
166
|
+
def train_cellpose(settings):
|
167
|
+
|
168
|
+
from .io import _load_normalized_images_and_labels, _load_images_and_labels
|
169
|
+
from .settings import get_train_cellpose_default_settings
|
170
|
+
from .utils import save_settings
|
171
|
+
|
172
|
+
settings = get_train_cellpose_default_settings(settings)
|
173
|
+
|
174
|
+
img_src = settings['img_src']
|
175
|
+
mask_src = os.path.join(img_src, 'masks')
|
176
|
+
test_img_src = settings['test_img_src']
|
177
|
+
test_mask_src = settings['test_mask_src']
|
178
|
+
|
179
|
+
if settings['resize']:
|
180
|
+
target_height = settings['width_height'][1]
|
181
|
+
target_width = settings['width_height'][0]
|
182
|
+
|
183
|
+
if settings['test']:
|
184
|
+
test_img_src = os.path.join(os.path.dirname(settings['img_src']), 'test')
|
185
|
+
test_mask_src = os.path.join(settings['test_img_src'], 'mask')
|
186
|
+
|
187
|
+
test_images, test_masks, test_image_names, test_mask_names = None,None,None,None
|
188
|
+
print(settings)
|
189
|
+
|
190
|
+
if settings['from_scratch']:
|
191
|
+
model_name=f"scratch_{settings['model_name']}_{settings['model_type']}_e{settings['n_epochs']}_X{target_width}_Y{target_height}.CP_model"
|
192
|
+
else:
|
193
|
+
if settings['resize']:
|
194
|
+
model_name=f"{settings['model_name']}_{settings['model_type']}_e{settings['n_epochs']}_X{target_width}_Y{target_height}.CP_model"
|
195
|
+
else:
|
196
|
+
model_name=f"{settings['model_name']}_{settings['model_type']}_e{settings['n_epochs']}.CP_model"
|
197
|
+
|
198
|
+
model_save_path = os.path.join(settings['mask_src'], 'models', 'cellpose_model')
|
199
|
+
print(model_save_path)
|
200
|
+
os.makedirs(model_save_path, exist_ok=True)
|
201
|
+
|
202
|
+
save_settings(settings, name=model_name)
|
203
|
+
|
204
|
+
if settings['from_scratch']:
|
205
|
+
model = cp_models.CellposeModel(gpu=True, model_type=settings['model_type'], diam_mean=settings['diameter'], pretrained_model=None)
|
206
|
+
else:
|
207
|
+
model = cp_models.CellposeModel(gpu=True, model_type=settings['model_type'])
|
208
|
+
|
209
|
+
if settings['normalize']:
|
210
|
+
|
211
|
+
image_files = [os.path.join(img_src, f) for f in os.listdir(img_src) if f.endswith('.tif')]
|
212
|
+
label_files = [os.path.join(mask_src, f) for f in os.listdir(mask_src) if f.endswith('.tif')]
|
213
|
+
images, masks, image_names, mask_names, orig_dims = _load_normalized_images_and_labels(image_files,
|
214
|
+
label_files,
|
215
|
+
settings['channels'],
|
216
|
+
settings['percentiles'],
|
217
|
+
settings['circular'],
|
218
|
+
settings['invert'],
|
219
|
+
settings['verbose'],
|
220
|
+
settings['remove_background'],
|
221
|
+
settings['background'],
|
222
|
+
settings['Signal_to_noise'],
|
223
|
+
settings['target_height'],
|
224
|
+
settings['target_width'])
|
225
|
+
images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
|
226
|
+
|
227
|
+
if settings['test']:
|
228
|
+
test_image_files = [os.path.join(test_img_src, f) for f in os.listdir(test_img_src) if f.endswith('.tif')]
|
229
|
+
test_label_files = [os.path.join(test_mask_src, f) for f in os.listdir(test_mask_src) if f.endswith('.tif')]
|
230
|
+
test_images, test_masks, test_image_names, test_mask_names = _load_normalized_images_and_labels(test_image_files,
|
231
|
+
test_label_files,
|
232
|
+
settings['channels'],
|
233
|
+
settings['percentiles'],
|
234
|
+
settings['circular'],
|
235
|
+
settings['invert'],
|
236
|
+
settings['verbose'],
|
237
|
+
settings['remove_background'],
|
238
|
+
settings['background'],
|
239
|
+
settings['Signal_to_noise'],
|
240
|
+
settings['target_height'],
|
241
|
+
settings['target_width'])
|
242
|
+
test_images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in test_images]
|
243
|
+
|
244
|
+
else:
|
245
|
+
images, masks, image_names, mask_names = _load_images_and_labels(img_src, mask_src, settings['circular'], settings['invert'])
|
246
|
+
images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
|
247
|
+
|
248
|
+
if settings['test']:
|
249
|
+
test_images, test_masks, test_image_names, test_mask_names = _load_images_and_labels(test_img_src,
|
250
|
+
test_mask_src,
|
251
|
+
settings['circular'],
|
252
|
+
settings['invert'])
|
253
|
+
|
254
|
+
test_images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in test_images]
|
255
|
+
|
256
|
+
#if resize:
|
257
|
+
# images, masks = resize_images_and_labels(images, masks, target_height, target_width, show_example=True)
|
258
|
+
|
259
|
+
if settings['model_type'] == 'cyto':
|
260
|
+
cp_channels = [0,1]
|
261
|
+
if settings['model_type'] == 'cyto2':
|
262
|
+
cp_channels = [0,2]
|
263
|
+
if settings['model_type'] == 'nucleus':
|
264
|
+
cp_channels = [0,0]
|
265
|
+
if settings['grayscale']:
|
266
|
+
cp_channels = [0,0]
|
267
|
+
images = [np.squeeze(img) if img.ndim == 3 and 1 in img.shape else img for img in images]
|
268
|
+
|
269
|
+
masks = [np.squeeze(mask) if mask.ndim == 3 and 1 in mask.shape else mask for mask in masks]
|
270
|
+
|
271
|
+
print(f'image shape: {images[0].shape}, image type: images[0].shape mask shape: {masks[0].shape}, image type: masks[0].shape')
|
272
|
+
save_every = int(settings['n_epochs']/10)
|
273
|
+
if save_every < 10:
|
274
|
+
save_every = settings['n_epochs']
|
275
|
+
|
276
|
+
train_cp.train_seg(model.net,
|
277
|
+
train_data=images,
|
278
|
+
train_labels=masks,
|
279
|
+
train_files=image_names,
|
280
|
+
train_labels_files=mask_names,
|
281
|
+
train_probs=None,
|
282
|
+
test_data=test_images,
|
283
|
+
test_labels=test_masks,
|
284
|
+
test_files=test_image_names,
|
285
|
+
test_labels_files=test_mask_names,
|
286
|
+
test_probs=None,
|
287
|
+
load_files=True,
|
288
|
+
batch_size=settings['batch_size'],
|
289
|
+
learning_rate=settings['learning_rate'],
|
290
|
+
n_epochs=settings['n_epochs'],
|
291
|
+
weight_decay=settings['weight_decay'],
|
292
|
+
momentum=0.9,
|
293
|
+
SGD=False,
|
294
|
+
channels=cp_channels,
|
295
|
+
channel_axis=None,
|
296
|
+
#rgb=False,
|
297
|
+
normalize=False,
|
298
|
+
compute_flows=False,
|
299
|
+
save_path=model_save_path,
|
300
|
+
save_every=save_every,
|
301
|
+
nimg_per_epoch=None,
|
302
|
+
nimg_test_per_epoch=None,
|
303
|
+
rescale=settings['rescale'],
|
304
|
+
#scale_range=None,
|
305
|
+
#bsize=224,
|
306
|
+
min_train_masks=1,
|
307
|
+
model_name=settings['model_name'])
|
308
|
+
|
309
|
+
return print(f"Model saved at: {model_save_path}/{model_name}")
|
310
|
+
|
311
|
+
def count_phenotypes(settings):
|
312
|
+
from .io import _read_db
|
313
|
+
|
314
|
+
if not settings['src'].endswith('/measurements/measurements.db'):
|
315
|
+
settings['src'] = os.path.join(settings['src'], 'measurements/measurements.db')
|
316
|
+
|
317
|
+
df = _read_db(loc=settings['src'], tables=['png_list'])
|
318
|
+
|
319
|
+
unique_values_count = df[settings['annotation_column']].nunique(dropna=True)
|
320
|
+
print(f"Unique values in {settings['annotation_column']} (excluding NaN): {unique_values_count}")
|
321
|
+
|
322
|
+
# Count unique values in 'value' column, grouped by 'plate', 'row', 'column'
|
323
|
+
grouped_unique_count = df.groupby(['plate', 'row', 'column'])[settings['annotation_column']].nunique(dropna=True).reset_index(name='unique_count')
|
324
|
+
display(grouped_unique_count)
|
325
|
+
|
326
|
+
save_path = os.path.join(settings['src'], 'phenotype_counts.csv')
|
327
|
+
|
328
|
+
# Group by plate, row, and column, then count the occurrences of each unique value
|
329
|
+
grouped_counts = df.groupby(['plate', 'row', 'column', 'value']).size().reset_index(name='count')
|
330
|
+
|
331
|
+
# Pivot the DataFrame so that unique values are columns and their counts are in the rows
|
332
|
+
pivot_df = grouped_counts.pivot_table(index=['plate', 'row', 'column'], columns='value', values='count', fill_value=0)
|
333
|
+
|
334
|
+
# Flatten the multi-level columns
|
335
|
+
pivot_df.columns = [f"value_{int(col)}" for col in pivot_df.columns]
|
336
|
+
|
337
|
+
# Reset the index so that plate, row, and column form a combined index
|
338
|
+
pivot_df.index = pivot_df.index.map(lambda x: f"{x[0]}_{x[1]}_{x[2]}")
|
339
|
+
|
340
|
+
# Saving the DataFrame to a SQLite .db file
|
341
|
+
output_dir = os.path.join('src', 'results') # Replace 'src' with the actual base directory
|
342
|
+
os.makedirs(output_dir, exist_ok=True)
|
343
|
+
|
344
|
+
output_dir = os.path.dirname(settings['src'])
|
345
|
+
output_path = os.path.join(output_dir, 'phenotype_counts.csv')
|
346
|
+
|
347
|
+
pivot_df.to_csv(output_path)
|
348
|
+
|
349
|
+
return
|
spacr/timelapse.py
CHANGED
@@ -13,8 +13,6 @@ from scipy.optimize import curve_fit
|
|
13
13
|
from scipy.integrate import trapz
|
14
14
|
import matplotlib.pyplot as plt
|
15
15
|
|
16
|
-
from .logger import log_function_call
|
17
|
-
|
18
16
|
def _npz_to_movie(arrays, filenames, save_path, fps=10):
|
19
17
|
"""
|
20
18
|
Convert a list of numpy arrays to a movie file.
|
spacr/toxo.py
ADDED
@@ -0,0 +1,238 @@
|
|
1
|
+
import matplotlib.pyplot as plt
|
2
|
+
import seaborn as sns
|
3
|
+
import numpy as np
|
4
|
+
from adjustText import adjust_text
|
5
|
+
import pandas as pd
|
6
|
+
from scipy.stats import fisher_exact
|
7
|
+
|
8
|
+
def custom_volcano_plot(data_path, metadata_path, metadata_column='tagm_location', string_list=[], point_size=50, figsize=20):
|
9
|
+
"""
|
10
|
+
Create a volcano plot with the ability to control the shape of points based on a categorical column,
|
11
|
+
color points based on a string list, annotate specific points based on p-value and coefficient thresholds,
|
12
|
+
and control the size of points.
|
13
|
+
|
14
|
+
Parameters:
|
15
|
+
- data_path: Path to the data CSV file.
|
16
|
+
- metadata_path: Path to the metadata CSV file.
|
17
|
+
- metadata_column: Column name in the metadata to control point shapes.
|
18
|
+
- string_list: List of strings to color points differently if present in 'coefficient' names.
|
19
|
+
- point_size: Fixed value to control the size of points.
|
20
|
+
- figsize: Width of the plot (height is half the width).
|
21
|
+
"""
|
22
|
+
|
23
|
+
filename = 'volcano_plot.pdf'
|
24
|
+
|
25
|
+
# Load the data
|
26
|
+
|
27
|
+
if isinstance(data_path, pd.DataFrame):
|
28
|
+
data = data_path
|
29
|
+
else:
|
30
|
+
data = pd.read_csv(data_path)
|
31
|
+
data['variable'] = data['feature'].str.extract(r'\[(.*?)\]')
|
32
|
+
data['variable'].fillna(data['feature'], inplace=True)
|
33
|
+
split_columns = data['variable'].str.split('_', expand=True)
|
34
|
+
data['gene_nr'] = split_columns[0]
|
35
|
+
|
36
|
+
# Load metadata
|
37
|
+
if isinstance(metadata_path, pd.DataFrame):
|
38
|
+
metadata = metadata_path
|
39
|
+
else:
|
40
|
+
metadata = pd.read_csv(metadata_path)
|
41
|
+
|
42
|
+
metadata['gene_nr'] = metadata['gene_nr'].astype(str)
|
43
|
+
data['gene_nr'] = data['gene_nr'].astype(str)
|
44
|
+
|
45
|
+
# Merge data and metadata on 'gene_nr'
|
46
|
+
merged_data = pd.merge(data, metadata[['gene_nr', 'tagm_location']], on='gene_nr', how='left')
|
47
|
+
|
48
|
+
# Controls handling
|
49
|
+
controls = ['000000', '000001', '000002', '000003', '000004', '000005', '000006', '000007', '000008', '000009', '000010', '000011']
|
50
|
+
merged_data.loc[merged_data['gene_nr'].isin(controls), metadata_column] = 'control'
|
51
|
+
merged_data.loc[merged_data['gene_nr'].str.startswith('4'), metadata_column] = 'GT1_gene'
|
52
|
+
merged_data.loc[merged_data['gene_nr'] == 'Intercept', metadata_column] = 'Intercept'
|
53
|
+
|
54
|
+
# Create a 'highlight_color' column based on the string_list
|
55
|
+
merged_data['highlight_color'] = merged_data['gene_nr'].apply(lambda x: 'red' if any(s in x for s in string_list) else 'blue')
|
56
|
+
|
57
|
+
# Create the volcano plot
|
58
|
+
figsize_2 = figsize / 2
|
59
|
+
plt.figure(figsize=(figsize_2, figsize))
|
60
|
+
|
61
|
+
# Create the scatter plot with fixed point size
|
62
|
+
sns.scatterplot(
|
63
|
+
data=merged_data,
|
64
|
+
x='coefficient',
|
65
|
+
y='-log10(p_value)',
|
66
|
+
hue='highlight_color',
|
67
|
+
style=metadata_column if metadata_column else None, # Control point shape with metadata_column
|
68
|
+
s=point_size, # Fixed size for all points
|
69
|
+
palette={'red': 'red', 'blue': 'blue'}
|
70
|
+
)
|
71
|
+
|
72
|
+
# Set the plot title and labels
|
73
|
+
plt.title('Custom Volcano Plot of Coefficients')
|
74
|
+
plt.xlabel('Coefficient')
|
75
|
+
plt.ylabel('-log10(p-value)')
|
76
|
+
|
77
|
+
# Horizontal line at p-value threshold (0.05)
|
78
|
+
plt.axhline(y=-np.log10(0.05), color='red', linestyle='--')
|
79
|
+
|
80
|
+
# Annotate points where p_value <= 0.05 and coefficient >= 0.25
|
81
|
+
texts = []
|
82
|
+
for i, row in merged_data.iterrows():
|
83
|
+
if row['p_value'] <= 0.05 and row['coefficient'] >= 0.25:
|
84
|
+
texts.append(plt.text(row['coefficient'], -np.log10(row['p_value']), row['gene_nr'], fontsize=9))
|
85
|
+
|
86
|
+
# Adjust text positions to avoid overlap
|
87
|
+
adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'))
|
88
|
+
|
89
|
+
# Move the legend outside the plot
|
90
|
+
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
|
91
|
+
|
92
|
+
# Save the plot
|
93
|
+
plt.savefig(filename, format='pdf', bbox_inches='tight') # bbox_inches ensures the legend doesn't get cut off
|
94
|
+
print(f'Saved Volcano plot: {filename}')
|
95
|
+
|
96
|
+
# Show the plot
|
97
|
+
plt.show()
|
98
|
+
|
99
|
+
def go_term_enrichment_by_column(significant_df, metadata_path, go_term_columns=['Computed GO Processes', 'Curated GO Components', 'Curated GO Functions', 'Curated GO Processes']):
|
100
|
+
"""
|
101
|
+
Perform GO term enrichment analysis for each GO term column and generate plots.
|
102
|
+
|
103
|
+
Parameters:
|
104
|
+
- significant_df: DataFrame containing the significant genes from the screen.
|
105
|
+
- metadata_path: Path to the metadata file containing GO terms.
|
106
|
+
- go_term_columns: List of columns in the metadata corresponding to GO terms.
|
107
|
+
|
108
|
+
For each GO term column, this function will:
|
109
|
+
- Split the GO terms by semicolons.
|
110
|
+
- Count the occurrences of GO terms in the hits and in the background.
|
111
|
+
- Perform Fisher's exact test for enrichment.
|
112
|
+
- Plot the enrichment score vs -log10(p-value).
|
113
|
+
"""
|
114
|
+
|
115
|
+
#significant_df['variable'].fillna(significant_df['feature'], inplace=True)
|
116
|
+
#split_columns = significant_df['variable'].str.split('_', expand=True)
|
117
|
+
#significant_df['gene_nr'] = split_columns[0]
|
118
|
+
#gene_list = significant_df['gene_nr'].to_list()
|
119
|
+
|
120
|
+
significant_df = significant_df.dropna(subset=['n_gene'])
|
121
|
+
significant_df = significant_df[significant_df['n_gene'] != None]
|
122
|
+
|
123
|
+
gene_list = significant_df['n_gene'].to_list()
|
124
|
+
|
125
|
+
# Load metadata
|
126
|
+
metadata = pd.read_csv(metadata_path)
|
127
|
+
split_columns = metadata['Gene ID'].str.split('_', expand=True)
|
128
|
+
metadata['gene_nr'] = split_columns[1]
|
129
|
+
|
130
|
+
# Create a subset of metadata with only the rows that contain genes in gene_list (hits)
|
131
|
+
hits_metadata = metadata[metadata['gene_nr'].isin(gene_list)]
|
132
|
+
|
133
|
+
# Create a list to hold results from all columns
|
134
|
+
combined_results = []
|
135
|
+
|
136
|
+
for go_term_column in go_term_columns:
|
137
|
+
# Initialize lists to store results
|
138
|
+
go_terms = []
|
139
|
+
enrichment_scores = []
|
140
|
+
p_values = []
|
141
|
+
|
142
|
+
# Split the GO terms in the entire metadata and hits
|
143
|
+
metadata[go_term_column] = metadata[go_term_column].fillna('')
|
144
|
+
hits_metadata[go_term_column] = hits_metadata[go_term_column].fillna('')
|
145
|
+
|
146
|
+
all_go_terms = metadata[go_term_column].str.split(';').explode()
|
147
|
+
hit_go_terms = hits_metadata[go_term_column].str.split(';').explode()
|
148
|
+
|
149
|
+
# Count occurrences of each GO term in hits and total metadata
|
150
|
+
all_go_term_counts = all_go_terms.value_counts()
|
151
|
+
hit_go_term_counts = hit_go_terms.value_counts()
|
152
|
+
|
153
|
+
# Perform enrichment analysis for each GO term
|
154
|
+
for go_term in all_go_term_counts.index:
|
155
|
+
total_with_go_term = all_go_term_counts.get(go_term, 0)
|
156
|
+
hits_with_go_term = hit_go_term_counts.get(go_term, 0)
|
157
|
+
|
158
|
+
# Calculate the total number of genes and hits
|
159
|
+
total_genes = len(metadata)
|
160
|
+
total_hits = len(hits_metadata)
|
161
|
+
|
162
|
+
# Perform Fisher's exact test
|
163
|
+
contingency_table = [[hits_with_go_term, total_hits - hits_with_go_term],
|
164
|
+
[total_with_go_term - hits_with_go_term, total_genes - total_hits - (total_with_go_term - hits_with_go_term)]]
|
165
|
+
|
166
|
+
_, p_value = fisher_exact(contingency_table)
|
167
|
+
|
168
|
+
# Calculate enrichment score (hits with GO term / total hits with GO term)
|
169
|
+
if total_with_go_term > 0 and total_hits > 0:
|
170
|
+
enrichment_score = (hits_with_go_term / total_hits) / (total_with_go_term / total_genes)
|
171
|
+
else:
|
172
|
+
enrichment_score = 0.0
|
173
|
+
|
174
|
+
# Store the results only if enrichment score is non-zero
|
175
|
+
if enrichment_score > 0.0:
|
176
|
+
go_terms.append(go_term)
|
177
|
+
enrichment_scores.append(enrichment_score)
|
178
|
+
p_values.append(p_value)
|
179
|
+
|
180
|
+
# Create a results DataFrame for this GO term column
|
181
|
+
results_df = pd.DataFrame({
|
182
|
+
'GO Term': go_terms,
|
183
|
+
'Enrichment Score': enrichment_scores,
|
184
|
+
'P-value': p_values,
|
185
|
+
'GO Column': go_term_column # Track the GO term column for final combined plot
|
186
|
+
})
|
187
|
+
|
188
|
+
# Sort by enrichment score
|
189
|
+
results_df = results_df.sort_values(by='Enrichment Score', ascending=False)
|
190
|
+
|
191
|
+
# Append this DataFrame to the combined list
|
192
|
+
combined_results.append(results_df)
|
193
|
+
|
194
|
+
# Plot the enrichment results for each individual column
|
195
|
+
plt.figure(figsize=(10, 6))
|
196
|
+
|
197
|
+
# Create a scatter plot of Enrichment Score vs -log10(p-value)
|
198
|
+
sns.scatterplot(data=results_df, x='Enrichment Score', y=-np.log10(results_df['P-value']), hue='GO Term', size='Enrichment Score', sizes=(50, 200))
|
199
|
+
|
200
|
+
# Set plot labels and title
|
201
|
+
plt.title(f'GO Term Enrichment Analysis for {go_term_column}')
|
202
|
+
plt.xlabel('Enrichment Score')
|
203
|
+
plt.ylabel('-log10(P-value)')
|
204
|
+
|
205
|
+
# Move the legend to the right of the plot
|
206
|
+
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
|
207
|
+
|
208
|
+
# Show the plot
|
209
|
+
plt.tight_layout() # Ensure everything fits in the figure area
|
210
|
+
plt.show()
|
211
|
+
|
212
|
+
# Optionally return or save the results for each column
|
213
|
+
print(f'Results for {go_term_column}')
|
214
|
+
|
215
|
+
# Combine results from all columns into a single DataFrame
|
216
|
+
combined_df = pd.concat(combined_results)
|
217
|
+
|
218
|
+
# Plot the combined results with text labels
|
219
|
+
plt.figure(figsize=(12, 8))
|
220
|
+
sns.scatterplot(data=combined_df, x='Enrichment Score', y=-np.log10(combined_df['P-value']),
|
221
|
+
style='GO Column', size='Enrichment Score', sizes=(50, 200))
|
222
|
+
|
223
|
+
# Set plot labels and title for the combined graph
|
224
|
+
plt.title('Combined GO Term Enrichment Analysis')
|
225
|
+
plt.xlabel('Enrichment Score')
|
226
|
+
plt.ylabel('-log10(P-value)')
|
227
|
+
|
228
|
+
# Annotate the points with labels and connecting lines
|
229
|
+
texts = []
|
230
|
+
for i, row in combined_df.iterrows():
|
231
|
+
texts.append(plt.text(row['Enrichment Score'], -np.log10(row['P-value']), row['GO Term'], fontsize=9))
|
232
|
+
|
233
|
+
# Adjust text to avoid overlap
|
234
|
+
adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'))
|
235
|
+
|
236
|
+
# Show the combined plot
|
237
|
+
plt.tight_layout()
|
238
|
+
plt.show()
|