spacr 0.0.81__py3-none-any.whl → 0.0.82__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/core.py +6 -7
- spacr/io.py +108 -1
- spacr/sequencing.py +793 -122
- {spacr-0.0.81.dist-info → spacr-0.0.82.dist-info}/METADATA +1 -1
- {spacr-0.0.81.dist-info → spacr-0.0.82.dist-info}/RECORD +9 -9
- {spacr-0.0.81.dist-info → spacr-0.0.82.dist-info}/LICENSE +0 -0
- {spacr-0.0.81.dist-info → spacr-0.0.82.dist-info}/WHEEL +0 -0
- {spacr-0.0.81.dist-info → spacr-0.0.82.dist-info}/entry_points.txt +0 -0
- {spacr-0.0.81.dist-info → spacr-0.0.82.dist-info}/top_level.txt +0 -0
spacr/core.py
CHANGED
@@ -77,7 +77,7 @@ def analyze_plaques(folder):
|
|
77
77
|
def train_cellpose(settings):
|
78
78
|
|
79
79
|
from .io import _load_normalized_images_and_labels, _load_images_and_labels
|
80
|
-
from .utils import resize_images_and_labels
|
80
|
+
#from .utils import resize_images_and_labels
|
81
81
|
|
82
82
|
img_src = settings['img_src']
|
83
83
|
mask_src = os.path.join(img_src, 'masks')
|
@@ -99,7 +99,6 @@ def train_cellpose(settings):
|
|
99
99
|
Signal_to_noise = settings.setdefault( 'Signal_to_noise', 10)
|
100
100
|
verbose = settings.setdefault( 'verbose', False)
|
101
101
|
|
102
|
-
|
103
102
|
channels = settings.setdefault( 'channels', [0,0])
|
104
103
|
normalize = settings.setdefault( 'normalize', True)
|
105
104
|
percentiles = settings.setdefault( 'percentiles', None)
|
@@ -119,7 +118,7 @@ def train_cellpose(settings):
|
|
119
118
|
test_img_src = os.path.join(os.path.dirname(img_src), 'test')
|
120
119
|
test_mask_src = os.path.join(test_img_src, 'mask')
|
121
120
|
|
122
|
-
test_images, test_masks, test_image_names, test_mask_names = None,None,None,None
|
121
|
+
test_images, test_masks, test_image_names, test_mask_names = None,None,None,None
|
123
122
|
print(settings)
|
124
123
|
|
125
124
|
if from_scratch:
|
@@ -147,13 +146,13 @@ def train_cellpose(settings):
|
|
147
146
|
|
148
147
|
image_files = [os.path.join(img_src, f) for f in os.listdir(img_src) if f.endswith('.tif')]
|
149
148
|
label_files = [os.path.join(mask_src, f) for f in os.listdir(mask_src) if f.endswith('.tif')]
|
150
|
-
images, masks, image_names, mask_names = _load_normalized_images_and_labels(image_files, label_files, channels, percentiles, circular, invert, verbose, remove_background, background, Signal_to_noise)
|
149
|
+
images, masks, image_names, mask_names = _load_normalized_images_and_labels(image_files, label_files, channels, percentiles, circular, invert, verbose, remove_background, background, Signal_to_noise, target_height, target_width)
|
151
150
|
images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
|
152
151
|
|
153
152
|
if test:
|
154
153
|
test_image_files = [os.path.join(test_img_src, f) for f in os.listdir(test_img_src) if f.endswith('.tif')]
|
155
154
|
test_label_files = [os.path.join(test_mask_src, f) for f in os.listdir(test_mask_src) if f.endswith('.tif')]
|
156
|
-
test_images, test_masks, test_image_names, test_mask_names = _load_normalized_images_and_labels(test_image_files, test_label_files, channels, percentiles, circular, invert, verbose, remove_background, background, Signal_to_noise)
|
155
|
+
test_images, test_masks, test_image_names, test_mask_names = _load_normalized_images_and_labels(test_image_files, test_label_files, channels, percentiles, circular, invert, verbose, remove_background, background, Signal_to_noise, target_height, target_width)
|
157
156
|
test_images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in test_images]
|
158
157
|
|
159
158
|
else:
|
@@ -164,8 +163,8 @@ def train_cellpose(settings):
|
|
164
163
|
test_images, test_masks, test_image_names, test_mask_names = _load_images_and_labels(img_src=test_img_src, mask_src=test_mask_src, circular=circular, invert=invert)
|
165
164
|
test_images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in test_images]
|
166
165
|
|
167
|
-
if resize:
|
168
|
-
|
166
|
+
#if resize:
|
167
|
+
# images, masks = resize_images_and_labels(images, masks, target_height, target_width, show_example=True)
|
169
168
|
|
170
169
|
if model_type == 'cyto':
|
171
170
|
cp_channels = [0,1]
|
spacr/io.py
CHANGED
@@ -87,7 +87,7 @@ def _load_images_and_labels(image_files, label_files, circular=False, invert=Fal
|
|
87
87
|
print(f'image shape: {images[0].shape}, image type: images[0].shape mask shape: {labels[0].shape}, image type: labels[0].shape')
|
88
88
|
return images, labels, image_names, label_names
|
89
89
|
|
90
|
-
def
|
90
|
+
def _load_normalized_images_and_labels_v1(image_files, label_files, channels=None, percentiles=None, circular=False, invert=False, visualize=False, remove_background=False, background=0, Signal_to_noise=10):
|
91
91
|
|
92
92
|
from .plot import normalize_and_visualize
|
93
93
|
from .utils import invert_image, apply_mask
|
@@ -182,6 +182,113 @@ def _load_normalized_images_and_labels(image_files, label_files, channels=None,
|
|
182
182
|
|
183
183
|
return normalized_images, labels, image_names, label_names
|
184
184
|
|
185
|
+
def _load_normalized_images_and_labels(image_files, label_files, channels=None, percentiles=None, circular=False, invert=False, visualize=False, remove_background=False, background=0, Signal_to_noise=10, target_height=None, target_width=None):
|
186
|
+
|
187
|
+
from .plot import normalize_and_visualize, plot_resize
|
188
|
+
from .utils import invert_image, apply_mask
|
189
|
+
from skimage.transform import resize as resizescikit
|
190
|
+
|
191
|
+
signal_thresholds = background * Signal_to_noise
|
192
|
+
lower_percentile = 2
|
193
|
+
|
194
|
+
images = []
|
195
|
+
labels = []
|
196
|
+
|
197
|
+
num_channels = 4
|
198
|
+
percentiles_1 = [[] for _ in range(num_channels)]
|
199
|
+
percentiles_99 = [[] for _ in range(num_channels)]
|
200
|
+
|
201
|
+
image_names = [os.path.basename(f) for f in image_files]
|
202
|
+
image_dir = os.path.dirname(image_files[0])
|
203
|
+
|
204
|
+
if label_files is not None:
|
205
|
+
label_names = [os.path.basename(f) for f in label_files]
|
206
|
+
label_dir = os.path.dirname(label_files[0])
|
207
|
+
|
208
|
+
# Load, normalize, and resize images
|
209
|
+
for i, img_file in enumerate(image_files):
|
210
|
+
image = cellpose.io.imread(img_file)
|
211
|
+
if invert:
|
212
|
+
image = invert_image(image)
|
213
|
+
if circular:
|
214
|
+
image = apply_mask(image, output_value=0)
|
215
|
+
|
216
|
+
# If specific channels are specified, select them
|
217
|
+
if channels is not None and image.ndim == 3:
|
218
|
+
image = image[..., channels]
|
219
|
+
|
220
|
+
if remove_background:
|
221
|
+
image[image < background] = 0
|
222
|
+
|
223
|
+
if image.ndim < 3:
|
224
|
+
image = np.expand_dims(image, axis=-1)
|
225
|
+
|
226
|
+
if percentiles is None:
|
227
|
+
for c in range(image.shape[-1]):
|
228
|
+
p1 = np.percentile(image[..., c], lower_percentile)
|
229
|
+
percentiles_1[c].append(p1)
|
230
|
+
for percentile in [98, 99, 99.9, 99.99, 99.999]:
|
231
|
+
p = np.percentile(image[..., c], percentile)
|
232
|
+
if p > signal_thresholds:
|
233
|
+
percentiles_99[c].append(p)
|
234
|
+
break
|
235
|
+
|
236
|
+
# Resize image
|
237
|
+
if target_height is not None and target_width is not None:
|
238
|
+
if image.ndim == 2:
|
239
|
+
image_shape = (target_height, target_width)
|
240
|
+
elif image.ndim == 3:
|
241
|
+
image_shape = (target_height, target_width, image.shape[-1])
|
242
|
+
|
243
|
+
image = resizescikit(image, image_shape, preserve_range=True, anti_aliasing=True).astype(image.dtype)
|
244
|
+
|
245
|
+
images.append(image)
|
246
|
+
|
247
|
+
if percentiles is None:
|
248
|
+
# Calculate average percentiles for normalization
|
249
|
+
avg_p1 = [np.mean(p) for p in percentiles_1]
|
250
|
+
avg_p99 = [np.mean(p) if len(p) > 0 else np.mean(percentiles_1[i]) for i, p in enumerate(percentiles_99)]
|
251
|
+
|
252
|
+
print(f'Average 1st percentiles: {avg_p1}, Average 99th percentiles: {avg_p99}')
|
253
|
+
|
254
|
+
normalized_images = []
|
255
|
+
for image in images:
|
256
|
+
normalized_image = np.zeros_like(image, dtype=np.float32)
|
257
|
+
for c in range(image.shape[-1]):
|
258
|
+
normalized_image[..., c] = rescale_intensity(image[..., c], in_range=(avg_p1[c], avg_p99[c]), out_range=(0, 1))
|
259
|
+
normalized_images.append(normalized_image)
|
260
|
+
if visualize:
|
261
|
+
normalize_and_visualize(image, normalized_image, title=f"Channel {c+1} Normalized")
|
262
|
+
else:
|
263
|
+
normalized_images = []
|
264
|
+
for image in images:
|
265
|
+
normalized_image = np.zeros_like(image, dtype=np.float32)
|
266
|
+
for c in range(image.shape[-1]):
|
267
|
+
low_p = np.percentile(image[..., c], percentiles[0])
|
268
|
+
high_p = np.percentile(image[..., c], percentiles[1])
|
269
|
+
normalized_image[..., c] = rescale_intensity(image[..., c], in_range=(low_p, high_p), out_range=(0, 1))
|
270
|
+
normalized_images.append(normalized_image)
|
271
|
+
if visualize:
|
272
|
+
normalize_and_visualize(image, normalized_image, title=f"Channel {c+1} Normalized")
|
273
|
+
|
274
|
+
if label_files is not None:
|
275
|
+
for lbl_file in label_files:
|
276
|
+
label = cellpose.io.imread(lbl_file)
|
277
|
+
# Resize label
|
278
|
+
if target_height is not None and target_width is not None:
|
279
|
+
label = resizescikit(label, (target_height, target_width), order=0, preserve_range=True, anti_aliasing=False).astype(label.dtype)
|
280
|
+
labels.append(label)
|
281
|
+
else:
|
282
|
+
label_names = []
|
283
|
+
label_dir = None
|
284
|
+
|
285
|
+
print(f'Loaded and normalized {len(normalized_images)} images and {len(labels)} labels from {image_dir} and {label_dir}')
|
286
|
+
|
287
|
+
if visualize and images and labels:
|
288
|
+
plot_resize(images, normalized_images, labels, labels)
|
289
|
+
|
290
|
+
return normalized_images, labels, image_names, label_names
|
291
|
+
|
185
292
|
class CombineLoaders:
|
186
293
|
|
187
294
|
"""
|
spacr/sequencing.py
CHANGED
@@ -7,10 +7,18 @@ import matplotlib.pyplot as plt
|
|
7
7
|
import seaborn as sns
|
8
8
|
from Bio import pairwise2
|
9
9
|
import statsmodels.api as sm
|
10
|
-
|
10
|
+
from statsmodels.regression.mixed_linear_model import MixedLM
|
11
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
11
12
|
from scipy.stats import gmean
|
12
13
|
from difflib import SequenceMatcher
|
13
14
|
from collections import Counter
|
15
|
+
from IPython.display import display
|
16
|
+
|
17
|
+
from sklearn.linear_model import LinearRegression, Lasso, Ridge
|
18
|
+
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
|
19
|
+
|
20
|
+
from scipy.stats import shapiro
|
21
|
+
from patsy import dmatrices
|
14
22
|
|
15
23
|
def analyze_reads(settings):
|
16
24
|
"""
|
@@ -28,7 +36,7 @@ def analyze_reads(settings):
|
|
28
36
|
None
|
29
37
|
"""
|
30
38
|
|
31
|
-
def
|
39
|
+
def save_chunk_to_hdf5_v1(output_file_path, data_chunk, chunk_counter):
|
32
40
|
"""
|
33
41
|
Save a data chunk to an HDF5 file.
|
34
42
|
|
@@ -44,6 +52,28 @@ def analyze_reads(settings):
|
|
44
52
|
with pd.HDFStore(output_file_path, mode='a', complevel=5, complib='blosc') as store:
|
45
53
|
store.put(f'reads/chunk_{chunk_counter}', df, format='table', append=True)
|
46
54
|
|
55
|
+
def save_chunk_to_hdf5(output_file_path, data_chunk, chunk_counter):
|
56
|
+
"""
|
57
|
+
Save a data chunk to an HDF5 file.
|
58
|
+
|
59
|
+
Parameters:
|
60
|
+
- output_file_path (str): The path to the output HDF5 file.
|
61
|
+
- data_chunk (list): The data chunk to be saved.
|
62
|
+
- chunk_counter (int): The counter for the current chunk.
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
None
|
66
|
+
"""
|
67
|
+
df = pd.DataFrame(data_chunk, columns=['combined_read', 'grna', 'plate_row', 'column', 'sample'])
|
68
|
+
with pd.HDFStore(output_file_path, mode='a', complevel=5, complib='blosc') as store:
|
69
|
+
store.put(
|
70
|
+
f'reads/chunk_{chunk_counter}',
|
71
|
+
df,
|
72
|
+
format='table',
|
73
|
+
append=True,
|
74
|
+
min_itemsize={'combined_read': 300, 'grna': 50, 'plate_row': 20, 'column': 20, 'sample': 50}
|
75
|
+
)
|
76
|
+
|
47
77
|
def reverse_complement(seq):
|
48
78
|
"""
|
49
79
|
Returns the reverse complement of a DNA sequence.
|
@@ -139,7 +169,7 @@ def analyze_reads(settings):
|
|
139
169
|
best_alignment = alignments[0]
|
140
170
|
return best_alignment
|
141
171
|
|
142
|
-
def combine_reads(samples_dict, src, chunk_size,
|
172
|
+
def combine_reads(samples_dict, src, chunk_size, barecode_length_1, barecode_length_2, upstream, downstream):
|
143
173
|
"""
|
144
174
|
Combine reads from paired-end sequencing files and save the combined reads to a new file.
|
145
175
|
|
@@ -186,7 +216,7 @@ def analyze_reads(settings):
|
|
186
216
|
r1_size_est = os.path.getsize(r1_path) // (avg_read_length * 4) if r1_path else 0
|
187
217
|
r2_size_est = os.path.getsize(r2_path) // (avg_read_length * 4) if r2_path else 0
|
188
218
|
max_size = max(r1_size_est, r2_size_est) * 10
|
189
|
-
|
219
|
+
test10 =0
|
190
220
|
with tqdm(total=max_size, desc=f"Processing {sample}") as pbar:
|
191
221
|
total_length_processed = 0
|
192
222
|
read_count = 0
|
@@ -229,12 +259,26 @@ def analyze_reads(settings):
|
|
229
259
|
combo_split_index_1 = read_combo.find(upstream)
|
230
260
|
combo_split_index_2 = read_combo.find(downstream)
|
231
261
|
|
232
|
-
barcode_1 = read_combo[combo_split_index_1 -
|
262
|
+
barcode_1 = read_combo[combo_split_index_1 - barecode_length_1:combo_split_index_1]
|
233
263
|
grna = read_combo[combo_split_index_1 + len(upstream):combo_split_index_2]
|
234
|
-
barcode_2 = read_combo[combo_split_index_2 + len(downstream):combo_split_index_2 + len(downstream) +
|
264
|
+
barcode_2 = read_combo[combo_split_index_2 + len(downstream):combo_split_index_2 + len(downstream) + barecode_length_2]
|
235
265
|
barcode_2 = reverse_complement(barcode_2)
|
236
266
|
data_chunk.append((read_combo, grna, barcode_1, barcode_2, sample))
|
237
267
|
|
268
|
+
if settings['test']:
|
269
|
+
if read_count % 1000 == 0:
|
270
|
+
print(f"Read count: {read_count}")
|
271
|
+
print(f"Read 1: {r1_read_rc}")
|
272
|
+
print(f"Read 2: {r2_read}")
|
273
|
+
print(f"Read combo: {read_combo}")
|
274
|
+
print(f"Barcode 1: {barcode_1}")
|
275
|
+
print(f"gRNA: {grna}")
|
276
|
+
print(f"Barcode 2: {barcode_2}")
|
277
|
+
print()
|
278
|
+
test10 += 1
|
279
|
+
if test10 == 10:
|
280
|
+
break
|
281
|
+
|
238
282
|
read_count += 1
|
239
283
|
total_length_processed += len(r1_read) + len(r2_read)
|
240
284
|
|
@@ -261,13 +305,15 @@ def analyze_reads(settings):
|
|
261
305
|
qc_df = pd.DataFrame([qc])
|
262
306
|
qc_df.to_csv(qc_file_path, index=False)
|
263
307
|
|
264
|
-
settings.setdefault('upstream', 'CTTCTGGTAAATGGGGATGTCAAGTT')
|
265
|
-
settings.setdefault('downstream', '
|
266
|
-
settings.setdefault('
|
308
|
+
settings.setdefault('upstream', 'CTTCTGGTAAATGGGGATGTCAAGTT')
|
309
|
+
settings.setdefault('downstream', 'GTTTAAGAGCTATGCTGGAAACAGCAG') #This is the reverce compliment of the column primer starting from the end #TGCTGTTTAAGAGCTATGCTGGAAACAGCA
|
310
|
+
settings.setdefault('barecode_length_1', 8)
|
311
|
+
settings.setdefault('barecode_length_2', 7)
|
267
312
|
settings.setdefault('chunk_size', 1000000)
|
313
|
+
settings.setdefault('test', False)
|
268
314
|
|
269
315
|
samples_dict = parse_gz_files(settings['src'])
|
270
|
-
combine_reads(samples_dict, settings['src'], settings['chunk_size'], settings['
|
316
|
+
combine_reads(samples_dict, settings['src'], settings['chunk_size'], settings['barecode_length_1'], settings['barecode_length_2'], settings['upstream'], settings['downstream'])
|
271
317
|
|
272
318
|
def map_barcodes(h5_file_path, settings={}):
|
273
319
|
"""
|
@@ -280,27 +326,20 @@ def map_barcodes(h5_file_path, settings={}):
|
|
280
326
|
Returns:
|
281
327
|
None
|
282
328
|
"""
|
283
|
-
def get_read_qc(df,
|
329
|
+
def get_read_qc(df, settings):
|
284
330
|
"""
|
285
331
|
Calculate quality control metrics for sequencing reads.
|
286
332
|
|
287
333
|
Parameters:
|
288
334
|
- df: DataFrame containing the sequencing reads.
|
289
|
-
- df_cleaned: DataFrame containing the cleaned sequencing reads.
|
290
335
|
|
291
336
|
Returns:
|
292
|
-
-
|
293
|
-
|
294
|
-
- 'cleaned_reads': Total number of cleaned reads.
|
295
|
-
- 'NaN_grna': Number of reads with missing 'grna_metadata'.
|
296
|
-
- 'NaN_plate_row': Number of reads with missing 'plate_row_metadata'.
|
297
|
-
- 'NaN_column': Number of reads with missing 'column_metadata'.
|
298
|
-
- 'NaN_plate': Number of reads with missing 'plate_metadata'.
|
299
|
-
- 'unique_grna': Counter object containing the count of unique 'grna_metadata' values.
|
300
|
-
- 'unique_plate_row': Counter object containing the count of unique 'plate_row_metadata' values.
|
301
|
-
- 'unique_column': Counter object containing the count of unique 'column_metadata' values.
|
302
|
-
- 'unique_plate': Counter object containing the count of unique 'plate_metadata' values.
|
337
|
+
- df_cleaned: DataFrame containing the cleaned sequencing reads.
|
338
|
+
- qc_dict: Dictionary containing the quality control metrics.
|
303
339
|
"""
|
340
|
+
|
341
|
+
df_cleaned = df.dropna()
|
342
|
+
|
304
343
|
qc_dict = {}
|
305
344
|
qc_dict['reads'] = len(df)
|
306
345
|
qc_dict['cleaned_reads'] = len(df_cleaned)
|
@@ -312,9 +351,56 @@ def map_barcodes(h5_file_path, settings={}):
|
|
312
351
|
qc_dict['unique_plate_row'] = Counter(df['plate_row_metadata'].dropna().tolist())
|
313
352
|
qc_dict['unique_column'] = Counter(df['column_metadata'].dropna().tolist())
|
314
353
|
qc_dict['unique_plate'] = Counter(df['plate_metadata'].dropna().tolist())
|
354
|
+
|
355
|
+
# Calculate control error rates using cleaned DataFrame
|
356
|
+
total_pc_non_nan = df_cleaned[(df_cleaned['column_metadata'] == settings['pc_loc'])].shape[0]
|
357
|
+
total_nc_non_nan = df_cleaned[(df_cleaned['column_metadata'] == settings['nc_loc'])].shape[0]
|
315
358
|
|
316
|
-
|
317
|
-
|
359
|
+
pc_count_pc = df_cleaned[(df_cleaned['column_metadata'] == settings['pc_loc']) & (df_cleaned['grna_metadata'] == settings['pc'])].shape[0]
|
360
|
+
nc_count_nc = df_cleaned[(df_cleaned['column_metadata'] == settings['nc_loc']) & (df_cleaned['grna_metadata'] == settings['nc'])].shape[0]
|
361
|
+
|
362
|
+
pc_error_count = df_cleaned[(df_cleaned['column_metadata'] == settings['pc_loc']) & (df_cleaned['grna_metadata'] != settings['pc'])].shape[0]
|
363
|
+
nc_error_count = df_cleaned[(df_cleaned['column_metadata'] == settings['nc_loc']) & (df_cleaned['grna_metadata'] != settings['nc'])].shape[0]
|
364
|
+
|
365
|
+
pc_in_nc_loc_count = df_cleaned[(df_cleaned['column_metadata'] == settings['nc_loc']) & (df_cleaned['grna_metadata'] == settings['pc'])].shape[0]
|
366
|
+
nc_in_pc_loc_count = df_cleaned[(df_cleaned['column_metadata'] == settings['pc_loc']) & (df_cleaned['grna_metadata'] == settings['nc'])].shape[0]
|
367
|
+
|
368
|
+
# Collect QC metrics into a dictionary
|
369
|
+
# PC
|
370
|
+
qc_dict['pc_total_count'] = total_pc_non_nan
|
371
|
+
qc_dict['pc_count_pc'] = pc_count_pc
|
372
|
+
qc_dict['nc_count_pc'] = pc_in_nc_loc_count
|
373
|
+
qc_dict['pc_error_count'] = pc_error_count
|
374
|
+
# NC
|
375
|
+
qc_dict['nc_total_count'] = total_nc_non_nan
|
376
|
+
qc_dict['nc_count_nc'] = nc_count_nc
|
377
|
+
qc_dict['pc_count_nc'] = nc_in_pc_loc_count
|
378
|
+
qc_dict['nc_error_count'] = nc_error_count
|
379
|
+
|
380
|
+
return df_cleaned, qc_dict
|
381
|
+
|
382
|
+
def get_per_row_qc(df, settings):
|
383
|
+
"""
|
384
|
+
Calculate quality control metrics for each unique row in the control columns.
|
385
|
+
|
386
|
+
Parameters:
|
387
|
+
- df: DataFrame containing the sequencing reads.
|
388
|
+
- settings: Dictionary containing the settings for control values.
|
389
|
+
|
390
|
+
Returns:
|
391
|
+
- dict: Dictionary containing the quality control metrics for each unique row.
|
392
|
+
"""
|
393
|
+
qc_dict_per_row = {}
|
394
|
+
unique_rows = df['plate_row_metadata'].dropna().unique().tolist()
|
395
|
+
unique_rows = list(set(unique_rows)) # Remove duplicates
|
396
|
+
|
397
|
+
for row in unique_rows:
|
398
|
+
df_row = df[(df['plate_row_metadata'] == row)]
|
399
|
+
_, qc_dict_row = get_read_qc(df_row, settings)
|
400
|
+
qc_dict_per_row[row] = qc_dict_row
|
401
|
+
|
402
|
+
return qc_dict_per_row
|
403
|
+
|
318
404
|
def mapping_dicts(df, settings):
|
319
405
|
"""
|
320
406
|
Maps the values in the DataFrame columns to corresponding metadata using dictionaries.
|
@@ -339,22 +425,94 @@ def map_barcodes(h5_file_path, settings={}):
|
|
339
425
|
df['plate_row_metadata'] = df['plate_row'].map(plate_row_dict)
|
340
426
|
df['column_metadata'] = df['column'].map(column_dict)
|
341
427
|
df['plate_metadata'] = df['sample'].map(plate_dict)
|
342
|
-
|
428
|
+
|
343
429
|
return df
|
344
430
|
|
431
|
+
def filter_combinations(df, settings):
|
432
|
+
"""
|
433
|
+
Takes the combination counts Data Frame, filters the rows based on specific conditions,
|
434
|
+
and removes rows with a count lower than the highest value of max_count_c1 and max_count_c2.
|
435
|
+
|
436
|
+
Args:
|
437
|
+
combination_counts_file_path (str): The file path to the CSV file containing the combination counts.
|
438
|
+
pc (str, optional): The positive control sequence. Defaults to 'TGGT1_220950_1'.
|
439
|
+
nc (str, optional): The negative control sequence. Defaults to 'TGGT1_233460_4'.
|
440
|
+
|
441
|
+
Returns:
|
442
|
+
pd.DataFrame: The filtered DataFrame.
|
443
|
+
"""
|
444
|
+
|
445
|
+
pc = settings['pc']
|
446
|
+
nc = settings['nc']
|
447
|
+
pc_loc = settings['pc_loc']
|
448
|
+
nc_loc = settings['nc_loc']
|
449
|
+
|
450
|
+
filtered_c1 = df[(df['column'] == nc_loc) & (df['grna'] != nc)]
|
451
|
+
max_count_c1 = filtered_c1['count'].max()
|
452
|
+
|
453
|
+
filtered_c2 = df[(df['column'] == pc_loc) & (df['grna'] != pc)]
|
454
|
+
max_count_c2 = filtered_c2['count'].max()
|
455
|
+
|
456
|
+
#filtered_c3 = df[(df['column'] != nc_loc) & (df['grna'] == nc)]
|
457
|
+
#max_count_c3 = filtered_c3['count'].max()
|
458
|
+
|
459
|
+
#filtered_c4 = df[(df['column'] != pc_loc) & (df['grna'] == pc)]
|
460
|
+
#max_count_c4 = filtered_c4['count'].max()
|
461
|
+
|
462
|
+
# Find the highest value between max_count_c1 and max_count_c2
|
463
|
+
highest_max_count = max(max_count_c1, max_count_c2)
|
464
|
+
|
465
|
+
# Filter the DataFrame to remove rows with a count lower than the highest_max_count
|
466
|
+
filtered_df = df[df['count'] >= highest_max_count]
|
467
|
+
|
468
|
+
# Calculate total read counts for each unique combination of plate_row and column
|
469
|
+
filtered_df['total_reads'] = filtered_df.groupby(['plate_row', 'column'])['count'].transform('sum')
|
470
|
+
|
471
|
+
# Calculate read fraction for each row
|
472
|
+
filtered_df['read_fraction'] = filtered_df['count'] / filtered_df['total_reads']
|
473
|
+
|
474
|
+
if settings['verbose']:
|
475
|
+
print(f"Max count for non {nc} in {nc_loc}: {max_count_c1}")
|
476
|
+
print(f"Max count for non {pc} in {pc_loc}: {max_count_c2}")
|
477
|
+
#print(f"Max count for {nc} in other columns: {max_count_c3}")
|
478
|
+
|
479
|
+
return filtered_df
|
480
|
+
|
345
481
|
settings.setdefault('grna', '/home/carruthers/Documents/grna_barcodes.csv')
|
346
482
|
settings.setdefault('barcodes', '/home/carruthers/Documents/SCREEN_BARCODES.csv')
|
347
483
|
settings.setdefault('plate_dict', {'EO1': 'plate1', 'EO2': 'plate2', 'EO3': 'plate3', 'EO4': 'plate4', 'EO5': 'plate5', 'EO6': 'plate6', 'EO7': 'plate7', 'EO8': 'plate8'})
|
348
484
|
settings.setdefault('test', False)
|
349
485
|
settings.setdefault('verbose', True)
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
486
|
+
|
487
|
+
settings.setdefault('pc', 'TGGT1_220950_1')
|
488
|
+
settings.setdefault('pc_loc', 'c2')
|
489
|
+
settings.setdefault('nc', 'TGGT1_233460_4')
|
490
|
+
settings.setdefault('nc_loc', 'c1')
|
491
|
+
|
492
|
+
fldr = os.path.splitext(h5_file_path)[0]
|
493
|
+
file_name = os.path.basename(fldr)
|
494
|
+
|
495
|
+
if settings['test']:
|
496
|
+
fldr = os.path.join(fldr, 'test')
|
497
|
+
os.makedirs(fldr, exist_ok=True)
|
498
|
+
|
499
|
+
qc_file_path = os.path.join(fldr, f'{file_name}_qc_step_2.csv')
|
500
|
+
unique_grna_file_path = os.path.join(fldr, f'{file_name}_unique_grna.csv')
|
501
|
+
unique_plate_row_file_path = os.path.join(fldr, f'{file_name}_unique_plate_row.csv')
|
502
|
+
unique_column_file_path = os.path.join(fldr, f'{file_name}_unique_column.csv')
|
503
|
+
unique_plate_file_path = os.path.join(fldr, f'{file_name}_unique_plate.csv')
|
504
|
+
new_h5_file_path = os.path.join(fldr, f'{file_name}_cleaned.h5')
|
505
|
+
combination_counts_file_path = os.path.join(fldr, f'{file_name}_combination_counts.csv')
|
506
|
+
combination_counts_file_path_cleaned = os.path.join(fldr, f'{file_name}_combination_counts_cleaned.csv')
|
507
|
+
|
508
|
+
#qc_file_path = os.path.splitext(h5_file_path)[0] + '_qc_step_2.csv'
|
509
|
+
#unique_grna_file_path = os.path.splitext(h5_file_path)[0] + '_unique_grna.csv'
|
510
|
+
#unique_plate_row_file_path = os.path.splitext(h5_file_path)[0] + '_unique_plate_row.csv'
|
511
|
+
#unique_column_file_path = os.path.splitext(h5_file_path)[0] + '_unique_column.csv'
|
512
|
+
#unique_plate_file_path = os.path.splitext(h5_file_path)[0] + '_unique_plate.csv'
|
513
|
+
#new_h5_file_path = os.path.splitext(h5_file_path)[0] + '_cleaned.h5'
|
514
|
+
#combination_counts_file_path = os.path.splitext(h5_file_path)[0] + '_combination_counts.csv'
|
515
|
+
#combination_counts_file_path_cleaned = os.path.splitext(h5_file_path)[0] + '_combination_counts_cleaned.csv'
|
358
516
|
|
359
517
|
# Initialize the HDF5 store for cleaned data
|
360
518
|
store_cleaned = pd.HDFStore(new_h5_file_path, mode='a', complevel=5, complib='blosc')
|
@@ -370,38 +528,89 @@ def map_barcodes(h5_file_path, settings={}):
|
|
370
528
|
'unique_grna': Counter(),
|
371
529
|
'unique_plate_row': Counter(),
|
372
530
|
'unique_column': Counter(),
|
373
|
-
'unique_plate': Counter()
|
531
|
+
'unique_plate': Counter(),
|
532
|
+
'pc_total_count': 0,
|
533
|
+
'pc_count_pc': 0,
|
534
|
+
'nc_total_count': 0,
|
535
|
+
'nc_count_nc': 0,
|
536
|
+
'pc_count_nc': 0,
|
537
|
+
'nc_count_pc': 0,
|
538
|
+
'pc_error_count': 0,
|
539
|
+
'nc_error_count': 0,
|
540
|
+
'pc_fraction_pc': 0,
|
541
|
+
'nc_fraction_nc': 0,
|
542
|
+
'pc_fraction_nc': 0,
|
543
|
+
'nc_fraction_pc': 0
|
374
544
|
}
|
375
545
|
|
546
|
+
per_row_qc = {}
|
547
|
+
combination_counts = Counter()
|
548
|
+
|
376
549
|
with pd.HDFStore(h5_file_path, mode='r') as store:
|
377
550
|
keys = [key for key in store.keys() if key.startswith('/reads/chunk_')]
|
378
|
-
|
551
|
+
|
552
|
+
if settings['test']:
|
553
|
+
keys = keys[:3] # Only read the first chunks if in test mode
|
554
|
+
|
379
555
|
for key in keys:
|
380
556
|
df = store.get(key)
|
381
557
|
df = mapping_dicts(df, settings)
|
382
|
-
df_cleaned = df
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
overall_qc['reads'] += qc_dict['reads']
|
387
|
-
overall_qc['cleaned_reads'] += qc_dict['cleaned_reads']
|
388
|
-
overall_qc['NaN_grna'] += qc_dict['NaN_grna']
|
389
|
-
overall_qc['NaN_plate_row'] += qc_dict['NaN_plate_row']
|
390
|
-
overall_qc['NaN_column'] += qc_dict['NaN_column']
|
391
|
-
overall_qc['NaN_plate'] += qc_dict['NaN_plate']
|
392
|
-
overall_qc['unique_grna'].update(qc_dict['unique_grna'])
|
393
|
-
overall_qc['unique_plate_row'].update(qc_dict['unique_plate_row'])
|
394
|
-
overall_qc['unique_column'].update(qc_dict['unique_column'])
|
395
|
-
overall_qc['unique_plate'].update(qc_dict['unique_plate'])
|
558
|
+
df_cleaned, qc_dict = get_read_qc(df, settings)
|
559
|
+
|
560
|
+
# Accumulate counts for unique combinations
|
561
|
+
combinations = df_cleaned[['plate_row_metadata', 'column_metadata', 'grna_metadata']].apply(tuple, axis=1)
|
396
562
|
|
397
|
-
|
398
|
-
|
563
|
+
combination_counts.update(combinations)
|
564
|
+
|
565
|
+
if settings['test'] and settings['verbose']:
|
566
|
+
os.makedirs(os.path.join(os.path.splitext(h5_file_path)[0],'test'), exist_ok=True)
|
567
|
+
df.to_csv(os.path.join(os.path.splitext(h5_file_path)[0],'test','chunk_1_df.csv'), index=False)
|
568
|
+
df_cleaned.to_csv(os.path.join(os.path.splitext(h5_file_path)[0],'test','chunk_1_df_cleaned.csv'), index=False)
|
569
|
+
|
570
|
+
# Accumulate QC metrics for all rows
|
571
|
+
for metric in qc_dict:
|
572
|
+
if isinstance(overall_qc[metric], Counter):
|
573
|
+
overall_qc[metric].update(qc_dict[metric])
|
574
|
+
else:
|
575
|
+
overall_qc[metric] += qc_dict[metric]
|
576
|
+
|
577
|
+
# Update per_row_qc dictionary
|
578
|
+
chunk_per_row_qc = get_per_row_qc(df, settings)
|
579
|
+
for row in chunk_per_row_qc:
|
580
|
+
if row not in per_row_qc:
|
581
|
+
per_row_qc[row] = chunk_per_row_qc[row]
|
582
|
+
else:
|
583
|
+
for metric in chunk_per_row_qc[row]:
|
584
|
+
if isinstance(per_row_qc[row][metric], Counter):
|
585
|
+
per_row_qc[row][metric].update(chunk_per_row_qc[row][metric])
|
586
|
+
else:
|
587
|
+
per_row_qc[row][metric] += chunk_per_row_qc[row][metric]
|
588
|
+
|
589
|
+
# Ensure the DataFrame columns are in the desired order
|
590
|
+
df_cleaned = df_cleaned[['grna', 'plate_row', 'column', 'sample', 'grna_metadata', 'plate_row_metadata', 'column_metadata', 'plate_metadata']]
|
591
|
+
|
399
592
|
# Save cleaned data to the new HDF5 store
|
400
593
|
store_cleaned.put('reads/cleaned_data', df_cleaned, format='table', append=True)
|
401
|
-
|
594
|
+
|
402
595
|
del df_cleaned, df
|
403
596
|
gc.collect()
|
404
597
|
|
598
|
+
# Calculate overall fractions after accumulating all metrics
|
599
|
+
overall_qc['pc_fraction_pc'] = overall_qc['pc_count_pc'] / overall_qc['pc_total_count'] if overall_qc['pc_total_count'] else 0
|
600
|
+
overall_qc['nc_fraction_nc'] = overall_qc['nc_count_nc'] / overall_qc['nc_total_count'] if overall_qc['nc_total_count'] else 0
|
601
|
+
overall_qc['pc_fraction_nc'] = overall_qc['pc_count_nc'] / overall_qc['nc_total_count'] if overall_qc['nc_total_count'] else 0
|
602
|
+
overall_qc['nc_fraction_pc'] = overall_qc['nc_count_pc'] / overall_qc['pc_total_count'] if overall_qc['pc_total_count'] else 0
|
603
|
+
|
604
|
+
for row in per_row_qc:
|
605
|
+
if row != 'all_rows':
|
606
|
+
per_row_qc[row]['pc_fraction_pc'] = per_row_qc[row]['pc_count_pc'] / per_row_qc[row]['pc_total_count'] if per_row_qc[row]['pc_total_count'] else 0
|
607
|
+
per_row_qc[row]['nc_fraction_nc'] = per_row_qc[row]['nc_count_nc'] / per_row_qc[row]['nc_total_count'] if per_row_qc[row]['nc_total_count'] else 0
|
608
|
+
per_row_qc[row]['pc_fraction_nc'] = per_row_qc[row]['pc_count_nc'] / per_row_qc[row]['nc_total_count'] if per_row_qc[row]['nc_total_count'] else 0
|
609
|
+
per_row_qc[row]['nc_fraction_pc'] = per_row_qc[row]['nc_count_pc'] / per_row_qc[row]['pc_total_count'] if per_row_qc[row]['pc_total_count'] else 0
|
610
|
+
|
611
|
+
# Add overall_qc to per_row_qc with the key 'all_rows'
|
612
|
+
per_row_qc['all_rows'] = overall_qc
|
613
|
+
|
405
614
|
# Convert the Counter objects to DataFrames and save them to CSV files
|
406
615
|
unique_grna_df = pd.DataFrame(overall_qc['unique_grna'].items(), columns=['key', 'value'])
|
407
616
|
unique_plate_row_df = pd.DataFrame(overall_qc['unique_plate_row'].items(), columns=['key', 'value'])
|
@@ -422,89 +631,128 @@ def map_barcodes(h5_file_path, settings={}):
|
|
422
631
|
# Combine all remaining QC metrics into a single DataFrame and save it to CSV
|
423
632
|
qc_df = pd.DataFrame([overall_qc])
|
424
633
|
qc_df.to_csv(qc_file_path, index=False)
|
634
|
+
|
635
|
+
# Convert per_row_qc to a DataFrame and save it to CSV
|
636
|
+
per_row_qc_df = pd.DataFrame.from_dict(per_row_qc, orient='index')
|
637
|
+
per_row_qc_df = per_row_qc_df.sort_values(by='reads', ascending=False)
|
638
|
+
per_row_qc_df = per_row_qc_df.drop(['unique_grna', 'unique_plate_row', 'unique_column', 'unique_plate'], axis=1, errors='ignore')
|
639
|
+
per_row_qc_df = per_row_qc_df.dropna(subset=['reads'])
|
640
|
+
per_row_qc_df.to_csv(os.path.splitext(h5_file_path)[0] + '_per_row_qc.csv', index=True)
|
641
|
+
|
642
|
+
if settings['verbose']:
|
643
|
+
display(per_row_qc_df)
|
644
|
+
|
645
|
+
# Save the combination counts to a CSV file
|
646
|
+
try:
|
647
|
+
combination_counts_df = pd.DataFrame(combination_counts.items(), columns=['combination', 'count'])
|
648
|
+
combination_counts_df[['plate_row', 'column', 'grna']] = pd.DataFrame(combination_counts_df['combination'].tolist(), index=combination_counts_df.index)
|
649
|
+
combination_counts_df = combination_counts_df.drop('combination', axis=1)
|
650
|
+
combination_counts_df.to_csv(combination_counts_file_path, index=False)
|
651
|
+
|
652
|
+
grna_plate_heatmap(combination_counts_file_path, specific_grna=None)
|
653
|
+
grna_plate_heatmap(combination_counts_file_path, specific_grna=settings['pc'])
|
654
|
+
grna_plate_heatmap(combination_counts_file_path, specific_grna=settings['nc'])
|
655
|
+
|
656
|
+
combination_counts_df_cleaned = filter_combinations(combination_counts_df, settings)
|
657
|
+
combination_counts_df_cleaned.to_csv(combination_counts_file_path_cleaned, index=False)
|
658
|
+
|
659
|
+
grna_plate_heatmap(combination_counts_file_path_cleaned, specific_grna=None)
|
660
|
+
grna_plate_heatmap(combination_counts_file_path_cleaned, specific_grna=settings['pc'])
|
661
|
+
grna_plate_heatmap(combination_counts_file_path_cleaned, specific_grna=settings['nc'])
|
662
|
+
except Exception as e:
|
663
|
+
print(e)
|
425
664
|
|
426
665
|
# Close the HDF5 store
|
427
666
|
store_cleaned.close()
|
428
|
-
|
429
667
|
gc.collect()
|
430
668
|
return
|
431
669
|
|
432
|
-
def
|
670
|
+
def grna_plate_heatmap(path, specific_grna=None, min_max='all', cmap='viridis', min_count=0, save=True):
|
671
|
+
"""
|
672
|
+
Generate a heatmap of gRNA plate data.
|
433
673
|
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
674
|
+
Args:
|
675
|
+
path (str): The path to the CSV file containing the gRNA plate data.
|
676
|
+
specific_grna (str, optional): The specific gRNA to filter the data for. Defaults to None.
|
677
|
+
min_max (str or list or tuple, optional): The range of values to use for the color scale.
|
678
|
+
If 'all', the range will be determined by the minimum and maximum values in the data.
|
679
|
+
If 'allq', the range will be determined by the 2nd and 98th percentiles of the data.
|
680
|
+
If a list or tuple of two values, the range will be determined by those values.
|
681
|
+
Defaults to 'all'.
|
682
|
+
cmap (str, optional): The colormap to use for the heatmap. Defaults to 'viridis'.
|
683
|
+
min_count (int, optional): The minimum count threshold for including a gRNA in the heatmap.
|
684
|
+
Defaults to 0.
|
685
|
+
save (bool, optional): Whether to save the heatmap as a PDF file. Defaults to True.
|
686
|
+
|
687
|
+
Returns:
|
688
|
+
matplotlib.figure.Figure: The generated heatmap figure.
|
689
|
+
"""
|
690
|
+
def generate_grna_plate_heatmap(df, plate_number, min_max, min_count, specific_grna=None):
|
691
|
+
df = df.copy() # Work on a copy to avoid SettingWithCopyWarning
|
451
692
|
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
barcode_df = pd.read_csv(settings['barcodes'])
|
693
|
+
# Filtering the dataframe based on the plate_number and specific gRNA if provided
|
694
|
+
df = df[df['plate_row'].str.startswith(plate_number)]
|
695
|
+
if specific_grna:
|
696
|
+
df = df[df['grna'] == specific_grna]
|
457
697
|
|
458
|
-
|
459
|
-
|
460
|
-
column_dict = {row['sequence']: row['name'] for _, row in barcode_df.iterrows() if row['name'].startswith('c')}
|
461
|
-
plate_dict = settings['plate_dict']
|
698
|
+
# Split plate_row into plate and row
|
699
|
+
df[['plate', 'row']] = df['plate_row'].str.split('_', expand=True)
|
462
700
|
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
df['column_metadata'] = df['column'].map(column_dict)
|
467
|
-
df['plate_metadata'] = df['sample'].map(plate_dict)
|
468
|
-
|
469
|
-
return df
|
470
|
-
|
471
|
-
settings.setdefault('grna', '/home/carruthers/Documents/grna_barcodes.csv')
|
472
|
-
settings.setdefault('barcodes', '/home/carruthers/Documents/SCREEN_BARCODES.csv')
|
473
|
-
settings.setdefault('plate_dict', {'EO1': 'plate1', 'EO2': 'plate2', 'EO3': 'plate3', 'EO4': 'plate4', 'EO5': 'plate5', 'EO6': 'plate6', 'EO7': 'plate7', 'EO8': 'plate8'})
|
474
|
-
settings.setdefault('test', False)
|
475
|
-
settings.setdefault('verbose', True)
|
476
|
-
settings.setdefault('min_itemsize', 1000)
|
701
|
+
# Ensure proper ordering
|
702
|
+
row_order = [f'r{i}' for i in range(1, 17)]
|
703
|
+
col_order = [f'c{i}' for i in range(1, 28)]
|
477
704
|
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
705
|
+
df['row'] = pd.Categorical(df['row'], categories=row_order, ordered=True)
|
706
|
+
df['column'] = pd.Categorical(df['column'], categories=col_order, ordered=True)
|
707
|
+
|
708
|
+
# Group by row and column, summing counts
|
709
|
+
grouped = df.groupby(['row', 'column'], observed=True)['count'].sum().reset_index()
|
710
|
+
|
711
|
+
plate_map = pd.pivot_table(grouped, values='count', index='row', columns='column').fillna(0)
|
712
|
+
|
713
|
+
if min_max == 'all':
|
714
|
+
min_max = [plate_map.min().min(), plate_map.max().max()]
|
715
|
+
elif min_max == 'allq':
|
716
|
+
min_max = np.quantile(plate_map.values, [0.02, 0.98])
|
717
|
+
elif isinstance(min_max, (list, tuple)) and len(min_max) == 2:
|
718
|
+
if isinstance(min_max[0], (float)) and isinstance(min_max[1], (float)):
|
719
|
+
min_max = np.quantile(plate_map.values, [min_max[0], min_max[1]])
|
720
|
+
if isinstance(min_max[0], (int)) and isinstance(min_max[1], (int)):
|
721
|
+
min_max = [min_max[0], min_max[1]]
|
722
|
+
|
723
|
+
return plate_map, min_max
|
483
724
|
|
484
|
-
|
485
|
-
|
725
|
+
if isinstance(path, pd.DataFrame):
|
726
|
+
df = path
|
727
|
+
else:
|
728
|
+
df = pd.read_csv(path)
|
486
729
|
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
df = store.get(key)
|
492
|
-
df = mapping_dicts(df, settings)
|
493
|
-
df_cleaned = df.dropna()
|
494
|
-
qc_dict = get_read_qc(df, df_cleaned)
|
495
|
-
qc_df_list.append(qc_dict)
|
496
|
-
df_cleaned = df_cleaned[df_cleaned['grna_length'] >= 30]
|
497
|
-
|
498
|
-
# Save cleaned data to the new HDF5 store
|
499
|
-
store_cleaned.put('reads/cleaned_data', df_cleaned, format='table', append=True)
|
730
|
+
plates = df['plate_row'].str.split('_', expand=True)[0].unique()
|
731
|
+
n_rows, n_cols = (len(plates) + 3) // 4, 4
|
732
|
+
fig, ax = plt.subplots(n_rows, n_cols, figsize=(40, 5 * n_rows))
|
733
|
+
ax = ax.flatten()
|
500
734
|
|
501
|
-
|
502
|
-
|
503
|
-
|
735
|
+
for index, plate in enumerate(plates):
|
736
|
+
plate_map, min_max_values = generate_grna_plate_heatmap(df, plate, min_max, min_count, specific_grna)
|
737
|
+
sns.heatmap(plate_map, cmap=cmap, vmin=min_max_values[0], vmax=min_max_values[1], ax=ax[index])
|
738
|
+
ax[index].set_title(plate)
|
739
|
+
|
740
|
+
for i in range(len(plates), n_rows * n_cols):
|
741
|
+
fig.delaxes(ax[i])
|
504
742
|
|
505
|
-
|
506
|
-
|
507
|
-
|
743
|
+
plt.subplots_adjust(wspace=0.1, hspace=0.4)
|
744
|
+
|
745
|
+
# Save the figure
|
746
|
+
if save:
|
747
|
+
filename = path.replace('.csv', '')
|
748
|
+
if specific_grna:
|
749
|
+
filename += f'_{specific_grna}'
|
750
|
+
filename += '.pdf'
|
751
|
+
plt.savefig(filename)
|
752
|
+
print(f'saved {filename}')
|
753
|
+
plt.show()
|
754
|
+
|
755
|
+
return fig
|
508
756
|
|
509
757
|
def map_barcodes_folder(src, settings={}):
|
510
758
|
for file in os.listdir(src):
|
@@ -1144,4 +1392,427 @@ def generate_fraction_map(df, gene_column, min_=10, plates=['p1','p2','p3','p4']
|
|
1144
1392
|
independent_variables = independent_variables.drop('sum', axis=1)
|
1145
1393
|
independent_variables.index.name = 'prc'
|
1146
1394
|
independent_variables = independent_variables.loc[:, (independent_variables.sum() != 0)]
|
1147
|
-
return independent_variables
|
1395
|
+
return independent_variables
|
1396
|
+
|
1397
|
+
|
1398
|
+
def plot_histogram(df, dependent_variable):
|
1399
|
+
# Plot histogram of the dependent variable
|
1400
|
+
plt.figure(figsize=(10, 6))
|
1401
|
+
sns.histplot(df[dependent_variable], kde=True)
|
1402
|
+
plt.title(f'Histogram of {dependent_variable}')
|
1403
|
+
plt.xlabel(dependent_variable)
|
1404
|
+
plt.ylabel('Frequency')
|
1405
|
+
plt.show()
|
1406
|
+
|
1407
|
+
def precess_reads(csv_path, fraction_threshold, plate):
|
1408
|
+
# Read the CSV file into a DataFrame
|
1409
|
+
csv_df = pd.read_csv(csv_path)
|
1410
|
+
|
1411
|
+
# Ensure the necessary columns are present
|
1412
|
+
if not all(col in csv_df.columns for col in ['grna', 'count', 'column']):
|
1413
|
+
raise ValueError("The CSV file must contain 'grna', 'count', 'plate_row', and 'column' columns.")
|
1414
|
+
|
1415
|
+
if 'plate_row' in csv_df.columns:
|
1416
|
+
csv_df[['plate', 'row']] = csv_df['plate_row'].str.split('_', expand=True)
|
1417
|
+
if plate is not None:
|
1418
|
+
csv_df = csv_df.drop(columns=['plate'])
|
1419
|
+
csv_df['plate'] = plate
|
1420
|
+
|
1421
|
+
if plate is not None:
|
1422
|
+
csv_df['plate'] = plate
|
1423
|
+
|
1424
|
+
# Create the prc column
|
1425
|
+
csv_df['prc'] = csv_df['plate'] + '_' + csv_df['row'] + '_' + csv_df['column']
|
1426
|
+
|
1427
|
+
# Group by prc and calculate the sum of counts
|
1428
|
+
grouped_df = csv_df.groupby('prc')['count'].sum().reset_index()
|
1429
|
+
grouped_df = grouped_df.rename(columns={'count': 'total_counts'})
|
1430
|
+
merged_df = pd.merge(csv_df, grouped_df, on='prc')
|
1431
|
+
merged_df['fraction'] = merged_df['count'] / merged_df['total_counts']
|
1432
|
+
|
1433
|
+
# Filter rows with fraction under the threshold
|
1434
|
+
if fraction_threshold is not None:
|
1435
|
+
observations_before = len(merged_df)
|
1436
|
+
merged_df = merged_df[merged_df['fraction'] >= fraction_threshold]
|
1437
|
+
observations_after = len(merged_df)
|
1438
|
+
removed = observations_before - observations_after
|
1439
|
+
print(f'Removed {removed} observation below fraction threshold: {fraction_threshold}')
|
1440
|
+
|
1441
|
+
merged_df = merged_df[['prc', 'grna', 'fraction']]
|
1442
|
+
|
1443
|
+
if not all(col in merged_df.columns for col in ['grna', 'gene']):
|
1444
|
+
try:
|
1445
|
+
merged_df[['org', 'gene', 'grna']] = merged_df['grna'].str.split('_', expand=True)
|
1446
|
+
merged_df = merged_df.drop(columns=['org'])
|
1447
|
+
merged_df['grna'] = merged_df['gene'] + '_' + merged_df['grna']
|
1448
|
+
except:
|
1449
|
+
print('Error splitting grna into org, gene, grna.')
|
1450
|
+
|
1451
|
+
return merged_df
|
1452
|
+
|
1453
|
+
def apply_transformation(X, transform):
|
1454
|
+
if transform == 'log':
|
1455
|
+
transformer = FunctionTransformer(np.log1p, validate=True)
|
1456
|
+
elif transform == 'sqrt':
|
1457
|
+
transformer = FunctionTransformer(np.sqrt, validate=True)
|
1458
|
+
elif transform == 'square':
|
1459
|
+
transformer = FunctionTransformer(np.square, validate=True)
|
1460
|
+
else:
|
1461
|
+
transformer = None
|
1462
|
+
return transformer
|
1463
|
+
|
1464
|
+
def check_normality(data, variable_name, verbose=False):
|
1465
|
+
"""Check if the data is normally distributed using the Shapiro-Wilk test."""
|
1466
|
+
stat, p_value = shapiro(data)
|
1467
|
+
if verbose:
|
1468
|
+
print(f"Shapiro-Wilk Test for {variable_name}:\nStatistic: {stat}, P-value: {p_value}")
|
1469
|
+
if p_value > 0.05:
|
1470
|
+
if verbose:
|
1471
|
+
print(f"The data for {variable_name} is normally distributed.")
|
1472
|
+
return True
|
1473
|
+
else:
|
1474
|
+
if verbose:
|
1475
|
+
print(f"The data for {variable_name} is not normally distributed.")
|
1476
|
+
return False
|
1477
|
+
|
1478
|
+
def process_scores(df, dependent_variable, plate, min_cell_count=25, agg_type='mean', transform=None):
|
1479
|
+
|
1480
|
+
if plate is not None:
|
1481
|
+
df['plate'] = plate
|
1482
|
+
|
1483
|
+
df['prc'] = df['plate'] + '_' + df['row'] + '_' + df['col']
|
1484
|
+
df = df[['prc', dependent_variable]]
|
1485
|
+
|
1486
|
+
# Group by prc and calculate the mean and count of the dependent_variable
|
1487
|
+
grouped = df.groupby('prc')[dependent_variable]
|
1488
|
+
|
1489
|
+
print(f'Using agg_type: {agg_type}')
|
1490
|
+
if agg_type == 'median':
|
1491
|
+
dependent_df = grouped.median().reset_index()
|
1492
|
+
elif agg_type == 'mean':
|
1493
|
+
dependent_df = grouped.mean().reset_index()
|
1494
|
+
elif agg_type == 'quantile':
|
1495
|
+
dependent_df = grouped.quantile(0.75).reset_index()
|
1496
|
+
elif agg_type == None:
|
1497
|
+
dependent_df = df.reset_index()
|
1498
|
+
if 'prcfo' in dependent_df.columns:
|
1499
|
+
dependent_df = dependent_df.drop(columns=['prcfo'])
|
1500
|
+
|
1501
|
+
else:
|
1502
|
+
raise ValueError(f"Unsupported aggregation type {agg_type}")
|
1503
|
+
|
1504
|
+
# Calculate cell_count for all cases
|
1505
|
+
cell_count = grouped.size().reset_index(name='cell_count')
|
1506
|
+
|
1507
|
+
if agg_type is None:
|
1508
|
+
dependent_df = pd.merge(dependent_df, cell_count, on='prc')
|
1509
|
+
else:
|
1510
|
+
dependent_df['cell_count'] = cell_count['cell_count']
|
1511
|
+
|
1512
|
+
dependent_df = dependent_df[dependent_df['cell_count'] >= min_cell_count]
|
1513
|
+
|
1514
|
+
is_normal = check_normality(dependent_df[dependent_variable], dependent_variable)
|
1515
|
+
|
1516
|
+
if not transform is None:
|
1517
|
+
transformer = apply_transformation(dependent_df[dependent_variable], transform=transform)
|
1518
|
+
transformed_var = f'{transform}_{dependent_variable}'
|
1519
|
+
df[transformed_var] = transformer.fit_transform(dependent_df[[dependent_variable]])
|
1520
|
+
dependent_variable = transformed_var
|
1521
|
+
is_normal = check_normality(dependent_df[transformed_var], transformed_var)
|
1522
|
+
|
1523
|
+
if not is_normal:
|
1524
|
+
print(f'{dependent_variable} is not normally distributed')
|
1525
|
+
else:
|
1526
|
+
print(f'{dependent_variable} is normally distributed')
|
1527
|
+
|
1528
|
+
return dependent_df, dependent_variable
|
1529
|
+
|
1530
|
+
def perform_mixed_model(y, X, groups, alpha=1.0):
|
1531
|
+
# Ensure groups are defined correctly and check for multicollinearity
|
1532
|
+
if groups is None:
|
1533
|
+
raise ValueError("Groups must be defined for mixed model regression")
|
1534
|
+
|
1535
|
+
# Check for multicollinearity by calculating the VIF for each feature
|
1536
|
+
X_np = X.values
|
1537
|
+
vif = [variance_inflation_factor(X_np, i) for i in range(X_np.shape[1])]
|
1538
|
+
print(f"VIF: {vif}")
|
1539
|
+
if any(v > 10 for v in vif):
|
1540
|
+
print(f"Multicollinearity detected with VIF: {vif}. Applying Ridge regression to the fixed effects.")
|
1541
|
+
ridge = Ridge(alpha=alpha)
|
1542
|
+
ridge.fit(X, y)
|
1543
|
+
X_ridge = ridge.coef_ * X # Adjust X with Ridge coefficients
|
1544
|
+
model = MixedLM(y, X_ridge, groups=groups)
|
1545
|
+
else:
|
1546
|
+
model = MixedLM(y, X, groups=groups)
|
1547
|
+
|
1548
|
+
result = model.fit()
|
1549
|
+
return result
|
1550
|
+
|
1551
|
+
def regression_model(X, y, regression_type='ols', groups=None, alpha=1.0, remove_row_column_effect=True):
|
1552
|
+
|
1553
|
+
if regression_type == 'ols':
|
1554
|
+
model = sm.OLS(y, X).fit()
|
1555
|
+
|
1556
|
+
elif regression_type == 'gls':
|
1557
|
+
model = sm.GLS(y, X).fit()
|
1558
|
+
|
1559
|
+
elif regression_type == 'wls':
|
1560
|
+
model = sm.WLS(y, X, weights=weights).fit()
|
1561
|
+
|
1562
|
+
elif regression_type == 'rlm':
|
1563
|
+
model = sm.RLM(y, X, M=sm.robust.norms.HuberT()).fit()
|
1564
|
+
#model = sm.RLM(y, X, M=sm.robust.norms.TukeyBiweight()).fit()
|
1565
|
+
#model = sm.RLM(y, X, M=sm.robust.norms.Hampel()).fit()
|
1566
|
+
#model = sm.RLM(y, X, M=sm.robust.norms.LeastSquares()).fit()
|
1567
|
+
#model = sm.RLM(y, X, M=sm.robust.norms.RamsayE()).fit()
|
1568
|
+
#model = sm.RLM(y, X, M=sm.robust.norms.TrimmedMean()).fit()
|
1569
|
+
|
1570
|
+
elif regression_type == 'glm':
|
1571
|
+
model = sm.GLM(y, X, family=sm.families.Gaussian()).fit() # Gaussian: Used for continuous data, similar to OLS regression.
|
1572
|
+
#model = sm.GLM(y, X, family=sm.families.Binomial()).fit() # Binomial: Used for binary data, modeling the probability of success.
|
1573
|
+
#model = sm.GLM(y, X, family=sm.families.Poisson()).fit() # Poisson: Used for count data.
|
1574
|
+
#model = sm.GLM(y, X, family=sm.families.Gamma()).fit() # Gamma: Used for continuous, positive data, often for modeling waiting times or life data.
|
1575
|
+
#model = sm.GLM(y, X, family=sm.families.InverseGaussian()).fit() # Inverse Gaussian: Used for positive continuous data with a variance that increases with the
|
1576
|
+
#model = sm.GLM(y, X, family=sm.families.NegativeBinomial()).fit() # Negative Binomial: Used for count data with overdispersion (variance greater than the mean).
|
1577
|
+
#model = sm.GLM(y, X, family=sm.families.Tweedie()).fit() # Tweedie: Used for data that can take both positive continuous and count values, allowing for a mixture of distributions.
|
1578
|
+
|
1579
|
+
elif regression_type == 'mixed':
|
1580
|
+
model = perform_mixed_model(y, X, groups, alpha=alpha)
|
1581
|
+
|
1582
|
+
elif regression_type == 'quantile':
|
1583
|
+
model = sm.QuantReg(y, X).fit(q=alpha)
|
1584
|
+
|
1585
|
+
elif regression_type == 'logit':
|
1586
|
+
model = sm.Logit(y, X).fit()
|
1587
|
+
|
1588
|
+
elif regression_type == 'probit':
|
1589
|
+
model = sm.Probit(y, X).fit()
|
1590
|
+
|
1591
|
+
elif regression_type == 'poisson':
|
1592
|
+
model_poisson = sm.Poisson(y, X).fit()
|
1593
|
+
|
1594
|
+
elif regression_type == 'lasso':
|
1595
|
+
model = Lasso(alpha=alpha).fit(X, y)
|
1596
|
+
|
1597
|
+
elif regression_type == 'ridge':
|
1598
|
+
model = Ridge(alpha=alpha).fit(X, y)
|
1599
|
+
|
1600
|
+
else:
|
1601
|
+
raise ValueError(f"Unsupported regression type {regression_type}")
|
1602
|
+
|
1603
|
+
if regression_type in ['lasso', 'ridge']:
|
1604
|
+
y_pred = model.predict(X)
|
1605
|
+
plt.scatter(X.iloc[:, 1], y, color='blue', label='Data')
|
1606
|
+
plt.plot(X.iloc[:, 1], y_pred, color='red', label='Regression line')
|
1607
|
+
plt.xlabel('Features')
|
1608
|
+
plt.ylabel('Dependent Variable')
|
1609
|
+
plt.legend()
|
1610
|
+
plt.show()
|
1611
|
+
|
1612
|
+
return model
|
1613
|
+
|
1614
|
+
def volcano_plot(coef_df, filename='volcano_plot.pdf'):
|
1615
|
+
# Create the volcano plot
|
1616
|
+
plt.figure(figsize=(10, 6))
|
1617
|
+
sns.scatterplot(
|
1618
|
+
data=coef_df,
|
1619
|
+
x='coefficient',
|
1620
|
+
y='-log10(p_value)',
|
1621
|
+
hue='highlight',
|
1622
|
+
palette={True: 'red', False: 'blue'}
|
1623
|
+
)
|
1624
|
+
plt.title('Volcano Plot of Coefficients')
|
1625
|
+
plt.xlabel('Coefficient')
|
1626
|
+
plt.ylabel('-log10(p-value)')
|
1627
|
+
plt.axhline(y=-np.log10(0.05), color='red', linestyle='--')
|
1628
|
+
plt.legend().remove()
|
1629
|
+
plt.savefig(filename, format='pdf')
|
1630
|
+
print(f'Saved Volcano plot: {filename}')
|
1631
|
+
plt.show()
|
1632
|
+
|
1633
|
+
def clean_controls(df,pc,nc,other):
|
1634
|
+
if 'col' in df.columns:
|
1635
|
+
df['column'] = df['col']
|
1636
|
+
if nc != None:
|
1637
|
+
df = df[~df['column'].isin([nc])]
|
1638
|
+
if pc != None:
|
1639
|
+
df = df[~df['column'].isin([pc])]
|
1640
|
+
if other != None:
|
1641
|
+
df = df[~df['column'].isin([other])]
|
1642
|
+
print(f'Removed data from {nc, pc, other}')
|
1643
|
+
return df
|
1644
|
+
|
1645
|
+
def regression(df, csv_path, dependent_variable='predictions', regression_type=None, alpha=1.0, remove_row_column_effect=False):
|
1646
|
+
|
1647
|
+
volcano_filename = os.path.splitext(os.path.basename(csv_path))[0] + '_volcano_plot.pdf'
|
1648
|
+
volcano_filename = regression_type+'_'+volcano_filename
|
1649
|
+
if regression_type == 'quantile':
|
1650
|
+
volcano_filename = str(alpha)+'_'+volcano_filename
|
1651
|
+
volcano_path=os.path.join(os.path.dirname(csv_path), volcano_filename)
|
1652
|
+
|
1653
|
+
if regression_type is None:
|
1654
|
+
if is_normal:
|
1655
|
+
regression_type = 'ols'
|
1656
|
+
else:
|
1657
|
+
regression_type = 'glm'
|
1658
|
+
|
1659
|
+
if remove_row_column_effect:
|
1660
|
+
|
1661
|
+
## 1. Fit the initial model with row and column to estimate their effects
|
1662
|
+
## 2. Fit the initial model using the specified regression type
|
1663
|
+
## 3. Calculate the residuals
|
1664
|
+
### Residual calculation: Residuals are the differences between the observed and predicted values. This step checks if the initial_model has an attribute resid (residuals). If it does, it directly uses them. Otherwise, it calculates residuals manually by subtracting the predicted values from the observed values (y_with_row_col).
|
1665
|
+
## 4. Use the residuals as the new dependent variable in the final regression model without row and column
|
1666
|
+
### Formula creation: A new regression formula is created, excluding row and column effects, with residuals as the new dependent variable.
|
1667
|
+
### Matrix creation: dmatrices is used again to create new design matrices (X for independent variables and y for the new dependent variable, residuals) based on the new formula and the dataframe df.
|
1668
|
+
#### Remove Confounding Effects:Variables like row and column can introduce systematic biases or confounding effects that might obscure the relationships between the dependent variable and the variables of interest (fraction:gene and fraction:grna).
|
1669
|
+
#### By first estimating the effects of row and column and then using the residuals (the part of the dependent variable that is not explained by row and column), we can focus the final regression model on the relationships of interest without the interference from row and column.
|
1670
|
+
|
1671
|
+
#### Reduce Multicollinearity: Including variables like row and column along with other predictors can sometimes lead to multicollinearity, where predictors are highly correlated with each other. This can make it difficult to determine the individual effect of each predictor.
|
1672
|
+
#### By regressing out the effects of row and column first, we reduce potential multicollinearity issues in the final model.
|
1673
|
+
|
1674
|
+
# Fit the initial model with row and column to estimate their effects
|
1675
|
+
formula_with_row_col = f'{dependent_variable} ~ row + column'
|
1676
|
+
y_with_row_col, X_with_row_col = dmatrices(formula_with_row_col, data=df, return_type='dataframe')
|
1677
|
+
|
1678
|
+
# Fit the initial model using the specified regression type
|
1679
|
+
initial_model = regression_model(X_with_row_col, y_with_row_col, regression_type=regression_type, alpha=alpha)
|
1680
|
+
|
1681
|
+
# Calculate the residuals manually
|
1682
|
+
if hasattr(initial_model, 'resid'):
|
1683
|
+
df['residuals'] = initial_model.resid
|
1684
|
+
else:
|
1685
|
+
df['residuals'] = y_with_row_col.values.ravel() - initial_model.predict(X_with_row_col)
|
1686
|
+
|
1687
|
+
# Use the residuals as the new dependent variable in the final regression model without row and column
|
1688
|
+
formula_without_row_col = 'residuals ~ fraction:gene + fraction:grna'
|
1689
|
+
y, X = dmatrices(formula_without_row_col, data=df, return_type='dataframe')
|
1690
|
+
|
1691
|
+
# Plot histogram of the residuals
|
1692
|
+
plot_histogram(df, 'residuals')
|
1693
|
+
|
1694
|
+
# Scale the independent variables and residuals
|
1695
|
+
scaler_X = MinMaxScaler()
|
1696
|
+
scaler_y = MinMaxScaler()
|
1697
|
+
X = pd.DataFrame(scaler_X.fit_transform(X), columns=X.columns)
|
1698
|
+
y = scaler_y.fit_transform(y)
|
1699
|
+
|
1700
|
+
else:
|
1701
|
+
formula = f'{dependent_variable} ~ fraction:gene + fraction:grna + row + column'
|
1702
|
+
y, X = dmatrices(formula, data=df, return_type='dataframe')
|
1703
|
+
|
1704
|
+
plot_histogram(y, dependent_variable)
|
1705
|
+
|
1706
|
+
# Scale the independent variables and dependent variable
|
1707
|
+
scaler_X = MinMaxScaler()
|
1708
|
+
scaler_y = MinMaxScaler()
|
1709
|
+
X = pd.DataFrame(scaler_X.fit_transform(X), columns=X.columns)
|
1710
|
+
y = scaler_y.fit_transform(y)
|
1711
|
+
|
1712
|
+
groups = df['prc'] if regression_type == 'mixed' else None
|
1713
|
+
print(f'performing {regression_type} regression')
|
1714
|
+
model = regression_model(X, y, regression_type=regression_type, groups=groups, alpha=alpha, remove_row_column_effect=remove_row_column_effect)
|
1715
|
+
|
1716
|
+
# Get the model coefficients and p-values
|
1717
|
+
if regression_type in ['ols','gls','wls','rlm','glm','mixed','quantile','logit','probit','poisson','lasso','ridge']:
|
1718
|
+
coefs = model.params
|
1719
|
+
p_values = model.pvalues
|
1720
|
+
|
1721
|
+
coef_df = pd.DataFrame({
|
1722
|
+
'feature': coefs.index,
|
1723
|
+
'coefficient': coefs.values,
|
1724
|
+
'p_value': p_values.values
|
1725
|
+
})
|
1726
|
+
else:
|
1727
|
+
coefs = model.coef_
|
1728
|
+
intercept = model.intercept_
|
1729
|
+
feature_names = X.design_info.column_names
|
1730
|
+
|
1731
|
+
coef_df = pd.DataFrame({
|
1732
|
+
'feature': feature_names,
|
1733
|
+
'coefficient': coefs
|
1734
|
+
})
|
1735
|
+
coef_df.loc[0, 'coefficient'] += intercept
|
1736
|
+
coef_df['p_value'] = np.nan # Placeholder since sklearn doesn't provide p-values
|
1737
|
+
|
1738
|
+
coef_df['-log10(p_value)'] = -np.log10(coef_df['p_value'])
|
1739
|
+
coef_df_v = coef_df[coef_df['feature'] != 'Intercept']
|
1740
|
+
|
1741
|
+
# Create the highlight column
|
1742
|
+
coef_df['highlight'] = coef_df['feature'].apply(lambda x: '220950' in x)
|
1743
|
+
coef_df = coef_df[~coef_df['feature'].str.contains('row|column')]
|
1744
|
+
volcano_plot(coef_df, volcano_path)
|
1745
|
+
|
1746
|
+
return model, coef_df
|
1747
|
+
|
1748
|
+
def set_regression_defaults(settings):
|
1749
|
+
settings.setdefault('gene_weights_csv', '/nas_mnt/carruthers/Einar/mitoscreen/sequencing/combined_reads/EO1_combined/EO1_combined_combination_counts.csv')
|
1750
|
+
settings.setdefault('dependent_variable','predictions')
|
1751
|
+
settings.setdefault('transform',None)
|
1752
|
+
settings.setdefault('agg_type','mean')
|
1753
|
+
settings.setdefault('min_cell_count',25)
|
1754
|
+
settings.setdefault('regression_type','ols')
|
1755
|
+
settings.setdefault('remove_row_column_effect',False)
|
1756
|
+
settings.setdefault('alpha',1)
|
1757
|
+
settings.setdefault('fraction_threshold',0.1)
|
1758
|
+
settings.setdefault('nc','c1')
|
1759
|
+
settings.setdefault('pc','c2')
|
1760
|
+
settings.setdefault('other','c3')
|
1761
|
+
settings.setdefault('plate','plate1')
|
1762
|
+
|
1763
|
+
if settings['regression_type'] == 'quantile':
|
1764
|
+
print(f"Using alpha as quantile for quantile regression, alpha: {settings['alpha']}")
|
1765
|
+
settings['agg_type'] = None
|
1766
|
+
print(f'agg_type set to None for quantile regression')
|
1767
|
+
return settings
|
1768
|
+
|
1769
|
+
def perform_regression(df, settings):
|
1770
|
+
|
1771
|
+
from spacr.plot import _plot_plates
|
1772
|
+
|
1773
|
+
results_filename = os.path.splitext(os.path.basename(settings['gene_weights_csv']))[0] + '_results.csv'
|
1774
|
+
hits_filename = os.path.splitext(os.path.basename(settings['gene_weights_csv']))[0] + '_results_significant.csv'
|
1775
|
+
|
1776
|
+
results_filename = settings['regression_type']+'_'+results_filename
|
1777
|
+
hits_filename = settings['regression_type']+'_'+hits_filename
|
1778
|
+
if settings['regression_type'] == 'quantile':
|
1779
|
+
results_filename = str(settings['alpha'])+'_'+results_filename
|
1780
|
+
hits_filename = str(settings['alpha'])+'_'+hits_filename
|
1781
|
+
results_path=os.path.join(os.path.dirname(settings['gene_weights_csv']), results_filename)
|
1782
|
+
hits_path=os.path.join(os.path.dirname(settings['gene_weights_csv']), hits_filename)
|
1783
|
+
|
1784
|
+
settings = set_regression_defaults(settings)
|
1785
|
+
|
1786
|
+
df = clean_controls(df,settings['pc'],settings['nc'],settings['other'])
|
1787
|
+
dependent_df, dependent_variable = process_scores(df, settings['dependent_variable'], settings['plate'], settings['min_cell_count'], settings['agg_type'], settings['transform'])
|
1788
|
+
display(dependent_df)
|
1789
|
+
|
1790
|
+
independent_df = precess_reads(settings['gene_weights_csv'], settings['fraction_threshold'], settings['plate'])
|
1791
|
+
display(independent_df)
|
1792
|
+
|
1793
|
+
merged_df = pd.merge(independent_df, dependent_df, on='prc')
|
1794
|
+
|
1795
|
+
merged_df[['plate', 'row', 'column']] = merged_df['prc'].str.split('_', expand=True)
|
1796
|
+
|
1797
|
+
plate_heatmap = _plot_plates(df, variable=dependent_variable, grouping='mean', min_max='allq', cmap='viridis', min_count=settings['min_cell_count'])
|
1798
|
+
|
1799
|
+
model, coef_df = regression(merged_df, settings['gene_weights_csv'], dependent_variable, settings['regression_type'], settings['alpha'], settings['remove_row_column_effect'])
|
1800
|
+
|
1801
|
+
coef_df.to_csv(results_path, index=False)
|
1802
|
+
|
1803
|
+
if settings['regression_type'] == 'lasso':
|
1804
|
+
significant = coef_df[coef_df['coefficient'] > 0]
|
1805
|
+
|
1806
|
+
else:
|
1807
|
+
significant = coef_df[coef_df['p_value']<= 0.05]
|
1808
|
+
#significant = significant[significant['coefficient'] > 0.1]
|
1809
|
+
significant.sort_values(by='coefficient', ascending=False, inplace=True)
|
1810
|
+
significant = significant[~significant['feature'].str.contains('row|column')]
|
1811
|
+
|
1812
|
+
if settings['regression_type'] == 'ols':
|
1813
|
+
print(model.summary())
|
1814
|
+
|
1815
|
+
significant.to_csv(hits_path, index=False)
|
1816
|
+
print('Significant Genes')
|
1817
|
+
display(significant)
|
1818
|
+
return coef_df
|
@@ -4,7 +4,7 @@ spacr/alpha.py,sha256=Y95sLEfpK2OSYKRn3M8eUOU33JJeXfV8zhrC4KnwSTY,35244
|
|
4
4
|
spacr/annotate_app.py,sha256=w7t7Zilu31FSIRDKtIPae8X4MZGez3cJugFM3rOmnlQ,20617
|
5
5
|
spacr/chris.py,sha256=YlBjSgeZaY8HPy6jkrT_ISAnCMAKVfvCxF0I9eAZLFM,2418
|
6
6
|
spacr/cli.py,sha256=507jfOOEV8BoL4eeUcblvH-iiDHdBrEVJLu1ghAAPSc,1800
|
7
|
-
spacr/core.py,sha256=
|
7
|
+
spacr/core.py,sha256=L2z9HmB0TjrwTQ-iDfoacQ9BClqfCeVEJQQbKkP3Yas,155517
|
8
8
|
spacr/deep_spacr.py,sha256=ljIakns6q74an5QwDU7j0xoj6jRCAz-ejY0QHj9X0d8,33193
|
9
9
|
spacr/foldseek.py,sha256=YIP1d4Ci6CeA9jSyiv-HTDbNmAmcSM9Y_DaOs7wYzLY,33546
|
10
10
|
spacr/get_alfafold_structures.py,sha256=ehx_MQgb12k3hFecP6cYVlm5TLO8iWjgevy8ESyS3cw,3544
|
@@ -16,21 +16,21 @@ spacr/gui_mask_app.py,sha256=WKkAH0jv-SnfaZdJ8MkC7mkUIVSSrNE8lUfH3QBvUak,9747
|
|
16
16
|
spacr/gui_measure_app.py,sha256=5vjjds5NFaOcE8XeuWDug9k-NI4jbTrwp54sJ7DNaNI,9625
|
17
17
|
spacr/gui_sim_app.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
spacr/gui_utils.py,sha256=JRWwmGEEVSPgs0UtZRukdNwIUJepbP675_Fvs5qocPk,49718
|
19
|
-
spacr/io.py,sha256=
|
19
|
+
spacr/io.py,sha256=WOKkFA-Npc22EeWJVxYPhCrbqjyEGz4cTih53eAxhMM,109690
|
20
20
|
spacr/logger.py,sha256=7Zqr3TuuOQLWT32gYr2q1qvv7x0a2JhLANmZcnBXAW8,670
|
21
21
|
spacr/mask_app.py,sha256=jlKmj_evveIkkyH3PYEcAshcLXN0DOPWB1oc4hAwq9E,44201
|
22
22
|
spacr/measure.py,sha256=-pR43dO1MPiwIa7zACcWyNTBpHYDyiYFV_6sTo3qqRk,54975
|
23
23
|
spacr/old_code.py,sha256=jw67DAGoLBd7mWofVzRJSEmCI1Qrff26zIo65SEkV00,13817
|
24
24
|
spacr/plot.py,sha256=fnswxUXHwSLmxRpqSAmoUl5ln-_ueYPeYQlDmiYSwzQ,63299
|
25
|
-
spacr/sequencing.py,sha256=
|
25
|
+
spacr/sequencing.py,sha256=xS-0n_Du_zK0jIt2HE5GUCXij9CpWquXdk8E19xiMWo,82310
|
26
26
|
spacr/sim.py,sha256=FveaVgBi3eypO2oVB5Dx-v0CC1Ny7UPfXkJiiRRodAk,71212
|
27
27
|
spacr/timelapse.py,sha256=5TNmkzR_urMxy0eVB4quGdjNj2QduyiwrLL2I-udlAg,39614
|
28
28
|
spacr/utils.py,sha256=3cA3qUNf7l_VEeuhype2kI7B5IoYK0hb6Y31Q6Si3ds,184107
|
29
29
|
spacr/version.py,sha256=axH5tnGwtgSnJHb5IDhiu4Zjk5GhLyAEDRe-rnaoFOA,409
|
30
30
|
spacr/models/cp/toxo_pv_lumen.CP_model,sha256=2y_CindYhmTvVwBH39SNILF3rI3x9SsRn6qrMxHy3l0,26562451
|
31
|
-
spacr-0.0.
|
32
|
-
spacr-0.0.
|
33
|
-
spacr-0.0.
|
34
|
-
spacr-0.0.
|
35
|
-
spacr-0.0.
|
36
|
-
spacr-0.0.
|
31
|
+
spacr-0.0.82.dist-info/LICENSE,sha256=SR-2MeGc6SCM1UORJYyarSWY_A-JaOMFDj7ReSs9tRM,1083
|
32
|
+
spacr-0.0.82.dist-info/METADATA,sha256=iiIvFLIDU5M7F8VIuGMWah3zpzjbQv_a8kIlIikp174,5158
|
33
|
+
spacr-0.0.82.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
34
|
+
spacr-0.0.82.dist-info/entry_points.txt,sha256=xncHsqD9MI5wj0_p4mgZlrB8dHm_g_qF0Ggo1c78LqY,315
|
35
|
+
spacr-0.0.82.dist-info/top_level.txt,sha256=GJPU8FgwRXGzKeut6JopsSRY2R8T3i9lDgya42tLInY,6
|
36
|
+
spacr-0.0.82.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|