spacr 0.0.70__py3-none-any.whl → 0.0.80__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/__init__.py +4 -1
- spacr/__main__.py +0 -7
- spacr/annotate_app.py +75 -61
- spacr/core.py +39 -246
- spacr/foldseek.py +6 -6
- spacr/get_alfafold_structures.py +3 -3
- spacr/io.py +53 -116
- spacr/measure.py +46 -59
- spacr/plot.py +117 -81
- spacr/sequencing.py +508 -491
- spacr/sim.py +24 -29
- spacr/utils.py +487 -260
- {spacr-0.0.70.dist-info → spacr-0.0.80.dist-info}/METADATA +10 -8
- spacr-0.0.80.dist-info/RECORD +36 -0
- spacr/graph_learning_lap.py +0 -84
- spacr/train.py +0 -667
- spacr/umap.py +0 -0
- spacr-0.0.70.dist-info/RECORD +0 -39
- {spacr-0.0.70.dist-info → spacr-0.0.80.dist-info}/LICENSE +0 -0
- {spacr-0.0.70.dist-info → spacr-0.0.80.dist-info}/WHEEL +0 -0
- {spacr-0.0.70.dist-info → spacr-0.0.80.dist-info}/entry_points.txt +0 -0
- {spacr-0.0.70.dist-info → spacr-0.0.80.dist-info}/top_level.txt +0 -0
spacr/sequencing.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
-
import os, re, time, math, subprocess
|
2
|
-
import numpy as np
|
1
|
+
import os, gc, gzip, re, time, math, subprocess
|
3
2
|
import pandas as pd
|
3
|
+
import numpy as np
|
4
|
+
from tqdm import tqdm
|
5
|
+
from Bio.Align import PairwiseAligner
|
4
6
|
import matplotlib.pyplot as plt
|
5
7
|
import seaborn as sns
|
6
8
|
from Bio import pairwise2
|
@@ -8,6 +10,509 @@ import statsmodels.api as sm
|
|
8
10
|
import statsmodels.formula.api as smf
|
9
11
|
from scipy.stats import gmean
|
10
12
|
from difflib import SequenceMatcher
|
13
|
+
from collections import Counter
|
14
|
+
|
15
|
+
def analyze_reads(settings):
|
16
|
+
"""
|
17
|
+
Analyzes reads from gzipped fastq files and combines them based on specified settings.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
settings (dict): A dictionary containing the following keys:
|
21
|
+
- 'src' (str): The path to the folder containing the input fastq files.
|
22
|
+
- 'upstream' (str, optional): The upstream sequence used for read combination. Defaults to 'CTTCTGGTAAATGGGGATGTCAAGTT'.
|
23
|
+
- 'downstream' (str, optional): The downstream sequence used for read combination. Defaults to 'GTTTAAGAGCTATGCTGGAAACAGCA'.
|
24
|
+
- 'barecode_length' (int, optional): The length of the barcode sequence. Defaults to 8.
|
25
|
+
- 'chunk_size' (int, optional): The number of reads to process and save at a time. Defaults to 1000000.
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
None
|
29
|
+
"""
|
30
|
+
|
31
|
+
def save_chunk_to_hdf5(output_file_path, data_chunk, chunk_counter):
|
32
|
+
"""
|
33
|
+
Save a data chunk to an HDF5 file.
|
34
|
+
|
35
|
+
Parameters:
|
36
|
+
- output_file_path (str): The path to the output HDF5 file.
|
37
|
+
- data_chunk (list): The data chunk to be saved.
|
38
|
+
- chunk_counter (int): The counter for the current chunk.
|
39
|
+
|
40
|
+
Returns:
|
41
|
+
None
|
42
|
+
"""
|
43
|
+
df = pd.DataFrame(data_chunk, columns=['combined_read', 'grna', 'plate_row', 'column', 'sample'])
|
44
|
+
with pd.HDFStore(output_file_path, mode='a', complevel=5, complib='blosc') as store:
|
45
|
+
store.put(f'reads/chunk_{chunk_counter}', df, format='table', append=True)
|
46
|
+
|
47
|
+
def reverse_complement(seq):
|
48
|
+
"""
|
49
|
+
Returns the reverse complement of a DNA sequence.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
seq (str): The DNA sequence to be reversed and complemented.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
str: The reverse complement of the input DNA sequence.
|
56
|
+
|
57
|
+
Example:
|
58
|
+
>>> reverse_complement('ATCG')
|
59
|
+
'CGAT'
|
60
|
+
"""
|
61
|
+
complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
|
62
|
+
return ''.join(complement[base] for base in reversed(seq))
|
63
|
+
|
64
|
+
def get_avg_read_length(file_path, num_reads=100):
|
65
|
+
"""
|
66
|
+
Calculate the average read length from a given file.
|
67
|
+
|
68
|
+
Args:
|
69
|
+
file_path (str): The path to the input file.
|
70
|
+
num_reads (int, optional): The number of reads to process. Defaults to 100.
|
71
|
+
|
72
|
+
Returns:
|
73
|
+
float: The average read length.
|
74
|
+
|
75
|
+
Raises:
|
76
|
+
FileNotFoundError: If the input file does not exist.
|
77
|
+
"""
|
78
|
+
if not file_path:
|
79
|
+
return 0
|
80
|
+
total_length = 0
|
81
|
+
count = 0
|
82
|
+
with gzip.open(file_path, 'rt') as f:
|
83
|
+
for _ in range(num_reads):
|
84
|
+
try:
|
85
|
+
f.readline() # Skip index line
|
86
|
+
read = f.readline().strip()
|
87
|
+
total_length += len(read)
|
88
|
+
f.readline() # Skip plus line
|
89
|
+
f.readline() # Skip quality line
|
90
|
+
count += 1
|
91
|
+
except StopIteration:
|
92
|
+
break
|
93
|
+
return total_length / count if count > 0 else 0
|
94
|
+
|
95
|
+
def parse_gz_files(folder_path):
|
96
|
+
"""
|
97
|
+
Parses the .fastq.gz files in the specified folder path and returns a dictionary
|
98
|
+
containing the sample names and their corresponding file paths.
|
99
|
+
|
100
|
+
Args:
|
101
|
+
folder_path (str): The path to the folder containing the .fastq.gz files.
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
dict: A dictionary where the keys are the sample names and the values are
|
105
|
+
dictionaries containing the file paths for the 'R1' and 'R2' read directions.
|
106
|
+
"""
|
107
|
+
files = os.listdir(folder_path)
|
108
|
+
gz_files = [f for f in files if f.endswith('.fastq.gz')]
|
109
|
+
|
110
|
+
samples_dict = {}
|
111
|
+
for gz_file in gz_files:
|
112
|
+
parts = gz_file.split('_')
|
113
|
+
sample_name = parts[0]
|
114
|
+
read_direction = parts[1]
|
115
|
+
|
116
|
+
if sample_name not in samples_dict:
|
117
|
+
samples_dict[sample_name] = {}
|
118
|
+
|
119
|
+
if read_direction == "R1":
|
120
|
+
samples_dict[sample_name]['R1'] = os.path.join(folder_path, gz_file)
|
121
|
+
elif read_direction == "R2":
|
122
|
+
samples_dict[sample_name]['R2'] = os.path.join(folder_path, gz_file)
|
123
|
+
|
124
|
+
return samples_dict
|
125
|
+
|
126
|
+
def find_overlap(r1_read_rc, r2_read):
|
127
|
+
"""
|
128
|
+
Find the best alignment between two DNA reads.
|
129
|
+
|
130
|
+
Parameters:
|
131
|
+
- r1_read_rc (str): The reverse complement of the first DNA read.
|
132
|
+
- r2_read (str): The second DNA read.
|
133
|
+
|
134
|
+
Returns:
|
135
|
+
- best_alignment (Alignment): The best alignment between the two DNA reads.
|
136
|
+
"""
|
137
|
+
aligner = PairwiseAligner()
|
138
|
+
alignments = aligner.align(r1_read_rc, r2_read)
|
139
|
+
best_alignment = alignments[0]
|
140
|
+
return best_alignment
|
141
|
+
|
142
|
+
def combine_reads(samples_dict, src, chunk_size, barecode_length, upstream, downstream):
|
143
|
+
"""
|
144
|
+
Combine reads from paired-end sequencing files and save the combined reads to a new file.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
samples_dict (dict): A dictionary mapping sample names to file paths of paired-end sequencing files.
|
148
|
+
src (str): The source directory where the combined reads will be saved.
|
149
|
+
chunk_size (int): The number of reads to be processed and saved as a chunk.
|
150
|
+
barecode_length (int): The length of the barcode sequence.
|
151
|
+
upstream (str): The upstream sequence used for read splitting.
|
152
|
+
downstream (str): The downstream sequence used for read splitting.
|
153
|
+
|
154
|
+
Returns:
|
155
|
+
None
|
156
|
+
"""
|
157
|
+
dst = os.path.join(src, 'combined_reads')
|
158
|
+
if not os.path.exists(dst):
|
159
|
+
os.makedirs(dst)
|
160
|
+
|
161
|
+
for sample, paths in samples_dict.items():
|
162
|
+
print(f'Processing: {sample} with the files: {paths}')
|
163
|
+
r1_path = paths.get('R1')
|
164
|
+
r2_path = paths.get('R2')
|
165
|
+
|
166
|
+
output_file_path = os.path.join(dst, f"{sample}_combined.h5")
|
167
|
+
qc_file_path = os.path.join(dst, f"{sample}_qc.csv")
|
168
|
+
|
169
|
+
r1_file = gzip.open(r1_path, 'rt') if r1_path else None
|
170
|
+
r2_file = gzip.open(r2_path, 'rt') if r2_path else None
|
171
|
+
|
172
|
+
chunk_counter = 0
|
173
|
+
data_chunk = []
|
174
|
+
|
175
|
+
success = 0
|
176
|
+
fail = 0
|
177
|
+
|
178
|
+
# Calculate initial average read length
|
179
|
+
avg_read_length_r1 = get_avg_read_length(r1_path, 100)
|
180
|
+
avg_read_length_r2 = get_avg_read_length(r2_path, 100)
|
181
|
+
avg_read_length = (avg_read_length_r1 + avg_read_length_r2) / 2 if avg_read_length_r1 and avg_read_length_r2 else 0
|
182
|
+
|
183
|
+
print(f'Initial avg_read_length: {avg_read_length}')
|
184
|
+
|
185
|
+
# Estimate the initial number of reads based on the file size
|
186
|
+
r1_size_est = os.path.getsize(r1_path) // (avg_read_length * 4) if r1_path else 0
|
187
|
+
r2_size_est = os.path.getsize(r2_path) // (avg_read_length * 4) if r2_path else 0
|
188
|
+
max_size = max(r1_size_est, r2_size_est) * 10
|
189
|
+
|
190
|
+
with tqdm(total=max_size, desc=f"Processing {sample}") as pbar:
|
191
|
+
total_length_processed = 0
|
192
|
+
read_count = 0
|
193
|
+
|
194
|
+
while True:
|
195
|
+
try:
|
196
|
+
r1_index = next(r1_file).strip() if r1_file else None
|
197
|
+
r1_read = next(r1_file).strip() if r1_file else None
|
198
|
+
r1_plus = next(r1_file).strip() if r1_file else None
|
199
|
+
r1_quality = next(r1_file).strip() if r1_file else None
|
200
|
+
|
201
|
+
r2_index = next(r2_file).strip() if r2_file else None
|
202
|
+
r2_read = next(r2_file).strip() if r2_file else None
|
203
|
+
r2_plus = next(r2_file).strip() if r2_file else None
|
204
|
+
r2_quality = next(r2_file).strip() if r2_file else None
|
205
|
+
|
206
|
+
pbar.update(1)
|
207
|
+
|
208
|
+
if r1_index and r2_index and r1_index.split(' ')[0] != r2_index.split(' ')[0]:
|
209
|
+
fail += 1
|
210
|
+
print(f"Index mismatch: {r1_index} != {r2_index}")
|
211
|
+
continue
|
212
|
+
|
213
|
+
r1_read_rc = reverse_complement(r1_read) if r1_read else ''
|
214
|
+
r1_quality_rc = r1_quality[::-1] if r1_quality else ''
|
215
|
+
|
216
|
+
r1_rc_split_index = r1_read_rc.find(upstream)
|
217
|
+
r2_split_index = r2_read.find(upstream)
|
218
|
+
|
219
|
+
if r1_rc_split_index == -1 or r2_split_index == -1:
|
220
|
+
fail += 1
|
221
|
+
continue
|
222
|
+
else:
|
223
|
+
success += 1
|
224
|
+
|
225
|
+
read1_fragment = r1_read_rc[:r1_rc_split_index]
|
226
|
+
read2_fragment = r2_read[r2_split_index:]
|
227
|
+
read_combo = read1_fragment + read2_fragment
|
228
|
+
|
229
|
+
combo_split_index_1 = read_combo.find(upstream)
|
230
|
+
combo_split_index_2 = read_combo.find(downstream)
|
231
|
+
|
232
|
+
barcode_1 = read_combo[combo_split_index_1 - barecode_length:combo_split_index_1]
|
233
|
+
grna = read_combo[combo_split_index_1 + len(upstream):combo_split_index_2]
|
234
|
+
barcode_2 = read_combo[combo_split_index_2 + len(downstream):combo_split_index_2 + len(downstream) + barecode_length]
|
235
|
+
barcode_2 = reverse_complement(barcode_2)
|
236
|
+
data_chunk.append((read_combo, grna, barcode_1, barcode_2, sample))
|
237
|
+
|
238
|
+
read_count += 1
|
239
|
+
total_length_processed += len(r1_read) + len(r2_read)
|
240
|
+
|
241
|
+
# Periodically update the average read length and total
|
242
|
+
if read_count % 10000 == 0:
|
243
|
+
avg_read_length = total_length_processed / (read_count * 2)
|
244
|
+
max_size = (os.path.getsize(r1_path) + os.path.getsize(r2_path)) // (avg_read_length * 4)
|
245
|
+
pbar.total = max_size
|
246
|
+
|
247
|
+
if len(data_chunk) >= chunk_size:
|
248
|
+
save_chunk_to_hdf5(output_file_path, data_chunk, chunk_counter)
|
249
|
+
chunk_counter += 1
|
250
|
+
data_chunk = []
|
251
|
+
|
252
|
+
except StopIteration:
|
253
|
+
break
|
254
|
+
|
255
|
+
# Save any remaining data_chunk
|
256
|
+
if data_chunk:
|
257
|
+
save_chunk_to_hdf5(output_file_path, data_chunk, chunk_counter)
|
258
|
+
|
259
|
+
# Save QC metrics
|
260
|
+
qc = {'success': success, 'failed': fail}
|
261
|
+
qc_df = pd.DataFrame([qc])
|
262
|
+
qc_df.to_csv(qc_file_path, index=False)
|
263
|
+
|
264
|
+
settings.setdefault('upstream', 'CTTCTGGTAAATGGGGATGTCAAGTT')
|
265
|
+
settings.setdefault('downstream', 'GTTTAAGAGCTATGCTGGAAACAGCA')
|
266
|
+
settings.setdefault('barecode_length', 8)
|
267
|
+
settings.setdefault('chunk_size', 1000000)
|
268
|
+
|
269
|
+
samples_dict = parse_gz_files(settings['src'])
|
270
|
+
combine_reads(samples_dict, settings['src'], settings['chunk_size'], settings['barecode_length'], settings['upstream'], settings['downstream'])
|
271
|
+
|
272
|
+
def map_barcodes(h5_file_path, settings={}):
|
273
|
+
"""
|
274
|
+
Maps barcodes and performs quality control on sequencing data.
|
275
|
+
|
276
|
+
Args:
|
277
|
+
h5_file_path (str): The file path to the HDF5 file containing the sequencing data.
|
278
|
+
settings (dict, optional): Additional settings for the mapping and quality control process. Defaults to {}.
|
279
|
+
|
280
|
+
Returns:
|
281
|
+
None
|
282
|
+
"""
|
283
|
+
def get_read_qc(df, df_cleaned):
|
284
|
+
"""
|
285
|
+
Calculate quality control metrics for sequencing reads.
|
286
|
+
|
287
|
+
Parameters:
|
288
|
+
- df: DataFrame containing the sequencing reads.
|
289
|
+
- df_cleaned: DataFrame containing the cleaned sequencing reads.
|
290
|
+
|
291
|
+
Returns:
|
292
|
+
- qc_dict: Dictionary containing the following quality control metrics:
|
293
|
+
- 'reads': Total number of reads.
|
294
|
+
- 'cleaned_reads': Total number of cleaned reads.
|
295
|
+
- 'NaN_grna': Number of reads with missing 'grna_metadata'.
|
296
|
+
- 'NaN_plate_row': Number of reads with missing 'plate_row_metadata'.
|
297
|
+
- 'NaN_column': Number of reads with missing 'column_metadata'.
|
298
|
+
- 'NaN_plate': Number of reads with missing 'plate_metadata'.
|
299
|
+
- 'unique_grna': Counter object containing the count of unique 'grna_metadata' values.
|
300
|
+
- 'unique_plate_row': Counter object containing the count of unique 'plate_row_metadata' values.
|
301
|
+
- 'unique_column': Counter object containing the count of unique 'column_metadata' values.
|
302
|
+
- 'unique_plate': Counter object containing the count of unique 'plate_metadata' values.
|
303
|
+
"""
|
304
|
+
qc_dict = {}
|
305
|
+
qc_dict['reads'] = len(df)
|
306
|
+
qc_dict['cleaned_reads'] = len(df_cleaned)
|
307
|
+
qc_dict['NaN_grna'] = df['grna_metadata'].isna().sum()
|
308
|
+
qc_dict['NaN_plate_row'] = df['plate_row_metadata'].isna().sum()
|
309
|
+
qc_dict['NaN_column'] = df['column_metadata'].isna().sum()
|
310
|
+
qc_dict['NaN_plate'] = df['plate_metadata'].isna().sum()
|
311
|
+
qc_dict['unique_grna'] = Counter(df['grna_metadata'].dropna().tolist())
|
312
|
+
qc_dict['unique_plate_row'] = Counter(df['plate_row_metadata'].dropna().tolist())
|
313
|
+
qc_dict['unique_column'] = Counter(df['column_metadata'].dropna().tolist())
|
314
|
+
qc_dict['unique_plate'] = Counter(df['plate_metadata'].dropna().tolist())
|
315
|
+
|
316
|
+
return qc_dict
|
317
|
+
|
318
|
+
def mapping_dicts(df, settings):
|
319
|
+
"""
|
320
|
+
Maps the values in the DataFrame columns to corresponding metadata using dictionaries.
|
321
|
+
|
322
|
+
Args:
|
323
|
+
df (pandas.DataFrame): The DataFrame containing the data to be mapped.
|
324
|
+
settings (dict): A dictionary containing the settings for mapping.
|
325
|
+
|
326
|
+
Returns:
|
327
|
+
pandas.DataFrame: The DataFrame with the mapped metadata columns added.
|
328
|
+
"""
|
329
|
+
grna_df = pd.read_csv(settings['grna'])
|
330
|
+
barcode_df = pd.read_csv(settings['barcodes'])
|
331
|
+
|
332
|
+
grna_dict = {row['sequence']: row['name'] for _, row in grna_df.iterrows()}
|
333
|
+
plate_row_dict = {row['sequence']: row['name'] for _, row in barcode_df.iterrows() if row['name'].startswith('p')}
|
334
|
+
column_dict = {row['sequence']: row['name'] for _, row in barcode_df.iterrows() if row['name'].startswith('c')}
|
335
|
+
plate_dict = settings['plate_dict']
|
336
|
+
|
337
|
+
df['grna_metadata'] = df['grna'].map(grna_dict)
|
338
|
+
df['grna_length'] = df['grna'].apply(len)
|
339
|
+
df['plate_row_metadata'] = df['plate_row'].map(plate_row_dict)
|
340
|
+
df['column_metadata'] = df['column'].map(column_dict)
|
341
|
+
df['plate_metadata'] = df['sample'].map(plate_dict)
|
342
|
+
|
343
|
+
return df
|
344
|
+
|
345
|
+
settings.setdefault('grna', '/home/carruthers/Documents/grna_barecodes.csv')
|
346
|
+
settings.setdefault('barcodes', '/home/carruthers/Documents/SCREEN_BARECODES.csv')
|
347
|
+
settings.setdefault('plate_dict', {'EO1': 'plate1', 'EO2': 'plate2', 'EO3': 'plate3', 'EO4': 'plate4', 'EO5': 'plate5', 'EO6': 'plate6', 'EO7': 'plate7', 'EO8': 'plate8'})
|
348
|
+
settings.setdefault('test', False)
|
349
|
+
settings.setdefault('verbose', True)
|
350
|
+
settings.setdefault('min_itemsize', 1000)
|
351
|
+
|
352
|
+
qc_file_path = os.path.splitext(h5_file_path)[0] + '_qc_step_2.csv'
|
353
|
+
unique_grna_file_path = os.path.splitext(h5_file_path)[0] + '_unique_grna.csv'
|
354
|
+
unique_plate_row_file_path = os.path.splitext(h5_file_path)[0] + '_unique_plate_row.csv'
|
355
|
+
unique_column_file_path = os.path.splitext(h5_file_path)[0] + '_unique_column.csv'
|
356
|
+
unique_plate_file_path = os.path.splitext(h5_file_path)[0] + '_unique_plate.csv'
|
357
|
+
new_h5_file_path = os.path.splitext(h5_file_path)[0] + '_cleaned.h5'
|
358
|
+
|
359
|
+
# Initialize the HDF5 store for cleaned data
|
360
|
+
store_cleaned = pd.HDFStore(new_h5_file_path, mode='a', complevel=5, complib='blosc')
|
361
|
+
|
362
|
+
# Initialize the overall QC metrics
|
363
|
+
overall_qc = {
|
364
|
+
'reads': 0,
|
365
|
+
'cleaned_reads': 0,
|
366
|
+
'NaN_grna': 0,
|
367
|
+
'NaN_plate_row': 0,
|
368
|
+
'NaN_column': 0,
|
369
|
+
'NaN_plate': 0,
|
370
|
+
'unique_grna': Counter(),
|
371
|
+
'unique_plate_row': Counter(),
|
372
|
+
'unique_column': Counter(),
|
373
|
+
'unique_plate': Counter()
|
374
|
+
}
|
375
|
+
|
376
|
+
with pd.HDFStore(h5_file_path, mode='r') as store:
|
377
|
+
keys = [key for key in store.keys() if key.startswith('/reads/chunk_')]
|
378
|
+
|
379
|
+
for key in keys:
|
380
|
+
df = store.get(key)
|
381
|
+
df = mapping_dicts(df, settings)
|
382
|
+
df_cleaned = df.dropna()
|
383
|
+
qc_dict = get_read_qc(df, df_cleaned)
|
384
|
+
|
385
|
+
# Accumulate QC metrics
|
386
|
+
overall_qc['reads'] += qc_dict['reads']
|
387
|
+
overall_qc['cleaned_reads'] += qc_dict['cleaned_reads']
|
388
|
+
overall_qc['NaN_grna'] += qc_dict['NaN_grna']
|
389
|
+
overall_qc['NaN_plate_row'] += qc_dict['NaN_plate_row']
|
390
|
+
overall_qc['NaN_column'] += qc_dict['NaN_column']
|
391
|
+
overall_qc['NaN_plate'] += qc_dict['NaN_plate']
|
392
|
+
overall_qc['unique_grna'].update(qc_dict['unique_grna'])
|
393
|
+
overall_qc['unique_plate_row'].update(qc_dict['unique_plate_row'])
|
394
|
+
overall_qc['unique_column'].update(qc_dict['unique_column'])
|
395
|
+
overall_qc['unique_plate'].update(qc_dict['unique_plate'])
|
396
|
+
|
397
|
+
df_cleaned = df_cleaned[df_cleaned['grna_length'] >= 30]
|
398
|
+
|
399
|
+
# Save cleaned data to the new HDF5 store
|
400
|
+
store_cleaned.put('reads/cleaned_data', df_cleaned, format='table', append=True)
|
401
|
+
|
402
|
+
del df_cleaned, df
|
403
|
+
gc.collect()
|
404
|
+
|
405
|
+
# Convert the Counter objects to DataFrames and save them to CSV files
|
406
|
+
unique_grna_df = pd.DataFrame(overall_qc['unique_grna'].items(), columns=['key', 'value'])
|
407
|
+
unique_plate_row_df = pd.DataFrame(overall_qc['unique_plate_row'].items(), columns=['key', 'value'])
|
408
|
+
unique_column_df = pd.DataFrame(overall_qc['unique_column'].items(), columns=['key', 'value'])
|
409
|
+
unique_plate_df = pd.DataFrame(overall_qc['unique_plate'].items(), columns=['key', 'value'])
|
410
|
+
|
411
|
+
unique_grna_df.to_csv(unique_grna_file_path, index=False)
|
412
|
+
unique_plate_row_df.to_csv(unique_plate_row_file_path, index=False)
|
413
|
+
unique_column_df.to_csv(unique_column_file_path, index=False)
|
414
|
+
unique_plate_df.to_csv(unique_plate_file_path, index=False)
|
415
|
+
|
416
|
+
# Remove the unique counts from overall_qc for the main QC CSV file
|
417
|
+
del overall_qc['unique_grna']
|
418
|
+
del overall_qc['unique_plate_row']
|
419
|
+
del overall_qc['unique_column']
|
420
|
+
del overall_qc['unique_plate']
|
421
|
+
|
422
|
+
# Combine all remaining QC metrics into a single DataFrame and save it to CSV
|
423
|
+
qc_df = pd.DataFrame([overall_qc])
|
424
|
+
qc_df.to_csv(qc_file_path, index=False)
|
425
|
+
|
426
|
+
# Close the HDF5 store
|
427
|
+
store_cleaned.close()
|
428
|
+
|
429
|
+
gc.collect()
|
430
|
+
return
|
431
|
+
|
432
|
+
def map_barcodes_v1(h5_file_path, settings={}):
|
433
|
+
|
434
|
+
def get_read_qc(df, df_cleaned):
|
435
|
+
qc_dict = {}
|
436
|
+
qc_dict['reads'] = len(df)
|
437
|
+
qc_dict['cleaned_reads'] = len(df_cleaned)
|
438
|
+
qc_dict['NaN_grna'] = df['grna_metadata'].isna().sum()
|
439
|
+
qc_dict['NaN_plate_row'] = df['plate_row_metadata'].isna().sum()
|
440
|
+
qc_dict['NaN_column'] = df['column_metadata'].isna().sum()
|
441
|
+
qc_dict['NaN_plate'] = df['plate_metadata'].isna().sum()
|
442
|
+
|
443
|
+
|
444
|
+
qc_dict['unique_grna'] = len(df['grna_metadata'].dropna().unique().tolist())
|
445
|
+
qc_dict['unique_plate_row'] = len(df['plate_row_metadata'].dropna().unique().tolist())
|
446
|
+
qc_dict['unique_column'] = len(df['column_metadata'].dropna().unique().tolist())
|
447
|
+
qc_dict['unique_plate'] = len(df['plate_metadata'].dropna().unique().tolist())
|
448
|
+
qc_dict['value_counts_grna'] = df['grna_metadata'].value_counts(dropna=True)
|
449
|
+
qc_dict['value_counts_plate_row'] = df['plate_row_metadata'].value_counts(dropna=True)
|
450
|
+
qc_dict['value_counts_column'] = df['column_metadata'].value_counts(dropna=True)
|
451
|
+
|
452
|
+
return qc_dict
|
453
|
+
|
454
|
+
def mapping_dicts(df, settings):
|
455
|
+
grna_df = pd.read_csv(settings['grna'])
|
456
|
+
barcode_df = pd.read_csv(settings['barcodes'])
|
457
|
+
|
458
|
+
grna_dict = {row['sequence']: row['name'] for _, row in grna_df.iterrows()}
|
459
|
+
plate_row_dict = {row['sequence']: row['name'] for _, row in barcode_df.iterrows() if row['name'].startswith('p')}
|
460
|
+
column_dict = {row['sequence']: row['name'] for _, row in barcode_df.iterrows() if row['name'].startswith('c')}
|
461
|
+
plate_dict = settings['plate_dict']
|
462
|
+
|
463
|
+
df['grna_metadata'] = df['grna'].map(grna_dict)
|
464
|
+
df['grna_length'] = df['grna'].apply(len)
|
465
|
+
df['plate_row_metadata'] = df['plate_row'].map(plate_row_dict)
|
466
|
+
df['column_metadata'] = df['column'].map(column_dict)
|
467
|
+
df['plate_metadata'] = df['sample'].map(plate_dict)
|
468
|
+
|
469
|
+
return df
|
470
|
+
|
471
|
+
settings.setdefault('grna', '/home/carruthers/Documents/grna_barcodes.csv')
|
472
|
+
settings.setdefault('barcodes', '/home/carruthers/Documents/SCREEN_BARCODES.csv')
|
473
|
+
settings.setdefault('plate_dict', {'EO1': 'plate1', 'EO2': 'plate2', 'EO3': 'plate3', 'EO4': 'plate4', 'EO5': 'plate5', 'EO6': 'plate6', 'EO7': 'plate7', 'EO8': 'plate8'})
|
474
|
+
settings.setdefault('test', False)
|
475
|
+
settings.setdefault('verbose', True)
|
476
|
+
settings.setdefault('min_itemsize', 1000)
|
477
|
+
|
478
|
+
qc_file_path = os.path.splitext(h5_file_path)[0] + '_qc_step_2.csv'
|
479
|
+
new_h5_file_path = os.path.splitext(h5_file_path)[0] + '_cleaned.h5'
|
480
|
+
|
481
|
+
# Initialize the HDF5 store for cleaned data
|
482
|
+
store_cleaned = pd.HDFStore(new_h5_file_path, mode='a', complevel=5, complib='blosc')
|
483
|
+
|
484
|
+
# Initialize the DataFrame for QC metrics
|
485
|
+
qc_df_list = []
|
486
|
+
|
487
|
+
with pd.HDFStore(h5_file_path, mode='r') as store:
|
488
|
+
keys = [key for key in store.keys() if key.startswith('/reads/chunk_')]
|
489
|
+
|
490
|
+
for key in keys:
|
491
|
+
df = store.get(key)
|
492
|
+
df = mapping_dicts(df, settings)
|
493
|
+
df_cleaned = df.dropna()
|
494
|
+
qc_dict = get_read_qc(df, df_cleaned)
|
495
|
+
qc_df_list.append(qc_dict)
|
496
|
+
df_cleaned = df_cleaned[df_cleaned['grna_length'] >= 30]
|
497
|
+
|
498
|
+
# Save cleaned data to the new HDF5 store
|
499
|
+
store_cleaned.put('reads/cleaned_data', df_cleaned, format='table', append=True)
|
500
|
+
|
501
|
+
# Combine all QC metrics into a single DataFrame and save it to CSV
|
502
|
+
qc_df = pd.DataFrame(qc_df_list)
|
503
|
+
qc_df.to_csv(qc_file_path, index=False)
|
504
|
+
|
505
|
+
# Close the HDF5 store
|
506
|
+
store_cleaned.close()
|
507
|
+
return
|
508
|
+
|
509
|
+
def map_barcodes_folder(src, settings={}):
|
510
|
+
for file in os.listdir(src):
|
511
|
+
if file.endswith('.h5'):
|
512
|
+
print(file)
|
513
|
+
path = os.path.join(src, file)
|
514
|
+
map_barcodes(path, settings)
|
515
|
+
gc.collect()
|
11
516
|
|
12
517
|
def reverse_complement(dna_sequence):
|
13
518
|
complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N':'N'}
|
@@ -146,7 +651,6 @@ def count_mismatches(seq1, seq2, align_length=10):
|
|
146
651
|
mismatches = sum(c1 != c2 for c1, c2 in zip(seq1_aligned, seq2_aligned))
|
147
652
|
return mismatches
|
148
653
|
|
149
|
-
|
150
654
|
def get_sequence_data(r1,r2):
|
151
655
|
forward_regex = re.compile(r'^(...GGTGCCACTT)TTTCAAGTTG.*?TTCTAGCTCT(AAAAC[A-Z]{18,22}AACTT)GACATCCCCA.*?AAGGCAAACA(CCCCCTTCGG....).*')
|
152
656
|
r1fd = forward_regex.search(r1)
|
@@ -640,491 +1144,4 @@ def generate_fraction_map(df, gene_column, min_=10, plates=['p1','p2','p3','p4']
|
|
640
1144
|
independent_variables = independent_variables.drop('sum', axis=1)
|
641
1145
|
independent_variables.index.name = 'prc'
|
642
1146
|
independent_variables = independent_variables.loc[:, (independent_variables.sum() != 0)]
|
643
|
-
return independent_variables
|
644
|
-
|
645
|
-
# Check if filename or path
|
646
|
-
def split_filenames(df, filename_column):
|
647
|
-
plate_ls = []
|
648
|
-
well_ls = []
|
649
|
-
col_ls = []
|
650
|
-
row_ls = []
|
651
|
-
field_ls = []
|
652
|
-
obj_ls = []
|
653
|
-
ls = df[filename_column].tolist()
|
654
|
-
if '/' in ls[0]:
|
655
|
-
file_list = [os.path.basename(path) for path in ls]
|
656
|
-
else:
|
657
|
-
file_list = ls
|
658
|
-
print('first file',file_list[0])
|
659
|
-
for filename in file_list:
|
660
|
-
plate = filename.split('_')[0]
|
661
|
-
plate = plate.split('plate')[1]
|
662
|
-
well = filename.split('_')[1]
|
663
|
-
field = filename.split('_')[2]
|
664
|
-
object_nr = filename.split('_')[3]
|
665
|
-
object_nr = object_nr.split('.')[0]
|
666
|
-
object_nr = 'o'+str(object_nr)
|
667
|
-
if re.match('A..', well):
|
668
|
-
row = 'r1'
|
669
|
-
if re.match('B..', well):
|
670
|
-
row = 'r2'
|
671
|
-
if re.match('C..', well):
|
672
|
-
row = 'r3'
|
673
|
-
if re.match('D..', well):
|
674
|
-
row = 'r4'
|
675
|
-
if re.match('E..', well):
|
676
|
-
row = 'r5'
|
677
|
-
if re.match('F..', well):
|
678
|
-
row = 'r6'
|
679
|
-
if re.match('G..', well):
|
680
|
-
row = 'r7'
|
681
|
-
if re.match('H..', well):
|
682
|
-
row = 'r8'
|
683
|
-
if re.match('I..', well):
|
684
|
-
row = 'r9'
|
685
|
-
if re.match('J..', well):
|
686
|
-
row = 'r10'
|
687
|
-
if re.match('K..', well):
|
688
|
-
row = 'r11'
|
689
|
-
if re.match('L..', well):
|
690
|
-
row = 'r12'
|
691
|
-
if re.match('M..', well):
|
692
|
-
row = 'r13'
|
693
|
-
if re.match('N..', well):
|
694
|
-
row = 'r14'
|
695
|
-
if re.match('O..', well):
|
696
|
-
row = 'r15'
|
697
|
-
if re.match('P..', well):
|
698
|
-
row = 'r16'
|
699
|
-
if re.match('.01', well):
|
700
|
-
col = 'c1'
|
701
|
-
if re.match('.02', well):
|
702
|
-
col = 'c2'
|
703
|
-
if re.match('.03', well):
|
704
|
-
col = 'c3'
|
705
|
-
if re.match('.04', well):
|
706
|
-
col = 'c4'
|
707
|
-
if re.match('.05', well):
|
708
|
-
col = 'c5'
|
709
|
-
if re.match('.06', well):
|
710
|
-
col = 'c6'
|
711
|
-
if re.match('.07', well):
|
712
|
-
col = 'c7'
|
713
|
-
if re.match('.08', well):
|
714
|
-
col = 'c8'
|
715
|
-
if re.match('.09', well):
|
716
|
-
col = 'c9'
|
717
|
-
if re.match('.10', well):
|
718
|
-
col = 'c10'
|
719
|
-
if re.match('.11', well):
|
720
|
-
col = 'c11'
|
721
|
-
if re.match('.12', well):
|
722
|
-
col = 'c12'
|
723
|
-
if re.match('.13', well):
|
724
|
-
col = 'c13'
|
725
|
-
if re.match('.14', well):
|
726
|
-
col = 'c14'
|
727
|
-
if re.match('.15', well):
|
728
|
-
col = 'c15'
|
729
|
-
if re.match('.16', well):
|
730
|
-
col = 'c16'
|
731
|
-
if re.match('.17', well):
|
732
|
-
col = 'c17'
|
733
|
-
if re.match('.18', well):
|
734
|
-
col = 'c18'
|
735
|
-
if re.match('.19', well):
|
736
|
-
col = 'c19'
|
737
|
-
if re.match('.20', well):
|
738
|
-
col = 'c20'
|
739
|
-
if re.match('.21', well):
|
740
|
-
col = 'c21'
|
741
|
-
if re.match('.22', well):
|
742
|
-
col = 'c22'
|
743
|
-
if re.match('.23', well):
|
744
|
-
col = 'c23'
|
745
|
-
if re.match('.24', well):
|
746
|
-
col = 'c24'
|
747
|
-
plate_ls.append(plate)
|
748
|
-
well_ls.append(well)
|
749
|
-
field_ls.append(field)
|
750
|
-
obj_ls.append(object_nr)
|
751
|
-
row_ls.append(row)
|
752
|
-
col_ls.append(col)
|
753
|
-
df['file'] = ls
|
754
|
-
df['plate'] = plate_ls
|
755
|
-
df['well'] = well_ls
|
756
|
-
df['row'] = row_ls
|
757
|
-
df['col'] = col_ls
|
758
|
-
df['field'] = field_ls
|
759
|
-
df['obj'] = obj_ls
|
760
|
-
df['plate_well'] = df['plate']+'_'+df['well']
|
761
|
-
df = df.set_index(filename_column)
|
762
|
-
return df
|
763
|
-
|
764
|
-
def rename_plate_metadata(df):
|
765
|
-
try:
|
766
|
-
df = df.drop(['plateID'], axis=1)
|
767
|
-
df = df.drop(['rowID'], axis=1)
|
768
|
-
df = df.drop(['columnID'], axis=1)
|
769
|
-
df = df.drop(['plate_row_col'], axis=1)
|
770
|
-
df = df.drop(['Unnamed: 0'], axis=1)
|
771
|
-
df = df.drop(['Unnamed: 0.1'], axis=1)
|
772
|
-
except:
|
773
|
-
next
|
774
|
-
|
775
|
-
df['plate'] = df['plate'].astype('string')
|
776
|
-
df.plate.replace('1', 'A', inplace=True)
|
777
|
-
df.plate.replace('2', 'B', inplace=True)
|
778
|
-
df.plate.replace('3', 'C', inplace=True)
|
779
|
-
df.plate.replace('4', 'D', inplace=True)
|
780
|
-
df.plate.replace('5', 'E', inplace=True)
|
781
|
-
df.plate.replace('6', 'F', inplace=True)
|
782
|
-
df.plate.replace('7', 'G', inplace=True)
|
783
|
-
df.plate.replace('8', 'H', inplace=True)
|
784
|
-
df.plate.replace('9', 'I', inplace=True)
|
785
|
-
df.plate.replace('10', 'J', inplace=True)
|
786
|
-
|
787
|
-
df.plate.replace('A', 'p1', inplace=True)# 1 - 1
|
788
|
-
df.plate.replace('B', 'p2', inplace=True)# 2 - 2
|
789
|
-
df.plate.replace('C', 'p3', inplace=True)# 3 - 3
|
790
|
-
df.plate.replace('E', 'p4', inplace=True)# 5 - 4
|
791
|
-
|
792
|
-
df.plate.replace('F', 'p5', inplace=True)# 6 - 5
|
793
|
-
df.plate.replace('G', 'p6', inplace=True)# 7 - 6
|
794
|
-
df.plate.replace('H', 'p7', inplace=True)# 8 - 7
|
795
|
-
df.plate.replace('I', 'p8', inplace=True)# 9 - 8
|
796
|
-
|
797
|
-
df['plateID'] = df['plate']
|
798
|
-
|
799
|
-
df.loc[(df['plateID'].isin(['D'])) & (df['col'].isin(['c1', 'c2', 'c3'])), 'plate'] = 'p1'
|
800
|
-
df.loc[(df['plateID'].isin(['D'])) & (df['col'].isin(['c4', 'c5', 'c6'])), 'plate'] = 'p2'
|
801
|
-
df.loc[(df['plateID'].isin(['D'])) & (df['col'].isin(['c7', 'c8', 'c9'])), 'plate'] = 'p3'
|
802
|
-
df.loc[(df['plateID'].isin(['D'])) & (df['col'].isin(['c10', 'c11', 'c12'])), 'plate'] = 'p4'
|
803
|
-
|
804
|
-
df.loc[(df['plateID'].isin(['J'])) & (df['col'].isin(['c1', 'c2', 'c3'])), 'plate'] = 'p5'
|
805
|
-
df.loc[(df['plateID'].isin(['J'])) & (df['col'].isin(['c4', 'c5', 'c6'])), 'plate'] = 'p6'
|
806
|
-
df.loc[(df['plateID'].isin(['J'])) & (df['col'].isin(['c7', 'c8', 'c9'])), 'plate'] = 'p7'
|
807
|
-
df.loc[(df['plateID'].isin(['J'])) & (df['col'].isin(['c10', 'c11', 'c12'])), 'plate'] = 'p8'
|
808
|
-
|
809
|
-
df.loc[(df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c1', 'c4', 'c7', 'c10'])), 'col'] = 'c1'
|
810
|
-
df.loc[(df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c2', 'c5', 'c8', 'c11'])), 'col'] = 'c2'
|
811
|
-
df.loc[(df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c3', 'c6', 'c9', 'c12'])), 'col'] = 'c3'
|
812
|
-
|
813
|
-
df.loc[(~df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c1'])), 'col'] = 'c25'
|
814
|
-
df.loc[(~df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c2'])), 'col'] = 'c26'
|
815
|
-
df.loc[(~df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c3'])), 'col'] = 'c27'
|
816
|
-
|
817
|
-
df.loc[(~df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c1'])), 'col'] = 'c25'
|
818
|
-
|
819
|
-
df = df.drop(['plateID'], axis=1)
|
820
|
-
|
821
|
-
df = df.loc[~df['plate'].isin(['D', 'J'])]
|
822
|
-
|
823
|
-
screen_cols = ['c1','c2','c3','c4','c5','c6','c7','c8','c9','c10','c11','c12','c13','c14','c15','c16','c17','c18','c19','c20','c21','c22','c23','c24']
|
824
|
-
screen_plates = ['p1','p2','p3','p4']
|
825
|
-
positive_control_plates = ['p5','p6','p7','p8']
|
826
|
-
positive_control_cols = screen_cols
|
827
|
-
negative_control_cols = ['c25','c26','c27']
|
828
|
-
#extra_plates = ['p9','p10']
|
829
|
-
cond_ls = []
|
830
|
-
|
831
|
-
cols = df.col.tolist()
|
832
|
-
for index, plate in enumerate(df.plate.tolist()):
|
833
|
-
co = cols[index]
|
834
|
-
if plate in screen_plates:
|
835
|
-
if co in screen_cols:
|
836
|
-
cond = 'SCREEN'
|
837
|
-
if co in negative_control_cols:
|
838
|
-
cond = 'NC'
|
839
|
-
if plate in positive_control_plates:
|
840
|
-
if co in positive_control_cols:
|
841
|
-
cond = 'PC'
|
842
|
-
if co in negative_control_cols:
|
843
|
-
cond = 'NC'
|
844
|
-
cond_ls.append(cond)
|
845
|
-
|
846
|
-
df['cond'] = cond_ls
|
847
|
-
df['plate'] = df['plate'].astype('string')
|
848
|
-
df['row'] = df['row'].astype('string')
|
849
|
-
df['col'] = df['col'].astype('string')
|
850
|
-
df['obj'] = df['obj'].astype('string')
|
851
|
-
df['prco'] = df['plate']+'_'+df['row']+'_'+df['col']+'_'+df['field']+'_'+df['obj']
|
852
|
-
df['prc'] = df['plate']+'_'+df['row']+'_'+df['col']
|
853
|
-
df = df.set_index(['prco'], drop=True)
|
854
|
-
df = df.sort_values(by = ['plate'], ascending = [True], na_position = 'first')
|
855
|
-
values, counts = np.unique(df['plate'], return_counts=True)
|
856
|
-
print('plates:', values)
|
857
|
-
print('well count:', counts)
|
858
|
-
return df
|
859
|
-
|
860
|
-
def plot_reg_res(df, coef_col='coef', col_p='P>|t|'):
|
861
|
-
df['gene'] = df.index
|
862
|
-
df[coef_col] = pd.to_numeric(df[coef_col], errors='coerce')
|
863
|
-
df[col_p] = pd.to_numeric(df[col_p], errors='coerce')
|
864
|
-
df = df.sort_values(by = [coef_col], ascending = [False], na_position = 'first')
|
865
|
-
df['color'] = 'None'
|
866
|
-
df.loc[df['gene'].str.contains('239740'), 'color'] = '239740'
|
867
|
-
df.loc[df['gene'].str.contains('205250'), 'color'] = '205250'
|
868
|
-
|
869
|
-
df.loc[df['gene'].str.contains('000000'), 'color'] = '000000'
|
870
|
-
df.loc[df['gene'].str.contains('000001'), 'color'] = '000000'
|
871
|
-
df.loc[df['gene'].str.contains('000002'), 'color'] = '000000'
|
872
|
-
df.loc[df['gene'].str.contains('000003'), 'color'] = '000000'
|
873
|
-
df.loc[df['gene'].str.contains('000004'), 'color'] = '000000'
|
874
|
-
df.loc[df['gene'].str.contains('000005'), 'color'] = '000000'
|
875
|
-
df.loc[df['gene'].str.contains('000006'), 'color'] = '000000'
|
876
|
-
df.loc[df['gene'].str.contains('000007'), 'color'] = '000000'
|
877
|
-
df.loc[df['gene'].str.contains('000008'), 'color'] = '000000'
|
878
|
-
df.loc[df['gene'].str.contains('000009'), 'color'] = '000000'
|
879
|
-
df.loc[df['gene'].str.contains('000010'), 'color'] = '000000'
|
880
|
-
fig, ax = plt.subplots(figsize=(10,10))
|
881
|
-
df.loc[df[col_p] == 0.000, col_p] = 0.001
|
882
|
-
df['logp'] = -np.log10(df[col_p])
|
883
|
-
sns.scatterplot(data = df, x = coef_col, y = 'logp', legend = False, ax = ax,
|
884
|
-
hue= 'color', hue_order = ['239740','205250','None', '000000'],
|
885
|
-
palette = ['purple', 'teal', 'lightgrey', 'black'],
|
886
|
-
size = 'color', sizes = (100, 10))
|
887
|
-
g14 = df[df['gene'].str.contains('239740')]
|
888
|
-
r18 = df[df['gene'].str.contains('205250')]
|
889
|
-
res = pd.concat([g14, r18], axis=0)
|
890
|
-
res = res[[coef_col, col_p]]
|
891
|
-
print(res)
|
892
|
-
return df, res
|
893
|
-
|
894
|
-
def reg_model(iv_loc,dv_loc):
|
895
|
-
independent_variables = pd.read_csv(iv_loc)
|
896
|
-
dependent_variable = pd.read_csv(dv_loc)
|
897
|
-
independent_variables = independent_variables.set_index('prc')
|
898
|
-
columns = independent_variables.columns
|
899
|
-
new_columns = [col.replace('TGGT1_', '') for col in columns]
|
900
|
-
independent_variables.columns = new_columns
|
901
|
-
|
902
|
-
dependent_variable = dependent_variable.set_index('prc')
|
903
|
-
|
904
|
-
reg_input = pd.DataFrame(pd.merge(independent_variables, dependent_variable, left_index=True, right_index=True))
|
905
|
-
reg_input = reg_input.dropna(axis=0, how='any')
|
906
|
-
reg_input = reg_input.dropna(axis=1, how='any')
|
907
|
-
print('Number of wells',len(reg_input))
|
908
|
-
x = reg_input.drop(['score'], axis=1)
|
909
|
-
x = sm.add_constant(x)
|
910
|
-
y = np.log10(reg_input['score']+1)
|
911
|
-
model = sm.OLS(y, x).fit()
|
912
|
-
predictions = model.predict(x)
|
913
|
-
results_summary = model.summary()
|
914
|
-
print(results_summary)
|
915
|
-
results_as_html = results_summary.tables[1].as_html()
|
916
|
-
results_df = pd.read_html(results_as_html, header=0, index_col=0)[0]
|
917
|
-
df, res = plot_reg_res(df=results_df)
|
918
|
-
return df, res
|
919
|
-
|
920
|
-
def mixed_model(iv_loc,dv_loc):
|
921
|
-
independent_variables = pd.read_csv(iv_loc)
|
922
|
-
dependent_variable = pd.read_csv(dv_loc)
|
923
|
-
independent_variables = independent_variables.set_index('prc')
|
924
|
-
columns = independent_variables.columns
|
925
|
-
new_columns = [col.replace('TGGT1_', '') for col in columns]
|
926
|
-
independent_variables.columns = new_columns
|
927
|
-
dependent_variable = dependent_variable.set_index('prc')
|
928
|
-
reg_input = pd.DataFrame(pd.merge(independent_variables, dependent_variable, left_index=True, right_index=True))
|
929
|
-
reg_input = reg_input.dropna(axis=0, how='any')
|
930
|
-
|
931
|
-
y = np.log10(reg_input['score']+1)
|
932
|
-
X = reg_input.drop('score', axis=1)
|
933
|
-
X.columns = pd.MultiIndex.from_tuples([tuple(col.split('_')) for col in X.columns],
|
934
|
-
names=['main_variable', 'sub_variable'])
|
935
|
-
# Melt the DataFrame
|
936
|
-
X_long = X.melt(ignore_index=False, var_name=['main_variable', 'sub_variable'], value_name='value')
|
937
|
-
X_long = X_long[X_long['value']>0]
|
938
|
-
|
939
|
-
# Create a new column to represent the nested structure of gRNA within gene
|
940
|
-
X_long['gene_gRNA'] = X_long['main_variable'].astype(str) + "_" + X_long['sub_variable'].astype(str)
|
941
|
-
|
942
|
-
# Add 'score' to the DataFrame
|
943
|
-
X_long['score'] = y
|
944
|
-
|
945
|
-
# Create and convert the plate, row, and column variables to type category
|
946
|
-
X_long.reset_index(inplace=True)
|
947
|
-
split_values = X_long['prc'].str.split('_', expand=True)
|
948
|
-
X_long[['plate', 'row', 'col']] = split_values
|
949
|
-
X_long['plate'] = X_long['plate'].str[1:]
|
950
|
-
X_long['plate'] = X_long['plate'].astype(int)
|
951
|
-
X_long['row'] = X_long['row'].str[1:]
|
952
|
-
X_long['row'] = X_long['row'].astype(int)
|
953
|
-
X_long['col'] = X_long['col'].str[1:]
|
954
|
-
X_long['col'] = X_long['col'].astype(int)
|
955
|
-
X_long = X_long.set_index('prc')
|
956
|
-
# Create a new column to represent the nested structure of plate, row, and column
|
957
|
-
X_long['plate_row_col'] = X_long['plate'].astype(str) + "_" + X_long['row'].astype(str) + "_" + X_long['col'].astype(str)
|
958
|
-
n_group = pd.DataFrame(X_long.groupby(['gene_gRNA']).count()['main_variable'])
|
959
|
-
n_group = n_group.rename({'main_variable': 'n_group'}, axis=1)
|
960
|
-
n_group = n_group.reset_index(drop=False)
|
961
|
-
X_long = pd.merge(X_long, n_group, on='gene_gRNA')
|
962
|
-
X_long = X_long[X_long['n_group']>1]
|
963
|
-
#print(X_long.isna().sum())
|
964
|
-
|
965
|
-
X_long['main_variable'] = X_long['main_variable'].astype('category')
|
966
|
-
X_long['sub_variable'] = X_long['sub_variable'].astype('category')
|
967
|
-
X_long['plate'] = X_long['plate'].astype('category')
|
968
|
-
X_long['row'] = X_long['row'].astype('category')
|
969
|
-
X_long['col'] = X_long['col'].astype('category')
|
970
|
-
X_long = pd.DataFrame(X_long)
|
971
|
-
print(X_long)
|
972
|
-
|
973
|
-
md = smf.mixedlm("score ~ C(main_variable)", X_long,
|
974
|
-
groups=X_long["sub_variable"])
|
975
|
-
|
976
|
-
# Define your nonlinear function here
|
977
|
-
def nonlinear_function(x, *params):
|
978
|
-
pass # Implement non linear function here
|
979
|
-
|
980
|
-
mdf = md.fit(method='bfgs', maxiter=1000)
|
981
|
-
print(mdf.summary())
|
982
|
-
summary = mdf.summary()
|
983
|
-
df = pd.DataFrame(summary.tables[1])
|
984
|
-
df, res = plot_reg_res(df, coef_col='Coef.', col_p='P>|z|')
|
985
|
-
return df, res
|
986
|
-
|
987
|
-
def calculate_accuracy(df):
|
988
|
-
df.loc[df['pc_score'] <= 0.5, 'pred'] = 0
|
989
|
-
df.loc[df['pc_score'] >= 0.5, 'pred'] = 1
|
990
|
-
df.loc[df['cond'] == 'NC', 'lab'] = 0
|
991
|
-
df.loc[df['cond'] == 'PC', 'lab'] = 1
|
992
|
-
df = df[df['cond'] != 'SCREEN']
|
993
|
-
df_nc = df[df['cond'] != 'NC']
|
994
|
-
df_pc = df[df['cond'] != 'PC']
|
995
|
-
correct = []
|
996
|
-
all_ls = []
|
997
|
-
pred_list = df['pred'].tolist()
|
998
|
-
lab_list = df['lab'].tolist()
|
999
|
-
for i,v in enumerate(pred_list):
|
1000
|
-
all_ls.append(1)
|
1001
|
-
if v == lab_list[i]:
|
1002
|
-
correct.append(1)
|
1003
|
-
print('total accuracy',len(correct)/len(all_ls))
|
1004
|
-
correct = []
|
1005
|
-
all_ls = []
|
1006
|
-
pred_list = df_pc['pred'].tolist()
|
1007
|
-
lab_list = df_pc['lab'].tolist()
|
1008
|
-
for i,v in enumerate(pred_list):
|
1009
|
-
all_ls.append(1)
|
1010
|
-
if v == lab_list[i]:
|
1011
|
-
correct.append(1)
|
1012
|
-
print('positives accuracy', len(correct)/len(all_ls))
|
1013
|
-
correct = []
|
1014
|
-
all_ls = []
|
1015
|
-
pred_list = df_nc['pred'].tolist()
|
1016
|
-
lab_list = df_nc['lab'].tolist()
|
1017
|
-
for i,v in enumerate(pred_list):
|
1018
|
-
all_ls.append(1)
|
1019
|
-
if v == lab_list[i]:
|
1020
|
-
correct.append(1)
|
1021
|
-
print('negatives accuracy',len(correct)/len(all_ls))
|
1022
|
-
|
1023
|
-
def preprocess_image_data(df, resnet_loc, min_count=25, metric='mean', plot=True, score='pc_score'):
|
1024
|
-
print('number of cells', len(df))
|
1025
|
-
resnet_preds = pd.read_csv(resnet_loc)
|
1026
|
-
res_df = split_filenames(df=resnet_preds, filename_column='path')
|
1027
|
-
pred_df = rename_plate_metadata(df=res_df)
|
1028
|
-
pred_df['prcfo'] = pred_df['plate']+'_'+pred_df['row']+'_'+pred_df['col']+'_'+pred_df['field']+'_'+pred_df['obj']
|
1029
|
-
print('number of resnet scores', len(df))
|
1030
|
-
merged_df = pd.merge(df, pred_df, on='prcfo', how='inner', suffixes=('', '_right'))
|
1031
|
-
merged_df = merged_df.rename(columns={'pred': 'pc_score'})
|
1032
|
-
|
1033
|
-
merged_df = merged_df[(merged_df['pc_score'] <= 0.25) | (merged_df['pc_score'] >= 0.75)]
|
1034
|
-
|
1035
|
-
merged_df['recruitment'] = merged_df['Toxo_channel_1_quartiles75']/merged_df['Cytosol_channel_1_quartiles75']
|
1036
|
-
merged_df = pd.DataFrame(merged_df[merged_df['duplicates'] == 1.0])
|
1037
|
-
columns_to_drop = [col for col in merged_df.columns if col.endswith('_right')]
|
1038
|
-
merged_df = merged_df.drop(columns_to_drop, axis=1)
|
1039
|
-
well_group = pd.DataFrame(merged_df.groupby(['prc']).count()['cond'])
|
1040
|
-
well_group = well_group.rename({'cond': 'cell_count'}, axis=1)
|
1041
|
-
merged_df = pd.merge(merged_df, well_group, on='prc', how='inner', suffixes=('', '_right'))
|
1042
|
-
columns_to_drop = [col for col in merged_df.columns if col.endswith('_right')]
|
1043
|
-
merged_df = merged_df.drop(columns_to_drop, axis=1)
|
1044
|
-
#merged_df = merged_df.drop(['duplicates', 'outlier', 'prcfo.1'], axis=1)
|
1045
|
-
merged_df = merged_df.drop(['duplicates', 'prcfo.1'], axis=1)
|
1046
|
-
merged_df = pd.DataFrame(merged_df[merged_df['cell_count'] >= min_count])
|
1047
|
-
|
1048
|
-
if metric == 'mean':
|
1049
|
-
well_scores_score = pd.DataFrame(merged_df.groupby(['prc']).mean()['pc_score'])
|
1050
|
-
well_scores_score = well_scores_score.rename({'pc_score': 'mean_pc_score'}, axis=1)
|
1051
|
-
well_scores_rec = pd.DataFrame(merged_df.groupby(['prc']).mean()['recruitment'])
|
1052
|
-
well_scores_rec = well_scores_rec.rename({'recruitment': 'mean_recruitment'}, axis=1)
|
1053
|
-
if metric == 'geomean':
|
1054
|
-
well_scores_score = pd.DataFrame(merged_df.groupby(['prc'])['pc_score'].apply(gmean))
|
1055
|
-
well_scores_score = well_scores_score.rename({'pc_score': 'mean_pc_score'}, axis=1)
|
1056
|
-
well_scores_rec = pd.DataFrame(merged_df.groupby(['prc'])['recruitment'].apply(gmean))
|
1057
|
-
well_scores_rec = well_scores_rec.rename({'recruitment': 'mean_recruitment'}, axis=1)
|
1058
|
-
if metric == 'median':
|
1059
|
-
well_scores_score = pd.DataFrame(merged_df.groupby(['prc']).median()['pc_score'])
|
1060
|
-
well_scores_score = well_scores_score.rename({'pc_score': 'mean_pc_score'}, axis=1)
|
1061
|
-
well_scores_rec = pd.DataFrame(merged_df.groupby(['prc']).median()['recruitment'])
|
1062
|
-
well_scores_rec = well_scores_rec.rename({'recruitment': 'mean_recruitment'}, axis=1)
|
1063
|
-
if metric == 'quntile':
|
1064
|
-
well_scores_score = pd.DataFrame(merged_df.groupby(['prc']).quantile(0.75)['pc_score'])
|
1065
|
-
well_scores_score = well_scores_score.rename({'pc_score': 'mean_pc_score'}, axis=1)
|
1066
|
-
well_scores_rec = pd.DataFrame(merged_df.groupby(['prc']).quantile(0.75)['recruitment'])
|
1067
|
-
well_scores_rec = well_scores_rec.rename({'recruitment': 'mean_recruitment'}, axis=1)
|
1068
|
-
well = pd.DataFrame(pd.DataFrame(merged_df.select_dtypes(include=['object'])).groupby(['prc']).first())
|
1069
|
-
well['mean_pc_score'] = well_scores_score['mean_pc_score']
|
1070
|
-
well['mean_recruitment'] = well_scores_rec['mean_recruitment']
|
1071
|
-
nc = well[well['cond'] == 'NC']
|
1072
|
-
max_nc = nc['mean_recruitment'].max()
|
1073
|
-
pc = well[well['cond'] == 'PC']
|
1074
|
-
screen = well[well['cond'] == 'SCREEN']
|
1075
|
-
screen = screen[screen['mean_recruitment'] <= max_nc]
|
1076
|
-
if plot:
|
1077
|
-
x_axis = 'mean_pc_score'
|
1078
|
-
fig, ax = plt.subplots(1,3,figsize=(30,10))
|
1079
|
-
sns.histplot(data=nc, x=x_axis, kde=False, stat='density', element="step", ax=ax[0], color='lightgray', log_scale=False)
|
1080
|
-
sns.histplot(data=pc, x=x_axis, kde=False, stat='density', element="step", ax=ax[0], color='teal', log_scale=False)
|
1081
|
-
sns.histplot(data=screen, x=x_axis, kde=False, stat='density', element="step", ax=ax[1], color='purple', log_scale=False)
|
1082
|
-
sns.histplot(data=nc, x=x_axis, kde=False, stat='density', element="step", ax=ax[2], color='lightgray', log_scale=False)
|
1083
|
-
sns.histplot(data=pc, x=x_axis, kde=False, stat='density', element="step", ax=ax[2], color='teal', log_scale=False)
|
1084
|
-
sns.histplot(data=screen, x=x_axis, kde=False, stat='density', element="step", ax=ax[2], color='purple', log_scale=False)
|
1085
|
-
ax[0].set_title('NC vs PC wells')
|
1086
|
-
ax[1].set_title('Screen wells')
|
1087
|
-
ax[2].set_title('NC vs PC vs Screen wells')
|
1088
|
-
ax[0].spines['top'].set_visible(False)
|
1089
|
-
ax[0].spines['right'].set_visible(False)
|
1090
|
-
ax[1].spines['top'].set_visible(False)
|
1091
|
-
ax[1].spines['right'].set_visible(False)
|
1092
|
-
ax[2].spines['top'].set_visible(False)
|
1093
|
-
ax[2].spines['right'].set_visible(False)
|
1094
|
-
ax[0].set_xlim([0, 1])
|
1095
|
-
ax[1].set_xlim([0, 1])
|
1096
|
-
ax[2].set_xlim([0, 1])
|
1097
|
-
loc = '/media/olafsson/umich/matt_graphs/resnet_score_well_av.pdf'
|
1098
|
-
fig.savefig(loc, dpi = 600, format='pdf', bbox_inches='tight')
|
1099
|
-
x_axis = 'mean_recruitment'
|
1100
|
-
fig, ax = plt.subplots(1,3,figsize=(30,10))
|
1101
|
-
sns.histplot(data=nc, x=x_axis, kde=False, stat='density', element="step", ax=ax[0], color='lightgray', log_scale=False)
|
1102
|
-
sns.histplot(data=pc, x=x_axis, kde=False, stat='density', element="step", ax=ax[0], color='teal', log_scale=False)
|
1103
|
-
sns.histplot(data=screen, x=x_axis, kde=False, stat='density', element="step", ax=ax[1], color='purple', log_scale=False)
|
1104
|
-
sns.histplot(data=nc, x=x_axis, kde=False, stat='density', element="step", ax=ax[2], color='lightgray', log_scale=False)
|
1105
|
-
sns.histplot(data=pc, x=x_axis, kde=False, stat='density', element="step", ax=ax[2], color='teal', log_scale=False)
|
1106
|
-
sns.histplot(data=screen, x=x_axis, kde=False, stat='density', element="step", ax=ax[2], color='purple', log_scale=False)
|
1107
|
-
ax[0].set_title('NC vs PC wells')
|
1108
|
-
ax[1].set_title('Screen wells')
|
1109
|
-
ax[2].set_title('NC vs PC vs Screen wells')
|
1110
|
-
ax[0].spines['top'].set_visible(False)
|
1111
|
-
ax[0].spines['right'].set_visible(False)
|
1112
|
-
ax[1].spines['top'].set_visible(False)
|
1113
|
-
ax[1].spines['right'].set_visible(False)
|
1114
|
-
ax[2].spines['top'].set_visible(False)
|
1115
|
-
ax[2].spines['right'].set_visible(False)
|
1116
|
-
loc = '/media/olafsson/umich/matt_graphs/mean_recruitment_well_av.pdf'
|
1117
|
-
fig.savefig(loc, dpi = 600, format='pdf', bbox_inches='tight')
|
1118
|
-
plates = ['p1','p2','p3','p4']
|
1119
|
-
screen = screen[screen['plate'].isin(plates)]
|
1120
|
-
if score == 'pc_score':
|
1121
|
-
dv = pd.DataFrame(screen['mean_pc_score'])
|
1122
|
-
dv = dv.rename({'mean_pc_score': 'score'}, axis=1)
|
1123
|
-
if score == 'recruitment':
|
1124
|
-
dv = pd.DataFrame(screen['mean_recruitment'])
|
1125
|
-
dv = dv.rename({'mean_recruitment': 'score'}, axis=1)
|
1126
|
-
print('dependant variable well count:', len(well))
|
1127
|
-
dv_loc = '/media/olafsson/Data2/methods_paper/data/dv.csv'
|
1128
|
-
dv.to_csv(dv_loc)
|
1129
|
-
calculate_accuracy(df=merged_df)
|
1130
|
-
return merged_df, well
|
1147
|
+
return independent_variables
|