spacr 0.2.53__py3-none-any.whl → 0.2.56__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/core.py +282 -10
- spacr/deep_spacr.py +101 -41
- spacr/gui.py +1 -1
- spacr/gui_core.py +8 -10
- spacr/gui_elements.py +70 -0
- spacr/gui_utils.py +30 -10
- spacr/io.py +12 -4
- spacr/sequencing.py +443 -643
- spacr/settings.py +176 -44
- spacr/utils.py +13 -5
- {spacr-0.2.53.dist-info → spacr-0.2.56.dist-info}/METADATA +2 -1
- {spacr-0.2.53.dist-info → spacr-0.2.56.dist-info}/RECORD +16 -16
- {spacr-0.2.53.dist-info → spacr-0.2.56.dist-info}/LICENSE +0 -0
- {spacr-0.2.53.dist-info → spacr-0.2.56.dist-info}/WHEEL +0 -0
- {spacr-0.2.53.dist-info → spacr-0.2.56.dist-info}/entry_points.txt +0 -0
- {spacr-0.2.53.dist-info → spacr-0.2.56.dist-info}/top_level.txt +0 -0
spacr/sequencing.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1
|
-
import os,
|
1
|
+
import os, gzip, re, time, math, subprocess, gzip
|
2
2
|
import pandas as pd
|
3
3
|
import numpy as np
|
4
|
-
from tqdm import tqdm
|
5
|
-
from Bio.Align import PairwiseAligner
|
6
4
|
import matplotlib.pyplot as plt
|
7
5
|
import seaborn as sns
|
8
6
|
from Bio import pairwise2
|
@@ -14,6 +12,8 @@ from scipy import stats
|
|
14
12
|
from difflib import SequenceMatcher
|
15
13
|
from collections import Counter
|
16
14
|
from IPython.display import display
|
15
|
+
from multiprocessing import Pool, cpu_count, Queue, Process
|
16
|
+
from rapidfuzz import process, fuzz
|
17
17
|
|
18
18
|
from sklearn.linear_model import LinearRegression, Lasso, Ridge
|
19
19
|
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
|
@@ -21,22 +21,169 @@ from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
|
|
21
21
|
from scipy.stats import shapiro
|
22
22
|
from patsy import dmatrices
|
23
23
|
|
24
|
-
import os
|
25
|
-
import gzip
|
26
24
|
from Bio import SeqIO
|
27
25
|
from Bio.Seq import Seq
|
28
26
|
from Bio.SeqRecord import SeqRecord
|
29
27
|
|
30
|
-
def
|
28
|
+
def parse_gz_files(folder_path):
|
29
|
+
"""
|
30
|
+
Parses the .fastq.gz files in the specified folder path and returns a dictionary
|
31
|
+
containing the sample names and their corresponding file paths.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
folder_path (str): The path to the folder containing the .fastq.gz files.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
dict: A dictionary where the keys are the sample names and the values are
|
38
|
+
dictionaries containing the file paths for the 'R1' and 'R2' read directions.
|
39
|
+
"""
|
40
|
+
files = os.listdir(folder_path)
|
41
|
+
gz_files = [f for f in files if f.endswith('.fastq.gz')]
|
42
|
+
|
43
|
+
samples_dict = {}
|
44
|
+
for gz_file in gz_files:
|
45
|
+
parts = gz_file.split('_')
|
46
|
+
sample_name = parts[0]
|
47
|
+
read_direction = parts[1]
|
48
|
+
|
49
|
+
if sample_name not in samples_dict:
|
50
|
+
samples_dict[sample_name] = {}
|
51
|
+
|
52
|
+
if read_direction == "R1":
|
53
|
+
samples_dict[sample_name]['R1'] = os.path.join(folder_path, gz_file)
|
54
|
+
elif read_direction == "R2":
|
55
|
+
samples_dict[sample_name]['R2'] = os.path.join(folder_path, gz_file)
|
56
|
+
return samples_dict
|
57
|
+
|
58
|
+
def process_chunk_for_consensus(r1_chunk, r2_chunk):
|
59
|
+
"""
|
60
|
+
Process a chunk of paired-end sequencing reads to generate consensus sequences.
|
61
|
+
|
62
|
+
Args:
|
63
|
+
r1_chunk (list): List of SeqRecord objects representing the first read in each pair.
|
64
|
+
r2_chunk (list): List of SeqRecord objects representing the second read in each pair.
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
list: List of SeqRecord objects representing the consensus sequences.
|
68
|
+
|
69
|
+
"""
|
70
|
+
consensus_records = []
|
71
|
+
|
72
|
+
for r1_record, r2_record in zip(r1_chunk, r2_chunk):
|
73
|
+
best_sequence = []
|
74
|
+
best_quality = []
|
75
|
+
for base1, base2, qual1, qual2 in zip(r1_record.seq, r2_record.seq, r1_record.letter_annotations["phred_quality"], r2_record.letter_annotations["phred_quality"]):
|
76
|
+
if qual1 >= qual2:
|
77
|
+
best_sequence.append(base1)
|
78
|
+
best_quality.append(qual1)
|
79
|
+
else:
|
80
|
+
best_sequence.append(base2)
|
81
|
+
best_quality.append(qual2)
|
82
|
+
|
83
|
+
consensus_seq = Seq("".join(best_sequence))
|
84
|
+
|
85
|
+
# Create a new SeqRecord for the consensus sequence
|
86
|
+
consensus_record = SeqRecord(consensus_seq, id=r1_record.id, description="", letter_annotations={"phred_quality": best_quality})
|
87
|
+
|
88
|
+
# Add the consensus record to the list
|
89
|
+
consensus_records.append(consensus_record)
|
90
|
+
|
91
|
+
return consensus_records
|
92
|
+
|
93
|
+
def consensus_sequence(fastq_r1, fastq_r2, output_file, chunk_size=1000000, n_jobs=None):
|
94
|
+
"""
|
95
|
+
Calculate the consensus sequence from two FASTQ files (R1 and R2) and write the result to an output file.
|
96
|
+
|
97
|
+
Parameters:
|
98
|
+
- fastq_r1 (str): Path to the R1 FASTQ file.
|
99
|
+
- fastq_r2 (str): Path to the R2 FASTQ file.
|
100
|
+
- output_file (str): Path to the output file where the consensus sequence will be written.
|
101
|
+
- chunk_size (int): Number of reads to process in each chunk. Default is 1000000.
|
102
|
+
- n_jobs (int): Number of parallel processes to use. If None, it will use the number of available CPUs minus 2.
|
103
|
+
|
104
|
+
Returns:
|
105
|
+
None
|
106
|
+
"""
|
107
|
+
from .utils import print_progress, count_reads_in_fastq
|
108
|
+
|
109
|
+
print(f'Calculating read count for {fastq_r1} ...')
|
110
|
+
total_reads = count_reads_in_fastq(fastq_r1)
|
111
|
+
chunks_nr = (int(total_reads / chunk_size) + 1) // (n_jobs if n_jobs else cpu_count())
|
112
|
+
|
113
|
+
total_reads_processed = 0
|
114
|
+
chunk_count = 0
|
115
|
+
time_ls = []
|
116
|
+
|
117
|
+
if n_jobs is None:
|
118
|
+
n_jobs = cpu_count() - 2
|
119
|
+
|
120
|
+
with gzip.open(fastq_r1, "rt") as r1_handle, gzip.open(fastq_r2, "rt") as r2_handle, gzip.open(output_file, "wt") as output_handle:
|
121
|
+
r1_iter = SeqIO.parse(r1_handle, "fastq")
|
122
|
+
r2_iter = SeqIO.parse(r2_handle, "fastq")
|
123
|
+
pool = Pool(processes=n_jobs)
|
124
|
+
|
125
|
+
while True:
|
126
|
+
start_time = time.time()
|
127
|
+
|
128
|
+
r1_chunk = [rec for rec in (next(r1_iter, None) for _ in range(n_jobs * chunk_size)) if rec is not None]
|
129
|
+
r2_chunk = [rec for rec in (next(r2_iter, None) for _ in range(n_jobs * chunk_size)) if rec is not None]
|
130
|
+
|
131
|
+
# If either chunk is empty, we have reached the end of one or both files
|
132
|
+
if not r1_chunk or not r2_chunk:
|
133
|
+
break
|
134
|
+
|
135
|
+
chunk_count += 1
|
136
|
+
total_reads_processed += len(r1_chunk)
|
137
|
+
|
138
|
+
# Split the records into chunks to be processed by each core
|
139
|
+
r1_chunked = [r1_chunk[i:i + chunk_size] for i in range(0, len(r1_chunk), chunk_size)]
|
140
|
+
r2_chunked = [r2_chunk[i:i + chunk_size] for i in range(0, len(r2_chunk), chunk_size)]
|
141
|
+
|
142
|
+
# Process each chunk in parallel
|
143
|
+
results = pool.starmap(process_chunk_for_consensus, zip(r1_chunked, r2_chunked))
|
144
|
+
|
145
|
+
# Write the results to the output file
|
146
|
+
for consensus_records in results:
|
147
|
+
SeqIO.write(consensus_records, output_handle, "fastq")
|
148
|
+
|
149
|
+
end_time = time.time()
|
150
|
+
chunk_time = end_time - start_time
|
151
|
+
time_ls.append(chunk_time)
|
152
|
+
print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=chunk_size, operation_type=" Consensus sequence from R1 & R2")
|
153
|
+
|
154
|
+
pool.close()
|
155
|
+
pool.join()
|
156
|
+
|
157
|
+
def consensus_sequence_v1(fastq_r1, fastq_r2, output_file, chunk_size=1000000):
|
158
|
+
"""
|
159
|
+
Generate a consensus sequence from paired-end FASTQ files.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
fastq_r1 (str): Path to the first input FASTQ file.
|
163
|
+
fastq_r2 (str): Path to the second input FASTQ file.
|
164
|
+
output_file (str): Path to the output FASTQ file.
|
165
|
+
chunk_size (int, optional): Number of reads to process in each iteration. Defaults to 1000000.
|
166
|
+
|
167
|
+
Returns:
|
168
|
+
None
|
169
|
+
"""
|
170
|
+
from .utils import print_progress, count_reads_in_fastq
|
171
|
+
|
172
|
+
print(f'Calculating read count for {fastq_r1} ...')
|
173
|
+
total_reads = count_reads_in_fastq(fastq_r1)
|
174
|
+
chunks_nr = int(total_reads/chunk_size) + 1
|
31
175
|
|
32
176
|
total_reads = 0
|
33
177
|
chunk_count = 0
|
34
|
-
|
178
|
+
time_ls = []
|
179
|
+
|
35
180
|
with gzip.open(fastq_r1, "rt") as r1_handle, gzip.open(fastq_r2, "rt") as r2_handle, gzip.open(output_file, "wt") as output_handle:
|
36
181
|
r1_iter = SeqIO.parse(r1_handle, "fastq")
|
37
182
|
r2_iter = SeqIO.parse(r2_handle, "fastq")
|
38
183
|
|
39
184
|
while True:
|
185
|
+
start_time = time.time()
|
186
|
+
|
40
187
|
r1_chunk = [rec for rec in (next(r1_iter, None) for _ in range(chunk_size)) if rec is not None]
|
41
188
|
r2_chunk = [rec for rec in (next(r2_iter, None) for _ in range(chunk_size)) if rec is not None]
|
42
189
|
|
@@ -66,672 +213,338 @@ def consensus_sequence(fastq_r1, fastq_r2, output_file, chunk_size=1000000):
|
|
66
213
|
# Write the consensus sequence to the output file
|
67
214
|
SeqIO.write(consensus_record, output_handle, "fastq")
|
68
215
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
Parses the .fastq.gz files in the specified folder path and returns a dictionary
|
74
|
-
containing the sample names and their corresponding file paths.
|
216
|
+
end_time = time.time()
|
217
|
+
chunk_time = end_time - start_time
|
218
|
+
time_ls.append(chunk_time)
|
219
|
+
print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=1, time_ls=time_ls, batch_size=chunk_size, operation_type=" Consensus sequence from R1 & R2")
|
75
220
|
|
76
|
-
|
77
|
-
|
221
|
+
def save_to_hdf(queue, output_file, complevel=9, compression='zlib'):
|
222
|
+
"""
|
223
|
+
Save data from a queue to an HDF file.
|
224
|
+
|
225
|
+
Parameters:
|
226
|
+
- queue: Queue object
|
227
|
+
The queue containing the data to be saved.
|
228
|
+
- output_file: strs
|
229
|
+
The path to the output HDF file.
|
230
|
+
- complevel: int, optional
|
231
|
+
The compression level to use (default is 9).
|
232
|
+
- compression: str, optional
|
233
|
+
The compression algorithm to use (default is 'zlib').
|
78
234
|
|
79
235
|
Returns:
|
80
|
-
|
81
|
-
dictionaries containing the file paths for the 'R1' and 'R2' read directions.
|
236
|
+
None
|
82
237
|
"""
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
read_direction = parts[1]
|
238
|
+
with pd.HDFStore(output_file, mode='a', complevel=complevel, complib=compression) as store:
|
239
|
+
while True:
|
240
|
+
chunk_count, df = queue.get()
|
241
|
+
if df is None:
|
242
|
+
break
|
243
|
+
print(f'Writing chunks to H5PY ...')
|
244
|
+
store.append(f'chunk_{chunk_count}', df, format='table', data_columns=True)
|
91
245
|
|
92
|
-
|
93
|
-
|
246
|
+
def get_top_two_matches(seq, barcode_dict):
|
247
|
+
"""
|
248
|
+
Finds the top two closest matches for a given sequence in a barcode dictionary.
|
94
249
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
samples_dict[sample_name]['R2'] = os.path.join(folder_path, gz_file)
|
250
|
+
Args:
|
251
|
+
seq (str): The sequence to find the closest matches for.
|
252
|
+
barcode_dict (dict): A dictionary containing barcodes as keys and their corresponding values.
|
99
253
|
|
100
|
-
|
254
|
+
Returns:
|
255
|
+
list of tuples: A list containing up to two tuples, each with a barcode match and its score.
|
256
|
+
"""
|
257
|
+
results = process.extract(seq, barcode_dict.keys(), scorer=fuzz.ratio, limit=2)
|
258
|
+
matches = [(barcode_dict[result[0]], result[1] / 100.0) for result in results]
|
259
|
+
# Pad the matches list if there are fewer than two results
|
260
|
+
if len(matches) < 2:
|
261
|
+
matches.append((None, 0.0))
|
262
|
+
return matches
|
263
|
+
|
264
|
+
def process_chunk_for_mapping(records, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements):
|
265
|
+
"""
|
266
|
+
Process a chunk of records for barcode mapping, including highest and second-highest scores.
|
101
267
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
consensus_dir = os.path.join(os.path.dirname(R1), 'consensus')
|
109
|
-
os.makedirs(consensus_dir, exist_ok=True) # Use os.makedirs() instead of os.mkdir()
|
110
|
-
consensus = os.path.join(consensus_dir, f"{key}_consensus.fastq.gz")
|
111
|
-
consensus_sequence(R1, R2, consensus, chunk_size)
|
268
|
+
Args:
|
269
|
+
records (list): A list of records to process.
|
270
|
+
barcode_mapping (dict): A dictionary mapping barcodes to their corresponding keys.
|
271
|
+
barcode_dicts (dict): A dictionary of barcode dictionaries.
|
272
|
+
barcode_coordinates (dict): A dictionary mapping barcode keys to their start and end coordinates.
|
273
|
+
reverse_complements (dict): A dictionary indicating whether to reverse complement the extracted sequences for each barcode key.
|
112
274
|
|
275
|
+
Returns:
|
276
|
+
pandas.DataFrame: A DataFrame containing the processed data.
|
277
|
+
"""
|
278
|
+
data = {key: [] for key in barcode_mapping.keys()}
|
279
|
+
seq_data = {f"{key}_seq": [] for key in barcode_mapping.keys()}
|
280
|
+
score_data_1 = {f"{key}_score_1": [] for key in barcode_mapping.keys()}
|
281
|
+
score_data_2 = {f"{key}_score_2": [] for key in barcode_mapping.keys()}
|
282
|
+
sequences = []
|
283
|
+
|
284
|
+
for record in records:
|
285
|
+
sequences.append(str(record.seq))
|
286
|
+
for key, coord in barcode_coordinates.items():
|
287
|
+
start, end = coord
|
288
|
+
extracted_seq = str(record.seq[start:end])
|
289
|
+
|
290
|
+
if reverse_complements[key]:
|
291
|
+
extracted_seq = str(Seq(extracted_seq).reverse_complement())
|
292
|
+
|
293
|
+
seq_data[f"{key}_seq"].append(extracted_seq)
|
294
|
+
|
295
|
+
if key in barcode_dicts:
|
296
|
+
exact_match = barcode_dicts[key].get(extracted_seq, None)
|
297
|
+
if exact_match:
|
298
|
+
data[key].append(exact_match)
|
299
|
+
score_data_1[f"{key}_score_1"].append(1.0)
|
300
|
+
score_data_2[f"{key}_score_2"].append(0.0)
|
301
|
+
else:
|
302
|
+
matches = get_top_two_matches(extracted_seq, barcode_dicts[key])
|
303
|
+
data[key].append(matches[0][0])
|
304
|
+
score_data_1[f"{key}_score_1"].append(matches[0][1])
|
305
|
+
score_data_2[f"{key}_score_2"].append(matches[1][1])
|
306
|
+
else:
|
307
|
+
data[key].append(extracted_seq)
|
308
|
+
score_data_1[f"{key}_score_1"].append(0.0)
|
309
|
+
score_data_2[f"{key}_score_2"].append(0.0)
|
310
|
+
|
311
|
+
df = pd.DataFrame(data)
|
312
|
+
df_seq = pd.DataFrame(seq_data)
|
313
|
+
df_score_1 = pd.DataFrame(score_data_1)
|
314
|
+
df_score_2 = pd.DataFrame(score_data_2)
|
315
|
+
df['sequence'] = sequences
|
316
|
+
df = pd.concat([df, df_seq, df_score_1, df_score_2], axis=1)
|
317
|
+
return df
|
113
318
|
|
114
|
-
def
|
319
|
+
def extract_barcodes_from_fastq(fastq, output_file, chunk_size, barcode_mapping, n_jobs=None, compression='zlib', complevel=9):
|
115
320
|
"""
|
116
|
-
|
321
|
+
Extracts barcodes from a FASTQ file and maps them based on a barcode mapping.
|
117
322
|
|
118
323
|
Args:
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
324
|
+
fastq (str): Path to the input FASTQ file.
|
325
|
+
output_file (str): Path to the output file where the mapped barcodes will be saved.
|
326
|
+
chunk_size (int): Number of records to process in each chunk.
|
327
|
+
barcode_mapping (dict): Dictionary containing barcode mapping information.
|
328
|
+
The keys are the names of the barcode sets, and the values are tuples
|
329
|
+
containing the path to the CSV file, barcode coordinates, and reverse complement flag.
|
330
|
+
n_jobs (int, optional): Number of parallel processes to use for mapping. Defaults to None.
|
331
|
+
compression (str, optional): Compression algorithm to use for saving the output file. Defaults to 'zlib'.
|
332
|
+
complevel (int, optional): Compression level to use for saving the output file. Defaults to 9.
|
125
333
|
|
126
334
|
Returns:
|
127
335
|
None
|
128
336
|
"""
|
337
|
+
from .utils import print_progress, count_reads_in_fastq
|
338
|
+
|
339
|
+
# Ensure the file is deleted before starting
|
340
|
+
if os.path.exists(output_file):
|
341
|
+
os.remove(output_file)
|
342
|
+
|
343
|
+
# Validate and process barcode mapping
|
344
|
+
barcode_dicts = {}
|
345
|
+
barcode_coordinates = {}
|
346
|
+
reverse_complements = {}
|
347
|
+
|
348
|
+
for key, (csv_path, coordinates, reverse_comp) in barcode_mapping.items():
|
349
|
+
df = pd.read_csv(csv_path)
|
350
|
+
if 'name' not in df.columns or 'sequence' not in df.columns:
|
351
|
+
print(f"Warning: CSV file {csv_path} does not have required columns 'name' and 'sequence'. Aborting.")
|
352
|
+
return
|
353
|
+
barcode_dicts[key] = df.set_index('sequence')['name'].to_dict()
|
354
|
+
barcode_coordinates[key] = coordinates
|
355
|
+
reverse_complements[key] = reverse_comp
|
356
|
+
|
357
|
+
if n_jobs is None:
|
358
|
+
n_jobs = cpu_count() - 3 # Reserve one core for saving
|
129
359
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
Parameters:
|
135
|
-
- output_file_path (str): The path to the output HDF5 file.
|
136
|
-
- data_chunk (list): The data chunk to be saved.
|
137
|
-
- chunk_counter (int): The counter for the current chunk.
|
138
|
-
|
139
|
-
Returns:
|
140
|
-
None
|
141
|
-
"""
|
142
|
-
df = pd.DataFrame(data_chunk, columns=['combined_read', 'grna', 'plate_row', 'column', 'sample'])
|
143
|
-
with pd.HDFStore(output_file_path, mode='a', complevel=5, complib='blosc') as store:
|
144
|
-
store.put(
|
145
|
-
f'reads/chunk_{chunk_counter}',
|
146
|
-
df,
|
147
|
-
format='table',
|
148
|
-
append=True,
|
149
|
-
min_itemsize={'combined_read': 300, 'grna': 50, 'plate_row': 20, 'column': 20, 'sample': 50}
|
150
|
-
)
|
151
|
-
|
152
|
-
def reverse_complement(seq):
|
153
|
-
"""
|
154
|
-
Returns the reverse complement of a DNA sequence.
|
155
|
-
|
156
|
-
Args:
|
157
|
-
seq (str): The DNA sequence to be reversed and complemented.
|
158
|
-
|
159
|
-
Returns:
|
160
|
-
str: The reverse complement of the input DNA sequence.
|
161
|
-
|
162
|
-
Example:
|
163
|
-
>>> reverse_complement('ATCG')
|
164
|
-
'CGAT'
|
165
|
-
"""
|
166
|
-
complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
|
167
|
-
return ''.join(complement[base] for base in reversed(seq))
|
360
|
+
analyzed_chunks = 0
|
361
|
+
chunk_count = 0
|
362
|
+
time_ls = []
|
168
363
|
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
Args:
|
174
|
-
file_path (str): The path to the input file.
|
175
|
-
num_reads (int, optional): The number of reads to process. Defaults to 100.
|
176
|
-
|
177
|
-
Returns:
|
178
|
-
float: The average read length.
|
179
|
-
|
180
|
-
Raises:
|
181
|
-
FileNotFoundError: If the input file does not exist.
|
182
|
-
"""
|
183
|
-
if not file_path:
|
184
|
-
return 0
|
185
|
-
total_length = 0
|
186
|
-
count = 0
|
187
|
-
with gzip.open(file_path, 'rt') as f:
|
188
|
-
for _ in range(num_reads):
|
189
|
-
try:
|
190
|
-
f.readline() # Skip index line
|
191
|
-
read = f.readline().strip()
|
192
|
-
total_length += len(read)
|
193
|
-
f.readline() # Skip plus line
|
194
|
-
f.readline() # Skip quality line
|
195
|
-
count += 1
|
196
|
-
except StopIteration:
|
197
|
-
break
|
198
|
-
return total_length / count if count > 0 else 0
|
364
|
+
print(f'Calculating read count for {fastq} ...')
|
365
|
+
total_reads = count_reads_in_fastq(fastq)
|
366
|
+
chunks_nr = int(total_reads/chunk_size)
|
199
367
|
|
200
|
-
|
201
|
-
"""
|
202
|
-
Parses the .fastq.gz files in the specified folder path and returns a dictionary
|
203
|
-
containing the sample names and their corresponding file paths.
|
204
|
-
|
205
|
-
Args:
|
206
|
-
folder_path (str): The path to the folder containing the .fastq.gz files.
|
207
|
-
|
208
|
-
Returns:
|
209
|
-
dict: A dictionary where the keys are the sample names and the values are
|
210
|
-
dictionaries containing the file paths for the 'R1' and 'R2' read directions.
|
211
|
-
"""
|
212
|
-
files = os.listdir(folder_path)
|
213
|
-
gz_files = [f for f in files if f.endswith('.fastq.gz')]
|
214
|
-
|
215
|
-
samples_dict = {}
|
216
|
-
for gz_file in gz_files:
|
217
|
-
parts = gz_file.split('_')
|
218
|
-
sample_name = parts[0]
|
219
|
-
read_direction = parts[1]
|
220
|
-
|
221
|
-
if sample_name not in samples_dict:
|
222
|
-
samples_dict[sample_name] = {}
|
223
|
-
|
224
|
-
if read_direction == "R1":
|
225
|
-
samples_dict[sample_name]['R1'] = os.path.join(folder_path, gz_file)
|
226
|
-
elif read_direction == "R2":
|
227
|
-
samples_dict[sample_name]['R2'] = os.path.join(folder_path, gz_file)
|
228
|
-
|
229
|
-
return samples_dict
|
368
|
+
print(f'Mapping barcodes for {total_reads} reads in {chunks_nr} batches for {fastq} ...')
|
230
369
|
|
231
|
-
|
232
|
-
|
233
|
-
Find the best alignment between two DNA reads.
|
234
|
-
|
235
|
-
Parameters:
|
236
|
-
- r1_read_rc (str): The reverse complement of the first DNA read.
|
237
|
-
- r2_read (str): The second DNA read.
|
238
|
-
|
239
|
-
Returns:
|
240
|
-
- best_alignment (Alignment): The best alignment between the two DNA reads.
|
241
|
-
"""
|
242
|
-
aligner = PairwiseAligner()
|
243
|
-
alignments = aligner.align(r1_read_rc, r2_read)
|
244
|
-
best_alignment = alignments[0]
|
245
|
-
return best_alignment
|
246
|
-
|
247
|
-
def combine_reads(samples_dict, src, chunk_size, barecode_length_1, barecode_length_2, upstream, downstream):
|
248
|
-
"""
|
249
|
-
Combine reads from paired-end sequencing files and save the combined reads to a new file.
|
250
|
-
|
251
|
-
Args:
|
252
|
-
samples_dict (dict): A dictionary mapping sample names to file paths of paired-end sequencing files.
|
253
|
-
src (str): The source directory where the combined reads will be saved.
|
254
|
-
chunk_size (int): The number of reads to be processed and saved as a chunk.
|
255
|
-
barecode_length (int): The length of the barcode sequence.
|
256
|
-
upstream (str): The upstream sequence used for read splitting.
|
257
|
-
downstream (str): The downstream sequence used for read splitting.
|
258
|
-
|
259
|
-
Returns:
|
260
|
-
None
|
261
|
-
"""
|
262
|
-
dst = os.path.join(src, 'combined_reads')
|
263
|
-
if not os.path.exists(dst):
|
264
|
-
os.makedirs(dst)
|
265
|
-
|
266
|
-
for sample, paths in samples_dict.items():
|
267
|
-
print(f'Processing: {sample} with the files: {paths}')
|
268
|
-
r1_path = paths.get('R1')
|
269
|
-
r2_path = paths.get('R2')
|
270
|
-
|
271
|
-
output_file_path = os.path.join(dst, f"{sample}_combined.h5")
|
272
|
-
qc_file_path = os.path.join(dst, f"{sample}_qc.csv")
|
273
|
-
|
274
|
-
r1_file = gzip.open(r1_path, 'rt') if r1_path else None
|
275
|
-
r2_file = gzip.open(r2_path, 'rt') if r2_path else None
|
276
|
-
|
277
|
-
chunk_counter = 0
|
278
|
-
data_chunk = []
|
279
|
-
|
280
|
-
success = 0
|
281
|
-
fail = 0
|
370
|
+
# Create a queue to hold dataframes to be saved
|
371
|
+
save_queue = Queue()
|
282
372
|
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
avg_read_length = (avg_read_length_r1 + avg_read_length_r2) / 2 if avg_read_length_r1 and avg_read_length_r2 else 0
|
287
|
-
|
288
|
-
print(f'Initial avg_read_length: {avg_read_length}')
|
289
|
-
|
290
|
-
# Estimate the initial number of reads based on the file size
|
291
|
-
r1_size_est = os.path.getsize(r1_path) // (avg_read_length * 4) if r1_path else 0
|
292
|
-
r2_size_est = os.path.getsize(r2_path) // (avg_read_length * 4) if r2_path else 0
|
293
|
-
max_size = max(r1_size_est, r2_size_est) * 10
|
294
|
-
test10 =0
|
295
|
-
with tqdm(total=max_size, desc=f"Processing {sample}") as pbar:
|
296
|
-
total_length_processed = 0
|
297
|
-
read_count = 0
|
298
|
-
|
299
|
-
while True:
|
300
|
-
try:
|
301
|
-
r1_index = next(r1_file).strip() if r1_file else None
|
302
|
-
r1_read = next(r1_file).strip() if r1_file else None
|
303
|
-
r1_plus = next(r1_file).strip() if r1_file else None
|
304
|
-
r1_quality = next(r1_file).strip() if r1_file else None
|
305
|
-
|
306
|
-
r2_index = next(r2_file).strip() if r2_file else None
|
307
|
-
r2_read = next(r2_file).strip() if r2_file else None
|
308
|
-
r2_plus = next(r2_file).strip() if r2_file else None
|
309
|
-
r2_quality = next(r2_file).strip() if r2_file else None
|
310
|
-
|
311
|
-
pbar.update(1)
|
312
|
-
|
313
|
-
if r1_index and r2_index and r1_index.split(' ')[0] != r2_index.split(' ')[0]:
|
314
|
-
fail += 1
|
315
|
-
print(f"Index mismatch: {r1_index} != {r2_index}")
|
316
|
-
continue
|
317
|
-
|
318
|
-
r1_read_rc = reverse_complement(r1_read) if r1_read else ''
|
319
|
-
r1_quality_rc = r1_quality[::-1] if r1_quality else ''
|
320
|
-
|
321
|
-
r1_rc_split_index = r1_read_rc.find(upstream)
|
322
|
-
r2_split_index = r2_read.find(upstream)
|
323
|
-
|
324
|
-
if r1_rc_split_index == -1 or r2_split_index == -1:
|
325
|
-
fail += 1
|
326
|
-
continue
|
327
|
-
else:
|
328
|
-
success += 1
|
329
|
-
|
330
|
-
read1_fragment = r1_read_rc[:r1_rc_split_index]
|
331
|
-
read2_fragment = r2_read[r2_split_index:]
|
332
|
-
read_combo = read1_fragment + read2_fragment
|
333
|
-
|
334
|
-
combo_split_index_1 = read_combo.find(upstream)
|
335
|
-
combo_split_index_2 = read_combo.find(downstream)
|
336
|
-
|
337
|
-
barcode_1 = read_combo[combo_split_index_1 - barecode_length_1:combo_split_index_1]
|
338
|
-
grna = read_combo[combo_split_index_1 + len(upstream):combo_split_index_2]
|
339
|
-
barcode_2 = read_combo[combo_split_index_2 + len(downstream):combo_split_index_2 + len(downstream) + barecode_length_2]
|
340
|
-
barcode_2 = reverse_complement(barcode_2)
|
341
|
-
data_chunk.append((read_combo, grna, barcode_1, barcode_2, sample))
|
342
|
-
|
343
|
-
if settings['test']:
|
344
|
-
if read_count % 1000 == 0:
|
345
|
-
print(f"Read count: {read_count}")
|
346
|
-
print(f"Read 1: {r1_read_rc}")
|
347
|
-
print(f"Read 2: {r2_read}")
|
348
|
-
print(f"Read combo: {read_combo}")
|
349
|
-
print(f"Barcode 1: {barcode_1}")
|
350
|
-
print(f"gRNA: {grna}")
|
351
|
-
print(f"Barcode 2: {barcode_2}")
|
352
|
-
print()
|
353
|
-
test10 += 1
|
354
|
-
if test10 == 10:
|
355
|
-
break
|
356
|
-
|
357
|
-
read_count += 1
|
358
|
-
total_length_processed += len(r1_read) + len(r2_read)
|
359
|
-
|
360
|
-
# Periodically update the average read length and total
|
361
|
-
if read_count % 10000 == 0:
|
362
|
-
avg_read_length = total_length_processed / (read_count * 2)
|
363
|
-
max_size = (os.path.getsize(r1_path) + os.path.getsize(r2_path)) // (avg_read_length * 4)
|
364
|
-
pbar.total = max_size
|
365
|
-
|
366
|
-
if len(data_chunk) >= chunk_size:
|
367
|
-
save_chunk_to_hdf5(output_file_path, data_chunk, chunk_counter)
|
368
|
-
chunk_counter += 1
|
369
|
-
data_chunk = []
|
370
|
-
|
371
|
-
except StopIteration:
|
372
|
-
break
|
373
|
-
|
374
|
-
# Save any remaining data_chunk
|
375
|
-
if data_chunk:
|
376
|
-
save_chunk_to_hdf5(output_file_path, data_chunk, chunk_counter)
|
377
|
-
|
378
|
-
# Save QC metrics
|
379
|
-
qc = {'success': success, 'failed': fail}
|
380
|
-
qc_df = pd.DataFrame([qc])
|
381
|
-
qc_df.to_csv(qc_file_path, index=False)
|
382
|
-
|
383
|
-
from .settings import get_analyze_reads_default_settings
|
384
|
-
try:
|
385
|
-
settings = get_analyze_reads_default_settings(settings)
|
386
|
-
samples_dict = parse_gz_files(settings['src'])
|
387
|
-
combine_reads(samples_dict, settings['src'], settings['chunk_size'], settings['barecode_length_1'], settings['barecode_length_2'], settings['upstream'], settings['downstream'])
|
388
|
-
except Exception as e:
|
389
|
-
print(e)
|
390
|
-
Error = traceback.format_exc()
|
391
|
-
print(Error)
|
392
|
-
|
393
|
-
def map_barcodes(h5_file_path, settings={}):
|
394
|
-
"""
|
395
|
-
Maps barcodes and performs quality control on sequencing data.
|
373
|
+
# Start a separate process for saving the data
|
374
|
+
save_process = Process(target=save_to_hdf, args=(save_queue, output_file, complevel, compression))
|
375
|
+
save_process.start()
|
396
376
|
|
397
|
-
|
398
|
-
|
399
|
-
|
377
|
+
with gzip.open(fastq, "rt") as handle:
|
378
|
+
fastq_iter = SeqIO.parse(handle, "fastq")
|
379
|
+
pool = Pool(processes=n_jobs)
|
400
380
|
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
def get_read_qc(df, settings):
|
405
|
-
"""
|
406
|
-
Calculate quality control metrics for sequencing reads.
|
381
|
+
while True:
|
382
|
+
# Read n_jobs * chunk_size records into memory
|
383
|
+
records = [record for _, record in zip(range(n_jobs * chunk_size), fastq_iter)]
|
407
384
|
|
408
|
-
|
409
|
-
|
385
|
+
if not records:
|
386
|
+
break
|
410
387
|
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
df_cleaned = df.dropna()
|
417
|
-
|
418
|
-
qc_dict = {}
|
419
|
-
qc_dict['reads'] = len(df)
|
420
|
-
qc_dict['cleaned_reads'] = len(df_cleaned)
|
421
|
-
qc_dict['NaN_grna'] = df['grna_metadata'].isna().sum()
|
422
|
-
qc_dict['NaN_plate_row'] = df['plate_row_metadata'].isna().sum()
|
423
|
-
qc_dict['NaN_column'] = df['column_metadata'].isna().sum()
|
424
|
-
qc_dict['NaN_plate'] = df['plate_metadata'].isna().sum()
|
425
|
-
qc_dict['unique_grna'] = Counter(df['grna_metadata'].dropna().tolist())
|
426
|
-
qc_dict['unique_plate_row'] = Counter(df['plate_row_metadata'].dropna().tolist())
|
427
|
-
qc_dict['unique_column'] = Counter(df['column_metadata'].dropna().tolist())
|
428
|
-
qc_dict['unique_plate'] = Counter(df['plate_metadata'].dropna().tolist())
|
429
|
-
|
430
|
-
# Calculate control error rates using cleaned DataFrame
|
431
|
-
total_pc_non_nan = df_cleaned[(df_cleaned['column_metadata'] == settings['pc_loc'])].shape[0]
|
432
|
-
total_nc_non_nan = df_cleaned[(df_cleaned['column_metadata'] == settings['nc_loc'])].shape[0]
|
433
|
-
|
434
|
-
pc_count_pc = df_cleaned[(df_cleaned['column_metadata'] == settings['pc_loc']) & (df_cleaned['grna_metadata'] == settings['pc'])].shape[0]
|
435
|
-
nc_count_nc = df_cleaned[(df_cleaned['column_metadata'] == settings['nc_loc']) & (df_cleaned['grna_metadata'] == settings['nc'])].shape[0]
|
388
|
+
analyzed_chunks_1 = analyzed_chunks
|
389
|
+
start_time = time.time()
|
390
|
+
chunk_count += 1
|
391
|
+
analyzed_chunks = int(chunk_count*n_jobs)
|
392
|
+
analyzed_chunks_ls = list(range(analyzed_chunks_1, analyzed_chunks))
|
436
393
|
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
pc_in_nc_loc_count = df_cleaned[(df_cleaned['column_metadata'] == settings['nc_loc']) & (df_cleaned['grna_metadata'] == settings['pc'])].shape[0]
|
441
|
-
nc_in_pc_loc_count = df_cleaned[(df_cleaned['column_metadata'] == settings['pc_loc']) & (df_cleaned['grna_metadata'] == settings['nc'])].shape[0]
|
442
|
-
|
443
|
-
# Collect QC metrics into a dictionary
|
444
|
-
# PC
|
445
|
-
qc_dict['pc_total_count'] = total_pc_non_nan
|
446
|
-
qc_dict['pc_count_pc'] = pc_count_pc
|
447
|
-
qc_dict['nc_count_pc'] = pc_in_nc_loc_count
|
448
|
-
qc_dict['pc_error_count'] = pc_error_count
|
449
|
-
# NC
|
450
|
-
qc_dict['nc_total_count'] = total_nc_non_nan
|
451
|
-
qc_dict['nc_count_nc'] = nc_count_nc
|
452
|
-
qc_dict['pc_count_nc'] = nc_in_pc_loc_count
|
453
|
-
qc_dict['nc_error_count'] = nc_error_count
|
454
|
-
|
455
|
-
return df_cleaned, qc_dict
|
456
|
-
|
457
|
-
def get_per_row_qc(df, settings):
|
458
|
-
"""
|
459
|
-
Calculate quality control metrics for each unique row in the control columns.
|
460
|
-
|
461
|
-
Parameters:
|
462
|
-
- df: DataFrame containing the sequencing reads.
|
463
|
-
- settings: Dictionary containing the settings for control values.
|
464
|
-
|
465
|
-
Returns:
|
466
|
-
- dict: Dictionary containing the quality control metrics for each unique row.
|
467
|
-
"""
|
468
|
-
qc_dict_per_row = {}
|
469
|
-
unique_rows = df['plate_row_metadata'].dropna().unique().tolist()
|
470
|
-
unique_rows = list(set(unique_rows)) # Remove duplicates
|
471
|
-
|
472
|
-
for row in unique_rows:
|
473
|
-
df_row = df[(df['plate_row_metadata'] == row)]
|
474
|
-
_, qc_dict_row = get_read_qc(df_row, settings)
|
475
|
-
qc_dict_per_row[row] = qc_dict_row
|
476
|
-
|
477
|
-
return qc_dict_per_row
|
478
|
-
|
479
|
-
def mapping_dicts(df, settings):
|
480
|
-
"""
|
481
|
-
Maps the values in the DataFrame columns to corresponding metadata using dictionaries.
|
482
|
-
|
483
|
-
Args:
|
484
|
-
df (pandas.DataFrame): The DataFrame containing the data to be mapped.
|
485
|
-
settings (dict): A dictionary containing the settings for mapping.
|
486
|
-
|
487
|
-
Returns:
|
488
|
-
pandas.DataFrame: The DataFrame with the mapped metadata columns added.
|
489
|
-
"""
|
490
|
-
grna_df = pd.read_csv(settings['grna'])
|
491
|
-
barcode_df = pd.read_csv(settings['barcodes'])
|
492
|
-
|
493
|
-
grna_dict = {row['sequence']: row['name'] for _, row in grna_df.iterrows()}
|
494
|
-
plate_row_dict = {row['sequence']: row['name'] for _, row in barcode_df.iterrows() if row['name'].startswith('p')}
|
495
|
-
column_dict = {row['sequence']: row['name'] for _, row in barcode_df.iterrows() if row['name'].startswith('c')}
|
496
|
-
plate_dict = settings['plate_dict']
|
497
|
-
|
498
|
-
df['grna_metadata'] = df['grna'].map(grna_dict)
|
499
|
-
df['grna_length'] = df['grna'].apply(len)
|
500
|
-
df['plate_row_metadata'] = df['plate_row'].map(plate_row_dict)
|
501
|
-
df['column_metadata'] = df['column'].map(column_dict)
|
502
|
-
df['plate_metadata'] = df['sample'].map(plate_dict)
|
503
|
-
|
504
|
-
return df
|
505
|
-
|
506
|
-
def filter_combinations(df, settings):
|
507
|
-
"""
|
508
|
-
Takes the combination counts Data Frame, filters the rows based on specific conditions,
|
509
|
-
and removes rows with a count lower than the highest value of max_count_c1 and max_count_c2.
|
394
|
+
# Split the records into chunks to be processed by each core
|
395
|
+
chunked_records = [records[i:i + chunk_size] for i in range(0, len(records), chunk_size)]
|
510
396
|
|
511
|
-
|
512
|
-
|
513
|
-
pc (str, optional): The positive control sequence. Defaults to 'TGGT1_220950_1'.
|
514
|
-
nc (str, optional): The negative control sequence. Defaults to 'TGGT1_233460_4'.
|
397
|
+
# Process each chunk in parallel
|
398
|
+
dfs = pool.starmap(process_chunk_for_mapping, [(chunk, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements) for chunk in chunked_records])
|
515
399
|
|
516
|
-
|
517
|
-
pd.
|
518
|
-
|
400
|
+
# Queue the dataframes to be saved
|
401
|
+
df = pd.concat(dfs, ignore_index=True)
|
402
|
+
save_queue.put((chunk_count, df))
|
519
403
|
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
nc_loc = settings['nc_loc']
|
404
|
+
end_time = time.time()
|
405
|
+
chunk_time = end_time - start_time
|
406
|
+
time_ls.append(chunk_time)
|
524
407
|
|
525
|
-
|
526
|
-
|
408
|
+
for az_chunks in analyzed_chunks_ls:
|
409
|
+
print_progress(files_processed=az_chunks, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=chunk_size, operation_type=" Mapping Barcodes")
|
527
410
|
|
528
|
-
|
529
|
-
max_count_c2 = filtered_c2['count'].max()
|
411
|
+
del records, chunked_records, dfs, df
|
530
412
|
|
531
|
-
|
532
|
-
|
413
|
+
pool.close()
|
414
|
+
pool.join()
|
533
415
|
|
534
|
-
|
535
|
-
|
416
|
+
# Send a sentinel value to indicate the saving process should stop
|
417
|
+
save_queue.put((None, None))
|
418
|
+
save_process.join()
|
536
419
|
|
537
|
-
|
538
|
-
|
420
|
+
def extract_barcodes_from_fastq_v1(fastq, output_file, chunk_size, barcode_mapping, n_jobs=None, compression='zlib', complevel=9):
|
421
|
+
"""
|
422
|
+
Extracts barcodes from a FASTQ file and saves the results to an output file.
|
423
|
+
|
424
|
+
Parameters:
|
425
|
+
- fastq (str): Path to the input FASTQ file.
|
426
|
+
- output_file (str): Path to the output file where the barcode data will be saved.
|
427
|
+
- chunk_size (int): Number of records to process in each chunk.
|
428
|
+
- barcode_mapping (dict): Mapping of barcode keys to CSV file paths, barcode coordinates, and reverse complement flags.
|
429
|
+
- n_jobs (int, optional): Number of parallel processes to use for barcode mapping. Defaults to None.
|
430
|
+
- compression (str, optional): Compression algorithm to use for the output file. Defaults to 'zlib'.
|
431
|
+
- complevel (int, optional): Compression level to use for the output file. Defaults to 9.
|
432
|
+
"""
|
539
433
|
|
540
|
-
|
541
|
-
filtered_df = df[df['count'] >= highest_max_count]
|
434
|
+
from .utils import print_progress, count_reads_in_fastq
|
542
435
|
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
# Calculate read fraction for each row
|
547
|
-
filtered_df['read_fraction'] = filtered_df['count'] / filtered_df['total_reads']
|
436
|
+
# Ensure the file is deleted before starting
|
437
|
+
if os.path.exists(output_file):
|
438
|
+
os.remove(output_file)
|
548
439
|
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
unique_column_file_path = os.path.join(fldr, f'{file_name}_unique_column.csv')
|
569
|
-
unique_plate_file_path = os.path.join(fldr, f'{file_name}_unique_plate.csv')
|
570
|
-
new_h5_file_path = os.path.join(fldr, f'{file_name}_cleaned.h5')
|
571
|
-
combination_counts_file_path = os.path.join(fldr, f'{file_name}_combination_counts.csv')
|
572
|
-
combination_counts_file_path_cleaned = os.path.join(fldr, f'{file_name}_combination_counts_cleaned.csv')
|
573
|
-
|
574
|
-
#qc_file_path = os.path.splitext(h5_file_path)[0] + '_qc_step_2.csv'
|
575
|
-
#unique_grna_file_path = os.path.splitext(h5_file_path)[0] + '_unique_grna.csv'
|
576
|
-
#unique_plate_row_file_path = os.path.splitext(h5_file_path)[0] + '_unique_plate_row.csv'
|
577
|
-
#unique_column_file_path = os.path.splitext(h5_file_path)[0] + '_unique_column.csv'
|
578
|
-
#unique_plate_file_path = os.path.splitext(h5_file_path)[0] + '_unique_plate.csv'
|
579
|
-
#new_h5_file_path = os.path.splitext(h5_file_path)[0] + '_cleaned.h5'
|
580
|
-
#combination_counts_file_path = os.path.splitext(h5_file_path)[0] + '_combination_counts.csv'
|
581
|
-
#combination_counts_file_path_cleaned = os.path.splitext(h5_file_path)[0] + '_combination_counts_cleaned.csv'
|
440
|
+
# Validate and process barcode mapping
|
441
|
+
barcode_dicts = {}
|
442
|
+
barcode_coordinates = {}
|
443
|
+
reverse_complements = {}
|
444
|
+
|
445
|
+
for key, (csv_path, coordinates, reverse_comp) in barcode_mapping.items():
|
446
|
+
df = pd.read_csv(csv_path)
|
447
|
+
if 'name' not in df.columns or 'sequence' not in df.columns:
|
448
|
+
print(f"Warning: CSV file {csv_path} does not have required columns 'name' and 'sequence'. Aborting.")
|
449
|
+
return
|
450
|
+
barcode_dicts[key] = df.set_index('sequence')['name'].to_dict()
|
451
|
+
barcode_coordinates[key] = coordinates
|
452
|
+
reverse_complements[key] = reverse_comp
|
453
|
+
|
454
|
+
if n_jobs is None:
|
455
|
+
n_jobs = cpu_count() - 2
|
456
|
+
|
457
|
+
chunk_count = 0
|
458
|
+
time_ls = []
|
582
459
|
|
583
|
-
|
584
|
-
|
460
|
+
print(f'Calculating read count for {fastq} ...')
|
461
|
+
total_reads = count_reads_in_fastq(fastq)
|
462
|
+
chunks_nr = (int(total_reads/chunk_size) + 1)
|
585
463
|
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
464
|
+
print(f'Mapping barcodes for {total_reads} reads in {chunks_nr} batches for {fastq} ...')
|
465
|
+
with gzip.open(fastq, "rt") as handle:
|
466
|
+
fastq_iter = SeqIO.parse(handle, "fastq")
|
467
|
+
pool = Pool(processes=n_jobs)
|
468
|
+
|
469
|
+
while True:
|
470
|
+
# Read n_jobs * chunk_size records into memory
|
471
|
+
records = [record for _, record in zip(range(n_jobs * chunk_size), fastq_iter)]
|
472
|
+
|
473
|
+
if not records:
|
474
|
+
break
|
475
|
+
|
476
|
+
start_time = time.time()
|
477
|
+
chunk_count += 1
|
478
|
+
|
479
|
+
# Split the records into chunks to be processed by each core
|
480
|
+
chunked_records = [records[i:i + chunk_size] for i in range(0, len(records), chunk_size)]
|
481
|
+
|
482
|
+
# Process each chunk in parallel
|
483
|
+
dfs = pool.starmap(process_chunk_for_mapping, [(chunk, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements) for chunk in chunked_records])
|
484
|
+
|
485
|
+
# Join the results
|
486
|
+
df = pd.concat(dfs, ignore_index=True)
|
487
|
+
|
488
|
+
# Save to HDF5 with compression
|
489
|
+
print(f'Writing chunk {chunk_count} to H5PY ...')
|
490
|
+
df.to_hdf(output_file, key=f'chunk_{chunk_count}', mode='a', format='table', complevel=complevel, complib=compression)
|
491
|
+
|
492
|
+
end_time = time.time()
|
493
|
+
chunk_time = end_time - start_time
|
494
|
+
time_ls.append(chunk_time)
|
495
|
+
print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=None, operation_type=" Mapping Barcodes")
|
496
|
+
|
497
|
+
del records, chunked_records, dfs, df
|
498
|
+
|
499
|
+
pool.close()
|
500
|
+
pool.join()
|
501
|
+
|
502
|
+
def generate_barecode_mapping(settings={}):
|
503
|
+
from .settings import set_default_generate_barecode_mapping
|
504
|
+
|
505
|
+
settings = set_default_generate_barecode_mapping(settings)
|
506
|
+
|
507
|
+
samples_dict = parse_gz_files(settings['src'])
|
508
|
+
for key in samples_dict:
|
509
|
+
if samples_dict[key]['R1'] and samples_dict[key]['R2']:
|
510
|
+
R1 = samples_dict[key]['R1']
|
511
|
+
R2 = samples_dict[key]['R2']
|
512
|
+
consensus_dir = os.path.join(os.path.dirname(R1), 'consensus')
|
513
|
+
os.makedirs(consensus_dir, exist_ok=True)
|
514
|
+
consensus = os.path.join(consensus_dir, f"{key}_consensus.fastq.gz")
|
515
|
+
h5 = os.path.join(consensus_dir, f"{key}_barecodes.h5")
|
516
|
+
|
517
|
+
if not os.path.exists(consensus):
|
518
|
+
consensus_sequence(R1, R2, consensus, settings['chunk_size'])
|
519
|
+
else:
|
520
|
+
print(f"Consensus file {consensus} already exists. Mapping barecodes.")
|
521
|
+
|
522
|
+
extract_barcodes_from_fastq(fastq=consensus,
|
523
|
+
output_file=h5,
|
524
|
+
chunk_size=settings['chunk_size'],
|
525
|
+
barcode_mapping=settings['barcode_mapping'],
|
526
|
+
n_jobs=settings['n_jobs'],
|
527
|
+
compression=settings['compression'],
|
528
|
+
complevel=settings['complevel'])
|
529
|
+
|
530
|
+
|
531
|
+
|
532
|
+
|
533
|
+
|
534
|
+
|
535
|
+
|
536
|
+
|
537
|
+
|
538
|
+
|
539
|
+
|
540
|
+
|
541
|
+
|
542
|
+
|
543
|
+
|
544
|
+
|
630
545
|
|
631
|
-
if settings['test'] and settings['verbose']:
|
632
|
-
os.makedirs(os.path.join(os.path.splitext(h5_file_path)[0],'test'), exist_ok=True)
|
633
|
-
df.to_csv(os.path.join(os.path.splitext(h5_file_path)[0],'test','chunk_1_df.csv'), index=False)
|
634
|
-
df_cleaned.to_csv(os.path.join(os.path.splitext(h5_file_path)[0],'test','chunk_1_df_cleaned.csv'), index=False)
|
635
546
|
|
636
|
-
# Accumulate QC metrics for all rows
|
637
|
-
for metric in qc_dict:
|
638
|
-
if isinstance(overall_qc[metric], Counter):
|
639
|
-
overall_qc[metric].update(qc_dict[metric])
|
640
|
-
else:
|
641
|
-
overall_qc[metric] += qc_dict[metric]
|
642
547
|
|
643
|
-
# Update per_row_qc dictionary
|
644
|
-
chunk_per_row_qc = get_per_row_qc(df, settings)
|
645
|
-
for row in chunk_per_row_qc:
|
646
|
-
if row not in per_row_qc:
|
647
|
-
per_row_qc[row] = chunk_per_row_qc[row]
|
648
|
-
else:
|
649
|
-
for metric in chunk_per_row_qc[row]:
|
650
|
-
if isinstance(per_row_qc[row][metric], Counter):
|
651
|
-
per_row_qc[row][metric].update(chunk_per_row_qc[row][metric])
|
652
|
-
else:
|
653
|
-
per_row_qc[row][metric] += chunk_per_row_qc[row][metric]
|
654
|
-
|
655
|
-
# Ensure the DataFrame columns are in the desired order
|
656
|
-
df_cleaned = df_cleaned[['grna', 'plate_row', 'column', 'sample', 'grna_metadata', 'plate_row_metadata', 'column_metadata', 'plate_metadata']]
|
657
|
-
|
658
|
-
# Save cleaned data to the new HDF5 store
|
659
|
-
store_cleaned.put('reads/cleaned_data', df_cleaned, format='table', append=True)
|
660
|
-
|
661
|
-
del df_cleaned, df
|
662
|
-
gc.collect()
|
663
|
-
|
664
|
-
# Calculate overall fractions after accumulating all metrics
|
665
|
-
overall_qc['pc_fraction_pc'] = overall_qc['pc_count_pc'] / overall_qc['pc_total_count'] if overall_qc['pc_total_count'] else 0
|
666
|
-
overall_qc['nc_fraction_nc'] = overall_qc['nc_count_nc'] / overall_qc['nc_total_count'] if overall_qc['nc_total_count'] else 0
|
667
|
-
overall_qc['pc_fraction_nc'] = overall_qc['pc_count_nc'] / overall_qc['nc_total_count'] if overall_qc['nc_total_count'] else 0
|
668
|
-
overall_qc['nc_fraction_pc'] = overall_qc['nc_count_pc'] / overall_qc['pc_total_count'] if overall_qc['pc_total_count'] else 0
|
669
|
-
|
670
|
-
for row in per_row_qc:
|
671
|
-
if row != 'all_rows':
|
672
|
-
per_row_qc[row]['pc_fraction_pc'] = per_row_qc[row]['pc_count_pc'] / per_row_qc[row]['pc_total_count'] if per_row_qc[row]['pc_total_count'] else 0
|
673
|
-
per_row_qc[row]['nc_fraction_nc'] = per_row_qc[row]['nc_count_nc'] / per_row_qc[row]['nc_total_count'] if per_row_qc[row]['nc_total_count'] else 0
|
674
|
-
per_row_qc[row]['pc_fraction_nc'] = per_row_qc[row]['pc_count_nc'] / per_row_qc[row]['nc_total_count'] if per_row_qc[row]['nc_total_count'] else 0
|
675
|
-
per_row_qc[row]['nc_fraction_pc'] = per_row_qc[row]['nc_count_pc'] / per_row_qc[row]['pc_total_count'] if per_row_qc[row]['pc_total_count'] else 0
|
676
|
-
|
677
|
-
# Add overall_qc to per_row_qc with the key 'all_rows'
|
678
|
-
per_row_qc['all_rows'] = overall_qc
|
679
|
-
|
680
|
-
# Convert the Counter objects to DataFrames and save them to CSV files
|
681
|
-
unique_grna_df = pd.DataFrame(overall_qc['unique_grna'].items(), columns=['key', 'value'])
|
682
|
-
unique_plate_row_df = pd.DataFrame(overall_qc['unique_plate_row'].items(), columns=['key', 'value'])
|
683
|
-
unique_column_df = pd.DataFrame(overall_qc['unique_column'].items(), columns=['key', 'value'])
|
684
|
-
unique_plate_df = pd.DataFrame(overall_qc['unique_plate'].items(), columns=['key', 'value'])
|
685
|
-
|
686
|
-
unique_grna_df.to_csv(unique_grna_file_path, index=False)
|
687
|
-
unique_plate_row_df.to_csv(unique_plate_row_file_path, index=False)
|
688
|
-
unique_column_df.to_csv(unique_column_file_path, index=False)
|
689
|
-
unique_plate_df.to_csv(unique_plate_file_path, index=False)
|
690
|
-
|
691
|
-
# Remove the unique counts from overall_qc for the main QC CSV file
|
692
|
-
del overall_qc['unique_grna']
|
693
|
-
del overall_qc['unique_plate_row']
|
694
|
-
del overall_qc['unique_column']
|
695
|
-
del overall_qc['unique_plate']
|
696
|
-
|
697
|
-
# Combine all remaining QC metrics into a single DataFrame and save it to CSV
|
698
|
-
qc_df = pd.DataFrame([overall_qc])
|
699
|
-
qc_df.to_csv(qc_file_path, index=False)
|
700
|
-
|
701
|
-
# Convert per_row_qc to a DataFrame and save it to CSV
|
702
|
-
per_row_qc_df = pd.DataFrame.from_dict(per_row_qc, orient='index')
|
703
|
-
per_row_qc_df = per_row_qc_df.sort_values(by='reads', ascending=False)
|
704
|
-
per_row_qc_df = per_row_qc_df.drop(['unique_grna', 'unique_plate_row', 'unique_column', 'unique_plate'], axis=1, errors='ignore')
|
705
|
-
per_row_qc_df = per_row_qc_df.dropna(subset=['reads'])
|
706
|
-
per_row_qc_df.to_csv(os.path.splitext(h5_file_path)[0] + '_per_row_qc.csv', index=True)
|
707
|
-
|
708
|
-
if settings['verbose']:
|
709
|
-
display(per_row_qc_df)
|
710
|
-
|
711
|
-
# Save the combination counts to a CSV file
|
712
|
-
try:
|
713
|
-
combination_counts_df = pd.DataFrame(combination_counts.items(), columns=['combination', 'count'])
|
714
|
-
combination_counts_df[['plate_row', 'column', 'grna']] = pd.DataFrame(combination_counts_df['combination'].tolist(), index=combination_counts_df.index)
|
715
|
-
combination_counts_df = combination_counts_df.drop('combination', axis=1)
|
716
|
-
combination_counts_df.to_csv(combination_counts_file_path, index=False)
|
717
|
-
|
718
|
-
grna_plate_heatmap(combination_counts_file_path, specific_grna=None)
|
719
|
-
grna_plate_heatmap(combination_counts_file_path, specific_grna=settings['pc'])
|
720
|
-
grna_plate_heatmap(combination_counts_file_path, specific_grna=settings['nc'])
|
721
|
-
|
722
|
-
combination_counts_df_cleaned = filter_combinations(combination_counts_df, settings)
|
723
|
-
combination_counts_df_cleaned.to_csv(combination_counts_file_path_cleaned, index=False)
|
724
|
-
|
725
|
-
grna_plate_heatmap(combination_counts_file_path_cleaned, specific_grna=None)
|
726
|
-
grna_plate_heatmap(combination_counts_file_path_cleaned, specific_grna=settings['pc'])
|
727
|
-
grna_plate_heatmap(combination_counts_file_path_cleaned, specific_grna=settings['nc'])
|
728
|
-
except Exception as e:
|
729
|
-
print(e)
|
730
|
-
|
731
|
-
# Close the HDF5 store
|
732
|
-
store_cleaned.close()
|
733
|
-
gc.collect()
|
734
|
-
return
|
735
548
|
|
736
549
|
def grna_plate_heatmap(path, specific_grna=None, min_max='all', cmap='viridis', min_count=0, save=True):
|
737
550
|
"""
|
@@ -820,19 +633,6 @@ def grna_plate_heatmap(path, specific_grna=None, min_max='all', cmap='viridis',
|
|
820
633
|
|
821
634
|
return fig
|
822
635
|
|
823
|
-
def map_barcodes_folder(settings={}):
|
824
|
-
from .settings import get_map_barcodes_default_settings
|
825
|
-
settings = get_map_barcodes_default_settings(settings)
|
826
|
-
|
827
|
-
print(settings)
|
828
|
-
src = settings['src']
|
829
|
-
for file in os.listdir(src):
|
830
|
-
if file.endswith('.h5'):
|
831
|
-
print(file)
|
832
|
-
path = os.path.join(src, file)
|
833
|
-
map_barcodes(path, settings)
|
834
|
-
gc.collect()
|
835
|
-
|
836
636
|
def reverse_complement(dna_sequence):
|
837
637
|
complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N':'N'}
|
838
638
|
reverse_seq = dna_sequence[::-1]
|