spacr 0.2.67__py3-none-any.whl → 0.2.81__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/core.py +1 -1
- spacr/gui.py +35 -20
- spacr/gui_core.py +152 -105
- spacr/gui_elements.py +190 -18
- spacr/gui_utils.py +28 -17
- spacr/sequencing.py +234 -422
- spacr/settings.py +13 -9
- spacr/utils.py +4 -1
- {spacr-0.2.67.dist-info → spacr-0.2.81.dist-info}/METADATA +4 -1
- {spacr-0.2.67.dist-info → spacr-0.2.81.dist-info}/RECORD +14 -14
- {spacr-0.2.67.dist-info → spacr-0.2.81.dist-info}/LICENSE +0 -0
- {spacr-0.2.67.dist-info → spacr-0.2.81.dist-info}/WHEEL +0 -0
- {spacr-0.2.67.dist-info → spacr-0.2.81.dist-info}/entry_points.txt +0 -0
- {spacr-0.2.67.dist-info → spacr-0.2.81.dist-info}/top_level.txt +0 -0
spacr/sequencing.py
CHANGED
@@ -25,6 +25,18 @@ from Bio import SeqIO
|
|
25
25
|
from Bio.Seq import Seq
|
26
26
|
from Bio.SeqRecord import SeqRecord
|
27
27
|
|
28
|
+
from collections import defaultdict
|
29
|
+
|
30
|
+
import gzip, re
|
31
|
+
from Bio.Seq import Seq
|
32
|
+
import pandas as pd
|
33
|
+
import numpy as np
|
34
|
+
import gzip, re
|
35
|
+
from Bio.Seq import Seq
|
36
|
+
import pandas as pd
|
37
|
+
import numpy as np
|
38
|
+
from multiprocessing import Pool, cpu_count
|
39
|
+
|
28
40
|
def parse_gz_files(folder_path):
|
29
41
|
"""
|
30
42
|
Parses the .fastq.gz files in the specified folder path and returns a dictionary
|
@@ -55,474 +67,274 @@ def parse_gz_files(folder_path):
|
|
55
67
|
samples_dict[sample_name]['R2'] = os.path.join(folder_path, gz_file)
|
56
68
|
return samples_dict
|
57
69
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
r2_chunk (list): List of SeqRecord objects representing the second read in each pair.
|
65
|
-
|
66
|
-
Returns:
|
67
|
-
list: List of SeqRecord objects representing the consensus sequences.
|
68
|
-
|
69
|
-
"""
|
70
|
-
consensus_records = []
|
70
|
+
# Function to map sequences to names (same as your original)
|
71
|
+
def map_sequences_to_names(csv_file, sequences, rc):
|
72
|
+
def rev_comp(dna_sequence):
|
73
|
+
complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
|
74
|
+
reverse_seq = dna_sequence[::-1]
|
75
|
+
return ''.join([complement_dict[base] for base in reverse_seq])
|
71
76
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
for base1, base2, qual1, qual2 in zip(r1_record.seq, r2_record.seq, r1_record.letter_annotations["phred_quality"], r2_record.letter_annotations["phred_quality"]):
|
76
|
-
if qual1 >= qual2:
|
77
|
-
best_sequence.append(base1)
|
78
|
-
best_quality.append(qual1)
|
79
|
-
else:
|
80
|
-
best_sequence.append(base2)
|
81
|
-
best_quality.append(qual2)
|
82
|
-
|
83
|
-
consensus_seq = Seq("".join(best_sequence))
|
84
|
-
|
85
|
-
# Create a new SeqRecord for the consensus sequence
|
86
|
-
consensus_record = SeqRecord(consensus_seq, id=r1_record.id, description="", letter_annotations={"phred_quality": best_quality})
|
87
|
-
|
88
|
-
# Add the consensus record to the list
|
89
|
-
consensus_records.append(consensus_record)
|
77
|
+
df = pd.read_csv(csv_file)
|
78
|
+
if rc:
|
79
|
+
df['sequence'] = df['sequence'].apply(rev_comp)
|
90
80
|
|
91
|
-
|
92
|
-
|
93
|
-
def consensus_sequence(fastq_r1, fastq_r2, output_file, chunk_size=1000000, n_jobs=None):
|
94
|
-
"""
|
95
|
-
Calculate the consensus sequence from two FASTQ files (R1 and R2) and write the result to an output file.
|
96
|
-
|
97
|
-
Parameters:
|
98
|
-
- fastq_r1 (str): Path to the R1 FASTQ file.
|
99
|
-
- fastq_r2 (str): Path to the R2 FASTQ file.
|
100
|
-
- output_file (str): Path to the output file where the consensus sequence will be written.
|
101
|
-
- chunk_size (int): Number of reads to process in each chunk. Default is 1000000.
|
102
|
-
- n_jobs (int): Number of parallel processes to use. If None, it will use the number of available CPUs minus 2.
|
103
|
-
|
104
|
-
Returns:
|
105
|
-
None
|
106
|
-
"""
|
107
|
-
from .utils import print_progress, count_reads_in_fastq
|
108
|
-
|
109
|
-
print(f'Calculating read count for {fastq_r1} ...')
|
110
|
-
total_reads = count_reads_in_fastq(fastq_r1)
|
111
|
-
chunks_nr = (int(total_reads / chunk_size) + 1) // (n_jobs if n_jobs else cpu_count())
|
112
|
-
|
113
|
-
total_reads_processed = 0
|
114
|
-
chunk_count = 0
|
115
|
-
time_ls = []
|
116
|
-
|
117
|
-
if n_jobs is None:
|
118
|
-
n_jobs = cpu_count() - 2
|
81
|
+
csv_sequences = pd.Series(df['name'].values, index=df['sequence']).to_dict()
|
82
|
+
return [csv_sequences.get(sequence, pd.NA) for sequence in sequences]
|
119
83
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
84
|
+
# Functions to save data (same as your original)
|
85
|
+
def save_df_to_hdf5(df, hdf5_file, key='df', comp_type='zlib', comp_level=5):
|
86
|
+
try:
|
87
|
+
with pd.HDFStore(hdf5_file, 'a', complib=comp_type, complevel=comp_level) as store:
|
88
|
+
if key in store:
|
89
|
+
existing_df = store[key]
|
90
|
+
df = pd.concat([existing_df, df], ignore_index=True)
|
91
|
+
store.put(key, df, format='table')
|
92
|
+
except Exception as e:
|
93
|
+
print(f"Error while saving DataFrame to HDF5: {e}")
|
94
|
+
|
95
|
+
def save_unique_combinations_to_csv(unique_combinations, csv_file):
|
96
|
+
try:
|
97
|
+
try:
|
98
|
+
existing_df = pd.read_csv(csv_file)
|
99
|
+
except FileNotFoundError:
|
100
|
+
existing_df = pd.DataFrame()
|
124
101
|
|
125
|
-
|
126
|
-
|
102
|
+
if not existing_df.empty:
|
103
|
+
unique_combinations = pd.concat([existing_df, unique_combinations])
|
104
|
+
unique_combinations = unique_combinations.groupby(
|
105
|
+
['row_name', 'column_name', 'grna_name'], as_index=False).sum()
|
127
106
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
# If either chunk is empty, we have reached the end of one or both files
|
132
|
-
if not r1_chunk or not r2_chunk:
|
133
|
-
break
|
107
|
+
unique_combinations.to_csv(csv_file, index=False)
|
108
|
+
except Exception as e:
|
109
|
+
print(f"Error while saving unique combinations to CSV: {e}")
|
134
110
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
None
|
169
|
-
"""
|
170
|
-
from .utils import print_progress, count_reads_in_fastq
|
171
|
-
|
172
|
-
print(f'Calculating read count for {fastq_r1} ...')
|
173
|
-
total_reads = count_reads_in_fastq(fastq_r1)
|
174
|
-
chunks_nr = int(total_reads/chunk_size) + 1
|
111
|
+
def save_qc_df_to_csv(qc_df, qc_csv_file):
|
112
|
+
try:
|
113
|
+
try:
|
114
|
+
existing_qc_df = pd.read_csv(qc_csv_file)
|
115
|
+
except FileNotFoundError:
|
116
|
+
existing_qc_df = pd.DataFrame()
|
117
|
+
|
118
|
+
if not existing_qc_df.empty:
|
119
|
+
qc_df = qc_df.add(existing_qc_df, fill_value=0)
|
120
|
+
|
121
|
+
qc_df.to_csv(qc_csv_file, index=False)
|
122
|
+
except Exception as e:
|
123
|
+
print(f"Error while saving QC DataFrame to CSV: {e}")
|
124
|
+
|
125
|
+
def extract_sequence_and_quality(sequence, quality, start, end):
|
126
|
+
return sequence[start:end], quality[start:end]
|
127
|
+
|
128
|
+
def create_consensus(seq1, qual1, seq2, qual2):
|
129
|
+
consensus_seq = []
|
130
|
+
for i in range(len(seq1)):
|
131
|
+
bases = [(seq1[i], qual1[i]), (seq2[i], qual2[i])]
|
132
|
+
consensus_seq.append(get_consensus_base(bases))
|
133
|
+
return ''.join(consensus_seq)
|
134
|
+
|
135
|
+
def get_consensus_base(bases):
|
136
|
+
# Prefer non-'N' bases, if 'N' exists, pick the other one.
|
137
|
+
if bases[0][0] == 'N':
|
138
|
+
return bases[1][0]
|
139
|
+
elif bases[1][0] == 'N':
|
140
|
+
return bases[0][0]
|
141
|
+
else:
|
142
|
+
# Return the base with the highest quality score
|
143
|
+
return bases[0][0] if bases[0][1] >= bases[1][1] else bases[1][0]
|
175
144
|
|
176
|
-
|
177
|
-
|
178
|
-
time_ls = []
|
145
|
+
def reverse_complement(seq):
|
146
|
+
return str(Seq(seq).reverse_complement())
|
179
147
|
|
180
|
-
|
181
|
-
|
182
|
-
|
148
|
+
# Core logic for processing a chunk (same as your original)
|
149
|
+
def process_chunk(chunk_data):
|
150
|
+
|
151
|
+
def find_sequence_in_chunk_reads(r1_chunk, r2_chunk, target_sequence, offset_start, expected_end):
|
152
|
+
i = 0
|
153
|
+
fail_count = 0
|
154
|
+
failed_cases = []
|
155
|
+
regex = r"^(?P<column>.{8})TGCTG.*TAAAC(?P<grna>.{20,21})AACTT.*AGAAG(?P<row>.{8}).*"
|
156
|
+
consensus_sequences, columns, grnas, rows = [], [], [], []
|
183
157
|
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
r2_chunk = [rec for rec in (next(r2_iter, None) for _ in range(chunk_size)) if rec is not None]
|
189
|
-
|
190
|
-
# If either chunk is empty, we have reached the end of one or both files
|
191
|
-
if not r1_chunk or not r2_chunk:
|
192
|
-
break
|
193
|
-
|
194
|
-
chunk_count += 1
|
195
|
-
total_reads += len(r1_chunk)
|
196
|
-
|
197
|
-
for r1_record, r2_record in zip(r1_chunk, r2_chunk):
|
198
|
-
best_sequence = []
|
199
|
-
best_quality = []
|
200
|
-
for base1, base2, qual1, qual2 in zip(r1_record.seq, r2_record.seq, r1_record.letter_annotations["phred_quality"], r2_record.letter_annotations["phred_quality"]):
|
201
|
-
if qual1 >= qual2:
|
202
|
-
best_sequence.append(base1)
|
203
|
-
best_quality.append(qual1)
|
204
|
-
else:
|
205
|
-
best_sequence.append(base2)
|
206
|
-
best_quality.append(qual2)
|
207
|
-
|
208
|
-
consensus_seq = Seq("".join(best_sequence))
|
209
|
-
|
210
|
-
# Create a new SeqRecord for the consensus sequence
|
211
|
-
consensus_record = SeqRecord(consensus_seq, id=r1_record.id, description="", letter_annotations={"phred_quality": best_quality})
|
212
|
-
|
213
|
-
# Write the consensus sequence to the output file
|
214
|
-
SeqIO.write(consensus_record, output_handle, "fastq")
|
215
|
-
|
216
|
-
end_time = time.time()
|
217
|
-
chunk_time = end_time - start_time
|
218
|
-
time_ls.append(chunk_time)
|
219
|
-
print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=1, time_ls=time_ls, batch_size=chunk_size, operation_type=" Consensus sequence from R1 & R2")
|
220
|
-
|
221
|
-
def save_to_hdf(queue, output_file, complevel=9, compression='zlib'):
|
222
|
-
"""
|
223
|
-
Save data from a queue to an HDF file.
|
224
|
-
|
225
|
-
Parameters:
|
226
|
-
- queue: Queue object containing chunks of data to be saved
|
227
|
-
- output_file: Path to the output HDF file
|
228
|
-
- complevel: Compression level (default: 9)
|
229
|
-
- compression: Compression algorithm (default: 'zlib')
|
230
|
-
|
231
|
-
Returns:
|
232
|
-
None
|
233
|
-
"""
|
234
|
-
with pd.HDFStore(output_file, mode='a', complevel=complevel, complib=compression) as store:
|
235
|
-
while True:
|
236
|
-
chunk_count, df = queue.get()
|
237
|
-
if df is None:
|
238
|
-
break
|
239
|
-
print(f'Writing chunks to H5PY ...')
|
240
|
-
store.append(f'chunk_{chunk_count}', df, format='table', data_columns=True)
|
241
|
-
|
242
|
-
def get_top_two_matches(seq, barcode_dict):
|
243
|
-
"""
|
244
|
-
Finds the top two closest matches for a given sequence in a barcode dictionary.
|
245
|
-
|
246
|
-
Args:
|
247
|
-
seq (str): The sequence to find the closest matches for.
|
248
|
-
barcode_dict (dict): A dictionary containing barcodes as keys and their corresponding values.
|
249
|
-
|
250
|
-
Returns:
|
251
|
-
list of tuples: A list containing up to two tuples, each with a barcode match and its score.
|
252
|
-
"""
|
253
|
-
results = process.extract(seq, barcode_dict.keys(), scorer=fuzz.ratio, limit=2)
|
254
|
-
matches = [(barcode_dict[result[0]], result[1] / 100.0) for result in results]
|
255
|
-
# Pad the matches list if there are fewer than two results
|
256
|
-
if len(matches) < 2:
|
257
|
-
matches.append((None, 0.0))
|
258
|
-
return matches
|
259
|
-
|
260
|
-
def process_chunk_for_mapping(records, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements):
|
261
|
-
"""
|
262
|
-
Process a chunk of records for barcode mapping, including highest and second-highest scores.
|
263
|
-
|
264
|
-
Args:
|
265
|
-
records (list): A list of records to process.
|
266
|
-
barcode_mapping (dict): A dictionary mapping barcodes to their corresponding keys.
|
267
|
-
barcode_dicts (dict): A dictionary of barcode dictionaries.
|
268
|
-
barcode_coordinates (dict): A dictionary mapping barcode keys to their start and end coordinates.
|
269
|
-
reverse_complements (dict): A dictionary indicating whether to reverse complement the extracted sequences for each barcode key.
|
158
|
+
for r1_lines, r2_lines in zip(r1_chunk, r2_chunk):
|
159
|
+
r1_header, r1_sequence, r1_plus, r1_quality = r1_lines.split('\n')
|
160
|
+
r2_header, r2_sequence, r2_plus, r2_quality = r2_lines.split('\n')
|
161
|
+
r2_sequence = reverse_complement(r2_sequence)
|
270
162
|
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
163
|
+
r1_pos = r1_sequence.find(target_sequence)
|
164
|
+
r2_pos = r2_sequence.find(target_sequence)
|
165
|
+
|
166
|
+
if r1_pos != -1 and r2_pos != -1:
|
167
|
+
r1_start = max(r1_pos + offset_start, 0)
|
168
|
+
r1_end = min(r1_start + expected_end, len(r1_sequence))
|
169
|
+
r2_start = max(r2_pos + offset_start, 0)
|
170
|
+
r2_end = min(r2_start + expected_end, len(r2_sequence))
|
171
|
+
|
172
|
+
r1_seq, r1_qual = extract_sequence_and_quality(r1_sequence, r1_quality, r1_start, r1_end)
|
173
|
+
r2_seq, r2_qual = extract_sequence_and_quality(r2_sequence, r2_quality, r2_start, r2_end)
|
174
|
+
|
175
|
+
if len(r1_seq) < expected_end:
|
176
|
+
r1_seq += 'N' * (expected_end - len(r1_seq))
|
177
|
+
r1_qual += '!' * (expected_end - len(r1_qual))
|
178
|
+
|
179
|
+
if len(r2_seq) < expected_end:
|
180
|
+
r2_seq += 'N' * (expected_end - len(r2_seq))
|
181
|
+
r2_qual += '!' * (expected_end - len(r2_qual))
|
182
|
+
|
183
|
+
consensus_seq = create_consensus(r1_seq, r1_qual, r2_seq, r2_qual)
|
184
|
+
if len(consensus_seq) >= expected_end:
|
185
|
+
match = re.match(regex, consensus_seq)
|
186
|
+
if match:
|
187
|
+
consensus_sequences.append(consensus_seq)
|
188
|
+
column_sequence = match.group('column')
|
189
|
+
grna_sequence = match.group('grna')
|
190
|
+
row_sequence = match.group('row')
|
191
|
+
columns.append(column_sequence)
|
192
|
+
grnas.append(grna_sequence)
|
193
|
+
rows.append(row_sequence)
|
194
|
+
|
195
|
+
return consensus_sequences, columns, grnas, rows, fail_count
|
196
|
+
|
197
|
+
r1_chunk, r2_chunk, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv = chunk_data
|
198
|
+
consensus_sequences, columns, grnas, rows, _ = find_sequence_in_chunk_reads(r1_chunk, r2_chunk, target_sequence, offset_start, expected_end)
|
279
199
|
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
df = pd.concat([df, df_seq, df_score_1, df_score_2], axis=1)
|
313
|
-
return df
|
314
|
-
|
315
|
-
def extract_barcodes_from_fastq(fastq, output_file, chunk_size, barcode_mapping, n_jobs=None, compression='zlib', complevel=9):
|
316
|
-
"""
|
317
|
-
Extracts barcodes from a FASTQ file and maps them based on a barcode mapping.
|
200
|
+
column_names = map_sequences_to_names(column_csv, columns, rc=False)
|
201
|
+
grna_names = map_sequences_to_names(grna_csv, grnas, rc=True)
|
202
|
+
row_names = map_sequences_to_names(row_csv, rows, rc=True)
|
203
|
+
|
204
|
+
df = pd.DataFrame({
|
205
|
+
'read': consensus_sequences,
|
206
|
+
'column_sequence': columns,
|
207
|
+
'column_name': column_names,
|
208
|
+
'row_sequence': rows,
|
209
|
+
'row_name': row_names,
|
210
|
+
'grna_sequence': grnas,
|
211
|
+
'grna_name': grna_names
|
212
|
+
})
|
213
|
+
|
214
|
+
qc_df = df.isna().sum().to_frame().T
|
215
|
+
qc_df.columns = df.columns
|
216
|
+
qc_df.index = ["NaN_Counts"]
|
217
|
+
qc_df['total_reads'] = len(df)
|
218
|
+
|
219
|
+
unique_combinations = df.groupby(['row_name', 'column_name', 'grna_name']).size().reset_index(name='count')
|
220
|
+
return df, unique_combinations, qc_df
|
221
|
+
|
222
|
+
# Function to save data from the queue
|
223
|
+
def saver_process(save_queue, hdf5_file, unique_combinations_csv, qc_csv_file, comp_type, comp_level):
|
224
|
+
while True:
|
225
|
+
item = save_queue.get()
|
226
|
+
if item == "STOP":
|
227
|
+
break
|
228
|
+
df, unique_combinations, qc_df = item
|
229
|
+
save_df_to_hdf5(df, hdf5_file, key='df', comp_type=comp_type, comp_level=comp_level)
|
230
|
+
save_unique_combinations_to_csv(unique_combinations, unique_combinations_csv)
|
231
|
+
save_qc_df_to_csv(qc_df, qc_csv_file)
|
318
232
|
|
319
|
-
|
320
|
-
|
321
|
-
output_file (str): Path to the output file where the mapped barcodes will be saved.
|
322
|
-
chunk_size (int): Number of records to process in each chunk.
|
323
|
-
barcode_mapping (dict): Dictionary containing barcode mapping information.
|
324
|
-
The keys are the names of the barcode sets, and the values are tuples
|
325
|
-
containing the path to the CSV file, barcode coordinates, and reverse complement flag.
|
326
|
-
n_jobs (int, optional): Number of parallel processes to use for mapping. Defaults to None.
|
327
|
-
compression (str, optional): Compression algorithm to use for saving the output file. Defaults to 'zlib'.
|
328
|
-
complevel (int, optional): Compression level to use for saving the output file. Defaults to 9.
|
233
|
+
# Updated chunked_processing with improved multiprocessing logic
|
234
|
+
def chunked_processing(r1_file, r2_file, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None):
|
329
235
|
|
330
|
-
|
331
|
-
None
|
332
|
-
"""
|
333
|
-
from .utils import print_progress, count_reads_in_fastq
|
334
|
-
|
335
|
-
# Ensure the file is deleted before starting
|
336
|
-
if os.path.exists(output_file):
|
337
|
-
os.remove(output_file)
|
338
|
-
|
339
|
-
# Validate and process barcode mapping
|
340
|
-
barcode_dicts = {}
|
341
|
-
barcode_coordinates = {}
|
342
|
-
reverse_complements = {}
|
343
|
-
|
344
|
-
for key, (csv_path, coordinates, reverse_comp) in barcode_mapping.items():
|
345
|
-
df = pd.read_csv(csv_path)
|
346
|
-
if 'name' not in df.columns or 'sequence' not in df.columns:
|
347
|
-
print(f"Warning: CSV file {csv_path} does not have required columns 'name' and 'sequence'. Aborting.")
|
348
|
-
return
|
349
|
-
barcode_dicts[key] = df.set_index('sequence')['name'].to_dict()
|
350
|
-
barcode_coordinates[key] = coordinates
|
351
|
-
reverse_complements[key] = reverse_comp
|
236
|
+
from .utils import count_reads_in_fastq, print_progress
|
352
237
|
|
238
|
+
# Use cpu_count minus 3 cores if n_jobs isn't specified
|
353
239
|
if n_jobs is None:
|
354
|
-
n_jobs = cpu_count() - 3
|
355
|
-
|
240
|
+
n_jobs = cpu_count() - 3
|
241
|
+
|
356
242
|
analyzed_chunks = 0
|
357
243
|
chunk_count = 0
|
358
244
|
time_ls = []
|
359
|
-
|
360
|
-
print(f'Calculating read count for {
|
361
|
-
total_reads = count_reads_in_fastq(
|
362
|
-
chunks_nr = int(total_reads/chunk_size)
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
# Create a queue to hold dataframes to be saved
|
245
|
+
|
246
|
+
print(f'Calculating read count for {r1_file}...')
|
247
|
+
total_reads = count_reads_in_fastq(r1_file)
|
248
|
+
chunks_nr = int(total_reads / chunk_size)
|
249
|
+
print(f'Mapping barcodes for {total_reads} reads in {chunks_nr} batches for {r1_file}...')
|
250
|
+
|
251
|
+
# Queue for saving
|
367
252
|
save_queue = Queue()
|
368
253
|
|
369
|
-
# Start
|
370
|
-
save_process = Process(target=
|
254
|
+
# Start the saving process
|
255
|
+
save_process = Process(target=saver_process, args=(save_queue, hdf5_file, unique_combinations_csv, qc_csv_file, comp_type, comp_level))
|
371
256
|
save_process.start()
|
372
257
|
|
373
|
-
|
374
|
-
fastq_iter = SeqIO.parse(handle, "fastq")
|
375
|
-
pool = Pool(processes=n_jobs)
|
258
|
+
pool = Pool(n_jobs)
|
376
259
|
|
260
|
+
with gzip.open(r1_file, 'rt') as r1, gzip.open(r2_file, 'rt') as r2:
|
261
|
+
fastq_iter = zip(r1, r2)
|
377
262
|
while True:
|
378
|
-
# Read n_jobs * chunk_size records into memory
|
379
|
-
records = [record for _, record in zip(range(n_jobs * chunk_size), fastq_iter)]
|
380
|
-
|
381
|
-
if not records:
|
382
|
-
break
|
383
|
-
|
384
|
-
analyzed_chunks_1 = analyzed_chunks
|
385
263
|
start_time = time.time()
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
df = pd.concat(dfs, ignore_index=True)
|
398
|
-
save_queue.put((chunk_count, df))
|
399
|
-
|
400
|
-
end_time = time.time()
|
401
|
-
chunk_time = end_time - start_time
|
402
|
-
time_ls.append(chunk_time)
|
403
|
-
|
404
|
-
for az_chunks in analyzed_chunks_ls:
|
405
|
-
print_progress(files_processed=az_chunks, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=chunk_size, operation_type=" Mapping Barcodes")
|
406
|
-
|
407
|
-
del records, chunked_records, dfs, df
|
408
|
-
|
409
|
-
pool.close()
|
410
|
-
pool.join()
|
411
|
-
|
412
|
-
# Send a sentinel value to indicate the saving process should stop
|
413
|
-
save_queue.put((None, None))
|
414
|
-
save_process.join()
|
415
|
-
|
416
|
-
def extract_barcodes_from_fastq_v1(fastq, output_file, chunk_size, barcode_mapping, n_jobs=None, compression='zlib', complevel=9):
|
417
|
-
"""
|
418
|
-
Extracts barcodes from a FASTQ file and saves the results to an output file.
|
419
|
-
|
420
|
-
Parameters:
|
421
|
-
- fastq (str): Path to the input FASTQ file.
|
422
|
-
- output_file (str): Path to the output file where the barcode data will be saved.
|
423
|
-
- chunk_size (int): Number of records to process in each chunk.
|
424
|
-
- barcode_mapping (dict): Mapping of barcode keys to CSV file paths, barcode coordinates, and reverse complement flags.
|
425
|
-
- n_jobs (int, optional): Number of parallel processes to use for barcode mapping. Defaults to None.
|
426
|
-
- compression (str, optional): Compression algorithm to use for the output file. Defaults to 'zlib'.
|
427
|
-
- complevel (int, optional): Compression level to use for the output file. Defaults to 9.
|
428
|
-
"""
|
429
|
-
|
430
|
-
from .utils import print_progress, count_reads_in_fastq
|
431
|
-
|
432
|
-
# Ensure the file is deleted before starting
|
433
|
-
if os.path.exists(output_file):
|
434
|
-
os.remove(output_file)
|
435
|
-
|
436
|
-
# Validate and process barcode mapping
|
437
|
-
barcode_dicts = {}
|
438
|
-
barcode_coordinates = {}
|
439
|
-
reverse_complements = {}
|
440
|
-
|
441
|
-
for key, (csv_path, coordinates, reverse_comp) in barcode_mapping.items():
|
442
|
-
df = pd.read_csv(csv_path)
|
443
|
-
if 'name' not in df.columns or 'sequence' not in df.columns:
|
444
|
-
print(f"Warning: CSV file {csv_path} does not have required columns 'name' and 'sequence'. Aborting.")
|
445
|
-
return
|
446
|
-
barcode_dicts[key] = df.set_index('sequence')['name'].to_dict()
|
447
|
-
barcode_coordinates[key] = coordinates
|
448
|
-
reverse_complements[key] = reverse_comp
|
449
|
-
|
450
|
-
if n_jobs is None:
|
451
|
-
n_jobs = cpu_count() - 2
|
452
|
-
|
453
|
-
chunk_count = 0
|
454
|
-
time_ls = []
|
455
|
-
|
456
|
-
print(f'Calculating read count for {fastq} ...')
|
457
|
-
total_reads = count_reads_in_fastq(fastq)
|
458
|
-
chunks_nr = (int(total_reads/chunk_size) + 1)
|
459
|
-
|
460
|
-
print(f'Mapping barcodes for {total_reads} reads in {chunks_nr} batches for {fastq} ...')
|
461
|
-
with gzip.open(fastq, "rt") as handle:
|
462
|
-
fastq_iter = SeqIO.parse(handle, "fastq")
|
463
|
-
pool = Pool(processes=n_jobs)
|
464
|
-
|
465
|
-
while True:
|
466
|
-
# Read n_jobs * chunk_size records into memory
|
467
|
-
records = [record for _, record in zip(range(n_jobs * chunk_size), fastq_iter)]
|
264
|
+
r1_chunk = []
|
265
|
+
r2_chunk = []
|
266
|
+
|
267
|
+
for _ in range(chunk_size):
|
268
|
+
try:
|
269
|
+
r1_lines = [r1.readline().strip() for _ in range(4)]
|
270
|
+
r2_lines = [r2.readline().strip() for _ in range(4)]
|
271
|
+
r1_chunk.append('\n'.join(r1_lines))
|
272
|
+
r2_chunk.append('\n'.join(r2_lines))
|
273
|
+
except StopIteration:
|
274
|
+
break
|
468
275
|
|
469
|
-
if not
|
276
|
+
if not r1_chunk:
|
470
277
|
break
|
471
278
|
|
472
|
-
start_time = time.time()
|
473
279
|
chunk_count += 1
|
280
|
+
chunk_data = (r1_chunk, r2_chunk, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv)
|
474
281
|
|
475
|
-
#
|
476
|
-
|
477
|
-
|
478
|
-
# Process each chunk in parallel
|
479
|
-
dfs = pool.starmap(process_chunk_for_mapping, [(chunk, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements) for chunk in chunked_records])
|
282
|
+
# Process chunks in parallel
|
283
|
+
result = pool.apply_async(process_chunk, (chunk_data,))
|
284
|
+
df, unique_combinations, qc_df = result.get()
|
480
285
|
|
481
|
-
#
|
482
|
-
|
483
|
-
|
484
|
-
# Save to HDF5 with compression
|
485
|
-
print(f'Writing chunk {chunk_count} to H5PY ...')
|
486
|
-
df.to_hdf(output_file, key=f'chunk_{chunk_count}', mode='a', format='table', complevel=complevel, complib=compression)
|
286
|
+
# Queue the results for saving
|
287
|
+
save_queue.put((df, unique_combinations, qc_df))
|
487
288
|
|
488
289
|
end_time = time.time()
|
489
290
|
chunk_time = end_time - start_time
|
490
291
|
time_ls.append(chunk_time)
|
491
|
-
print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=
|
292
|
+
print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=chunk_size, operation_type="Mapping Barcodes")
|
492
293
|
|
493
|
-
|
294
|
+
# Cleanup the pool
|
295
|
+
pool.close()
|
296
|
+
pool.join()
|
494
297
|
|
495
|
-
|
496
|
-
|
298
|
+
# Send stop signal to saver process
|
299
|
+
save_queue.put("STOP")
|
300
|
+
save_process.join()
|
497
301
|
|
498
302
|
def generate_barecode_mapping(settings={}):
|
303
|
+
|
499
304
|
from .settings import set_default_generate_barecode_mapping
|
500
305
|
|
501
306
|
settings = set_default_generate_barecode_mapping(settings)
|
502
307
|
|
503
308
|
samples_dict = parse_gz_files(settings['src'])
|
309
|
+
|
504
310
|
for key in samples_dict:
|
505
|
-
if samples_dict[key]['R1'] and samples_dict[key]['R2']:
|
506
|
-
R1 = samples_dict[key]['R1']
|
507
|
-
R2 = samples_dict[key]['R2']
|
508
|
-
consensus_dir = os.path.join(os.path.dirname(R1), 'consensus')
|
509
|
-
os.makedirs(consensus_dir, exist_ok=True)
|
510
|
-
consensus = os.path.join(consensus_dir, f"{key}_consensus.fastq.gz")
|
511
|
-
h5 = os.path.join(consensus_dir, f"{key}_barecodes.h5")
|
512
|
-
|
513
|
-
if not os.path.exists(consensus):
|
514
|
-
consensus_sequence(R1, R2, consensus, settings['chunk_size'])
|
515
|
-
else:
|
516
|
-
print(f"Consensus file {consensus} already exists. Mapping barecodes.")
|
517
|
-
|
518
|
-
extract_barcodes_from_fastq(fastq=consensus,
|
519
|
-
output_file=h5,
|
520
|
-
chunk_size=settings['chunk_size'],
|
521
|
-
barcode_mapping=settings['barcode_mapping'],
|
522
|
-
n_jobs=settings['n_jobs'],
|
523
|
-
compression=settings['compression'],
|
524
|
-
complevel=settings['complevel'])
|
525
311
|
|
312
|
+
if samples_dict[key]['R1'] and samples_dict[key]['R2']:
|
313
|
+
|
314
|
+
dst = os.path.join(settings['src'], key)
|
315
|
+
hdf5_file = os.path.join(dst, 'annotated_reads.h5')
|
316
|
+
unique_combinations_csv = os.path.join(dst, 'unique_combinations.csv')
|
317
|
+
qc_csv_file = os.path.join(dst, 'qc.csv')
|
318
|
+
os.makedirs(dst, exist_ok=True)
|
319
|
+
|
320
|
+
print(f'Analyzing reads from sample {key}')
|
321
|
+
|
322
|
+
chunked_processing(r1_file=samples_dict[key]['R1'],
|
323
|
+
r2_file=samples_dict[key]['R2'],
|
324
|
+
target_sequence=settings['target_sequence'],
|
325
|
+
offset_start=settings['offset_start'],
|
326
|
+
expected_end=settings['expected_end'],
|
327
|
+
column_csv=settings['column_csv'],
|
328
|
+
grna_csv=settings['grna_csv'],
|
329
|
+
row_csv=settings['row_csv'],
|
330
|
+
save_h5 = settings['save_h5'],
|
331
|
+
comp_type = settings['comp_type'],
|
332
|
+
comp_level=settings['comp_level'],
|
333
|
+
hdf5_file=hdf5_file,
|
334
|
+
unique_combinations_csv=unique_combinations_csv,
|
335
|
+
qc_csv_file=qc_csv_file,
|
336
|
+
chunk_size=settings['chunk_size'],
|
337
|
+
n_jobs=settings['n_jobs'])
|
526
338
|
|
527
339
|
|
528
340
|
|