spacr 0.2.46__py3-none-any.whl → 0.2.56__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. spacr/core.py +306 -21
  2. spacr/deep_spacr.py +101 -41
  3. spacr/gui.py +1 -3
  4. spacr/gui_core.py +78 -65
  5. spacr/gui_elements.py +437 -152
  6. spacr/gui_utils.py +84 -73
  7. spacr/io.py +14 -7
  8. spacr/measure.py +196 -145
  9. spacr/plot.py +2 -42
  10. spacr/resources/font/open_sans/OFL.txt +93 -0
  11. spacr/resources/font/open_sans/OpenSans-Italic-VariableFont_wdth,wght.ttf +0 -0
  12. spacr/resources/font/open_sans/OpenSans-VariableFont_wdth,wght.ttf +0 -0
  13. spacr/resources/font/open_sans/README.txt +100 -0
  14. spacr/resources/font/open_sans/static/OpenSans-Bold.ttf +0 -0
  15. spacr/resources/font/open_sans/static/OpenSans-BoldItalic.ttf +0 -0
  16. spacr/resources/font/open_sans/static/OpenSans-ExtraBold.ttf +0 -0
  17. spacr/resources/font/open_sans/static/OpenSans-ExtraBoldItalic.ttf +0 -0
  18. spacr/resources/font/open_sans/static/OpenSans-Italic.ttf +0 -0
  19. spacr/resources/font/open_sans/static/OpenSans-Light.ttf +0 -0
  20. spacr/resources/font/open_sans/static/OpenSans-LightItalic.ttf +0 -0
  21. spacr/resources/font/open_sans/static/OpenSans-Medium.ttf +0 -0
  22. spacr/resources/font/open_sans/static/OpenSans-MediumItalic.ttf +0 -0
  23. spacr/resources/font/open_sans/static/OpenSans-Regular.ttf +0 -0
  24. spacr/resources/font/open_sans/static/OpenSans-SemiBold.ttf +0 -0
  25. spacr/resources/font/open_sans/static/OpenSans-SemiBoldItalic.ttf +0 -0
  26. spacr/resources/font/open_sans/static/OpenSans_Condensed-Bold.ttf +0 -0
  27. spacr/resources/font/open_sans/static/OpenSans_Condensed-BoldItalic.ttf +0 -0
  28. spacr/resources/font/open_sans/static/OpenSans_Condensed-ExtraBold.ttf +0 -0
  29. spacr/resources/font/open_sans/static/OpenSans_Condensed-ExtraBoldItalic.ttf +0 -0
  30. spacr/resources/font/open_sans/static/OpenSans_Condensed-Italic.ttf +0 -0
  31. spacr/resources/font/open_sans/static/OpenSans_Condensed-Light.ttf +0 -0
  32. spacr/resources/font/open_sans/static/OpenSans_Condensed-LightItalic.ttf +0 -0
  33. spacr/resources/font/open_sans/static/OpenSans_Condensed-Medium.ttf +0 -0
  34. spacr/resources/font/open_sans/static/OpenSans_Condensed-MediumItalic.ttf +0 -0
  35. spacr/resources/font/open_sans/static/OpenSans_Condensed-Regular.ttf +0 -0
  36. spacr/resources/font/open_sans/static/OpenSans_Condensed-SemiBold.ttf +0 -0
  37. spacr/resources/font/open_sans/static/OpenSans_Condensed-SemiBoldItalic.ttf +0 -0
  38. spacr/resources/font/open_sans/static/OpenSans_SemiCondensed-Bold.ttf +0 -0
  39. spacr/resources/font/open_sans/static/OpenSans_SemiCondensed-BoldItalic.ttf +0 -0
  40. spacr/resources/font/open_sans/static/OpenSans_SemiCondensed-ExtraBold.ttf +0 -0
  41. spacr/resources/font/open_sans/static/OpenSans_SemiCondensed-ExtraBoldItalic.ttf +0 -0
  42. spacr/resources/font/open_sans/static/OpenSans_SemiCondensed-Italic.ttf +0 -0
  43. spacr/resources/font/open_sans/static/OpenSans_SemiCondensed-Light.ttf +0 -0
  44. spacr/resources/font/open_sans/static/OpenSans_SemiCondensed-LightItalic.ttf +0 -0
  45. spacr/resources/font/open_sans/static/OpenSans_SemiCondensed-Medium.ttf +0 -0
  46. spacr/resources/font/open_sans/static/OpenSans_SemiCondensed-MediumItalic.ttf +0 -0
  47. spacr/resources/font/open_sans/static/OpenSans_SemiCondensed-Regular.ttf +0 -0
  48. spacr/resources/font/open_sans/static/OpenSans_SemiCondensed-SemiBold.ttf +0 -0
  49. spacr/resources/font/open_sans/static/OpenSans_SemiCondensed-SemiBoldItalic.ttf +0 -0
  50. spacr/sequencing.py +481 -587
  51. spacr/settings.py +197 -122
  52. spacr/utils.py +21 -13
  53. {spacr-0.2.46.dist-info → spacr-0.2.56.dist-info}/METADATA +7 -4
  54. spacr-0.2.56.dist-info/RECORD +100 -0
  55. spacr-0.2.46.dist-info/RECORD +0 -60
  56. {spacr-0.2.46.dist-info → spacr-0.2.56.dist-info}/LICENSE +0 -0
  57. {spacr-0.2.46.dist-info → spacr-0.2.56.dist-info}/WHEEL +0 -0
  58. {spacr-0.2.46.dist-info → spacr-0.2.56.dist-info}/entry_points.txt +0 -0
  59. {spacr-0.2.46.dist-info → spacr-0.2.56.dist-info}/top_level.txt +0 -0
spacr/sequencing.py CHANGED
@@ -1,8 +1,6 @@
1
- import os, gc, gzip, re, time, math, subprocess
1
+ import os, gzip, re, time, math, subprocess, gzip
2
2
  import pandas as pd
3
3
  import numpy as np
4
- from tqdm import tqdm
5
- from Bio.Align import PairwiseAligner
6
4
  import matplotlib.pyplot as plt
7
5
  import seaborn as sns
8
6
  from Bio import pairwise2
@@ -14,6 +12,8 @@ from scipy import stats
14
12
  from difflib import SequenceMatcher
15
13
  from collections import Counter
16
14
  from IPython.display import display
15
+ from multiprocessing import Pool, cpu_count, Queue, Process
16
+ from rapidfuzz import process, fuzz
17
17
 
18
18
  from sklearn.linear_model import LinearRegression, Lasso, Ridge
19
19
  from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
@@ -21,626 +21,530 @@ from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
21
21
  from scipy.stats import shapiro
22
22
  from patsy import dmatrices
23
23
 
24
- def analyze_reads(settings):
24
+ from Bio import SeqIO
25
+ from Bio.Seq import Seq
26
+ from Bio.SeqRecord import SeqRecord
27
+
28
+ def parse_gz_files(folder_path):
25
29
  """
26
- Analyzes reads from gzipped fastq files and combines them based on specified settings.
30
+ Parses the .fastq.gz files in the specified folder path and returns a dictionary
31
+ containing the sample names and their corresponding file paths.
27
32
 
28
33
  Args:
29
- settings (dict): A dictionary containing the following keys:
30
- - 'src' (str): The path to the folder containing the input fastq files.
31
- - 'upstream' (str, optional): The upstream sequence used for read combination. Defaults to 'CTTCTGGTAAATGGGGATGTCAAGTT'.
32
- - 'downstream' (str, optional): The downstream sequence used for read combination. Defaults to 'GTTTAAGAGCTATGCTGGAAACAGCA'.
33
- - 'barecode_length' (int, optional): The length of the barcode sequence. Defaults to 8.
34
- - 'chunk_size' (int, optional): The number of reads to process and save at a time. Defaults to 1000000.
34
+ folder_path (str): The path to the folder containing the .fastq.gz files.
35
35
 
36
36
  Returns:
37
- None
37
+ dict: A dictionary where the keys are the sample names and the values are
38
+ dictionaries containing the file paths for the 'R1' and 'R2' read directions.
38
39
  """
39
-
40
- def save_chunk_to_hdf5(output_file_path, data_chunk, chunk_counter):
41
- """
42
- Save a data chunk to an HDF5 file.
40
+ files = os.listdir(folder_path)
41
+ gz_files = [f for f in files if f.endswith('.fastq.gz')]
43
42
 
44
- Parameters:
45
- - output_file_path (str): The path to the output HDF5 file.
46
- - data_chunk (list): The data chunk to be saved.
47
- - chunk_counter (int): The counter for the current chunk.
43
+ samples_dict = {}
44
+ for gz_file in gz_files:
45
+ parts = gz_file.split('_')
46
+ sample_name = parts[0]
47
+ read_direction = parts[1]
48
48
 
49
- Returns:
50
- None
51
- """
52
- df = pd.DataFrame(data_chunk, columns=['combined_read', 'grna', 'plate_row', 'column', 'sample'])
53
- with pd.HDFStore(output_file_path, mode='a', complevel=5, complib='blosc') as store:
54
- store.put(
55
- f'reads/chunk_{chunk_counter}',
56
- df,
57
- format='table',
58
- append=True,
59
- min_itemsize={'combined_read': 300, 'grna': 50, 'plate_row': 20, 'column': 20, 'sample': 50}
60
- )
61
-
62
- def reverse_complement(seq):
63
- """
64
- Returns the reverse complement of a DNA sequence.
65
-
66
- Args:
67
- seq (str): The DNA sequence to be reversed and complemented.
68
-
69
- Returns:
70
- str: The reverse complement of the input DNA sequence.
71
-
72
- Example:
73
- >>> reverse_complement('ATCG')
74
- 'CGAT'
75
- """
76
- complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
77
- return ''.join(complement[base] for base in reversed(seq))
78
-
79
- def get_avg_read_length(file_path, num_reads=100):
80
- """
81
- Calculate the average read length from a given file.
82
-
83
- Args:
84
- file_path (str): The path to the input file.
85
- num_reads (int, optional): The number of reads to process. Defaults to 100.
86
-
87
- Returns:
88
- float: The average read length.
89
-
90
- Raises:
91
- FileNotFoundError: If the input file does not exist.
92
- """
93
- if not file_path:
94
- return 0
95
- total_length = 0
96
- count = 0
97
- with gzip.open(file_path, 'rt') as f:
98
- for _ in range(num_reads):
99
- try:
100
- f.readline() # Skip index line
101
- read = f.readline().strip()
102
- total_length += len(read)
103
- f.readline() # Skip plus line
104
- f.readline() # Skip quality line
105
- count += 1
106
- except StopIteration:
107
- break
108
- return total_length / count if count > 0 else 0
109
-
110
- def parse_gz_files(folder_path):
111
- """
112
- Parses the .fastq.gz files in the specified folder path and returns a dictionary
113
- containing the sample names and their corresponding file paths.
114
-
115
- Args:
116
- folder_path (str): The path to the folder containing the .fastq.gz files.
117
-
118
- Returns:
119
- dict: A dictionary where the keys are the sample names and the values are
120
- dictionaries containing the file paths for the 'R1' and 'R2' read directions.
121
- """
122
- files = os.listdir(folder_path)
123
- gz_files = [f for f in files if f.endswith('.fastq.gz')]
124
-
125
- samples_dict = {}
126
- for gz_file in gz_files:
127
- parts = gz_file.split('_')
128
- sample_name = parts[0]
129
- read_direction = parts[1]
130
-
131
- if sample_name not in samples_dict:
132
- samples_dict[sample_name] = {}
133
-
134
- if read_direction == "R1":
135
- samples_dict[sample_name]['R1'] = os.path.join(folder_path, gz_file)
136
- elif read_direction == "R2":
137
- samples_dict[sample_name]['R2'] = os.path.join(folder_path, gz_file)
138
-
139
- return samples_dict
49
+ if sample_name not in samples_dict:
50
+ samples_dict[sample_name] = {}
51
+
52
+ if read_direction == "R1":
53
+ samples_dict[sample_name]['R1'] = os.path.join(folder_path, gz_file)
54
+ elif read_direction == "R2":
55
+ samples_dict[sample_name]['R2'] = os.path.join(folder_path, gz_file)
56
+ return samples_dict
57
+
58
+ def process_chunk_for_consensus(r1_chunk, r2_chunk):
59
+ """
60
+ Process a chunk of paired-end sequencing reads to generate consensus sequences.
61
+
62
+ Args:
63
+ r1_chunk (list): List of SeqRecord objects representing the first read in each pair.
64
+ r2_chunk (list): List of SeqRecord objects representing the second read in each pair.
65
+
66
+ Returns:
67
+ list: List of SeqRecord objects representing the consensus sequences.
68
+
69
+ """
70
+ consensus_records = []
140
71
 
141
- def find_overlap(r1_read_rc, r2_read):
142
- """
143
- Find the best alignment between two DNA reads.
144
-
145
- Parameters:
146
- - r1_read_rc (str): The reverse complement of the first DNA read.
147
- - r2_read (str): The second DNA read.
148
-
149
- Returns:
150
- - best_alignment (Alignment): The best alignment between the two DNA reads.
151
- """
152
- aligner = PairwiseAligner()
153
- alignments = aligner.align(r1_read_rc, r2_read)
154
- best_alignment = alignments[0]
155
- return best_alignment
156
-
157
- def combine_reads(samples_dict, src, chunk_size, barecode_length_1, barecode_length_2, upstream, downstream):
158
- """
159
- Combine reads from paired-end sequencing files and save the combined reads to a new file.
72
+ for r1_record, r2_record in zip(r1_chunk, r2_chunk):
73
+ best_sequence = []
74
+ best_quality = []
75
+ for base1, base2, qual1, qual2 in zip(r1_record.seq, r2_record.seq, r1_record.letter_annotations["phred_quality"], r2_record.letter_annotations["phred_quality"]):
76
+ if qual1 >= qual2:
77
+ best_sequence.append(base1)
78
+ best_quality.append(qual1)
79
+ else:
80
+ best_sequence.append(base2)
81
+ best_quality.append(qual2)
82
+
83
+ consensus_seq = Seq("".join(best_sequence))
160
84
 
161
- Args:
162
- samples_dict (dict): A dictionary mapping sample names to file paths of paired-end sequencing files.
163
- src (str): The source directory where the combined reads will be saved.
164
- chunk_size (int): The number of reads to be processed and saved as a chunk.
165
- barecode_length (int): The length of the barcode sequence.
166
- upstream (str): The upstream sequence used for read splitting.
167
- downstream (str): The downstream sequence used for read splitting.
85
+ # Create a new SeqRecord for the consensus sequence
86
+ consensus_record = SeqRecord(consensus_seq, id=r1_record.id, description="", letter_annotations={"phred_quality": best_quality})
168
87
 
169
- Returns:
170
- None
171
- """
172
- dst = os.path.join(src, 'combined_reads')
173
- if not os.path.exists(dst):
174
- os.makedirs(dst)
175
-
176
- for sample, paths in samples_dict.items():
177
- print(f'Processing: {sample} with the files: {paths}')
178
- r1_path = paths.get('R1')
179
- r2_path = paths.get('R2')
180
-
181
- output_file_path = os.path.join(dst, f"{sample}_combined.h5")
182
- qc_file_path = os.path.join(dst, f"{sample}_qc.csv")
183
-
184
- r1_file = gzip.open(r1_path, 'rt') if r1_path else None
185
- r2_file = gzip.open(r2_path, 'rt') if r2_path else None
186
-
187
- chunk_counter = 0
188
- data_chunk = []
88
+ # Add the consensus record to the list
89
+ consensus_records.append(consensus_record)
90
+
91
+ return consensus_records
92
+
93
+ def consensus_sequence(fastq_r1, fastq_r2, output_file, chunk_size=1000000, n_jobs=None):
94
+ """
95
+ Calculate the consensus sequence from two FASTQ files (R1 and R2) and write the result to an output file.
96
+
97
+ Parameters:
98
+ - fastq_r1 (str): Path to the R1 FASTQ file.
99
+ - fastq_r2 (str): Path to the R2 FASTQ file.
100
+ - output_file (str): Path to the output file where the consensus sequence will be written.
101
+ - chunk_size (int): Number of reads to process in each chunk. Default is 1000000.
102
+ - n_jobs (int): Number of parallel processes to use. If None, it will use the number of available CPUs minus 2.
103
+
104
+ Returns:
105
+ None
106
+ """
107
+ from .utils import print_progress, count_reads_in_fastq
108
+
109
+ print(f'Calculating read count for {fastq_r1} ...')
110
+ total_reads = count_reads_in_fastq(fastq_r1)
111
+ chunks_nr = (int(total_reads / chunk_size) + 1) // (n_jobs if n_jobs else cpu_count())
112
+
113
+ total_reads_processed = 0
114
+ chunk_count = 0
115
+ time_ls = []
116
+
117
+ if n_jobs is None:
118
+ n_jobs = cpu_count() - 2
119
+
120
+ with gzip.open(fastq_r1, "rt") as r1_handle, gzip.open(fastq_r2, "rt") as r2_handle, gzip.open(output_file, "wt") as output_handle:
121
+ r1_iter = SeqIO.parse(r1_handle, "fastq")
122
+ r2_iter = SeqIO.parse(r2_handle, "fastq")
123
+ pool = Pool(processes=n_jobs)
124
+
125
+ while True:
126
+ start_time = time.time()
127
+
128
+ r1_chunk = [rec for rec in (next(r1_iter, None) for _ in range(n_jobs * chunk_size)) if rec is not None]
129
+ r2_chunk = [rec for rec in (next(r2_iter, None) for _ in range(n_jobs * chunk_size)) if rec is not None]
189
130
 
190
- success = 0
191
- fail = 0
131
+ # If either chunk is empty, we have reached the end of one or both files
132
+ if not r1_chunk or not r2_chunk:
133
+ break
192
134
 
193
- # Calculate initial average read length
194
- avg_read_length_r1 = get_avg_read_length(r1_path, 100)
195
- avg_read_length_r2 = get_avg_read_length(r2_path, 100)
196
- avg_read_length = (avg_read_length_r1 + avg_read_length_r2) / 2 if avg_read_length_r1 and avg_read_length_r2 else 0
135
+ chunk_count += 1
136
+ total_reads_processed += len(r1_chunk)
197
137
 
198
- print(f'Initial avg_read_length: {avg_read_length}')
138
+ # Split the records into chunks to be processed by each core
139
+ r1_chunked = [r1_chunk[i:i + chunk_size] for i in range(0, len(r1_chunk), chunk_size)]
140
+ r2_chunked = [r2_chunk[i:i + chunk_size] for i in range(0, len(r2_chunk), chunk_size)]
141
+
142
+ # Process each chunk in parallel
143
+ results = pool.starmap(process_chunk_for_consensus, zip(r1_chunked, r2_chunked))
199
144
 
200
- # Estimate the initial number of reads based on the file size
201
- r1_size_est = os.path.getsize(r1_path) // (avg_read_length * 4) if r1_path else 0
202
- r2_size_est = os.path.getsize(r2_path) // (avg_read_length * 4) if r2_path else 0
203
- max_size = max(r1_size_est, r2_size_est) * 10
204
- test10 =0
205
- with tqdm(total=max_size, desc=f"Processing {sample}") as pbar:
206
- total_length_processed = 0
207
- read_count = 0
208
-
209
- while True:
210
- try:
211
- r1_index = next(r1_file).strip() if r1_file else None
212
- r1_read = next(r1_file).strip() if r1_file else None
213
- r1_plus = next(r1_file).strip() if r1_file else None
214
- r1_quality = next(r1_file).strip() if r1_file else None
215
-
216
- r2_index = next(r2_file).strip() if r2_file else None
217
- r2_read = next(r2_file).strip() if r2_file else None
218
- r2_plus = next(r2_file).strip() if r2_file else None
219
- r2_quality = next(r2_file).strip() if r2_file else None
220
-
221
- pbar.update(1)
222
-
223
- if r1_index and r2_index and r1_index.split(' ')[0] != r2_index.split(' ')[0]:
224
- fail += 1
225
- print(f"Index mismatch: {r1_index} != {r2_index}")
226
- continue
227
-
228
- r1_read_rc = reverse_complement(r1_read) if r1_read else ''
229
- r1_quality_rc = r1_quality[::-1] if r1_quality else ''
230
-
231
- r1_rc_split_index = r1_read_rc.find(upstream)
232
- r2_split_index = r2_read.find(upstream)
233
-
234
- if r1_rc_split_index == -1 or r2_split_index == -1:
235
- fail += 1
236
- continue
237
- else:
238
- success += 1
239
-
240
- read1_fragment = r1_read_rc[:r1_rc_split_index]
241
- read2_fragment = r2_read[r2_split_index:]
242
- read_combo = read1_fragment + read2_fragment
243
-
244
- combo_split_index_1 = read_combo.find(upstream)
245
- combo_split_index_2 = read_combo.find(downstream)
246
-
247
- barcode_1 = read_combo[combo_split_index_1 - barecode_length_1:combo_split_index_1]
248
- grna = read_combo[combo_split_index_1 + len(upstream):combo_split_index_2]
249
- barcode_2 = read_combo[combo_split_index_2 + len(downstream):combo_split_index_2 + len(downstream) + barecode_length_2]
250
- barcode_2 = reverse_complement(barcode_2)
251
- data_chunk.append((read_combo, grna, barcode_1, barcode_2, sample))
252
-
253
- if settings['test']:
254
- if read_count % 1000 == 0:
255
- print(f"Read count: {read_count}")
256
- print(f"Read 1: {r1_read_rc}")
257
- print(f"Read 2: {r2_read}")
258
- print(f"Read combo: {read_combo}")
259
- print(f"Barcode 1: {barcode_1}")
260
- print(f"gRNA: {grna}")
261
- print(f"Barcode 2: {barcode_2}")
262
- print()
263
- test10 += 1
264
- if test10 == 10:
265
- break
266
-
267
- read_count += 1
268
- total_length_processed += len(r1_read) + len(r2_read)
269
-
270
- # Periodically update the average read length and total
271
- if read_count % 10000 == 0:
272
- avg_read_length = total_length_processed / (read_count * 2)
273
- max_size = (os.path.getsize(r1_path) + os.path.getsize(r2_path)) // (avg_read_length * 4)
274
- pbar.total = max_size
275
-
276
- if len(data_chunk) >= chunk_size:
277
- save_chunk_to_hdf5(output_file_path, data_chunk, chunk_counter)
278
- chunk_counter += 1
279
- data_chunk = []
280
-
281
- except StopIteration:
282
- break
283
-
284
- # Save any remaining data_chunk
285
- if data_chunk:
286
- save_chunk_to_hdf5(output_file_path, data_chunk, chunk_counter)
287
-
288
- # Save QC metrics
289
- qc = {'success': success, 'failed': fail}
290
- qc_df = pd.DataFrame([qc])
291
- qc_df.to_csv(qc_file_path, index=False)
292
-
293
- from .settings import get_analyze_reads_default_settings
145
+ # Write the results to the output file
146
+ for consensus_records in results:
147
+ SeqIO.write(consensus_records, output_handle, "fastq")
294
148
 
295
- settings = get_analyze_reads_default_settings(settings)
149
+ end_time = time.time()
150
+ chunk_time = end_time - start_time
151
+ time_ls.append(chunk_time)
152
+ print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=chunk_size, operation_type=" Consensus sequence from R1 & R2")
296
153
 
297
- samples_dict = parse_gz_files(settings['src'])
298
- combine_reads(samples_dict, settings['src'], settings['chunk_size'], settings['barecode_length_1'], settings['barecode_length_2'], settings['upstream'], settings['downstream'])
154
+ pool.close()
155
+ pool.join()
299
156
 
300
- def map_barcodes(h5_file_path, settings={}):
157
+ def consensus_sequence_v1(fastq_r1, fastq_r2, output_file, chunk_size=1000000):
301
158
  """
302
- Maps barcodes and performs quality control on sequencing data.
159
+ Generate a consensus sequence from paired-end FASTQ files.
303
160
 
304
161
  Args:
305
- h5_file_path (str): The file path to the HDF5 file containing the sequencing data.
306
- settings (dict, optional): Additional settings for the mapping and quality control process. Defaults to {}.
162
+ fastq_r1 (str): Path to the first input FASTQ file.
163
+ fastq_r2 (str): Path to the second input FASTQ file.
164
+ output_file (str): Path to the output FASTQ file.
165
+ chunk_size (int, optional): Number of reads to process in each iteration. Defaults to 1000000.
307
166
 
308
167
  Returns:
309
168
  None
310
169
  """
311
- def get_read_qc(df, settings):
312
- """
313
- Calculate quality control metrics for sequencing reads.
170
+ from .utils import print_progress, count_reads_in_fastq
314
171
 
315
- Parameters:
316
- - df: DataFrame containing the sequencing reads.
172
+ print(f'Calculating read count for {fastq_r1} ...')
173
+ total_reads = count_reads_in_fastq(fastq_r1)
174
+ chunks_nr = int(total_reads/chunk_size) + 1
317
175
 
318
- Returns:
319
- - df_cleaned: DataFrame containing the cleaned sequencing reads.
320
- - qc_dict: Dictionary containing the quality control metrics.
321
- """
322
-
323
- df_cleaned = df.dropna()
324
-
325
- qc_dict = {}
326
- qc_dict['reads'] = len(df)
327
- qc_dict['cleaned_reads'] = len(df_cleaned)
328
- qc_dict['NaN_grna'] = df['grna_metadata'].isna().sum()
329
- qc_dict['NaN_plate_row'] = df['plate_row_metadata'].isna().sum()
330
- qc_dict['NaN_column'] = df['column_metadata'].isna().sum()
331
- qc_dict['NaN_plate'] = df['plate_metadata'].isna().sum()
332
- qc_dict['unique_grna'] = Counter(df['grna_metadata'].dropna().tolist())
333
- qc_dict['unique_plate_row'] = Counter(df['plate_row_metadata'].dropna().tolist())
334
- qc_dict['unique_column'] = Counter(df['column_metadata'].dropna().tolist())
335
- qc_dict['unique_plate'] = Counter(df['plate_metadata'].dropna().tolist())
336
-
337
- # Calculate control error rates using cleaned DataFrame
338
- total_pc_non_nan = df_cleaned[(df_cleaned['column_metadata'] == settings['pc_loc'])].shape[0]
339
- total_nc_non_nan = df_cleaned[(df_cleaned['column_metadata'] == settings['nc_loc'])].shape[0]
340
-
341
- pc_count_pc = df_cleaned[(df_cleaned['column_metadata'] == settings['pc_loc']) & (df_cleaned['grna_metadata'] == settings['pc'])].shape[0]
342
- nc_count_nc = df_cleaned[(df_cleaned['column_metadata'] == settings['nc_loc']) & (df_cleaned['grna_metadata'] == settings['nc'])].shape[0]
176
+ total_reads = 0
177
+ chunk_count = 0
178
+ time_ls = []
343
179
 
344
- pc_error_count = df_cleaned[(df_cleaned['column_metadata'] == settings['pc_loc']) & (df_cleaned['grna_metadata'] != settings['pc'])].shape[0]
345
- nc_error_count = df_cleaned[(df_cleaned['column_metadata'] == settings['nc_loc']) & (df_cleaned['grna_metadata'] != settings['nc'])].shape[0]
346
-
347
- pc_in_nc_loc_count = df_cleaned[(df_cleaned['column_metadata'] == settings['nc_loc']) & (df_cleaned['grna_metadata'] == settings['pc'])].shape[0]
348
- nc_in_pc_loc_count = df_cleaned[(df_cleaned['column_metadata'] == settings['pc_loc']) & (df_cleaned['grna_metadata'] == settings['nc'])].shape[0]
349
-
350
- # Collect QC metrics into a dictionary
351
- # PC
352
- qc_dict['pc_total_count'] = total_pc_non_nan
353
- qc_dict['pc_count_pc'] = pc_count_pc
354
- qc_dict['nc_count_pc'] = pc_in_nc_loc_count
355
- qc_dict['pc_error_count'] = pc_error_count
356
- # NC
357
- qc_dict['nc_total_count'] = total_nc_non_nan
358
- qc_dict['nc_count_nc'] = nc_count_nc
359
- qc_dict['pc_count_nc'] = nc_in_pc_loc_count
360
- qc_dict['nc_error_count'] = nc_error_count
180
+ with gzip.open(fastq_r1, "rt") as r1_handle, gzip.open(fastq_r2, "rt") as r2_handle, gzip.open(output_file, "wt") as output_handle:
181
+ r1_iter = SeqIO.parse(r1_handle, "fastq")
182
+ r2_iter = SeqIO.parse(r2_handle, "fastq")
361
183
 
362
- return df_cleaned, qc_dict
363
-
364
- def get_per_row_qc(df, settings):
365
- """
366
- Calculate quality control metrics for each unique row in the control columns.
367
-
368
- Parameters:
369
- - df: DataFrame containing the sequencing reads.
370
- - settings: Dictionary containing the settings for control values.
371
-
372
- Returns:
373
- - dict: Dictionary containing the quality control metrics for each unique row.
374
- """
375
- qc_dict_per_row = {}
376
- unique_rows = df['plate_row_metadata'].dropna().unique().tolist()
377
- unique_rows = list(set(unique_rows)) # Remove duplicates
378
-
379
- for row in unique_rows:
380
- df_row = df[(df['plate_row_metadata'] == row)]
381
- _, qc_dict_row = get_read_qc(df_row, settings)
382
- qc_dict_per_row[row] = qc_dict_row
383
-
384
- return qc_dict_per_row
385
-
386
- def mapping_dicts(df, settings):
387
- """
388
- Maps the values in the DataFrame columns to corresponding metadata using dictionaries.
389
-
390
- Args:
391
- df (pandas.DataFrame): The DataFrame containing the data to be mapped.
392
- settings (dict): A dictionary containing the settings for mapping.
393
-
394
- Returns:
395
- pandas.DataFrame: The DataFrame with the mapped metadata columns added.
396
- """
397
- grna_df = pd.read_csv(settings['grna'])
398
- barcode_df = pd.read_csv(settings['barcodes'])
399
-
400
- grna_dict = {row['sequence']: row['name'] for _, row in grna_df.iterrows()}
401
- plate_row_dict = {row['sequence']: row['name'] for _, row in barcode_df.iterrows() if row['name'].startswith('p')}
402
- column_dict = {row['sequence']: row['name'] for _, row in barcode_df.iterrows() if row['name'].startswith('c')}
403
- plate_dict = settings['plate_dict']
404
-
405
- df['grna_metadata'] = df['grna'].map(grna_dict)
406
- df['grna_length'] = df['grna'].apply(len)
407
- df['plate_row_metadata'] = df['plate_row'].map(plate_row_dict)
408
- df['column_metadata'] = df['column'].map(column_dict)
409
- df['plate_metadata'] = df['sample'].map(plate_dict)
410
-
411
- return df
412
-
413
- def filter_combinations(df, settings):
414
- """
415
- Takes the combination counts Data Frame, filters the rows based on specific conditions,
416
- and removes rows with a count lower than the highest value of max_count_c1 and max_count_c2.
184
+ while True:
185
+ start_time = time.time()
417
186
 
418
- Args:
419
- combination_counts_file_path (str): The file path to the CSV file containing the combination counts.
420
- pc (str, optional): The positive control sequence. Defaults to 'TGGT1_220950_1'.
421
- nc (str, optional): The negative control sequence. Defaults to 'TGGT1_233460_4'.
187
+ r1_chunk = [rec for rec in (next(r1_iter, None) for _ in range(chunk_size)) if rec is not None]
188
+ r2_chunk = [rec for rec in (next(r2_iter, None) for _ in range(chunk_size)) if rec is not None]
189
+
190
+ # If either chunk is empty, we have reached the end of one or both files
191
+ if not r1_chunk or not r2_chunk:
192
+ break
193
+
194
+ chunk_count += 1
195
+ total_reads += len(r1_chunk)
196
+
197
+ for r1_record, r2_record in zip(r1_chunk, r2_chunk):
198
+ best_sequence = []
199
+ best_quality = []
200
+ for base1, base2, qual1, qual2 in zip(r1_record.seq, r2_record.seq, r1_record.letter_annotations["phred_quality"], r2_record.letter_annotations["phred_quality"]):
201
+ if qual1 >= qual2:
202
+ best_sequence.append(base1)
203
+ best_quality.append(qual1)
204
+ else:
205
+ best_sequence.append(base2)
206
+ best_quality.append(qual2)
207
+
208
+ consensus_seq = Seq("".join(best_sequence))
209
+
210
+ # Create a new SeqRecord for the consensus sequence
211
+ consensus_record = SeqRecord(consensus_seq, id=r1_record.id, description="", letter_annotations={"phred_quality": best_quality})
212
+
213
+ # Write the consensus sequence to the output file
214
+ SeqIO.write(consensus_record, output_handle, "fastq")
215
+
216
+ end_time = time.time()
217
+ chunk_time = end_time - start_time
218
+ time_ls.append(chunk_time)
219
+ print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=1, time_ls=time_ls, batch_size=chunk_size, operation_type=" Consensus sequence from R1 & R2")
422
220
 
423
- Returns:
424
- pd.DataFrame: The filtered DataFrame.
425
- """
221
+ def save_to_hdf(queue, output_file, complevel=9, compression='zlib'):
222
+ """
223
+ Save data from a queue to an HDF file.
224
+
225
+ Parameters:
226
+ - queue: Queue object
227
+ The queue containing the data to be saved.
228
+ - output_file: strs
229
+ The path to the output HDF file.
230
+ - complevel: int, optional
231
+ The compression level to use (default is 9).
232
+ - compression: str, optional
233
+ The compression algorithm to use (default is 'zlib').
426
234
 
427
- pc = settings['pc']
428
- nc = settings['nc']
429
- pc_loc = settings['pc_loc']
430
- nc_loc = settings['nc_loc']
235
+ Returns:
236
+ None
237
+ """
238
+ with pd.HDFStore(output_file, mode='a', complevel=complevel, complib=compression) as store:
239
+ while True:
240
+ chunk_count, df = queue.get()
241
+ if df is None:
242
+ break
243
+ print(f'Writing chunks to H5PY ...')
244
+ store.append(f'chunk_{chunk_count}', df, format='table', data_columns=True)
431
245
 
432
- filtered_c1 = df[(df['column'] == nc_loc) & (df['grna'] != nc)]
433
- max_count_c1 = filtered_c1['count'].max()
246
+ def get_top_two_matches(seq, barcode_dict):
247
+ """
248
+ Finds the top two closest matches for a given sequence in a barcode dictionary.
434
249
 
435
- filtered_c2 = df[(df['column'] == pc_loc) & (df['grna'] != pc)]
436
- max_count_c2 = filtered_c2['count'].max()
250
+ Args:
251
+ seq (str): The sequence to find the closest matches for.
252
+ barcode_dict (dict): A dictionary containing barcodes as keys and their corresponding values.
437
253
 
438
- #filtered_c3 = df[(df['column'] != nc_loc) & (df['grna'] == nc)]
439
- #max_count_c3 = filtered_c3['count'].max()
254
+ Returns:
255
+ list of tuples: A list containing up to two tuples, each with a barcode match and its score.
256
+ """
257
+ results = process.extract(seq, barcode_dict.keys(), scorer=fuzz.ratio, limit=2)
258
+ matches = [(barcode_dict[result[0]], result[1] / 100.0) for result in results]
259
+ # Pad the matches list if there are fewer than two results
260
+ if len(matches) < 2:
261
+ matches.append((None, 0.0))
262
+ return matches
263
+
264
+ def process_chunk_for_mapping(records, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements):
265
+ """
266
+ Process a chunk of records for barcode mapping, including highest and second-highest scores.
440
267
 
441
- #filtered_c4 = df[(df['column'] != pc_loc) & (df['grna'] == pc)]
442
- #max_count_c4 = filtered_c4['count'].max()
268
+ Args:
269
+ records (list): A list of records to process.
270
+ barcode_mapping (dict): A dictionary mapping barcodes to their corresponding keys.
271
+ barcode_dicts (dict): A dictionary of barcode dictionaries.
272
+ barcode_coordinates (dict): A dictionary mapping barcode keys to their start and end coordinates.
273
+ reverse_complements (dict): A dictionary indicating whether to reverse complement the extracted sequences for each barcode key.
443
274
 
444
- # Find the highest value between max_count_c1 and max_count_c2
445
- highest_max_count = max(max_count_c1, max_count_c2)
275
+ Returns:
276
+ pandas.DataFrame: A DataFrame containing the processed data.
277
+ """
278
+ data = {key: [] for key in barcode_mapping.keys()}
279
+ seq_data = {f"{key}_seq": [] for key in barcode_mapping.keys()}
280
+ score_data_1 = {f"{key}_score_1": [] for key in barcode_mapping.keys()}
281
+ score_data_2 = {f"{key}_score_2": [] for key in barcode_mapping.keys()}
282
+ sequences = []
283
+
284
+ for record in records:
285
+ sequences.append(str(record.seq))
286
+ for key, coord in barcode_coordinates.items():
287
+ start, end = coord
288
+ extracted_seq = str(record.seq[start:end])
289
+
290
+ if reverse_complements[key]:
291
+ extracted_seq = str(Seq(extracted_seq).reverse_complement())
292
+
293
+ seq_data[f"{key}_seq"].append(extracted_seq)
294
+
295
+ if key in barcode_dicts:
296
+ exact_match = barcode_dicts[key].get(extracted_seq, None)
297
+ if exact_match:
298
+ data[key].append(exact_match)
299
+ score_data_1[f"{key}_score_1"].append(1.0)
300
+ score_data_2[f"{key}_score_2"].append(0.0)
301
+ else:
302
+ matches = get_top_two_matches(extracted_seq, barcode_dicts[key])
303
+ data[key].append(matches[0][0])
304
+ score_data_1[f"{key}_score_1"].append(matches[0][1])
305
+ score_data_2[f"{key}_score_2"].append(matches[1][1])
306
+ else:
307
+ data[key].append(extracted_seq)
308
+ score_data_1[f"{key}_score_1"].append(0.0)
309
+ score_data_2[f"{key}_score_2"].append(0.0)
310
+
311
+ df = pd.DataFrame(data)
312
+ df_seq = pd.DataFrame(seq_data)
313
+ df_score_1 = pd.DataFrame(score_data_1)
314
+ df_score_2 = pd.DataFrame(score_data_2)
315
+ df['sequence'] = sequences
316
+ df = pd.concat([df, df_seq, df_score_1, df_score_2], axis=1)
317
+ return df
446
318
 
447
- # Filter the DataFrame to remove rows with a count lower than the highest_max_count
448
- filtered_df = df[df['count'] >= highest_max_count]
319
+ def extract_barcodes_from_fastq(fastq, output_file, chunk_size, barcode_mapping, n_jobs=None, compression='zlib', complevel=9):
320
+ """
321
+ Extracts barcodes from a FASTQ file and maps them based on a barcode mapping.
449
322
 
450
- # Calculate total read counts for each unique combination of plate_row and column
451
- filtered_df['total_reads'] = filtered_df.groupby(['plate_row', 'column'])['count'].transform('sum')
452
-
453
- # Calculate read fraction for each row
454
- filtered_df['read_fraction'] = filtered_df['count'] / filtered_df['total_reads']
323
+ Args:
324
+ fastq (str): Path to the input FASTQ file.
325
+ output_file (str): Path to the output file where the mapped barcodes will be saved.
326
+ chunk_size (int): Number of records to process in each chunk.
327
+ barcode_mapping (dict): Dictionary containing barcode mapping information.
328
+ The keys are the names of the barcode sets, and the values are tuples
329
+ containing the path to the CSV file, barcode coordinates, and reverse complement flag.
330
+ n_jobs (int, optional): Number of parallel processes to use for mapping. Defaults to None.
331
+ compression (str, optional): Compression algorithm to use for saving the output file. Defaults to 'zlib'.
332
+ complevel (int, optional): Compression level to use for saving the output file. Defaults to 9.
455
333
 
456
- if settings['verbose']:
457
- print(f"Max count for non {nc} in {nc_loc}: {max_count_c1}")
458
- print(f"Max count for non {pc} in {pc_loc}: {max_count_c2}")
459
- #print(f"Max count for {nc} in other columns: {max_count_c3}")
460
-
461
- return filtered_df
334
+ Returns:
335
+ None
336
+ """
337
+ from .utils import print_progress, count_reads_in_fastq
338
+
339
+ # Ensure the file is deleted before starting
340
+ if os.path.exists(output_file):
341
+ os.remove(output_file)
342
+
343
+ # Validate and process barcode mapping
344
+ barcode_dicts = {}
345
+ barcode_coordinates = {}
346
+ reverse_complements = {}
347
+
348
+ for key, (csv_path, coordinates, reverse_comp) in barcode_mapping.items():
349
+ df = pd.read_csv(csv_path)
350
+ if 'name' not in df.columns or 'sequence' not in df.columns:
351
+ print(f"Warning: CSV file {csv_path} does not have required columns 'name' and 'sequence'. Aborting.")
352
+ return
353
+ barcode_dicts[key] = df.set_index('sequence')['name'].to_dict()
354
+ barcode_coordinates[key] = coordinates
355
+ reverse_complements[key] = reverse_comp
356
+
357
+ if n_jobs is None:
358
+ n_jobs = cpu_count() - 3 # Reserve one core for saving
462
359
 
463
- from .settings import get_map_barcodes_default_settings
464
-
465
- settings = get_map_barcodes_default_settings(settings)
466
-
467
- fldr = os.path.splitext(h5_file_path)[0]
468
- file_name = os.path.basename(fldr)
469
-
470
- if settings['test']:
471
- fldr = os.path.join(fldr, 'test')
472
- os.makedirs(fldr, exist_ok=True)
473
-
474
- qc_file_path = os.path.join(fldr, f'{file_name}_qc_step_2.csv')
475
- unique_grna_file_path = os.path.join(fldr, f'{file_name}_unique_grna.csv')
476
- unique_plate_row_file_path = os.path.join(fldr, f'{file_name}_unique_plate_row.csv')
477
- unique_column_file_path = os.path.join(fldr, f'{file_name}_unique_column.csv')
478
- unique_plate_file_path = os.path.join(fldr, f'{file_name}_unique_plate.csv')
479
- new_h5_file_path = os.path.join(fldr, f'{file_name}_cleaned.h5')
480
- combination_counts_file_path = os.path.join(fldr, f'{file_name}_combination_counts.csv')
481
- combination_counts_file_path_cleaned = os.path.join(fldr, f'{file_name}_combination_counts_cleaned.csv')
482
-
483
- #qc_file_path = os.path.splitext(h5_file_path)[0] + '_qc_step_2.csv'
484
- #unique_grna_file_path = os.path.splitext(h5_file_path)[0] + '_unique_grna.csv'
485
- #unique_plate_row_file_path = os.path.splitext(h5_file_path)[0] + '_unique_plate_row.csv'
486
- #unique_column_file_path = os.path.splitext(h5_file_path)[0] + '_unique_column.csv'
487
- #unique_plate_file_path = os.path.splitext(h5_file_path)[0] + '_unique_plate.csv'
488
- #new_h5_file_path = os.path.splitext(h5_file_path)[0] + '_cleaned.h5'
489
- #combination_counts_file_path = os.path.splitext(h5_file_path)[0] + '_combination_counts.csv'
490
- #combination_counts_file_path_cleaned = os.path.splitext(h5_file_path)[0] + '_combination_counts_cleaned.csv'
360
+ analyzed_chunks = 0
361
+ chunk_count = 0
362
+ time_ls = []
491
363
 
492
- # Initialize the HDF5 store for cleaned data
493
- store_cleaned = pd.HDFStore(new_h5_file_path, mode='a', complevel=5, complib='blosc')
364
+ print(f'Calculating read count for {fastq} ...')
365
+ total_reads = count_reads_in_fastq(fastq)
366
+ chunks_nr = int(total_reads/chunk_size)
494
367
 
495
- # Initialize the overall QC metrics
496
- overall_qc = {
497
- 'reads': 0,
498
- 'cleaned_reads': 0,
499
- 'NaN_grna': 0,
500
- 'NaN_plate_row': 0,
501
- 'NaN_column': 0,
502
- 'NaN_plate': 0,
503
- 'unique_grna': Counter(),
504
- 'unique_plate_row': Counter(),
505
- 'unique_column': Counter(),
506
- 'unique_plate': Counter(),
507
- 'pc_total_count': 0,
508
- 'pc_count_pc': 0,
509
- 'nc_total_count': 0,
510
- 'nc_count_nc': 0,
511
- 'pc_count_nc': 0,
512
- 'nc_count_pc': 0,
513
- 'pc_error_count': 0,
514
- 'nc_error_count': 0,
515
- 'pc_fraction_pc': 0,
516
- 'nc_fraction_nc': 0,
517
- 'pc_fraction_nc': 0,
518
- 'nc_fraction_pc': 0
519
- }
520
-
521
- per_row_qc = {}
522
- combination_counts = Counter()
523
-
524
- with pd.HDFStore(h5_file_path, mode='r') as store:
525
- keys = [key for key in store.keys() if key.startswith('/reads/chunk_')]
526
-
527
- if settings['test']:
528
- keys = keys[:3] # Only read the first chunks if in test mode
529
-
530
- for key in keys:
531
- df = store.get(key)
532
- df = mapping_dicts(df, settings)
533
- df_cleaned, qc_dict = get_read_qc(df, settings)
534
-
535
- # Accumulate counts for unique combinations
536
- combinations = df_cleaned[['plate_row_metadata', 'column_metadata', 'grna_metadata']].apply(tuple, axis=1)
537
-
538
- combination_counts.update(combinations)
368
+ print(f'Mapping barcodes for {total_reads} reads in {chunks_nr} batches for {fastq} ...')
369
+
370
+ # Create a queue to hold dataframes to be saved
371
+ save_queue = Queue()
539
372
 
540
- if settings['test'] and settings['verbose']:
541
- os.makedirs(os.path.join(os.path.splitext(h5_file_path)[0],'test'), exist_ok=True)
542
- df.to_csv(os.path.join(os.path.splitext(h5_file_path)[0],'test','chunk_1_df.csv'), index=False)
543
- df_cleaned.to_csv(os.path.join(os.path.splitext(h5_file_path)[0],'test','chunk_1_df_cleaned.csv'), index=False)
373
+ # Start a separate process for saving the data
374
+ save_process = Process(target=save_to_hdf, args=(save_queue, output_file, complevel, compression))
375
+ save_process.start()
544
376
 
545
- # Accumulate QC metrics for all rows
546
- for metric in qc_dict:
547
- if isinstance(overall_qc[metric], Counter):
548
- overall_qc[metric].update(qc_dict[metric])
549
- else:
550
- overall_qc[metric] += qc_dict[metric]
377
+ with gzip.open(fastq, "rt") as handle:
378
+ fastq_iter = SeqIO.parse(handle, "fastq")
379
+ pool = Pool(processes=n_jobs)
551
380
 
552
- # Update per_row_qc dictionary
553
- chunk_per_row_qc = get_per_row_qc(df, settings)
554
- for row in chunk_per_row_qc:
555
- if row not in per_row_qc:
556
- per_row_qc[row] = chunk_per_row_qc[row]
557
- else:
558
- for metric in chunk_per_row_qc[row]:
559
- if isinstance(per_row_qc[row][metric], Counter):
560
- per_row_qc[row][metric].update(chunk_per_row_qc[row][metric])
561
- else:
562
- per_row_qc[row][metric] += chunk_per_row_qc[row][metric]
563
-
564
- # Ensure the DataFrame columns are in the desired order
565
- df_cleaned = df_cleaned[['grna', 'plate_row', 'column', 'sample', 'grna_metadata', 'plate_row_metadata', 'column_metadata', 'plate_metadata']]
566
-
567
- # Save cleaned data to the new HDF5 store
568
- store_cleaned.put('reads/cleaned_data', df_cleaned, format='table', append=True)
569
-
570
- del df_cleaned, df
571
- gc.collect()
572
-
573
- # Calculate overall fractions after accumulating all metrics
574
- overall_qc['pc_fraction_pc'] = overall_qc['pc_count_pc'] / overall_qc['pc_total_count'] if overall_qc['pc_total_count'] else 0
575
- overall_qc['nc_fraction_nc'] = overall_qc['nc_count_nc'] / overall_qc['nc_total_count'] if overall_qc['nc_total_count'] else 0
576
- overall_qc['pc_fraction_nc'] = overall_qc['pc_count_nc'] / overall_qc['nc_total_count'] if overall_qc['nc_total_count'] else 0
577
- overall_qc['nc_fraction_pc'] = overall_qc['nc_count_pc'] / overall_qc['pc_total_count'] if overall_qc['pc_total_count'] else 0
578
-
579
- for row in per_row_qc:
580
- if row != 'all_rows':
581
- per_row_qc[row]['pc_fraction_pc'] = per_row_qc[row]['pc_count_pc'] / per_row_qc[row]['pc_total_count'] if per_row_qc[row]['pc_total_count'] else 0
582
- per_row_qc[row]['nc_fraction_nc'] = per_row_qc[row]['nc_count_nc'] / per_row_qc[row]['nc_total_count'] if per_row_qc[row]['nc_total_count'] else 0
583
- per_row_qc[row]['pc_fraction_nc'] = per_row_qc[row]['pc_count_nc'] / per_row_qc[row]['nc_total_count'] if per_row_qc[row]['nc_total_count'] else 0
584
- per_row_qc[row]['nc_fraction_pc'] = per_row_qc[row]['nc_count_pc'] / per_row_qc[row]['pc_total_count'] if per_row_qc[row]['pc_total_count'] else 0
585
-
586
- # Add overall_qc to per_row_qc with the key 'all_rows'
587
- per_row_qc['all_rows'] = overall_qc
588
-
589
- # Convert the Counter objects to DataFrames and save them to CSV files
590
- unique_grna_df = pd.DataFrame(overall_qc['unique_grna'].items(), columns=['key', 'value'])
591
- unique_plate_row_df = pd.DataFrame(overall_qc['unique_plate_row'].items(), columns=['key', 'value'])
592
- unique_column_df = pd.DataFrame(overall_qc['unique_column'].items(), columns=['key', 'value'])
593
- unique_plate_df = pd.DataFrame(overall_qc['unique_plate'].items(), columns=['key', 'value'])
594
-
595
- unique_grna_df.to_csv(unique_grna_file_path, index=False)
596
- unique_plate_row_df.to_csv(unique_plate_row_file_path, index=False)
597
- unique_column_df.to_csv(unique_column_file_path, index=False)
598
- unique_plate_df.to_csv(unique_plate_file_path, index=False)
599
-
600
- # Remove the unique counts from overall_qc for the main QC CSV file
601
- del overall_qc['unique_grna']
602
- del overall_qc['unique_plate_row']
603
- del overall_qc['unique_column']
604
- del overall_qc['unique_plate']
605
-
606
- # Combine all remaining QC metrics into a single DataFrame and save it to CSV
607
- qc_df = pd.DataFrame([overall_qc])
608
- qc_df.to_csv(qc_file_path, index=False)
609
-
610
- # Convert per_row_qc to a DataFrame and save it to CSV
611
- per_row_qc_df = pd.DataFrame.from_dict(per_row_qc, orient='index')
612
- per_row_qc_df = per_row_qc_df.sort_values(by='reads', ascending=False)
613
- per_row_qc_df = per_row_qc_df.drop(['unique_grna', 'unique_plate_row', 'unique_column', 'unique_plate'], axis=1, errors='ignore')
614
- per_row_qc_df = per_row_qc_df.dropna(subset=['reads'])
615
- per_row_qc_df.to_csv(os.path.splitext(h5_file_path)[0] + '_per_row_qc.csv', index=True)
616
-
617
- if settings['verbose']:
618
- display(per_row_qc_df)
619
-
620
- # Save the combination counts to a CSV file
621
- try:
622
- combination_counts_df = pd.DataFrame(combination_counts.items(), columns=['combination', 'count'])
623
- combination_counts_df[['plate_row', 'column', 'grna']] = pd.DataFrame(combination_counts_df['combination'].tolist(), index=combination_counts_df.index)
624
- combination_counts_df = combination_counts_df.drop('combination', axis=1)
625
- combination_counts_df.to_csv(combination_counts_file_path, index=False)
626
-
627
- grna_plate_heatmap(combination_counts_file_path, specific_grna=None)
628
- grna_plate_heatmap(combination_counts_file_path, specific_grna=settings['pc'])
629
- grna_plate_heatmap(combination_counts_file_path, specific_grna=settings['nc'])
630
-
631
- combination_counts_df_cleaned = filter_combinations(combination_counts_df, settings)
632
- combination_counts_df_cleaned.to_csv(combination_counts_file_path_cleaned, index=False)
633
-
634
- grna_plate_heatmap(combination_counts_file_path_cleaned, specific_grna=None)
635
- grna_plate_heatmap(combination_counts_file_path_cleaned, specific_grna=settings['pc'])
636
- grna_plate_heatmap(combination_counts_file_path_cleaned, specific_grna=settings['nc'])
637
- except Exception as e:
638
- print(e)
381
+ while True:
382
+ # Read n_jobs * chunk_size records into memory
383
+ records = [record for _, record in zip(range(n_jobs * chunk_size), fastq_iter)]
384
+
385
+ if not records:
386
+ break
387
+
388
+ analyzed_chunks_1 = analyzed_chunks
389
+ start_time = time.time()
390
+ chunk_count += 1
391
+ analyzed_chunks = int(chunk_count*n_jobs)
392
+ analyzed_chunks_ls = list(range(analyzed_chunks_1, analyzed_chunks))
393
+
394
+ # Split the records into chunks to be processed by each core
395
+ chunked_records = [records[i:i + chunk_size] for i in range(0, len(records), chunk_size)]
396
+
397
+ # Process each chunk in parallel
398
+ dfs = pool.starmap(process_chunk_for_mapping, [(chunk, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements) for chunk in chunked_records])
399
+
400
+ # Queue the dataframes to be saved
401
+ df = pd.concat(dfs, ignore_index=True)
402
+ save_queue.put((chunk_count, df))
403
+
404
+ end_time = time.time()
405
+ chunk_time = end_time - start_time
406
+ time_ls.append(chunk_time)
407
+
408
+ for az_chunks in analyzed_chunks_ls:
409
+ print_progress(files_processed=az_chunks, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=chunk_size, operation_type=" Mapping Barcodes")
410
+
411
+ del records, chunked_records, dfs, df
412
+
413
+ pool.close()
414
+ pool.join()
415
+
416
+ # Send a sentinel value to indicate the saving process should stop
417
+ save_queue.put((None, None))
418
+ save_process.join()
419
+
420
+ def extract_barcodes_from_fastq_v1(fastq, output_file, chunk_size, barcode_mapping, n_jobs=None, compression='zlib', complevel=9):
421
+ """
422
+ Extracts barcodes from a FASTQ file and saves the results to an output file.
423
+
424
+ Parameters:
425
+ - fastq (str): Path to the input FASTQ file.
426
+ - output_file (str): Path to the output file where the barcode data will be saved.
427
+ - chunk_size (int): Number of records to process in each chunk.
428
+ - barcode_mapping (dict): Mapping of barcode keys to CSV file paths, barcode coordinates, and reverse complement flags.
429
+ - n_jobs (int, optional): Number of parallel processes to use for barcode mapping. Defaults to None.
430
+ - compression (str, optional): Compression algorithm to use for the output file. Defaults to 'zlib'.
431
+ - complevel (int, optional): Compression level to use for the output file. Defaults to 9.
432
+ """
433
+
434
+ from .utils import print_progress, count_reads_in_fastq
435
+
436
+ # Ensure the file is deleted before starting
437
+ if os.path.exists(output_file):
438
+ os.remove(output_file)
439
+
440
+ # Validate and process barcode mapping
441
+ barcode_dicts = {}
442
+ barcode_coordinates = {}
443
+ reverse_complements = {}
444
+
445
+ for key, (csv_path, coordinates, reverse_comp) in barcode_mapping.items():
446
+ df = pd.read_csv(csv_path)
447
+ if 'name' not in df.columns or 'sequence' not in df.columns:
448
+ print(f"Warning: CSV file {csv_path} does not have required columns 'name' and 'sequence'. Aborting.")
449
+ return
450
+ barcode_dicts[key] = df.set_index('sequence')['name'].to_dict()
451
+ barcode_coordinates[key] = coordinates
452
+ reverse_complements[key] = reverse_comp
453
+
454
+ if n_jobs is None:
455
+ n_jobs = cpu_count() - 2
456
+
457
+ chunk_count = 0
458
+ time_ls = []
639
459
 
640
- # Close the HDF5 store
641
- store_cleaned.close()
642
- gc.collect()
643
- return
460
+ print(f'Calculating read count for {fastq} ...')
461
+ total_reads = count_reads_in_fastq(fastq)
462
+ chunks_nr = (int(total_reads/chunk_size) + 1)
463
+
464
+ print(f'Mapping barcodes for {total_reads} reads in {chunks_nr} batches for {fastq} ...')
465
+ with gzip.open(fastq, "rt") as handle:
466
+ fastq_iter = SeqIO.parse(handle, "fastq")
467
+ pool = Pool(processes=n_jobs)
468
+
469
+ while True:
470
+ # Read n_jobs * chunk_size records into memory
471
+ records = [record for _, record in zip(range(n_jobs * chunk_size), fastq_iter)]
472
+
473
+ if not records:
474
+ break
475
+
476
+ start_time = time.time()
477
+ chunk_count += 1
478
+
479
+ # Split the records into chunks to be processed by each core
480
+ chunked_records = [records[i:i + chunk_size] for i in range(0, len(records), chunk_size)]
481
+
482
+ # Process each chunk in parallel
483
+ dfs = pool.starmap(process_chunk_for_mapping, [(chunk, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements) for chunk in chunked_records])
484
+
485
+ # Join the results
486
+ df = pd.concat(dfs, ignore_index=True)
487
+
488
+ # Save to HDF5 with compression
489
+ print(f'Writing chunk {chunk_count} to H5PY ...')
490
+ df.to_hdf(output_file, key=f'chunk_{chunk_count}', mode='a', format='table', complevel=complevel, complib=compression)
491
+
492
+ end_time = time.time()
493
+ chunk_time = end_time - start_time
494
+ time_ls.append(chunk_time)
495
+ print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=None, operation_type=" Mapping Barcodes")
496
+
497
+ del records, chunked_records, dfs, df
498
+
499
+ pool.close()
500
+ pool.join()
501
+
502
+ def generate_barecode_mapping(settings={}):
503
+ from .settings import set_default_generate_barecode_mapping
504
+
505
+ settings = set_default_generate_barecode_mapping(settings)
506
+
507
+ samples_dict = parse_gz_files(settings['src'])
508
+ for key in samples_dict:
509
+ if samples_dict[key]['R1'] and samples_dict[key]['R2']:
510
+ R1 = samples_dict[key]['R1']
511
+ R2 = samples_dict[key]['R2']
512
+ consensus_dir = os.path.join(os.path.dirname(R1), 'consensus')
513
+ os.makedirs(consensus_dir, exist_ok=True)
514
+ consensus = os.path.join(consensus_dir, f"{key}_consensus.fastq.gz")
515
+ h5 = os.path.join(consensus_dir, f"{key}_barecodes.h5")
516
+
517
+ if not os.path.exists(consensus):
518
+ consensus_sequence(R1, R2, consensus, settings['chunk_size'])
519
+ else:
520
+ print(f"Consensus file {consensus} already exists. Mapping barecodes.")
521
+
522
+ extract_barcodes_from_fastq(fastq=consensus,
523
+ output_file=h5,
524
+ chunk_size=settings['chunk_size'],
525
+ barcode_mapping=settings['barcode_mapping'],
526
+ n_jobs=settings['n_jobs'],
527
+ compression=settings['compression'],
528
+ complevel=settings['complevel'])
529
+
530
+
531
+
532
+
533
+
534
+
535
+
536
+
537
+
538
+
539
+
540
+
541
+
542
+
543
+
544
+
545
+
546
+
547
+
644
548
 
645
549
  def grna_plate_heatmap(path, specific_grna=None, min_max='all', cmap='viridis', min_count=0, save=True):
646
550
  """
@@ -729,14 +633,6 @@ def grna_plate_heatmap(path, specific_grna=None, min_max='all', cmap='viridis',
729
633
 
730
634
  return fig
731
635
 
732
- def map_barcodes_folder(src, settings={}):
733
- for file in os.listdir(src):
734
- if file.endswith('.h5'):
735
- print(file)
736
- path = os.path.join(src, file)
737
- map_barcodes(path, settings)
738
- gc.collect()
739
-
740
636
  def reverse_complement(dna_sequence):
741
637
  complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N':'N'}
742
638
  reverse_seq = dna_sequence[::-1]
@@ -1846,6 +1742,4 @@ def perform_regression(df, settings):
1846
1742
 
1847
1743
  print('Significant Genes')
1848
1744
  display(significant)
1849
- return coef_df
1850
-
1851
-
1745
+ return coef_df