spacr 0.2.68__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. spacr/__init__.py +2 -1
  2. spacr/core.py +107 -12
  3. spacr/gui.py +3 -2
  4. spacr/gui_core.py +160 -109
  5. spacr/gui_elements.py +190 -18
  6. spacr/gui_utils.py +4 -1
  7. spacr/io.py +1 -1
  8. spacr/measure.py +4 -4
  9. spacr/mediar.py +366 -0
  10. spacr/plot.py +4 -1
  11. spacr/resources/MEDIAR/.git +1 -0
  12. spacr/resources/MEDIAR/.gitignore +18 -0
  13. spacr/resources/MEDIAR/LICENSE +21 -0
  14. spacr/resources/MEDIAR/README.md +189 -0
  15. spacr/resources/MEDIAR/SetupDict.py +39 -0
  16. spacr/resources/MEDIAR/config/baseline.json +60 -0
  17. spacr/resources/MEDIAR/config/mediar_example.json +72 -0
  18. spacr/resources/MEDIAR/config/pred/pred_mediar.json +17 -0
  19. spacr/resources/MEDIAR/config/step1_pretraining/phase1.json +55 -0
  20. spacr/resources/MEDIAR/config/step1_pretraining/phase2.json +58 -0
  21. spacr/resources/MEDIAR/config/step2_finetuning/finetuning1.json +66 -0
  22. spacr/resources/MEDIAR/config/step2_finetuning/finetuning2.json +66 -0
  23. spacr/resources/MEDIAR/config/step3_prediction/base_prediction.json +16 -0
  24. spacr/resources/MEDIAR/config/step3_prediction/ensemble_tta.json +23 -0
  25. spacr/resources/MEDIAR/core/BasePredictor.py +120 -0
  26. spacr/resources/MEDIAR/core/BaseTrainer.py +240 -0
  27. spacr/resources/MEDIAR/core/Baseline/Predictor.py +59 -0
  28. spacr/resources/MEDIAR/core/Baseline/Trainer.py +113 -0
  29. spacr/resources/MEDIAR/core/Baseline/__init__.py +2 -0
  30. spacr/resources/MEDIAR/core/Baseline/utils.py +80 -0
  31. spacr/resources/MEDIAR/core/MEDIAR/EnsemblePredictor.py +105 -0
  32. spacr/resources/MEDIAR/core/MEDIAR/Predictor.py +234 -0
  33. spacr/resources/MEDIAR/core/MEDIAR/Trainer.py +172 -0
  34. spacr/resources/MEDIAR/core/MEDIAR/__init__.py +3 -0
  35. spacr/resources/MEDIAR/core/MEDIAR/utils.py +429 -0
  36. spacr/resources/MEDIAR/core/__init__.py +2 -0
  37. spacr/resources/MEDIAR/core/utils.py +40 -0
  38. spacr/resources/MEDIAR/evaluate.py +71 -0
  39. spacr/resources/MEDIAR/generate_mapping.py +121 -0
  40. spacr/resources/MEDIAR/image/examples/img1.tiff +0 -0
  41. spacr/resources/MEDIAR/image/examples/img2.tif +0 -0
  42. spacr/resources/MEDIAR/image/failure_cases.png +0 -0
  43. spacr/resources/MEDIAR/image/mediar_framework.png +0 -0
  44. spacr/resources/MEDIAR/image/mediar_model.PNG +0 -0
  45. spacr/resources/MEDIAR/image/mediar_results.png +0 -0
  46. spacr/resources/MEDIAR/main.py +125 -0
  47. spacr/resources/MEDIAR/predict.py +70 -0
  48. spacr/resources/MEDIAR/requirements.txt +14 -0
  49. spacr/resources/MEDIAR/train_tools/__init__.py +3 -0
  50. spacr/resources/MEDIAR/train_tools/data_utils/__init__.py +1 -0
  51. spacr/resources/MEDIAR/train_tools/data_utils/custom/CellAware.py +88 -0
  52. spacr/resources/MEDIAR/train_tools/data_utils/custom/LoadImage.py +161 -0
  53. spacr/resources/MEDIAR/train_tools/data_utils/custom/NormalizeImage.py +77 -0
  54. spacr/resources/MEDIAR/train_tools/data_utils/custom/__init__.py +3 -0
  55. spacr/resources/MEDIAR/train_tools/data_utils/custom/modalities.pkl +0 -0
  56. spacr/resources/MEDIAR/train_tools/data_utils/datasetter.py +208 -0
  57. spacr/resources/MEDIAR/train_tools/data_utils/transforms.py +148 -0
  58. spacr/resources/MEDIAR/train_tools/data_utils/utils.py +84 -0
  59. spacr/resources/MEDIAR/train_tools/measures.py +200 -0
  60. spacr/resources/MEDIAR/train_tools/models/MEDIARFormer.py +102 -0
  61. spacr/resources/MEDIAR/train_tools/models/__init__.py +1 -0
  62. spacr/resources/MEDIAR/train_tools/utils.py +70 -0
  63. spacr/resources/MEDIAR_weights/.DS_Store +0 -0
  64. spacr/resources/icons/.DS_Store +0 -0
  65. spacr/resources/icons/plaque.png +0 -0
  66. spacr/resources/images/plate1_E01_T0001F001L01A01Z01C02.tif +0 -0
  67. spacr/resources/images/plate1_E01_T0001F001L01A02Z01C01.tif +0 -0
  68. spacr/resources/images/plate1_E01_T0001F001L01A03Z01C03.tif +0 -0
  69. spacr/sequencing.py +234 -422
  70. spacr/settings.py +16 -10
  71. spacr/utils.py +14 -11
  72. {spacr-0.2.68.dist-info → spacr-0.3.0.dist-info}/METADATA +10 -2
  73. {spacr-0.2.68.dist-info → spacr-0.3.0.dist-info}/RECORD +77 -18
  74. {spacr-0.2.68.dist-info → spacr-0.3.0.dist-info}/LICENSE +0 -0
  75. {spacr-0.2.68.dist-info → spacr-0.3.0.dist-info}/WHEEL +0 -0
  76. {spacr-0.2.68.dist-info → spacr-0.3.0.dist-info}/entry_points.txt +0 -0
  77. {spacr-0.2.68.dist-info → spacr-0.3.0.dist-info}/top_level.txt +0 -0
spacr/sequencing.py CHANGED
@@ -25,6 +25,18 @@ from Bio import SeqIO
25
25
  from Bio.Seq import Seq
26
26
  from Bio.SeqRecord import SeqRecord
27
27
 
28
+ from collections import defaultdict
29
+
30
+ import gzip, re
31
+ from Bio.Seq import Seq
32
+ import pandas as pd
33
+ import numpy as np
34
+ import gzip, re
35
+ from Bio.Seq import Seq
36
+ import pandas as pd
37
+ import numpy as np
38
+ from multiprocessing import Pool, cpu_count
39
+
28
40
  def parse_gz_files(folder_path):
29
41
  """
30
42
  Parses the .fastq.gz files in the specified folder path and returns a dictionary
@@ -55,474 +67,274 @@ def parse_gz_files(folder_path):
55
67
  samples_dict[sample_name]['R2'] = os.path.join(folder_path, gz_file)
56
68
  return samples_dict
57
69
 
58
- def process_chunk_for_consensus(r1_chunk, r2_chunk):
59
- """
60
- Process a chunk of paired-end sequencing reads to generate consensus sequences.
61
-
62
- Args:
63
- r1_chunk (list): List of SeqRecord objects representing the first read in each pair.
64
- r2_chunk (list): List of SeqRecord objects representing the second read in each pair.
65
-
66
- Returns:
67
- list: List of SeqRecord objects representing the consensus sequences.
68
-
69
- """
70
- consensus_records = []
70
+ # Function to map sequences to names (same as your original)
71
+ def map_sequences_to_names(csv_file, sequences, rc):
72
+ def rev_comp(dna_sequence):
73
+ complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
74
+ reverse_seq = dna_sequence[::-1]
75
+ return ''.join([complement_dict[base] for base in reverse_seq])
71
76
 
72
- for r1_record, r2_record in zip(r1_chunk, r2_chunk):
73
- best_sequence = []
74
- best_quality = []
75
- for base1, base2, qual1, qual2 in zip(r1_record.seq, r2_record.seq, r1_record.letter_annotations["phred_quality"], r2_record.letter_annotations["phred_quality"]):
76
- if qual1 >= qual2:
77
- best_sequence.append(base1)
78
- best_quality.append(qual1)
79
- else:
80
- best_sequence.append(base2)
81
- best_quality.append(qual2)
82
-
83
- consensus_seq = Seq("".join(best_sequence))
84
-
85
- # Create a new SeqRecord for the consensus sequence
86
- consensus_record = SeqRecord(consensus_seq, id=r1_record.id, description="", letter_annotations={"phred_quality": best_quality})
87
-
88
- # Add the consensus record to the list
89
- consensus_records.append(consensus_record)
77
+ df = pd.read_csv(csv_file)
78
+ if rc:
79
+ df['sequence'] = df['sequence'].apply(rev_comp)
90
80
 
91
- return consensus_records
92
-
93
- def consensus_sequence(fastq_r1, fastq_r2, output_file, chunk_size=1000000, n_jobs=None):
94
- """
95
- Calculate the consensus sequence from two FASTQ files (R1 and R2) and write the result to an output file.
96
-
97
- Parameters:
98
- - fastq_r1 (str): Path to the R1 FASTQ file.
99
- - fastq_r2 (str): Path to the R2 FASTQ file.
100
- - output_file (str): Path to the output file where the consensus sequence will be written.
101
- - chunk_size (int): Number of reads to process in each chunk. Default is 1000000.
102
- - n_jobs (int): Number of parallel processes to use. If None, it will use the number of available CPUs minus 2.
103
-
104
- Returns:
105
- None
106
- """
107
- from .utils import print_progress, count_reads_in_fastq
108
-
109
- print(f'Calculating read count for {fastq_r1} ...')
110
- total_reads = count_reads_in_fastq(fastq_r1)
111
- chunks_nr = (int(total_reads / chunk_size) + 1) // (n_jobs if n_jobs else cpu_count())
112
-
113
- total_reads_processed = 0
114
- chunk_count = 0
115
- time_ls = []
116
-
117
- if n_jobs is None:
118
- n_jobs = cpu_count() - 2
81
+ csv_sequences = pd.Series(df['name'].values, index=df['sequence']).to_dict()
82
+ return [csv_sequences.get(sequence, pd.NA) for sequence in sequences]
119
83
 
120
- with gzip.open(fastq_r1, "rt") as r1_handle, gzip.open(fastq_r2, "rt") as r2_handle, gzip.open(output_file, "wt") as output_handle:
121
- r1_iter = SeqIO.parse(r1_handle, "fastq")
122
- r2_iter = SeqIO.parse(r2_handle, "fastq")
123
- pool = Pool(processes=n_jobs)
84
+ # Functions to save data (same as your original)
85
+ def save_df_to_hdf5(df, hdf5_file, key='df', comp_type='zlib', comp_level=5):
86
+ try:
87
+ with pd.HDFStore(hdf5_file, 'a', complib=comp_type, complevel=comp_level) as store:
88
+ if key in store:
89
+ existing_df = store[key]
90
+ df = pd.concat([existing_df, df], ignore_index=True)
91
+ store.put(key, df, format='table')
92
+ except Exception as e:
93
+ print(f"Error while saving DataFrame to HDF5: {e}")
94
+
95
+ def save_unique_combinations_to_csv(unique_combinations, csv_file):
96
+ try:
97
+ try:
98
+ existing_df = pd.read_csv(csv_file)
99
+ except FileNotFoundError:
100
+ existing_df = pd.DataFrame()
124
101
 
125
- while True:
126
- start_time = time.time()
102
+ if not existing_df.empty:
103
+ unique_combinations = pd.concat([existing_df, unique_combinations])
104
+ unique_combinations = unique_combinations.groupby(
105
+ ['row_name', 'column_name', 'grna_name'], as_index=False).sum()
127
106
 
128
- r1_chunk = [rec for rec in (next(r1_iter, None) for _ in range(n_jobs * chunk_size)) if rec is not None]
129
- r2_chunk = [rec for rec in (next(r2_iter, None) for _ in range(n_jobs * chunk_size)) if rec is not None]
130
-
131
- # If either chunk is empty, we have reached the end of one or both files
132
- if not r1_chunk or not r2_chunk:
133
- break
107
+ unique_combinations.to_csv(csv_file, index=False)
108
+ except Exception as e:
109
+ print(f"Error while saving unique combinations to CSV: {e}")
134
110
 
135
- chunk_count += 1
136
- total_reads_processed += len(r1_chunk)
137
-
138
- # Split the records into chunks to be processed by each core
139
- r1_chunked = [r1_chunk[i:i + chunk_size] for i in range(0, len(r1_chunk), chunk_size)]
140
- r2_chunked = [r2_chunk[i:i + chunk_size] for i in range(0, len(r2_chunk), chunk_size)]
141
-
142
- # Process each chunk in parallel
143
- results = pool.starmap(process_chunk_for_consensus, zip(r1_chunked, r2_chunked))
144
-
145
- # Write the results to the output file
146
- for consensus_records in results:
147
- SeqIO.write(consensus_records, output_handle, "fastq")
148
-
149
- end_time = time.time()
150
- chunk_time = end_time - start_time
151
- time_ls.append(chunk_time)
152
- print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=chunk_size, operation_type=" Consensus sequence from R1 & R2")
153
-
154
- pool.close()
155
- pool.join()
156
-
157
- def consensus_sequence_v1(fastq_r1, fastq_r2, output_file, chunk_size=1000000):
158
- """
159
- Generate a consensus sequence from paired-end FASTQ files.
160
-
161
- Args:
162
- fastq_r1 (str): Path to the first input FASTQ file.
163
- fastq_r2 (str): Path to the second input FASTQ file.
164
- output_file (str): Path to the output FASTQ file.
165
- chunk_size (int, optional): Number of reads to process in each iteration. Defaults to 1000000.
166
-
167
- Returns:
168
- None
169
- """
170
- from .utils import print_progress, count_reads_in_fastq
171
-
172
- print(f'Calculating read count for {fastq_r1} ...')
173
- total_reads = count_reads_in_fastq(fastq_r1)
174
- chunks_nr = int(total_reads/chunk_size) + 1
111
+ def save_qc_df_to_csv(qc_df, qc_csv_file):
112
+ try:
113
+ try:
114
+ existing_qc_df = pd.read_csv(qc_csv_file)
115
+ except FileNotFoundError:
116
+ existing_qc_df = pd.DataFrame()
117
+
118
+ if not existing_qc_df.empty:
119
+ qc_df = qc_df.add(existing_qc_df, fill_value=0)
120
+
121
+ qc_df.to_csv(qc_csv_file, index=False)
122
+ except Exception as e:
123
+ print(f"Error while saving QC DataFrame to CSV: {e}")
124
+
125
+ def extract_sequence_and_quality(sequence, quality, start, end):
126
+ return sequence[start:end], quality[start:end]
127
+
128
+ def create_consensus(seq1, qual1, seq2, qual2):
129
+ consensus_seq = []
130
+ for i in range(len(seq1)):
131
+ bases = [(seq1[i], qual1[i]), (seq2[i], qual2[i])]
132
+ consensus_seq.append(get_consensus_base(bases))
133
+ return ''.join(consensus_seq)
134
+
135
+ def get_consensus_base(bases):
136
+ # Prefer non-'N' bases, if 'N' exists, pick the other one.
137
+ if bases[0][0] == 'N':
138
+ return bases[1][0]
139
+ elif bases[1][0] == 'N':
140
+ return bases[0][0]
141
+ else:
142
+ # Return the base with the highest quality score
143
+ return bases[0][0] if bases[0][1] >= bases[1][1] else bases[1][0]
175
144
 
176
- total_reads = 0
177
- chunk_count = 0
178
- time_ls = []
145
+ def reverse_complement(seq):
146
+ return str(Seq(seq).reverse_complement())
179
147
 
180
- with gzip.open(fastq_r1, "rt") as r1_handle, gzip.open(fastq_r2, "rt") as r2_handle, gzip.open(output_file, "wt") as output_handle:
181
- r1_iter = SeqIO.parse(r1_handle, "fastq")
182
- r2_iter = SeqIO.parse(r2_handle, "fastq")
148
+ # Core logic for processing a chunk (same as your original)
149
+ def process_chunk(chunk_data):
150
+
151
+ def find_sequence_in_chunk_reads(r1_chunk, r2_chunk, target_sequence, offset_start, expected_end):
152
+ i = 0
153
+ fail_count = 0
154
+ failed_cases = []
155
+ regex = r"^(?P<column>.{8})TGCTG.*TAAAC(?P<grna>.{20,21})AACTT.*AGAAG(?P<row>.{8}).*"
156
+ consensus_sequences, columns, grnas, rows = [], [], [], []
183
157
 
184
- while True:
185
- start_time = time.time()
186
-
187
- r1_chunk = [rec for rec in (next(r1_iter, None) for _ in range(chunk_size)) if rec is not None]
188
- r2_chunk = [rec for rec in (next(r2_iter, None) for _ in range(chunk_size)) if rec is not None]
189
-
190
- # If either chunk is empty, we have reached the end of one or both files
191
- if not r1_chunk or not r2_chunk:
192
- break
193
-
194
- chunk_count += 1
195
- total_reads += len(r1_chunk)
196
-
197
- for r1_record, r2_record in zip(r1_chunk, r2_chunk):
198
- best_sequence = []
199
- best_quality = []
200
- for base1, base2, qual1, qual2 in zip(r1_record.seq, r2_record.seq, r1_record.letter_annotations["phred_quality"], r2_record.letter_annotations["phred_quality"]):
201
- if qual1 >= qual2:
202
- best_sequence.append(base1)
203
- best_quality.append(qual1)
204
- else:
205
- best_sequence.append(base2)
206
- best_quality.append(qual2)
207
-
208
- consensus_seq = Seq("".join(best_sequence))
209
-
210
- # Create a new SeqRecord for the consensus sequence
211
- consensus_record = SeqRecord(consensus_seq, id=r1_record.id, description="", letter_annotations={"phred_quality": best_quality})
212
-
213
- # Write the consensus sequence to the output file
214
- SeqIO.write(consensus_record, output_handle, "fastq")
215
-
216
- end_time = time.time()
217
- chunk_time = end_time - start_time
218
- time_ls.append(chunk_time)
219
- print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=1, time_ls=time_ls, batch_size=chunk_size, operation_type=" Consensus sequence from R1 & R2")
220
-
221
- def save_to_hdf(queue, output_file, complevel=9, compression='zlib'):
222
- """
223
- Save data from a queue to an HDF file.
224
-
225
- Parameters:
226
- - queue: Queue object containing chunks of data to be saved
227
- - output_file: Path to the output HDF file
228
- - complevel: Compression level (default: 9)
229
- - compression: Compression algorithm (default: 'zlib')
230
-
231
- Returns:
232
- None
233
- """
234
- with pd.HDFStore(output_file, mode='a', complevel=complevel, complib=compression) as store:
235
- while True:
236
- chunk_count, df = queue.get()
237
- if df is None:
238
- break
239
- print(f'Writing chunks to H5PY ...')
240
- store.append(f'chunk_{chunk_count}', df, format='table', data_columns=True)
241
-
242
- def get_top_two_matches(seq, barcode_dict):
243
- """
244
- Finds the top two closest matches for a given sequence in a barcode dictionary.
245
-
246
- Args:
247
- seq (str): The sequence to find the closest matches for.
248
- barcode_dict (dict): A dictionary containing barcodes as keys and their corresponding values.
249
-
250
- Returns:
251
- list of tuples: A list containing up to two tuples, each with a barcode match and its score.
252
- """
253
- results = process.extract(seq, barcode_dict.keys(), scorer=fuzz.ratio, limit=2)
254
- matches = [(barcode_dict[result[0]], result[1] / 100.0) for result in results]
255
- # Pad the matches list if there are fewer than two results
256
- if len(matches) < 2:
257
- matches.append((None, 0.0))
258
- return matches
259
-
260
- def process_chunk_for_mapping(records, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements):
261
- """
262
- Process a chunk of records for barcode mapping, including highest and second-highest scores.
263
-
264
- Args:
265
- records (list): A list of records to process.
266
- barcode_mapping (dict): A dictionary mapping barcodes to their corresponding keys.
267
- barcode_dicts (dict): A dictionary of barcode dictionaries.
268
- barcode_coordinates (dict): A dictionary mapping barcode keys to their start and end coordinates.
269
- reverse_complements (dict): A dictionary indicating whether to reverse complement the extracted sequences for each barcode key.
158
+ for r1_lines, r2_lines in zip(r1_chunk, r2_chunk):
159
+ r1_header, r1_sequence, r1_plus, r1_quality = r1_lines.split('\n')
160
+ r2_header, r2_sequence, r2_plus, r2_quality = r2_lines.split('\n')
161
+ r2_sequence = reverse_complement(r2_sequence)
270
162
 
271
- Returns:
272
- pandas.DataFrame: A DataFrame containing the processed data.
273
- """
274
- data = {key: [] for key in barcode_mapping.keys()}
275
- seq_data = {f"{key}_seq": [] for key in barcode_mapping.keys()}
276
- score_data_1 = {f"{key}_score_1": [] for key in barcode_mapping.keys()}
277
- score_data_2 = {f"{key}_score_2": [] for key in barcode_mapping.keys()}
278
- sequences = []
163
+ r1_pos = r1_sequence.find(target_sequence)
164
+ r2_pos = r2_sequence.find(target_sequence)
165
+
166
+ if r1_pos != -1 and r2_pos != -1:
167
+ r1_start = max(r1_pos + offset_start, 0)
168
+ r1_end = min(r1_start + expected_end, len(r1_sequence))
169
+ r2_start = max(r2_pos + offset_start, 0)
170
+ r2_end = min(r2_start + expected_end, len(r2_sequence))
171
+
172
+ r1_seq, r1_qual = extract_sequence_and_quality(r1_sequence, r1_quality, r1_start, r1_end)
173
+ r2_seq, r2_qual = extract_sequence_and_quality(r2_sequence, r2_quality, r2_start, r2_end)
174
+
175
+ if len(r1_seq) < expected_end:
176
+ r1_seq += 'N' * (expected_end - len(r1_seq))
177
+ r1_qual += '!' * (expected_end - len(r1_qual))
178
+
179
+ if len(r2_seq) < expected_end:
180
+ r2_seq += 'N' * (expected_end - len(r2_seq))
181
+ r2_qual += '!' * (expected_end - len(r2_qual))
182
+
183
+ consensus_seq = create_consensus(r1_seq, r1_qual, r2_seq, r2_qual)
184
+ if len(consensus_seq) >= expected_end:
185
+ match = re.match(regex, consensus_seq)
186
+ if match:
187
+ consensus_sequences.append(consensus_seq)
188
+ column_sequence = match.group('column')
189
+ grna_sequence = match.group('grna')
190
+ row_sequence = match.group('row')
191
+ columns.append(column_sequence)
192
+ grnas.append(grna_sequence)
193
+ rows.append(row_sequence)
194
+
195
+ return consensus_sequences, columns, grnas, rows, fail_count
196
+
197
+ r1_chunk, r2_chunk, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv = chunk_data
198
+ consensus_sequences, columns, grnas, rows, _ = find_sequence_in_chunk_reads(r1_chunk, r2_chunk, target_sequence, offset_start, expected_end)
279
199
 
280
- for record in records:
281
- sequences.append(str(record.seq))
282
- for key, coord in barcode_coordinates.items():
283
- start, end = coord
284
- extracted_seq = str(record.seq[start:end])
285
-
286
- if reverse_complements[key]:
287
- extracted_seq = str(Seq(extracted_seq).reverse_complement())
288
-
289
- seq_data[f"{key}_seq"].append(extracted_seq)
290
-
291
- if key in barcode_dicts:
292
- exact_match = barcode_dicts[key].get(extracted_seq, None)
293
- if exact_match:
294
- data[key].append(exact_match)
295
- score_data_1[f"{key}_score_1"].append(1.0)
296
- score_data_2[f"{key}_score_2"].append(0.0)
297
- else:
298
- matches = get_top_two_matches(extracted_seq, barcode_dicts[key])
299
- data[key].append(matches[0][0])
300
- score_data_1[f"{key}_score_1"].append(matches[0][1])
301
- score_data_2[f"{key}_score_2"].append(matches[1][1])
302
- else:
303
- data[key].append(extracted_seq)
304
- score_data_1[f"{key}_score_1"].append(0.0)
305
- score_data_2[f"{key}_score_2"].append(0.0)
306
-
307
- df = pd.DataFrame(data)
308
- df_seq = pd.DataFrame(seq_data)
309
- df_score_1 = pd.DataFrame(score_data_1)
310
- df_score_2 = pd.DataFrame(score_data_2)
311
- df['sequence'] = sequences
312
- df = pd.concat([df, df_seq, df_score_1, df_score_2], axis=1)
313
- return df
314
-
315
- def extract_barcodes_from_fastq(fastq, output_file, chunk_size, barcode_mapping, n_jobs=None, compression='zlib', complevel=9):
316
- """
317
- Extracts barcodes from a FASTQ file and maps them based on a barcode mapping.
200
+ column_names = map_sequences_to_names(column_csv, columns, rc=False)
201
+ grna_names = map_sequences_to_names(grna_csv, grnas, rc=True)
202
+ row_names = map_sequences_to_names(row_csv, rows, rc=True)
203
+
204
+ df = pd.DataFrame({
205
+ 'read': consensus_sequences,
206
+ 'column_sequence': columns,
207
+ 'column_name': column_names,
208
+ 'row_sequence': rows,
209
+ 'row_name': row_names,
210
+ 'grna_sequence': grnas,
211
+ 'grna_name': grna_names
212
+ })
213
+
214
+ qc_df = df.isna().sum().to_frame().T
215
+ qc_df.columns = df.columns
216
+ qc_df.index = ["NaN_Counts"]
217
+ qc_df['total_reads'] = len(df)
218
+
219
+ unique_combinations = df.groupby(['row_name', 'column_name', 'grna_name']).size().reset_index(name='count')
220
+ return df, unique_combinations, qc_df
221
+
222
+ # Function to save data from the queue
223
+ def saver_process(save_queue, hdf5_file, unique_combinations_csv, qc_csv_file, comp_type, comp_level):
224
+ while True:
225
+ item = save_queue.get()
226
+ if item == "STOP":
227
+ break
228
+ df, unique_combinations, qc_df = item
229
+ save_df_to_hdf5(df, hdf5_file, key='df', comp_type=comp_type, comp_level=comp_level)
230
+ save_unique_combinations_to_csv(unique_combinations, unique_combinations_csv)
231
+ save_qc_df_to_csv(qc_df, qc_csv_file)
318
232
 
319
- Args:
320
- fastq (str): Path to the input FASTQ file.
321
- output_file (str): Path to the output file where the mapped barcodes will be saved.
322
- chunk_size (int): Number of records to process in each chunk.
323
- barcode_mapping (dict): Dictionary containing barcode mapping information.
324
- The keys are the names of the barcode sets, and the values are tuples
325
- containing the path to the CSV file, barcode coordinates, and reverse complement flag.
326
- n_jobs (int, optional): Number of parallel processes to use for mapping. Defaults to None.
327
- compression (str, optional): Compression algorithm to use for saving the output file. Defaults to 'zlib'.
328
- complevel (int, optional): Compression level to use for saving the output file. Defaults to 9.
233
+ # Updated chunked_processing with improved multiprocessing logic
234
+ def chunked_processing(r1_file, r2_file, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None):
329
235
 
330
- Returns:
331
- None
332
- """
333
- from .utils import print_progress, count_reads_in_fastq
334
-
335
- # Ensure the file is deleted before starting
336
- if os.path.exists(output_file):
337
- os.remove(output_file)
338
-
339
- # Validate and process barcode mapping
340
- barcode_dicts = {}
341
- barcode_coordinates = {}
342
- reverse_complements = {}
343
-
344
- for key, (csv_path, coordinates, reverse_comp) in barcode_mapping.items():
345
- df = pd.read_csv(csv_path)
346
- if 'name' not in df.columns or 'sequence' not in df.columns:
347
- print(f"Warning: CSV file {csv_path} does not have required columns 'name' and 'sequence'. Aborting.")
348
- return
349
- barcode_dicts[key] = df.set_index('sequence')['name'].to_dict()
350
- barcode_coordinates[key] = coordinates
351
- reverse_complements[key] = reverse_comp
236
+ from .utils import count_reads_in_fastq, print_progress
352
237
 
238
+ # Use cpu_count minus 3 cores if n_jobs isn't specified
353
239
  if n_jobs is None:
354
- n_jobs = cpu_count() - 3 # Reserve one core for saving
355
-
240
+ n_jobs = cpu_count() - 3
241
+
356
242
  analyzed_chunks = 0
357
243
  chunk_count = 0
358
244
  time_ls = []
359
-
360
- print(f'Calculating read count for {fastq} ...')
361
- total_reads = count_reads_in_fastq(fastq)
362
- chunks_nr = int(total_reads/chunk_size)
363
-
364
- print(f'Mapping barcodes for {total_reads} reads in {chunks_nr} batches for {fastq} ...')
365
-
366
- # Create a queue to hold dataframes to be saved
245
+
246
+ print(f'Calculating read count for {r1_file}...')
247
+ total_reads = count_reads_in_fastq(r1_file)
248
+ chunks_nr = int(total_reads / chunk_size)
249
+ print(f'Mapping barcodes for {total_reads} reads in {chunks_nr} batches for {r1_file}...')
250
+
251
+ # Queue for saving
367
252
  save_queue = Queue()
368
253
 
369
- # Start a separate process for saving the data
370
- save_process = Process(target=save_to_hdf, args=(save_queue, output_file, complevel, compression))
254
+ # Start the saving process
255
+ save_process = Process(target=saver_process, args=(save_queue, hdf5_file, unique_combinations_csv, qc_csv_file, comp_type, comp_level))
371
256
  save_process.start()
372
257
 
373
- with gzip.open(fastq, "rt") as handle:
374
- fastq_iter = SeqIO.parse(handle, "fastq")
375
- pool = Pool(processes=n_jobs)
258
+ pool = Pool(n_jobs)
376
259
 
260
+ with gzip.open(r1_file, 'rt') as r1, gzip.open(r2_file, 'rt') as r2:
261
+ fastq_iter = zip(r1, r2)
377
262
  while True:
378
- # Read n_jobs * chunk_size records into memory
379
- records = [record for _, record in zip(range(n_jobs * chunk_size), fastq_iter)]
380
-
381
- if not records:
382
- break
383
-
384
- analyzed_chunks_1 = analyzed_chunks
385
263
  start_time = time.time()
386
- chunk_count += 1
387
- analyzed_chunks = int(chunk_count*n_jobs)
388
- analyzed_chunks_ls = list(range(analyzed_chunks_1, analyzed_chunks))
389
-
390
- # Split the records into chunks to be processed by each core
391
- chunked_records = [records[i:i + chunk_size] for i in range(0, len(records), chunk_size)]
392
-
393
- # Process each chunk in parallel
394
- dfs = pool.starmap(process_chunk_for_mapping, [(chunk, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements) for chunk in chunked_records])
395
-
396
- # Queue the dataframes to be saved
397
- df = pd.concat(dfs, ignore_index=True)
398
- save_queue.put((chunk_count, df))
399
-
400
- end_time = time.time()
401
- chunk_time = end_time - start_time
402
- time_ls.append(chunk_time)
403
-
404
- for az_chunks in analyzed_chunks_ls:
405
- print_progress(files_processed=az_chunks, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=chunk_size, operation_type=" Mapping Barcodes")
406
-
407
- del records, chunked_records, dfs, df
408
-
409
- pool.close()
410
- pool.join()
411
-
412
- # Send a sentinel value to indicate the saving process should stop
413
- save_queue.put((None, None))
414
- save_process.join()
415
-
416
- def extract_barcodes_from_fastq_v1(fastq, output_file, chunk_size, barcode_mapping, n_jobs=None, compression='zlib', complevel=9):
417
- """
418
- Extracts barcodes from a FASTQ file and saves the results to an output file.
419
-
420
- Parameters:
421
- - fastq (str): Path to the input FASTQ file.
422
- - output_file (str): Path to the output file where the barcode data will be saved.
423
- - chunk_size (int): Number of records to process in each chunk.
424
- - barcode_mapping (dict): Mapping of barcode keys to CSV file paths, barcode coordinates, and reverse complement flags.
425
- - n_jobs (int, optional): Number of parallel processes to use for barcode mapping. Defaults to None.
426
- - compression (str, optional): Compression algorithm to use for the output file. Defaults to 'zlib'.
427
- - complevel (int, optional): Compression level to use for the output file. Defaults to 9.
428
- """
429
-
430
- from .utils import print_progress, count_reads_in_fastq
431
-
432
- # Ensure the file is deleted before starting
433
- if os.path.exists(output_file):
434
- os.remove(output_file)
435
-
436
- # Validate and process barcode mapping
437
- barcode_dicts = {}
438
- barcode_coordinates = {}
439
- reverse_complements = {}
440
-
441
- for key, (csv_path, coordinates, reverse_comp) in barcode_mapping.items():
442
- df = pd.read_csv(csv_path)
443
- if 'name' not in df.columns or 'sequence' not in df.columns:
444
- print(f"Warning: CSV file {csv_path} does not have required columns 'name' and 'sequence'. Aborting.")
445
- return
446
- barcode_dicts[key] = df.set_index('sequence')['name'].to_dict()
447
- barcode_coordinates[key] = coordinates
448
- reverse_complements[key] = reverse_comp
449
-
450
- if n_jobs is None:
451
- n_jobs = cpu_count() - 2
452
-
453
- chunk_count = 0
454
- time_ls = []
455
-
456
- print(f'Calculating read count for {fastq} ...')
457
- total_reads = count_reads_in_fastq(fastq)
458
- chunks_nr = (int(total_reads/chunk_size) + 1)
459
-
460
- print(f'Mapping barcodes for {total_reads} reads in {chunks_nr} batches for {fastq} ...')
461
- with gzip.open(fastq, "rt") as handle:
462
- fastq_iter = SeqIO.parse(handle, "fastq")
463
- pool = Pool(processes=n_jobs)
464
-
465
- while True:
466
- # Read n_jobs * chunk_size records into memory
467
- records = [record for _, record in zip(range(n_jobs * chunk_size), fastq_iter)]
264
+ r1_chunk = []
265
+ r2_chunk = []
266
+
267
+ for _ in range(chunk_size):
268
+ try:
269
+ r1_lines = [r1.readline().strip() for _ in range(4)]
270
+ r2_lines = [r2.readline().strip() for _ in range(4)]
271
+ r1_chunk.append('\n'.join(r1_lines))
272
+ r2_chunk.append('\n'.join(r2_lines))
273
+ except StopIteration:
274
+ break
468
275
 
469
- if not records:
276
+ if not r1_chunk:
470
277
  break
471
278
 
472
- start_time = time.time()
473
279
  chunk_count += 1
280
+ chunk_data = (r1_chunk, r2_chunk, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv)
474
281
 
475
- # Split the records into chunks to be processed by each core
476
- chunked_records = [records[i:i + chunk_size] for i in range(0, len(records), chunk_size)]
477
-
478
- # Process each chunk in parallel
479
- dfs = pool.starmap(process_chunk_for_mapping, [(chunk, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements) for chunk in chunked_records])
282
+ # Process chunks in parallel
283
+ result = pool.apply_async(process_chunk, (chunk_data,))
284
+ df, unique_combinations, qc_df = result.get()
480
285
 
481
- # Join the results
482
- df = pd.concat(dfs, ignore_index=True)
483
-
484
- # Save to HDF5 with compression
485
- print(f'Writing chunk {chunk_count} to H5PY ...')
486
- df.to_hdf(output_file, key=f'chunk_{chunk_count}', mode='a', format='table', complevel=complevel, complib=compression)
286
+ # Queue the results for saving
287
+ save_queue.put((df, unique_combinations, qc_df))
487
288
 
488
289
  end_time = time.time()
489
290
  chunk_time = end_time - start_time
490
291
  time_ls.append(chunk_time)
491
- print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=None, operation_type=" Mapping Barcodes")
292
+ print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=chunk_size, operation_type="Mapping Barcodes")
492
293
 
493
- del records, chunked_records, dfs, df
294
+ # Cleanup the pool
295
+ pool.close()
296
+ pool.join()
494
297
 
495
- pool.close()
496
- pool.join()
298
+ # Send stop signal to saver process
299
+ save_queue.put("STOP")
300
+ save_process.join()
497
301
 
498
302
  def generate_barecode_mapping(settings={}):
303
+
499
304
  from .settings import set_default_generate_barecode_mapping
500
305
 
501
306
  settings = set_default_generate_barecode_mapping(settings)
502
307
 
503
308
  samples_dict = parse_gz_files(settings['src'])
309
+
504
310
  for key in samples_dict:
505
- if samples_dict[key]['R1'] and samples_dict[key]['R2']:
506
- R1 = samples_dict[key]['R1']
507
- R2 = samples_dict[key]['R2']
508
- consensus_dir = os.path.join(os.path.dirname(R1), 'consensus')
509
- os.makedirs(consensus_dir, exist_ok=True)
510
- consensus = os.path.join(consensus_dir, f"{key}_consensus.fastq.gz")
511
- h5 = os.path.join(consensus_dir, f"{key}_barecodes.h5")
512
-
513
- if not os.path.exists(consensus):
514
- consensus_sequence(R1, R2, consensus, settings['chunk_size'])
515
- else:
516
- print(f"Consensus file {consensus} already exists. Mapping barecodes.")
517
-
518
- extract_barcodes_from_fastq(fastq=consensus,
519
- output_file=h5,
520
- chunk_size=settings['chunk_size'],
521
- barcode_mapping=settings['barcode_mapping'],
522
- n_jobs=settings['n_jobs'],
523
- compression=settings['compression'],
524
- complevel=settings['complevel'])
525
311
 
312
+ if samples_dict[key]['R1'] and samples_dict[key]['R2']:
313
+
314
+ dst = os.path.join(settings['src'], key)
315
+ hdf5_file = os.path.join(dst, 'annotated_reads.h5')
316
+ unique_combinations_csv = os.path.join(dst, 'unique_combinations.csv')
317
+ qc_csv_file = os.path.join(dst, 'qc.csv')
318
+ os.makedirs(dst, exist_ok=True)
319
+
320
+ print(f'Analyzing reads from sample {key}')
321
+
322
+ chunked_processing(r1_file=samples_dict[key]['R1'],
323
+ r2_file=samples_dict[key]['R2'],
324
+ target_sequence=settings['target_sequence'],
325
+ offset_start=settings['offset_start'],
326
+ expected_end=settings['expected_end'],
327
+ column_csv=settings['column_csv'],
328
+ grna_csv=settings['grna_csv'],
329
+ row_csv=settings['row_csv'],
330
+ save_h5 = settings['save_h5'],
331
+ comp_type = settings['comp_type'],
332
+ comp_level=settings['comp_level'],
333
+ hdf5_file=hdf5_file,
334
+ unique_combinations_csv=unique_combinations_csv,
335
+ qc_csv_file=qc_csv_file,
336
+ chunk_size=settings['chunk_size'],
337
+ n_jobs=settings['n_jobs'])
526
338
 
527
339
 
528
340