speconsense 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speconsense/__init__.py +16 -0
- speconsense/cli.py +6 -0
- speconsense/core/__init__.py +32 -0
- speconsense/core/__main__.py +6 -0
- speconsense/core/cli.py +308 -0
- speconsense/core/clusterer.py +1565 -0
- speconsense/core/workers.py +696 -0
- speconsense/msa.py +813 -0
- speconsense/profiles/__init__.py +514 -0
- speconsense/profiles/example.yaml +97 -0
- speconsense/profiles/herbarium.yaml +25 -0
- speconsense/profiles/largedata.yaml +19 -0
- speconsense/profiles/nostalgia.yaml +22 -0
- speconsense/profiles/strict.yaml +27 -0
- speconsense/quality_report.py +499 -0
- speconsense/scalability/__init__.py +29 -0
- speconsense/scalability/base.py +461 -0
- speconsense/scalability/config.py +42 -0
- speconsense/scalability/vsearch.py +226 -0
- speconsense/summarize/__init__.py +129 -0
- speconsense/summarize/__main__.py +6 -0
- speconsense/summarize/analysis.py +780 -0
- speconsense/summarize/cli.py +528 -0
- speconsense/summarize/clustering.py +669 -0
- speconsense/summarize/fields.py +262 -0
- speconsense/summarize/io.py +723 -0
- speconsense/summarize/iupac.py +294 -0
- speconsense/summarize/merging.py +606 -0
- speconsense/synth.py +292 -0
- speconsense/types.py +38 -0
- speconsense-0.7.2.dist-info/METADATA +1449 -0
- speconsense-0.7.2.dist-info/RECORD +36 -0
- speconsense-0.7.2.dist-info/WHEEL +5 -0
- speconsense-0.7.2.dist-info/entry_points.txt +4 -0
- speconsense-0.7.2.dist-info/licenses/LICENSE +28 -0
- speconsense-0.7.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,723 @@
|
|
|
1
|
+
"""File input/output operations for speconsense-summarize.
|
|
2
|
+
|
|
3
|
+
Provides functions for loading consensus sequences, writing output files,
|
|
4
|
+
and managing the output directory structure.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import glob
|
|
10
|
+
import csv
|
|
11
|
+
import json
|
|
12
|
+
import shutil
|
|
13
|
+
import logging
|
|
14
|
+
import datetime
|
|
15
|
+
from typing import List, Dict, Tuple, Optional
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
|
|
18
|
+
from Bio import SeqIO
|
|
19
|
+
|
|
20
|
+
from speconsense.types import ConsensusInfo
|
|
21
|
+
|
|
22
|
+
from .fields import FastaField, format_fasta_header
|
|
23
|
+
from .clustering import select_variants
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_consensus_header(header: str) -> Tuple[Optional[str], Optional[int], Optional[int],
|
|
27
|
+
Optional[List[str]], Optional[float], Optional[float]]:
|
|
28
|
+
"""
|
|
29
|
+
Extract information from Speconsense consensus FASTA header.
|
|
30
|
+
|
|
31
|
+
Parses read identity metrics.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Tuple of (sample_name, ric, size, primers, rid, rid_min)
|
|
35
|
+
"""
|
|
36
|
+
sample_match = re.match(r'>([^ ]+) (.+)', header)
|
|
37
|
+
if not sample_match:
|
|
38
|
+
return None, None, None, None, None, None
|
|
39
|
+
|
|
40
|
+
sample_name = sample_match.group(1)
|
|
41
|
+
info_string = sample_match.group(2)
|
|
42
|
+
|
|
43
|
+
# Extract RiC value
|
|
44
|
+
ric_match = re.search(r'ric=(\d+)', info_string)
|
|
45
|
+
ric = int(ric_match.group(1)) if ric_match else 0
|
|
46
|
+
|
|
47
|
+
# Extract size value
|
|
48
|
+
size_match = re.search(r'size=(\d+)', info_string)
|
|
49
|
+
size = int(size_match.group(1)) if size_match else 0
|
|
50
|
+
|
|
51
|
+
# Extract primers value
|
|
52
|
+
primers_match = re.search(r'primers=([^,\s]+(?:,[^,\s]+)*)', info_string)
|
|
53
|
+
primers = primers_match.group(1).split(',') if primers_match else None
|
|
54
|
+
|
|
55
|
+
# Extract read identity metrics (percentages in headers, convert to fractions)
|
|
56
|
+
rid_match = re.search(r'rid=([\d.]+)', info_string)
|
|
57
|
+
rid = float(rid_match.group(1)) / 100.0 if rid_match else None
|
|
58
|
+
|
|
59
|
+
rid_min_match = re.search(r'rid_min=([\d.]+)', info_string)
|
|
60
|
+
rid_min = float(rid_min_match.group(1)) / 100.0 if rid_min_match else None
|
|
61
|
+
|
|
62
|
+
return sample_name, ric, size, primers, rid, rid_min
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def load_consensus_sequences(
|
|
66
|
+
source_folder: str,
|
|
67
|
+
min_ric: int,
|
|
68
|
+
min_len: int = 0,
|
|
69
|
+
max_len: int = 0
|
|
70
|
+
) -> List[ConsensusInfo]:
|
|
71
|
+
"""Load all consensus sequences from speconsense output files.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
source_folder: Directory containing speconsense output files
|
|
75
|
+
min_ric: Minimum Reads in Consensus threshold
|
|
76
|
+
min_len: Minimum sequence length (0 = disabled)
|
|
77
|
+
max_len: Maximum sequence length (0 = disabled)
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
List of ConsensusInfo objects passing all filters
|
|
81
|
+
"""
|
|
82
|
+
consensus_list = []
|
|
83
|
+
filtered_by_ric = 0
|
|
84
|
+
filtered_by_len = 0
|
|
85
|
+
|
|
86
|
+
# Find all consensus FASTA files matching the new naming pattern
|
|
87
|
+
fasta_pattern = os.path.join(source_folder, "*-all.fasta")
|
|
88
|
+
fasta_files = sorted(glob.glob(fasta_pattern))
|
|
89
|
+
|
|
90
|
+
for fasta_file in fasta_files:
|
|
91
|
+
logging.debug(f"Processing consensus file: {fasta_file}")
|
|
92
|
+
|
|
93
|
+
with open(fasta_file, 'r') as f:
|
|
94
|
+
for record in SeqIO.parse(f, "fasta"):
|
|
95
|
+
sample_name, ric, size, primers, rid, rid_min = \
|
|
96
|
+
parse_consensus_header(f">{record.description}")
|
|
97
|
+
|
|
98
|
+
if not sample_name:
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
# RiC filter
|
|
102
|
+
if ric < min_ric:
|
|
103
|
+
filtered_by_ric += 1
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
# Length filters (applied before merging to avoid chimeric contamination)
|
|
107
|
+
seq_len = len(record.seq)
|
|
108
|
+
if min_len > 0 and seq_len < min_len:
|
|
109
|
+
logging.debug(f"Filtered {sample_name}: length {seq_len} < min_len {min_len}")
|
|
110
|
+
filtered_by_len += 1
|
|
111
|
+
continue
|
|
112
|
+
if max_len > 0 and seq_len > max_len:
|
|
113
|
+
logging.debug(f"Filtered {sample_name}: length {seq_len} > max_len {max_len}")
|
|
114
|
+
filtered_by_len += 1
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
# Extract cluster ID from sample name (e.g., "sample-c1" -> "c1")
|
|
118
|
+
cluster_match = re.search(r'-c(\d+)$', sample_name)
|
|
119
|
+
cluster_id = cluster_match.group(0) if cluster_match else sample_name
|
|
120
|
+
|
|
121
|
+
consensus_info = ConsensusInfo(
|
|
122
|
+
sample_name=sample_name,
|
|
123
|
+
cluster_id=cluster_id,
|
|
124
|
+
sequence=str(record.seq),
|
|
125
|
+
ric=ric,
|
|
126
|
+
size=size,
|
|
127
|
+
file_path=fasta_file,
|
|
128
|
+
snp_count=None, # No SNP info from original speconsense output
|
|
129
|
+
primers=primers,
|
|
130
|
+
raw_ric=None, # Not available in original speconsense output
|
|
131
|
+
rid=rid, # Mean read identity if available
|
|
132
|
+
rid_min=rid_min, # Minimum read identity if available
|
|
133
|
+
)
|
|
134
|
+
consensus_list.append(consensus_info)
|
|
135
|
+
|
|
136
|
+
# Log loading summary
|
|
137
|
+
filter_parts = [f"Loaded {len(consensus_list)} consensus sequences from {len(fasta_files)} files"]
|
|
138
|
+
if filtered_by_ric > 0:
|
|
139
|
+
filter_parts.append(f"filtered {filtered_by_ric} by RiC")
|
|
140
|
+
if filtered_by_len > 0:
|
|
141
|
+
filter_parts.append(f"filtered {filtered_by_len} by length")
|
|
142
|
+
logging.info(", ".join(filter_parts))
|
|
143
|
+
|
|
144
|
+
return consensus_list
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def load_metadata_from_json(source_folder: str, sample_name: str) -> Optional[Dict]:
|
|
148
|
+
"""Load metadata JSON file for a consensus sequence.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
source_folder: Source directory containing cluster_debug folder
|
|
152
|
+
sample_name: Sample name (e.g., "sample-c1")
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Dictionary with metadata, or None if file not found or error
|
|
156
|
+
"""
|
|
157
|
+
# Construct path to metadata file
|
|
158
|
+
debug_dir = os.path.join(source_folder, "cluster_debug")
|
|
159
|
+
metadata_file = os.path.join(debug_dir, f"{sample_name}-metadata.json")
|
|
160
|
+
|
|
161
|
+
if not os.path.exists(metadata_file):
|
|
162
|
+
logging.debug(f"Metadata file not found: {metadata_file}")
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
with open(metadata_file, 'r') as f:
|
|
167
|
+
metadata = json.load(f)
|
|
168
|
+
return metadata
|
|
169
|
+
except Exception as e:
|
|
170
|
+
logging.warning(f"Failed to load metadata from {metadata_file}: {e}")
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def create_output_structure(groups: Dict[int, List[ConsensusInfo]],
|
|
175
|
+
max_variants: int,
|
|
176
|
+
variant_selection: str,
|
|
177
|
+
summary_folder: str) -> Tuple[List[ConsensusInfo], Dict]:
|
|
178
|
+
"""
|
|
179
|
+
Create the final output structure with proper naming.
|
|
180
|
+
Returns final consensus list and naming information.
|
|
181
|
+
"""
|
|
182
|
+
os.makedirs(summary_folder, exist_ok=True)
|
|
183
|
+
os.makedirs(os.path.join(summary_folder, 'FASTQ Files'), exist_ok=True)
|
|
184
|
+
os.makedirs(os.path.join(summary_folder, 'variants'), exist_ok=True)
|
|
185
|
+
os.makedirs(os.path.join(summary_folder, 'variants', 'FASTQ Files'), exist_ok=True)
|
|
186
|
+
|
|
187
|
+
final_consensus = []
|
|
188
|
+
naming_info = {}
|
|
189
|
+
|
|
190
|
+
# Sort groups by size of largest member (descending)
|
|
191
|
+
sorted_groups = sorted(groups.items(),
|
|
192
|
+
key=lambda x: max(m.size for m in x[1]),
|
|
193
|
+
reverse=True)
|
|
194
|
+
|
|
195
|
+
for group_idx, (_, group_members) in enumerate(sorted_groups, 1):
|
|
196
|
+
# Select variants for this group
|
|
197
|
+
selected_variants = select_variants(group_members, max_variants, variant_selection, group_number=group_idx)
|
|
198
|
+
|
|
199
|
+
# Create naming for this group
|
|
200
|
+
group_naming = []
|
|
201
|
+
|
|
202
|
+
for variant_idx, variant in enumerate(selected_variants):
|
|
203
|
+
# All variants get .v suffix (primary is .v1, additional are .v2, .v3, etc.)
|
|
204
|
+
# Use rsplit to split on the LAST '-c' (specimen names may contain '-c')
|
|
205
|
+
specimen_base = variant.sample_name.rsplit('-c', 1)[0]
|
|
206
|
+
new_name = f"{specimen_base}-{group_idx}.v{variant_idx + 1}"
|
|
207
|
+
|
|
208
|
+
# Use _replace to preserve all fields while updating sample_name
|
|
209
|
+
renamed_variant = variant._replace(sample_name=new_name)
|
|
210
|
+
|
|
211
|
+
final_consensus.append(renamed_variant)
|
|
212
|
+
group_naming.append((variant.sample_name, new_name))
|
|
213
|
+
|
|
214
|
+
naming_info[group_idx] = group_naming
|
|
215
|
+
|
|
216
|
+
return final_consensus, naming_info
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def write_consensus_fastq(consensus: ConsensusInfo,
|
|
220
|
+
merge_traceability: Dict[str, List[str]],
|
|
221
|
+
naming_info: Dict,
|
|
222
|
+
fastq_dir: str,
|
|
223
|
+
fastq_lookup: Dict[str, List[str]],
|
|
224
|
+
original_consensus_lookup: Dict[str, ConsensusInfo]):
|
|
225
|
+
"""Write FASTQ file for a consensus by concatenating existing FASTQ files."""
|
|
226
|
+
# Find the original cluster name(s) by looking through naming_info
|
|
227
|
+
original_clusters = []
|
|
228
|
+
for group_naming in naming_info.values():
|
|
229
|
+
for original_name, final_name in group_naming:
|
|
230
|
+
if final_name == consensus.sample_name:
|
|
231
|
+
# This original cluster contributed to our final consensus
|
|
232
|
+
if original_name in merge_traceability:
|
|
233
|
+
# This was a merged cluster, get all original contributors
|
|
234
|
+
original_clusters.extend(merge_traceability[original_name])
|
|
235
|
+
else:
|
|
236
|
+
# This was not merged, just add it directly
|
|
237
|
+
original_clusters.append(original_name)
|
|
238
|
+
break
|
|
239
|
+
|
|
240
|
+
if not original_clusters:
|
|
241
|
+
logging.warning(f"Could not find contributing clusters for {consensus.sample_name}")
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
# Find FASTQ files for these clusters using lookup table
|
|
245
|
+
# Track cluster metadata alongside files: [(cluster_name, ric, [files]), ...]
|
|
246
|
+
fastq_output_path = os.path.join(fastq_dir, f"{consensus.sample_name}-RiC{consensus.ric}.fastq")
|
|
247
|
+
cluster_files = []
|
|
248
|
+
|
|
249
|
+
for cluster_name in original_clusters:
|
|
250
|
+
# Look for specimen name from cluster name (e.g., "sample-c1" -> "sample")
|
|
251
|
+
if '-c' in cluster_name:
|
|
252
|
+
specimen_name = cluster_name.rsplit('-c', 1)[0]
|
|
253
|
+
debug_files = fastq_lookup.get(specimen_name, [])
|
|
254
|
+
|
|
255
|
+
# Get the original RiC value for this cluster
|
|
256
|
+
original_ric = original_consensus_lookup.get(cluster_name)
|
|
257
|
+
if not original_ric:
|
|
258
|
+
logging.warning(f"Could not find original consensus info for {cluster_name}")
|
|
259
|
+
continue
|
|
260
|
+
|
|
261
|
+
# Filter files that match this specific cluster with exact RiC value
|
|
262
|
+
# Match the full pattern: {specimen}-c{cluster}-RiC{exact_ric}-{stage}.fastq
|
|
263
|
+
# This prevents matching multiple RiC values for the same cluster
|
|
264
|
+
cluster_ric_pattern = f"{cluster_name}-RiC{original_ric.ric}-"
|
|
265
|
+
matching_files = [f for f in debug_files if cluster_ric_pattern in f]
|
|
266
|
+
|
|
267
|
+
# Validate that matched files exist and log any issues
|
|
268
|
+
valid_files = []
|
|
269
|
+
for mf in matching_files:
|
|
270
|
+
if not os.path.exists(mf):
|
|
271
|
+
logging.warning(f"Matched file does not exist: {mf}")
|
|
272
|
+
elif os.path.getsize(mf) == 0:
|
|
273
|
+
logging.warning(f"Matched file is empty: {mf}")
|
|
274
|
+
else:
|
|
275
|
+
valid_files.append(mf)
|
|
276
|
+
|
|
277
|
+
if valid_files:
|
|
278
|
+
cluster_files.append((cluster_name, original_ric.ric, valid_files))
|
|
279
|
+
|
|
280
|
+
if not cluster_files:
|
|
281
|
+
logging.warning(f"No FASTQ files found for {consensus.sample_name} from clusters: {original_clusters}")
|
|
282
|
+
return
|
|
283
|
+
|
|
284
|
+
# Concatenate files with cluster boundary delimiters
|
|
285
|
+
# Each cluster gets a synthetic FASTQ record as a delimiter before its reads
|
|
286
|
+
files_processed = 0
|
|
287
|
+
try:
|
|
288
|
+
with open(fastq_output_path, 'w') as outf:
|
|
289
|
+
for idx, (cluster_name, ric, files) in enumerate(cluster_files, 1):
|
|
290
|
+
# Count reads in this cluster's files
|
|
291
|
+
cluster_reads = 0
|
|
292
|
+
for f in files:
|
|
293
|
+
with open(f, 'r') as rf:
|
|
294
|
+
cluster_reads += sum(1 for _ in rf) // 4
|
|
295
|
+
|
|
296
|
+
# Write cluster boundary delimiter
|
|
297
|
+
outf.write(f"@CLUSTER_BOUNDARY_{idx}:{cluster_name}:RiC={ric}:reads={cluster_reads}\n")
|
|
298
|
+
outf.write("NNNNNNNNNN\n")
|
|
299
|
+
outf.write("+\n")
|
|
300
|
+
outf.write("!!!!!!!!!!\n")
|
|
301
|
+
|
|
302
|
+
# Write cluster reads
|
|
303
|
+
for input_file in files:
|
|
304
|
+
try:
|
|
305
|
+
with open(input_file, 'r') as inf:
|
|
306
|
+
shutil.copyfileobj(inf, outf)
|
|
307
|
+
files_processed += 1
|
|
308
|
+
except Exception as e:
|
|
309
|
+
logging.debug(f"Could not concatenate {input_file}: {e}")
|
|
310
|
+
|
|
311
|
+
# Check if the output file has content
|
|
312
|
+
output_size = os.path.getsize(fastq_output_path)
|
|
313
|
+
total_files = sum(len(files) for _, _, files in cluster_files)
|
|
314
|
+
if output_size > 0:
|
|
315
|
+
# Count reads for logging by quickly counting lines and dividing by 4
|
|
316
|
+
with open(fastq_output_path, 'r') as f:
|
|
317
|
+
line_count = sum(1 for line in f)
|
|
318
|
+
read_count = line_count // 4
|
|
319
|
+
logging.debug(f"Concatenated {files_processed}/{total_files} files from {len(cluster_files)} clusters ({output_size:,} bytes) with ~{read_count} reads to {fastq_output_path}")
|
|
320
|
+
else:
|
|
321
|
+
# Debug: check what files were supposed to be concatenated
|
|
322
|
+
file_info = []
|
|
323
|
+
for _, _, files in cluster_files:
|
|
324
|
+
for input_file in files:
|
|
325
|
+
size = os.path.getsize(input_file) if os.path.exists(input_file) else 0
|
|
326
|
+
file_info.append(f"{os.path.basename(input_file)}:{size}B")
|
|
327
|
+
|
|
328
|
+
logging.warning(f"No data written for {consensus.sample_name} - input files: {', '.join(file_info)}")
|
|
329
|
+
# Remove empty output file
|
|
330
|
+
try:
|
|
331
|
+
os.unlink(fastq_output_path)
|
|
332
|
+
except OSError:
|
|
333
|
+
pass
|
|
334
|
+
|
|
335
|
+
except Exception as e:
|
|
336
|
+
logging.error(f"Failed to write concatenated FASTQ file {fastq_output_path}: {e}")
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def write_specimen_data_files(specimen_consensus: List[ConsensusInfo],
|
|
340
|
+
merge_traceability: Dict[str, List[str]],
|
|
341
|
+
naming_info: Dict,
|
|
342
|
+
summary_folder: str,
|
|
343
|
+
fastq_dir: str,
|
|
344
|
+
fastq_lookup: Dict[str, List[str]],
|
|
345
|
+
original_consensus_lookup: Dict[str, ConsensusInfo],
|
|
346
|
+
fasta_fields: List[FastaField]
|
|
347
|
+
) -> List[Tuple[ConsensusInfo, str]]:
|
|
348
|
+
"""
|
|
349
|
+
Write individual FASTA and FASTQ files for a single specimen.
|
|
350
|
+
Does NOT write summary files (summary.fasta, summary.txt).
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
fasta_fields: List of FastaField objects defining header format
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
List of (raw_consensus, original_cluster_name) tuples for later use in summary.fasta
|
|
357
|
+
"""
|
|
358
|
+
# Generate .raw file consensuses for merged variants
|
|
359
|
+
raw_file_consensuses = []
|
|
360
|
+
for consensus in specimen_consensus:
|
|
361
|
+
# Only create .raw files if this consensus was actually merged
|
|
362
|
+
if consensus.raw_ric and len(consensus.raw_ric) > 1:
|
|
363
|
+
# Find the original cluster name from naming_info
|
|
364
|
+
original_cluster_name = None
|
|
365
|
+
for group_naming in naming_info.values():
|
|
366
|
+
for orig_name, final_name in group_naming:
|
|
367
|
+
if final_name == consensus.sample_name:
|
|
368
|
+
original_cluster_name = orig_name
|
|
369
|
+
break
|
|
370
|
+
if original_cluster_name:
|
|
371
|
+
break
|
|
372
|
+
|
|
373
|
+
# Get contributing clusters from merge_traceability
|
|
374
|
+
if original_cluster_name and original_cluster_name in merge_traceability:
|
|
375
|
+
contributing_clusters = merge_traceability[original_cluster_name]
|
|
376
|
+
|
|
377
|
+
# Sort by size (descending) to match .raw1, .raw2 ordering
|
|
378
|
+
contributing_infos = []
|
|
379
|
+
for cluster_name in contributing_clusters:
|
|
380
|
+
if cluster_name in original_consensus_lookup:
|
|
381
|
+
contributing_infos.append(original_consensus_lookup[cluster_name])
|
|
382
|
+
|
|
383
|
+
contributing_infos.sort(key=lambda x: x.size, reverse=True)
|
|
384
|
+
|
|
385
|
+
# Create .raw file entries
|
|
386
|
+
for raw_idx, raw_info in enumerate(contributing_infos, 1):
|
|
387
|
+
raw_name = f"{consensus.sample_name}.raw{raw_idx}"
|
|
388
|
+
|
|
389
|
+
# Create new ConsensusInfo with .raw name but original sequence/metadata
|
|
390
|
+
raw_consensus = ConsensusInfo(
|
|
391
|
+
sample_name=raw_name,
|
|
392
|
+
cluster_id=raw_info.cluster_id,
|
|
393
|
+
sequence=raw_info.sequence,
|
|
394
|
+
ric=raw_info.ric,
|
|
395
|
+
size=raw_info.size,
|
|
396
|
+
file_path=raw_info.file_path,
|
|
397
|
+
snp_count=None, # Pre-merge, no SNPs from merging
|
|
398
|
+
primers=raw_info.primers,
|
|
399
|
+
raw_ric=None, # Pre-merge, not merged
|
|
400
|
+
rid=raw_info.rid, # Preserve read identity metrics
|
|
401
|
+
rid_min=raw_info.rid_min,
|
|
402
|
+
)
|
|
403
|
+
raw_file_consensuses.append((raw_consensus, raw_info.sample_name))
|
|
404
|
+
|
|
405
|
+
# Write individual FASTA files with custom field formatting
|
|
406
|
+
for consensus in specimen_consensus:
|
|
407
|
+
output_file = os.path.join(summary_folder, f"{consensus.sample_name}-RiC{consensus.ric}.fasta")
|
|
408
|
+
with open(output_file, 'w') as f:
|
|
409
|
+
header = format_fasta_header(consensus, fasta_fields)
|
|
410
|
+
f.write(f">{header}\n")
|
|
411
|
+
f.write(f"{consensus.sequence}\n")
|
|
412
|
+
|
|
413
|
+
# Write FASTQ files for each final consensus containing all contributing reads
|
|
414
|
+
for consensus in specimen_consensus:
|
|
415
|
+
write_consensus_fastq(consensus, merge_traceability, naming_info, fastq_dir, fastq_lookup, original_consensus_lookup)
|
|
416
|
+
|
|
417
|
+
# Write .raw files (individual FASTA and FASTQ for pre-merge variants)
|
|
418
|
+
for raw_consensus, original_cluster_name in raw_file_consensuses:
|
|
419
|
+
# Write individual FASTA file with custom field formatting
|
|
420
|
+
output_file = os.path.join(summary_folder, 'variants', f"{raw_consensus.sample_name}-RiC{raw_consensus.ric}.fasta")
|
|
421
|
+
with open(output_file, 'w') as f:
|
|
422
|
+
header = format_fasta_header(raw_consensus, fasta_fields)
|
|
423
|
+
f.write(f">{header}\n")
|
|
424
|
+
f.write(f"{raw_consensus.sequence}\n")
|
|
425
|
+
|
|
426
|
+
# Write FASTQ file by finding the original cluster's FASTQ
|
|
427
|
+
# Look for specimen name from original cluster name
|
|
428
|
+
if '-c' in original_cluster_name:
|
|
429
|
+
specimen_name = original_cluster_name.rsplit('-c', 1)[0]
|
|
430
|
+
debug_files = fastq_lookup.get(specimen_name, []) if fastq_lookup else []
|
|
431
|
+
|
|
432
|
+
# Filter files that match this specific cluster with exact RiC value
|
|
433
|
+
# Use the raw_consensus.ric which came from the original cluster
|
|
434
|
+
cluster_ric_pattern = f"{original_cluster_name}-RiC{raw_consensus.ric}-"
|
|
435
|
+
matching_files = [f for f in debug_files if cluster_ric_pattern in f]
|
|
436
|
+
|
|
437
|
+
if matching_files:
|
|
438
|
+
fastq_output_path = os.path.join(summary_folder, 'variants', 'FASTQ Files', f"{raw_consensus.sample_name}-RiC{raw_consensus.ric}.fastq")
|
|
439
|
+
try:
|
|
440
|
+
with open(fastq_output_path, 'wb') as outf:
|
|
441
|
+
for input_file in matching_files:
|
|
442
|
+
if os.path.exists(input_file) and os.path.getsize(input_file) > 0:
|
|
443
|
+
with open(input_file, 'rb') as inf:
|
|
444
|
+
shutil.copyfileobj(inf, outf)
|
|
445
|
+
logging.debug(f"Wrote .raw FASTQ: {os.path.basename(fastq_output_path)}")
|
|
446
|
+
except Exception as e:
|
|
447
|
+
logging.debug(f"Could not write .raw FASTQ for {raw_consensus.sample_name}: {e}")
|
|
448
|
+
|
|
449
|
+
return raw_file_consensuses
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def build_fastq_lookup_table(source_dir: str = ".") -> Dict[str, List[str]]:
|
|
453
|
+
"""
|
|
454
|
+
Build a lookup table mapping specimen base names to their cluster FASTQ files.
|
|
455
|
+
This avoids repeated directory scanning during file copying.
|
|
456
|
+
"""
|
|
457
|
+
lookup = defaultdict(list)
|
|
458
|
+
|
|
459
|
+
# Initialize variables before conditional block
|
|
460
|
+
debug_files = []
|
|
461
|
+
selected_stage = None
|
|
462
|
+
|
|
463
|
+
# Scan cluster_debug directory once to build lookup table
|
|
464
|
+
cluster_debug_path = os.path.join(source_dir, "cluster_debug")
|
|
465
|
+
if os.path.exists(cluster_debug_path):
|
|
466
|
+
# Define priority order for stage types (first match wins)
|
|
467
|
+
# This prevents including multiple versions of the same cluster
|
|
468
|
+
stage_priority = ['sampled', 'reads', 'untrimmed']
|
|
469
|
+
|
|
470
|
+
# Try each stage type in priority order until we find files
|
|
471
|
+
for stage in stage_priority:
|
|
472
|
+
debug_files = glob.glob(os.path.join(cluster_debug_path, f"*-{stage}.fastq"))
|
|
473
|
+
if debug_files:
|
|
474
|
+
selected_stage = stage
|
|
475
|
+
break
|
|
476
|
+
|
|
477
|
+
# If no files found with known stage types, try generic pattern
|
|
478
|
+
if not debug_files:
|
|
479
|
+
debug_files = glob.glob(os.path.join(cluster_debug_path, "*.fastq"))
|
|
480
|
+
selected_stage = "unknown"
|
|
481
|
+
|
|
482
|
+
# Use regex to robustly parse the filename pattern
|
|
483
|
+
# Pattern: {specimen}-c{cluster}-RiC{size}-{stage}.fastq
|
|
484
|
+
# Where stage can be: sampled, reads, untrimmed, or other variants
|
|
485
|
+
pattern = re.compile(r'^(.+)-c(\d+)-RiC(\d+)-([a-z]+)\.fastq$')
|
|
486
|
+
|
|
487
|
+
for fastq_path in debug_files:
|
|
488
|
+
filename = os.path.basename(fastq_path)
|
|
489
|
+
match = pattern.match(filename)
|
|
490
|
+
if match:
|
|
491
|
+
specimen_name = match.group(1) # Extract specimen name
|
|
492
|
+
# cluster_num = match.group(2) # Available if needed
|
|
493
|
+
# ric_value = match.group(3) # Available if needed
|
|
494
|
+
# stage = match.group(4) # Stage: sampled, reads, untrimmed, etc.
|
|
495
|
+
lookup[specimen_name].append(fastq_path)
|
|
496
|
+
else:
|
|
497
|
+
logging.warning(f"Skipping file with unexpected name pattern: {filename}")
|
|
498
|
+
|
|
499
|
+
if debug_files:
|
|
500
|
+
logging.debug(f"Built FASTQ lookup table for {len(lookup)} specimens with {sum(len(files) for files in lookup.values())} {selected_stage} files")
|
|
501
|
+
else:
|
|
502
|
+
logging.debug("No FASTQ files found in cluster_debug directory")
|
|
503
|
+
return dict(lookup)
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def write_position_debug_file(
|
|
507
|
+
sequences_with_pos_outliers: List[Tuple],
|
|
508
|
+
summary_folder: str,
|
|
509
|
+
threshold: float
|
|
510
|
+
):
|
|
511
|
+
"""Write detailed debug information about high-error positions.
|
|
512
|
+
|
|
513
|
+
Creates a separate file with per-position base composition and error details
|
|
514
|
+
to help validate positional phasing and quality analysis.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
sequences_with_pos_outliers: List of (ConsensusInfo, result_dict) tuples
|
|
518
|
+
summary_folder: Output directory for the debug file
|
|
519
|
+
threshold: Error rate threshold used for flagging positions
|
|
520
|
+
"""
|
|
521
|
+
debug_path = os.path.join(summary_folder, 'position_errors_debug.txt')
|
|
522
|
+
|
|
523
|
+
with open(debug_path, 'w') as f:
|
|
524
|
+
f.write("POSITION ERROR DETAILED DEBUG REPORT\n")
|
|
525
|
+
f.write("=" * 80 + "\n\n")
|
|
526
|
+
f.write(f"Threshold: {threshold:.1%} (positions with error rate above this are flagged)\n")
|
|
527
|
+
f.write(f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
|
528
|
+
|
|
529
|
+
if not sequences_with_pos_outliers:
|
|
530
|
+
f.write("No sequences with high-error positions found.\n")
|
|
531
|
+
return
|
|
532
|
+
|
|
533
|
+
# Sort by total nucleotide errors descending
|
|
534
|
+
sorted_seqs = sorted(
|
|
535
|
+
sequences_with_pos_outliers,
|
|
536
|
+
key=lambda x: x[1].get('total_nucleotide_errors', 0),
|
|
537
|
+
reverse=True
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
for cons, result in sorted_seqs:
|
|
541
|
+
# Handle merged sequences (component_name in result)
|
|
542
|
+
if 'component_name' in result:
|
|
543
|
+
display_name = f"{cons.sample_name} (component: {result['component_name']})"
|
|
544
|
+
ric = result.get('component_ric', cons.ric)
|
|
545
|
+
else:
|
|
546
|
+
display_name = cons.sample_name
|
|
547
|
+
ric = result.get('ric', cons.ric)
|
|
548
|
+
|
|
549
|
+
f.write("=" * 80 + "\n")
|
|
550
|
+
f.write(f"SEQUENCE: {display_name}\n")
|
|
551
|
+
f.write(f"RiC: {ric}\n")
|
|
552
|
+
f.write(f"High-error positions: {result['num_outlier_positions']}\n")
|
|
553
|
+
f.write(f"Mean error rate at flagged positions: {result['mean_outlier_error_rate']:.1%}\n")
|
|
554
|
+
f.write(f"Total nucleotide errors: {result['total_nucleotide_errors']}\n")
|
|
555
|
+
f.write("-" * 80 + "\n\n")
|
|
556
|
+
|
|
557
|
+
outlier_details = result.get('outlier_details', [])
|
|
558
|
+
if not outlier_details:
|
|
559
|
+
# Fall back to basic info if detailed stats not available
|
|
560
|
+
for pos, rate, count in result.get('outlier_positions', []):
|
|
561
|
+
f.write(f" Position {pos+1}: error_rate={rate:.1%}, error_count={count}\n")
|
|
562
|
+
f.write("\n")
|
|
563
|
+
continue
|
|
564
|
+
|
|
565
|
+
for detail in outlier_details:
|
|
566
|
+
cons_pos = detail['consensus_position']
|
|
567
|
+
msa_pos = detail.get('msa_position')
|
|
568
|
+
# Display as 1-indexed for user-friendliness
|
|
569
|
+
cons_pos_display = cons_pos + 1 if cons_pos is not None else "?"
|
|
570
|
+
msa_pos_display = msa_pos + 1 if msa_pos is not None else "?"
|
|
571
|
+
|
|
572
|
+
f.write(f"Position {cons_pos_display} (MSA: {msa_pos_display}):\n")
|
|
573
|
+
f.write(f" Consensus base: {detail['consensus_nucleotide']}\n")
|
|
574
|
+
f.write(f" Coverage: {detail['coverage']}\n")
|
|
575
|
+
f.write(f" Error rate: {detail['error_rate']:.1%}\n")
|
|
576
|
+
f.write(f" Error count: {detail['error_count']}\n")
|
|
577
|
+
f.write(f" Substitutions: {detail['sub_count']}, Insertions: {detail['ins_count']}, Deletions: {detail['del_count']}\n")
|
|
578
|
+
|
|
579
|
+
# Format base composition (raw counts from MSA)
|
|
580
|
+
base_comp = detail['base_composition']
|
|
581
|
+
hp_comp = detail.get('homopolymer_composition', {})
|
|
582
|
+
|
|
583
|
+
if base_comp:
|
|
584
|
+
total = sum(base_comp.values())
|
|
585
|
+
comp_str = ", ".join(
|
|
586
|
+
f"{base}:{count}({count/total*100:.0f}%)"
|
|
587
|
+
for base, count in sorted(base_comp.items(), key=lambda x: -x[1])
|
|
588
|
+
if count > 0
|
|
589
|
+
)
|
|
590
|
+
f.write(f" Raw base composition: {comp_str}\n")
|
|
591
|
+
|
|
592
|
+
# Format homopolymer composition if present
|
|
593
|
+
if hp_comp and any(v > 0 for v in hp_comp.values()):
|
|
594
|
+
hp_str = ", ".join(
|
|
595
|
+
f"{base}:{count}"
|
|
596
|
+
for base, count in sorted(hp_comp.items(), key=lambda x: -x[1])
|
|
597
|
+
if count > 0
|
|
598
|
+
)
|
|
599
|
+
f.write(f" Homopolymer length variants: {hp_str}\n")
|
|
600
|
+
|
|
601
|
+
# Calculate and show effective composition (raw - HP adjustments)
|
|
602
|
+
# HP variants are normalized away in error calculation
|
|
603
|
+
if base_comp:
|
|
604
|
+
effective_comp = {}
|
|
605
|
+
for base in base_comp:
|
|
606
|
+
raw = base_comp.get(base, 0)
|
|
607
|
+
hp_adj = hp_comp.get(base, 0)
|
|
608
|
+
effective = raw - hp_adj
|
|
609
|
+
if effective > 0:
|
|
610
|
+
effective_comp[base] = effective
|
|
611
|
+
|
|
612
|
+
if effective_comp:
|
|
613
|
+
eff_total = sum(effective_comp.values())
|
|
614
|
+
eff_str = ", ".join(
|
|
615
|
+
f"{base}:{count}({count/eff_total*100:.0f}%)"
|
|
616
|
+
for base, count in sorted(effective_comp.items(), key=lambda x: -x[1])
|
|
617
|
+
if count > 0
|
|
618
|
+
)
|
|
619
|
+
f.write(f" Effective composition (HP-normalized): {eff_str}\n")
|
|
620
|
+
|
|
621
|
+
f.write("\n")
|
|
622
|
+
|
|
623
|
+
# Show context: consensus sequence around flagged positions
|
|
624
|
+
consensus_seq = result.get('consensus_seq', '')
|
|
625
|
+
if consensus_seq and outlier_details:
|
|
626
|
+
f.write("Consensus sequence context (flagged positions marked with *):\n")
|
|
627
|
+
# Mark positions in the sequence
|
|
628
|
+
marked_positions = set()
|
|
629
|
+
for detail in outlier_details:
|
|
630
|
+
if detail['consensus_position'] is not None:
|
|
631
|
+
marked_positions.add(detail['consensus_position'])
|
|
632
|
+
|
|
633
|
+
# Show sequence in chunks of 60 with position markers
|
|
634
|
+
chunk_size = 60
|
|
635
|
+
for chunk_start in range(0, len(consensus_seq), chunk_size):
|
|
636
|
+
chunk_end = min(chunk_start + chunk_size, len(consensus_seq))
|
|
637
|
+
chunk = consensus_seq[chunk_start:chunk_end]
|
|
638
|
+
|
|
639
|
+
# Position line
|
|
640
|
+
f.write(f" {chunk_start+1:>5} ")
|
|
641
|
+
f.write(chunk)
|
|
642
|
+
f.write(f" {chunk_end}\n")
|
|
643
|
+
|
|
644
|
+
# Marker line
|
|
645
|
+
f.write(" ")
|
|
646
|
+
for i in range(chunk_start, chunk_end):
|
|
647
|
+
if i in marked_positions:
|
|
648
|
+
f.write("*")
|
|
649
|
+
else:
|
|
650
|
+
f.write(" ")
|
|
651
|
+
f.write("\n")
|
|
652
|
+
|
|
653
|
+
f.write("\n")
|
|
654
|
+
|
|
655
|
+
logging.info(f"Position error debug file written to: {debug_path}")
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def write_output_files(final_consensus: List[ConsensusInfo],
|
|
659
|
+
all_raw_consensuses: List[Tuple[ConsensusInfo, str]],
|
|
660
|
+
summary_folder: str,
|
|
661
|
+
temp_log_file: str,
|
|
662
|
+
fasta_fields: List[FastaField]):
|
|
663
|
+
"""
|
|
664
|
+
Write summary files only. Individual data files already written per-specimen.
|
|
665
|
+
|
|
666
|
+
Args:
|
|
667
|
+
fasta_fields: List of FastaField objects defining header format
|
|
668
|
+
|
|
669
|
+
Writes:
|
|
670
|
+
- summary.fasta: Combined index of all sequences
|
|
671
|
+
- summary.txt: Statistics and totals
|
|
672
|
+
- summarize_log.txt: Copy of processing log
|
|
673
|
+
"""
|
|
674
|
+
|
|
675
|
+
# Write combined summary.fasta with custom field formatting
|
|
676
|
+
# Include only final consensus sequences (not .raw pre-merge variants)
|
|
677
|
+
summary_fasta_path = os.path.join(summary_folder, 'summary.fasta')
|
|
678
|
+
with open(summary_fasta_path, 'w') as f:
|
|
679
|
+
# Write final consensus sequences
|
|
680
|
+
for consensus in final_consensus:
|
|
681
|
+
header = format_fasta_header(consensus, fasta_fields)
|
|
682
|
+
f.write(f">{header}\n")
|
|
683
|
+
f.write(f"{consensus.sequence}\n")
|
|
684
|
+
|
|
685
|
+
# Write summary statistics
|
|
686
|
+
summary_txt_path = os.path.join(summary_folder, 'summary.txt')
|
|
687
|
+
with open(summary_txt_path, 'w') as f:
|
|
688
|
+
writer = csv.writer(f, delimiter='\t', lineterminator='\n')
|
|
689
|
+
writer.writerow(['Filename', 'Length', 'Reads in Consensus', 'Multiple'])
|
|
690
|
+
|
|
691
|
+
unique_samples = set()
|
|
692
|
+
total_ric = 0
|
|
693
|
+
specimen_counters = {}
|
|
694
|
+
|
|
695
|
+
for consensus in final_consensus:
|
|
696
|
+
base_name = consensus.sample_name.split('-')[0]
|
|
697
|
+
|
|
698
|
+
# Initialize counter for new specimen
|
|
699
|
+
if base_name not in specimen_counters:
|
|
700
|
+
specimen_counters[base_name] = 1
|
|
701
|
+
else:
|
|
702
|
+
specimen_counters[base_name] += 1
|
|
703
|
+
|
|
704
|
+
multiple_id = specimen_counters[base_name]
|
|
705
|
+
writer.writerow([consensus.sample_name, len(consensus.sequence), consensus.ric, multiple_id])
|
|
706
|
+
unique_samples.add(base_name)
|
|
707
|
+
total_ric += consensus.ric
|
|
708
|
+
|
|
709
|
+
writer.writerow([])
|
|
710
|
+
writer.writerow(['Total Unique Samples', len(unique_samples)])
|
|
711
|
+
writer.writerow(['Total Consensus Sequences', len(final_consensus)])
|
|
712
|
+
writer.writerow(['Total Reads in Consensus Sequences', total_ric])
|
|
713
|
+
|
|
714
|
+
# Copy log file to summary directory as summarize_log.txt
|
|
715
|
+
if temp_log_file:
|
|
716
|
+
summarize_log_path = os.path.join(summary_folder, 'summarize_log.txt')
|
|
717
|
+
try:
|
|
718
|
+
# Flush any remaining log entries before copying
|
|
719
|
+
logging.getLogger().handlers[1].flush() if len(logging.getLogger().handlers) > 1 else None
|
|
720
|
+
shutil.copy2(temp_log_file, summarize_log_path)
|
|
721
|
+
logging.info(f"Created log file: {summarize_log_path}")
|
|
722
|
+
except Exception as e:
|
|
723
|
+
logging.warning(f"Could not copy log file: {e}")
|