speconsense 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,723 @@
1
+ """File input/output operations for speconsense-summarize.
2
+
3
+ Provides functions for loading consensus sequences, writing output files,
4
+ and managing the output directory structure.
5
+ """
6
+
7
+ import os
8
+ import re
9
+ import glob
10
+ import csv
11
+ import json
12
+ import shutil
13
+ import logging
14
+ import datetime
15
+ from typing import List, Dict, Tuple, Optional
16
+ from collections import defaultdict
17
+
18
+ from Bio import SeqIO
19
+
20
+ from speconsense.types import ConsensusInfo
21
+
22
+ from .fields import FastaField, format_fasta_header
23
+ from .clustering import select_variants
24
+
25
+
26
+ def parse_consensus_header(header: str) -> Tuple[Optional[str], Optional[int], Optional[int],
27
+ Optional[List[str]], Optional[float], Optional[float]]:
28
+ """
29
+ Extract information from Speconsense consensus FASTA header.
30
+
31
+ Parses read identity metrics.
32
+
33
+ Returns:
34
+ Tuple of (sample_name, ric, size, primers, rid, rid_min)
35
+ """
36
+ sample_match = re.match(r'>([^ ]+) (.+)', header)
37
+ if not sample_match:
38
+ return None, None, None, None, None, None
39
+
40
+ sample_name = sample_match.group(1)
41
+ info_string = sample_match.group(2)
42
+
43
+ # Extract RiC value
44
+ ric_match = re.search(r'ric=(\d+)', info_string)
45
+ ric = int(ric_match.group(1)) if ric_match else 0
46
+
47
+ # Extract size value
48
+ size_match = re.search(r'size=(\d+)', info_string)
49
+ size = int(size_match.group(1)) if size_match else 0
50
+
51
+ # Extract primers value
52
+ primers_match = re.search(r'primers=([^,\s]+(?:,[^,\s]+)*)', info_string)
53
+ primers = primers_match.group(1).split(',') if primers_match else None
54
+
55
+ # Extract read identity metrics (percentages in headers, convert to fractions)
56
+ rid_match = re.search(r'rid=([\d.]+)', info_string)
57
+ rid = float(rid_match.group(1)) / 100.0 if rid_match else None
58
+
59
+ rid_min_match = re.search(r'rid_min=([\d.]+)', info_string)
60
+ rid_min = float(rid_min_match.group(1)) / 100.0 if rid_min_match else None
61
+
62
+ return sample_name, ric, size, primers, rid, rid_min
63
+
64
+
65
+ def load_consensus_sequences(
66
+ source_folder: str,
67
+ min_ric: int,
68
+ min_len: int = 0,
69
+ max_len: int = 0
70
+ ) -> List[ConsensusInfo]:
71
+ """Load all consensus sequences from speconsense output files.
72
+
73
+ Args:
74
+ source_folder: Directory containing speconsense output files
75
+ min_ric: Minimum Reads in Consensus threshold
76
+ min_len: Minimum sequence length (0 = disabled)
77
+ max_len: Maximum sequence length (0 = disabled)
78
+
79
+ Returns:
80
+ List of ConsensusInfo objects passing all filters
81
+ """
82
+ consensus_list = []
83
+ filtered_by_ric = 0
84
+ filtered_by_len = 0
85
+
86
+ # Find all consensus FASTA files matching the new naming pattern
87
+ fasta_pattern = os.path.join(source_folder, "*-all.fasta")
88
+ fasta_files = sorted(glob.glob(fasta_pattern))
89
+
90
+ for fasta_file in fasta_files:
91
+ logging.debug(f"Processing consensus file: {fasta_file}")
92
+
93
+ with open(fasta_file, 'r') as f:
94
+ for record in SeqIO.parse(f, "fasta"):
95
+ sample_name, ric, size, primers, rid, rid_min = \
96
+ parse_consensus_header(f">{record.description}")
97
+
98
+ if not sample_name:
99
+ continue
100
+
101
+ # RiC filter
102
+ if ric < min_ric:
103
+ filtered_by_ric += 1
104
+ continue
105
+
106
+ # Length filters (applied before merging to avoid chimeric contamination)
107
+ seq_len = len(record.seq)
108
+ if min_len > 0 and seq_len < min_len:
109
+ logging.debug(f"Filtered {sample_name}: length {seq_len} < min_len {min_len}")
110
+ filtered_by_len += 1
111
+ continue
112
+ if max_len > 0 and seq_len > max_len:
113
+ logging.debug(f"Filtered {sample_name}: length {seq_len} > max_len {max_len}")
114
+ filtered_by_len += 1
115
+ continue
116
+
117
+ # Extract cluster ID from sample name (e.g., "sample-c1" -> "c1")
118
+ cluster_match = re.search(r'-c(\d+)$', sample_name)
119
+ cluster_id = cluster_match.group(0) if cluster_match else sample_name
120
+
121
+ consensus_info = ConsensusInfo(
122
+ sample_name=sample_name,
123
+ cluster_id=cluster_id,
124
+ sequence=str(record.seq),
125
+ ric=ric,
126
+ size=size,
127
+ file_path=fasta_file,
128
+ snp_count=None, # No SNP info from original speconsense output
129
+ primers=primers,
130
+ raw_ric=None, # Not available in original speconsense output
131
+ rid=rid, # Mean read identity if available
132
+ rid_min=rid_min, # Minimum read identity if available
133
+ )
134
+ consensus_list.append(consensus_info)
135
+
136
+ # Log loading summary
137
+ filter_parts = [f"Loaded {len(consensus_list)} consensus sequences from {len(fasta_files)} files"]
138
+ if filtered_by_ric > 0:
139
+ filter_parts.append(f"filtered {filtered_by_ric} by RiC")
140
+ if filtered_by_len > 0:
141
+ filter_parts.append(f"filtered {filtered_by_len} by length")
142
+ logging.info(", ".join(filter_parts))
143
+
144
+ return consensus_list
145
+
146
+
147
+ def load_metadata_from_json(source_folder: str, sample_name: str) -> Optional[Dict]:
148
+ """Load metadata JSON file for a consensus sequence.
149
+
150
+ Args:
151
+ source_folder: Source directory containing cluster_debug folder
152
+ sample_name: Sample name (e.g., "sample-c1")
153
+
154
+ Returns:
155
+ Dictionary with metadata, or None if file not found or error
156
+ """
157
+ # Construct path to metadata file
158
+ debug_dir = os.path.join(source_folder, "cluster_debug")
159
+ metadata_file = os.path.join(debug_dir, f"{sample_name}-metadata.json")
160
+
161
+ if not os.path.exists(metadata_file):
162
+ logging.debug(f"Metadata file not found: {metadata_file}")
163
+ return None
164
+
165
+ try:
166
+ with open(metadata_file, 'r') as f:
167
+ metadata = json.load(f)
168
+ return metadata
169
+ except Exception as e:
170
+ logging.warning(f"Failed to load metadata from {metadata_file}: {e}")
171
+ return None
172
+
173
+
174
+ def create_output_structure(groups: Dict[int, List[ConsensusInfo]],
175
+ max_variants: int,
176
+ variant_selection: str,
177
+ summary_folder: str) -> Tuple[List[ConsensusInfo], Dict]:
178
+ """
179
+ Create the final output structure with proper naming.
180
+ Returns final consensus list and naming information.
181
+ """
182
+ os.makedirs(summary_folder, exist_ok=True)
183
+ os.makedirs(os.path.join(summary_folder, 'FASTQ Files'), exist_ok=True)
184
+ os.makedirs(os.path.join(summary_folder, 'variants'), exist_ok=True)
185
+ os.makedirs(os.path.join(summary_folder, 'variants', 'FASTQ Files'), exist_ok=True)
186
+
187
+ final_consensus = []
188
+ naming_info = {}
189
+
190
+ # Sort groups by size of largest member (descending)
191
+ sorted_groups = sorted(groups.items(),
192
+ key=lambda x: max(m.size for m in x[1]),
193
+ reverse=True)
194
+
195
+ for group_idx, (_, group_members) in enumerate(sorted_groups, 1):
196
+ # Select variants for this group
197
+ selected_variants = select_variants(group_members, max_variants, variant_selection, group_number=group_idx)
198
+
199
+ # Create naming for this group
200
+ group_naming = []
201
+
202
+ for variant_idx, variant in enumerate(selected_variants):
203
+ # All variants get .v suffix (primary is .v1, additional are .v2, .v3, etc.)
204
+ # Use rsplit to split on the LAST '-c' (specimen names may contain '-c')
205
+ specimen_base = variant.sample_name.rsplit('-c', 1)[0]
206
+ new_name = f"{specimen_base}-{group_idx}.v{variant_idx + 1}"
207
+
208
+ # Use _replace to preserve all fields while updating sample_name
209
+ renamed_variant = variant._replace(sample_name=new_name)
210
+
211
+ final_consensus.append(renamed_variant)
212
+ group_naming.append((variant.sample_name, new_name))
213
+
214
+ naming_info[group_idx] = group_naming
215
+
216
+ return final_consensus, naming_info
217
+
218
+
219
+ def write_consensus_fastq(consensus: ConsensusInfo,
220
+ merge_traceability: Dict[str, List[str]],
221
+ naming_info: Dict,
222
+ fastq_dir: str,
223
+ fastq_lookup: Dict[str, List[str]],
224
+ original_consensus_lookup: Dict[str, ConsensusInfo]):
225
+ """Write FASTQ file for a consensus by concatenating existing FASTQ files."""
226
+ # Find the original cluster name(s) by looking through naming_info
227
+ original_clusters = []
228
+ for group_naming in naming_info.values():
229
+ for original_name, final_name in group_naming:
230
+ if final_name == consensus.sample_name:
231
+ # This original cluster contributed to our final consensus
232
+ if original_name in merge_traceability:
233
+ # This was a merged cluster, get all original contributors
234
+ original_clusters.extend(merge_traceability[original_name])
235
+ else:
236
+ # This was not merged, just add it directly
237
+ original_clusters.append(original_name)
238
+ break
239
+
240
+ if not original_clusters:
241
+ logging.warning(f"Could not find contributing clusters for {consensus.sample_name}")
242
+ return
243
+
244
+ # Find FASTQ files for these clusters using lookup table
245
+ # Track cluster metadata alongside files: [(cluster_name, ric, [files]), ...]
246
+ fastq_output_path = os.path.join(fastq_dir, f"{consensus.sample_name}-RiC{consensus.ric}.fastq")
247
+ cluster_files = []
248
+
249
+ for cluster_name in original_clusters:
250
+ # Look for specimen name from cluster name (e.g., "sample-c1" -> "sample")
251
+ if '-c' in cluster_name:
252
+ specimen_name = cluster_name.rsplit('-c', 1)[0]
253
+ debug_files = fastq_lookup.get(specimen_name, [])
254
+
255
+ # Get the original RiC value for this cluster
256
+ original_ric = original_consensus_lookup.get(cluster_name)
257
+ if not original_ric:
258
+ logging.warning(f"Could not find original consensus info for {cluster_name}")
259
+ continue
260
+
261
+ # Filter files that match this specific cluster with exact RiC value
262
+ # Match the full pattern: {specimen}-c{cluster}-RiC{exact_ric}-{stage}.fastq
263
+ # This prevents matching multiple RiC values for the same cluster
264
+ cluster_ric_pattern = f"{cluster_name}-RiC{original_ric.ric}-"
265
+ matching_files = [f for f in debug_files if cluster_ric_pattern in f]
266
+
267
+ # Validate that matched files exist and log any issues
268
+ valid_files = []
269
+ for mf in matching_files:
270
+ if not os.path.exists(mf):
271
+ logging.warning(f"Matched file does not exist: {mf}")
272
+ elif os.path.getsize(mf) == 0:
273
+ logging.warning(f"Matched file is empty: {mf}")
274
+ else:
275
+ valid_files.append(mf)
276
+
277
+ if valid_files:
278
+ cluster_files.append((cluster_name, original_ric.ric, valid_files))
279
+
280
+ if not cluster_files:
281
+ logging.warning(f"No FASTQ files found for {consensus.sample_name} from clusters: {original_clusters}")
282
+ return
283
+
284
+ # Concatenate files with cluster boundary delimiters
285
+ # Each cluster gets a synthetic FASTQ record as a delimiter before its reads
286
+ files_processed = 0
287
+ try:
288
+ with open(fastq_output_path, 'w') as outf:
289
+ for idx, (cluster_name, ric, files) in enumerate(cluster_files, 1):
290
+ # Count reads in this cluster's files
291
+ cluster_reads = 0
292
+ for f in files:
293
+ with open(f, 'r') as rf:
294
+ cluster_reads += sum(1 for _ in rf) // 4
295
+
296
+ # Write cluster boundary delimiter
297
+ outf.write(f"@CLUSTER_BOUNDARY_{idx}:{cluster_name}:RiC={ric}:reads={cluster_reads}\n")
298
+ outf.write("NNNNNNNNNN\n")
299
+ outf.write("+\n")
300
+ outf.write("!!!!!!!!!!\n")
301
+
302
+ # Write cluster reads
303
+ for input_file in files:
304
+ try:
305
+ with open(input_file, 'r') as inf:
306
+ shutil.copyfileobj(inf, outf)
307
+ files_processed += 1
308
+ except Exception as e:
309
+ logging.debug(f"Could not concatenate {input_file}: {e}")
310
+
311
+ # Check if the output file has content
312
+ output_size = os.path.getsize(fastq_output_path)
313
+ total_files = sum(len(files) for _, _, files in cluster_files)
314
+ if output_size > 0:
315
+ # Count reads for logging by quickly counting lines and dividing by 4
316
+ with open(fastq_output_path, 'r') as f:
317
+ line_count = sum(1 for line in f)
318
+ read_count = line_count // 4
319
+ logging.debug(f"Concatenated {files_processed}/{total_files} files from {len(cluster_files)} clusters ({output_size:,} bytes) with ~{read_count} reads to {fastq_output_path}")
320
+ else:
321
+ # Debug: check what files were supposed to be concatenated
322
+ file_info = []
323
+ for _, _, files in cluster_files:
324
+ for input_file in files:
325
+ size = os.path.getsize(input_file) if os.path.exists(input_file) else 0
326
+ file_info.append(f"{os.path.basename(input_file)}:{size}B")
327
+
328
+ logging.warning(f"No data written for {consensus.sample_name} - input files: {', '.join(file_info)}")
329
+ # Remove empty output file
330
+ try:
331
+ os.unlink(fastq_output_path)
332
+ except OSError:
333
+ pass
334
+
335
+ except Exception as e:
336
+ logging.error(f"Failed to write concatenated FASTQ file {fastq_output_path}: {e}")
337
+
338
+
339
+ def write_specimen_data_files(specimen_consensus: List[ConsensusInfo],
340
+ merge_traceability: Dict[str, List[str]],
341
+ naming_info: Dict,
342
+ summary_folder: str,
343
+ fastq_dir: str,
344
+ fastq_lookup: Dict[str, List[str]],
345
+ original_consensus_lookup: Dict[str, ConsensusInfo],
346
+ fasta_fields: List[FastaField]
347
+ ) -> List[Tuple[ConsensusInfo, str]]:
348
+ """
349
+ Write individual FASTA and FASTQ files for a single specimen.
350
+ Does NOT write summary files (summary.fasta, summary.txt).
351
+
352
+ Args:
353
+ fasta_fields: List of FastaField objects defining header format
354
+
355
+ Returns:
356
+ List of (raw_consensus, original_cluster_name) tuples for later use in summary.fasta
357
+ """
358
+ # Generate .raw file consensuses for merged variants
359
+ raw_file_consensuses = []
360
+ for consensus in specimen_consensus:
361
+ # Only create .raw files if this consensus was actually merged
362
+ if consensus.raw_ric and len(consensus.raw_ric) > 1:
363
+ # Find the original cluster name from naming_info
364
+ original_cluster_name = None
365
+ for group_naming in naming_info.values():
366
+ for orig_name, final_name in group_naming:
367
+ if final_name == consensus.sample_name:
368
+ original_cluster_name = orig_name
369
+ break
370
+ if original_cluster_name:
371
+ break
372
+
373
+ # Get contributing clusters from merge_traceability
374
+ if original_cluster_name and original_cluster_name in merge_traceability:
375
+ contributing_clusters = merge_traceability[original_cluster_name]
376
+
377
+ # Sort by size (descending) to match .raw1, .raw2 ordering
378
+ contributing_infos = []
379
+ for cluster_name in contributing_clusters:
380
+ if cluster_name in original_consensus_lookup:
381
+ contributing_infos.append(original_consensus_lookup[cluster_name])
382
+
383
+ contributing_infos.sort(key=lambda x: x.size, reverse=True)
384
+
385
+ # Create .raw file entries
386
+ for raw_idx, raw_info in enumerate(contributing_infos, 1):
387
+ raw_name = f"{consensus.sample_name}.raw{raw_idx}"
388
+
389
+ # Create new ConsensusInfo with .raw name but original sequence/metadata
390
+ raw_consensus = ConsensusInfo(
391
+ sample_name=raw_name,
392
+ cluster_id=raw_info.cluster_id,
393
+ sequence=raw_info.sequence,
394
+ ric=raw_info.ric,
395
+ size=raw_info.size,
396
+ file_path=raw_info.file_path,
397
+ snp_count=None, # Pre-merge, no SNPs from merging
398
+ primers=raw_info.primers,
399
+ raw_ric=None, # Pre-merge, not merged
400
+ rid=raw_info.rid, # Preserve read identity metrics
401
+ rid_min=raw_info.rid_min,
402
+ )
403
+ raw_file_consensuses.append((raw_consensus, raw_info.sample_name))
404
+
405
+ # Write individual FASTA files with custom field formatting
406
+ for consensus in specimen_consensus:
407
+ output_file = os.path.join(summary_folder, f"{consensus.sample_name}-RiC{consensus.ric}.fasta")
408
+ with open(output_file, 'w') as f:
409
+ header = format_fasta_header(consensus, fasta_fields)
410
+ f.write(f">{header}\n")
411
+ f.write(f"{consensus.sequence}\n")
412
+
413
+ # Write FASTQ files for each final consensus containing all contributing reads
414
+ for consensus in specimen_consensus:
415
+ write_consensus_fastq(consensus, merge_traceability, naming_info, fastq_dir, fastq_lookup, original_consensus_lookup)
416
+
417
+ # Write .raw files (individual FASTA and FASTQ for pre-merge variants)
418
+ for raw_consensus, original_cluster_name in raw_file_consensuses:
419
+ # Write individual FASTA file with custom field formatting
420
+ output_file = os.path.join(summary_folder, 'variants', f"{raw_consensus.sample_name}-RiC{raw_consensus.ric}.fasta")
421
+ with open(output_file, 'w') as f:
422
+ header = format_fasta_header(raw_consensus, fasta_fields)
423
+ f.write(f">{header}\n")
424
+ f.write(f"{raw_consensus.sequence}\n")
425
+
426
+ # Write FASTQ file by finding the original cluster's FASTQ
427
+ # Look for specimen name from original cluster name
428
+ if '-c' in original_cluster_name:
429
+ specimen_name = original_cluster_name.rsplit('-c', 1)[0]
430
+ debug_files = fastq_lookup.get(specimen_name, []) if fastq_lookup else []
431
+
432
+ # Filter files that match this specific cluster with exact RiC value
433
+ # Use the raw_consensus.ric which came from the original cluster
434
+ cluster_ric_pattern = f"{original_cluster_name}-RiC{raw_consensus.ric}-"
435
+ matching_files = [f for f in debug_files if cluster_ric_pattern in f]
436
+
437
+ if matching_files:
438
+ fastq_output_path = os.path.join(summary_folder, 'variants', 'FASTQ Files', f"{raw_consensus.sample_name}-RiC{raw_consensus.ric}.fastq")
439
+ try:
440
+ with open(fastq_output_path, 'wb') as outf:
441
+ for input_file in matching_files:
442
+ if os.path.exists(input_file) and os.path.getsize(input_file) > 0:
443
+ with open(input_file, 'rb') as inf:
444
+ shutil.copyfileobj(inf, outf)
445
+ logging.debug(f"Wrote .raw FASTQ: {os.path.basename(fastq_output_path)}")
446
+ except Exception as e:
447
+ logging.debug(f"Could not write .raw FASTQ for {raw_consensus.sample_name}: {e}")
448
+
449
+ return raw_file_consensuses
450
+
451
+
452
+ def build_fastq_lookup_table(source_dir: str = ".") -> Dict[str, List[str]]:
453
+ """
454
+ Build a lookup table mapping specimen base names to their cluster FASTQ files.
455
+ This avoids repeated directory scanning during file copying.
456
+ """
457
+ lookup = defaultdict(list)
458
+
459
+ # Initialize variables before conditional block
460
+ debug_files = []
461
+ selected_stage = None
462
+
463
+ # Scan cluster_debug directory once to build lookup table
464
+ cluster_debug_path = os.path.join(source_dir, "cluster_debug")
465
+ if os.path.exists(cluster_debug_path):
466
+ # Define priority order for stage types (first match wins)
467
+ # This prevents including multiple versions of the same cluster
468
+ stage_priority = ['sampled', 'reads', 'untrimmed']
469
+
470
+ # Try each stage type in priority order until we find files
471
+ for stage in stage_priority:
472
+ debug_files = glob.glob(os.path.join(cluster_debug_path, f"*-{stage}.fastq"))
473
+ if debug_files:
474
+ selected_stage = stage
475
+ break
476
+
477
+ # If no files found with known stage types, try generic pattern
478
+ if not debug_files:
479
+ debug_files = glob.glob(os.path.join(cluster_debug_path, "*.fastq"))
480
+ selected_stage = "unknown"
481
+
482
+ # Use regex to robustly parse the filename pattern
483
+ # Pattern: {specimen}-c{cluster}-RiC{size}-{stage}.fastq
484
+ # Where stage can be: sampled, reads, untrimmed, or other variants
485
+ pattern = re.compile(r'^(.+)-c(\d+)-RiC(\d+)-([a-z]+)\.fastq$')
486
+
487
+ for fastq_path in debug_files:
488
+ filename = os.path.basename(fastq_path)
489
+ match = pattern.match(filename)
490
+ if match:
491
+ specimen_name = match.group(1) # Extract specimen name
492
+ # cluster_num = match.group(2) # Available if needed
493
+ # ric_value = match.group(3) # Available if needed
494
+ # stage = match.group(4) # Stage: sampled, reads, untrimmed, etc.
495
+ lookup[specimen_name].append(fastq_path)
496
+ else:
497
+ logging.warning(f"Skipping file with unexpected name pattern: {filename}")
498
+
499
+ if debug_files:
500
+ logging.debug(f"Built FASTQ lookup table for {len(lookup)} specimens with {sum(len(files) for files in lookup.values())} {selected_stage} files")
501
+ else:
502
+ logging.debug("No FASTQ files found in cluster_debug directory")
503
+ return dict(lookup)
504
+
505
+
506
+ def write_position_debug_file(
507
+ sequences_with_pos_outliers: List[Tuple],
508
+ summary_folder: str,
509
+ threshold: float
510
+ ):
511
+ """Write detailed debug information about high-error positions.
512
+
513
+ Creates a separate file with per-position base composition and error details
514
+ to help validate positional phasing and quality analysis.
515
+
516
+ Args:
517
+ sequences_with_pos_outliers: List of (ConsensusInfo, result_dict) tuples
518
+ summary_folder: Output directory for the debug file
519
+ threshold: Error rate threshold used for flagging positions
520
+ """
521
+ debug_path = os.path.join(summary_folder, 'position_errors_debug.txt')
522
+
523
+ with open(debug_path, 'w') as f:
524
+ f.write("POSITION ERROR DETAILED DEBUG REPORT\n")
525
+ f.write("=" * 80 + "\n\n")
526
+ f.write(f"Threshold: {threshold:.1%} (positions with error rate above this are flagged)\n")
527
+ f.write(f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
528
+
529
+ if not sequences_with_pos_outliers:
530
+ f.write("No sequences with high-error positions found.\n")
531
+ return
532
+
533
+ # Sort by total nucleotide errors descending
534
+ sorted_seqs = sorted(
535
+ sequences_with_pos_outliers,
536
+ key=lambda x: x[1].get('total_nucleotide_errors', 0),
537
+ reverse=True
538
+ )
539
+
540
+ for cons, result in sorted_seqs:
541
+ # Handle merged sequences (component_name in result)
542
+ if 'component_name' in result:
543
+ display_name = f"{cons.sample_name} (component: {result['component_name']})"
544
+ ric = result.get('component_ric', cons.ric)
545
+ else:
546
+ display_name = cons.sample_name
547
+ ric = result.get('ric', cons.ric)
548
+
549
+ f.write("=" * 80 + "\n")
550
+ f.write(f"SEQUENCE: {display_name}\n")
551
+ f.write(f"RiC: {ric}\n")
552
+ f.write(f"High-error positions: {result['num_outlier_positions']}\n")
553
+ f.write(f"Mean error rate at flagged positions: {result['mean_outlier_error_rate']:.1%}\n")
554
+ f.write(f"Total nucleotide errors: {result['total_nucleotide_errors']}\n")
555
+ f.write("-" * 80 + "\n\n")
556
+
557
+ outlier_details = result.get('outlier_details', [])
558
+ if not outlier_details:
559
+ # Fall back to basic info if detailed stats not available
560
+ for pos, rate, count in result.get('outlier_positions', []):
561
+ f.write(f" Position {pos+1}: error_rate={rate:.1%}, error_count={count}\n")
562
+ f.write("\n")
563
+ continue
564
+
565
+ for detail in outlier_details:
566
+ cons_pos = detail['consensus_position']
567
+ msa_pos = detail.get('msa_position')
568
+ # Display as 1-indexed for user-friendliness
569
+ cons_pos_display = cons_pos + 1 if cons_pos is not None else "?"
570
+ msa_pos_display = msa_pos + 1 if msa_pos is not None else "?"
571
+
572
+ f.write(f"Position {cons_pos_display} (MSA: {msa_pos_display}):\n")
573
+ f.write(f" Consensus base: {detail['consensus_nucleotide']}\n")
574
+ f.write(f" Coverage: {detail['coverage']}\n")
575
+ f.write(f" Error rate: {detail['error_rate']:.1%}\n")
576
+ f.write(f" Error count: {detail['error_count']}\n")
577
+ f.write(f" Substitutions: {detail['sub_count']}, Insertions: {detail['ins_count']}, Deletions: {detail['del_count']}\n")
578
+
579
+ # Format base composition (raw counts from MSA)
580
+ base_comp = detail['base_composition']
581
+ hp_comp = detail.get('homopolymer_composition', {})
582
+
583
+ if base_comp:
584
+ total = sum(base_comp.values())
585
+ comp_str = ", ".join(
586
+ f"{base}:{count}({count/total*100:.0f}%)"
587
+ for base, count in sorted(base_comp.items(), key=lambda x: -x[1])
588
+ if count > 0
589
+ )
590
+ f.write(f" Raw base composition: {comp_str}\n")
591
+
592
+ # Format homopolymer composition if present
593
+ if hp_comp and any(v > 0 for v in hp_comp.values()):
594
+ hp_str = ", ".join(
595
+ f"{base}:{count}"
596
+ for base, count in sorted(hp_comp.items(), key=lambda x: -x[1])
597
+ if count > 0
598
+ )
599
+ f.write(f" Homopolymer length variants: {hp_str}\n")
600
+
601
+ # Calculate and show effective composition (raw - HP adjustments)
602
+ # HP variants are normalized away in error calculation
603
+ if base_comp:
604
+ effective_comp = {}
605
+ for base in base_comp:
606
+ raw = base_comp.get(base, 0)
607
+ hp_adj = hp_comp.get(base, 0)
608
+ effective = raw - hp_adj
609
+ if effective > 0:
610
+ effective_comp[base] = effective
611
+
612
+ if effective_comp:
613
+ eff_total = sum(effective_comp.values())
614
+ eff_str = ", ".join(
615
+ f"{base}:{count}({count/eff_total*100:.0f}%)"
616
+ for base, count in sorted(effective_comp.items(), key=lambda x: -x[1])
617
+ if count > 0
618
+ )
619
+ f.write(f" Effective composition (HP-normalized): {eff_str}\n")
620
+
621
+ f.write("\n")
622
+
623
+ # Show context: consensus sequence around flagged positions
624
+ consensus_seq = result.get('consensus_seq', '')
625
+ if consensus_seq and outlier_details:
626
+ f.write("Consensus sequence context (flagged positions marked with *):\n")
627
+ # Mark positions in the sequence
628
+ marked_positions = set()
629
+ for detail in outlier_details:
630
+ if detail['consensus_position'] is not None:
631
+ marked_positions.add(detail['consensus_position'])
632
+
633
+ # Show sequence in chunks of 60 with position markers
634
+ chunk_size = 60
635
+ for chunk_start in range(0, len(consensus_seq), chunk_size):
636
+ chunk_end = min(chunk_start + chunk_size, len(consensus_seq))
637
+ chunk = consensus_seq[chunk_start:chunk_end]
638
+
639
+ # Position line
640
+ f.write(f" {chunk_start+1:>5} ")
641
+ f.write(chunk)
642
+ f.write(f" {chunk_end}\n")
643
+
644
+ # Marker line
645
+ f.write(" ")
646
+ for i in range(chunk_start, chunk_end):
647
+ if i in marked_positions:
648
+ f.write("*")
649
+ else:
650
+ f.write(" ")
651
+ f.write("\n")
652
+
653
+ f.write("\n")
654
+
655
+ logging.info(f"Position error debug file written to: {debug_path}")
656
+
657
+
658
+ def write_output_files(final_consensus: List[ConsensusInfo],
659
+ all_raw_consensuses: List[Tuple[ConsensusInfo, str]],
660
+ summary_folder: str,
661
+ temp_log_file: str,
662
+ fasta_fields: List[FastaField]):
663
+ """
664
+ Write summary files only. Individual data files already written per-specimen.
665
+
666
+ Args:
667
+ fasta_fields: List of FastaField objects defining header format
668
+
669
+ Writes:
670
+ - summary.fasta: Combined index of all sequences
671
+ - summary.txt: Statistics and totals
672
+ - summarize_log.txt: Copy of processing log
673
+ """
674
+
675
+ # Write combined summary.fasta with custom field formatting
676
+ # Include only final consensus sequences (not .raw pre-merge variants)
677
+ summary_fasta_path = os.path.join(summary_folder, 'summary.fasta')
678
+ with open(summary_fasta_path, 'w') as f:
679
+ # Write final consensus sequences
680
+ for consensus in final_consensus:
681
+ header = format_fasta_header(consensus, fasta_fields)
682
+ f.write(f">{header}\n")
683
+ f.write(f"{consensus.sequence}\n")
684
+
685
+ # Write summary statistics
686
+ summary_txt_path = os.path.join(summary_folder, 'summary.txt')
687
+ with open(summary_txt_path, 'w') as f:
688
+ writer = csv.writer(f, delimiter='\t', lineterminator='\n')
689
+ writer.writerow(['Filename', 'Length', 'Reads in Consensus', 'Multiple'])
690
+
691
+ unique_samples = set()
692
+ total_ric = 0
693
+ specimen_counters = {}
694
+
695
+ for consensus in final_consensus:
696
+ base_name = consensus.sample_name.split('-')[0]
697
+
698
+ # Initialize counter for new specimen
699
+ if base_name not in specimen_counters:
700
+ specimen_counters[base_name] = 1
701
+ else:
702
+ specimen_counters[base_name] += 1
703
+
704
+ multiple_id = specimen_counters[base_name]
705
+ writer.writerow([consensus.sample_name, len(consensus.sequence), consensus.ric, multiple_id])
706
+ unique_samples.add(base_name)
707
+ total_ric += consensus.ric
708
+
709
+ writer.writerow([])
710
+ writer.writerow(['Total Unique Samples', len(unique_samples)])
711
+ writer.writerow(['Total Consensus Sequences', len(final_consensus)])
712
+ writer.writerow(['Total Reads in Consensus Sequences', total_ric])
713
+
714
+ # Copy log file to summary directory as summarize_log.txt
715
+ if temp_log_file:
716
+ summarize_log_path = os.path.join(summary_folder, 'summarize_log.txt')
717
+ try:
718
+ # Flush any remaining log entries before copying
719
+ logging.getLogger().handlers[1].flush() if len(logging.getLogger().handlers) > 1 else None
720
+ shutil.copy2(temp_log_file, summarize_log_path)
721
+ logging.info(f"Created log file: {summarize_log_path}")
722
+ except Exception as e:
723
+ logging.warning(f"Could not copy log file: {e}")