speconsense 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
speconsense/synth.py ADDED
@@ -0,0 +1,292 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Speconsense-synth: Synthetic read generator for testing consensus algorithms.
4
+
5
+ Generates simulated reads from input sequences with controlled error rates
6
+ for testing clustering and consensus generation behavior.
7
+ """
8
+
9
+ import argparse
10
+ import sys
11
+ import random
12
+ import logging
13
+ from typing import List, Tuple, Dict
14
+ from Bio import SeqIO
15
+ from Bio.Seq import Seq
16
+ from Bio.SeqRecord import SeqRecord
17
+
18
+
19
+ def parse_ratios(ratio_str: str, num_sequences: int) -> List[float]:
20
+ """Parse ratio string and validate against number of sequences.
21
+
22
+ Args:
23
+ ratio_str: Comma-separated ratios (e.g., "50,30,20")
24
+ num_sequences: Number of input sequences
25
+
26
+ Returns:
27
+ List of normalized ratios summing to 1.0
28
+ """
29
+ if not ratio_str:
30
+ # Equal distribution by default
31
+ return [1.0 / num_sequences] * num_sequences
32
+
33
+ ratios = [float(r.strip()) for r in ratio_str.split(',')]
34
+
35
+ if len(ratios) != num_sequences:
36
+ raise ValueError(f"Number of ratios ({len(ratios)}) must match number of sequences ({num_sequences})")
37
+
38
+ if any(r < 0 for r in ratios):
39
+ raise ValueError("Ratios must be non-negative")
40
+
41
+ total = sum(ratios)
42
+ if total == 0:
43
+ raise ValueError("At least one ratio must be positive")
44
+
45
+ # Normalize to sum to 1.0
46
+ return [r / total for r in ratios]
47
+
48
+
49
+ def error_rate_to_phred(error_rate: float) -> int:
50
+ """Convert error rate to Phred quality score.
51
+
52
+ Args:
53
+ error_rate: Probability of error (0-1)
54
+
55
+ Returns:
56
+ Phred quality score (0-40, capped)
57
+ """
58
+ if error_rate <= 0:
59
+ return 40 # Cap at Q40
60
+ if error_rate >= 1:
61
+ return 0
62
+
63
+ import math
64
+ phred = -10 * math.log10(error_rate)
65
+ return min(40, max(0, int(round(phred))))
66
+
67
+
68
+ def introduce_errors(sequence: str, error_rate: float, rng: random.Random) -> str:
69
+ """Introduce errors into a sequence at specified rate.
70
+
71
+ Each position has error_rate chance of mutation.
72
+ Error types (insertion, deletion, substitution) are equally likely.
73
+
74
+ Args:
75
+ sequence: Original sequence
76
+ error_rate: Probability of error at each position
77
+ rng: Random number generator for reproducibility
78
+
79
+ Returns:
80
+ Mutated sequence
81
+ """
82
+ if error_rate <= 0:
83
+ return sequence
84
+
85
+ result = []
86
+ bases = ['A', 'C', 'G', 'T']
87
+
88
+ for base in sequence:
89
+ if rng.random() < error_rate:
90
+ # Error occurs - choose type
91
+ error_type = rng.choice(['insertion', 'deletion', 'substitution'])
92
+
93
+ if error_type == 'deletion':
94
+ # Skip this base
95
+ continue
96
+ elif error_type == 'insertion':
97
+ # Add current base plus a random insertion
98
+ result.append(base)
99
+ result.append(rng.choice(bases))
100
+ else: # substitution
101
+ # Replace with different base
102
+ alternatives = [b for b in bases if b != base.upper()]
103
+ result.append(rng.choice(alternatives))
104
+ else:
105
+ result.append(base)
106
+
107
+ return ''.join(result)
108
+
109
+
110
+ def normalize_sequence(seq_str: str, seq_id: str) -> str:
111
+ """Normalize a sequence by removing whitespace and converting to uppercase.
112
+
113
+ Args:
114
+ seq_str: Input sequence string
115
+ seq_id: Sequence ID for error messages
116
+
117
+ Returns:
118
+ Normalized sequence (uppercase, no whitespace)
119
+
120
+ Raises:
121
+ ValueError: If sequence contains non-ACGT bases
122
+ """
123
+ # Remove all whitespace and convert to uppercase
124
+ normalized = ''.join(seq_str.split()).upper()
125
+
126
+ # Check for non-ACGT bases
127
+ valid_bases = set('ACGT')
128
+ invalid_bases = set(normalized) - valid_bases
129
+
130
+ if invalid_bases:
131
+ logging.warning(f"Sequence '{seq_id}' contains non-ACGT bases: {sorted(invalid_bases)}")
132
+ logging.warning(f"These bases will be treated as-is but may cause unexpected behavior")
133
+
134
+ return normalized
135
+
136
+
137
+ def generate_reads(sequences: List[SeqRecord],
138
+ num_reads: int,
139
+ error_rate: float,
140
+ ratios: List[float],
141
+ seed: int = None) -> List[SeqRecord]:
142
+ """Generate synthetic reads from input sequences.
143
+
144
+ Args:
145
+ sequences: Input sequences to generate reads from
146
+ num_reads: Total number of reads to generate
147
+ error_rate: Per-base error probability
148
+ ratios: Relative abundance of each sequence
149
+ seed: Random seed for reproducibility
150
+
151
+ Returns:
152
+ List of synthetic reads as SeqRecord objects
153
+ """
154
+ rng = random.Random(seed)
155
+ reads = []
156
+
157
+ # Calculate reads per sequence based on ratios
158
+ reads_per_seq = []
159
+ cumulative = 0
160
+ for i, ratio in enumerate(ratios):
161
+ if i == len(ratios) - 1:
162
+ # Last sequence gets remaining reads to ensure exact count
163
+ reads_per_seq.append(num_reads - cumulative)
164
+ else:
165
+ count = int(round(num_reads * ratio))
166
+ reads_per_seq.append(count)
167
+ cumulative += count
168
+
169
+ # Generate reads for each sequence
170
+ quality = error_rate_to_phred(error_rate)
171
+
172
+ for seq_idx, (seq_record, seq_reads) in enumerate(zip(sequences, reads_per_seq)):
173
+ # Normalize sequence
174
+ sequence_str = normalize_sequence(str(seq_record.seq), seq_record.id)
175
+
176
+ for read_idx in range(seq_reads):
177
+ # Introduce errors
178
+ mutated_seq = introduce_errors(sequence_str, error_rate, rng)
179
+
180
+ # Create read ID with provenance (unique ID + source)
181
+ read_id = f"read_{len(reads) + 1}_from_{seq_record.id}"
182
+
183
+ # Create SeqRecord with quality scores
184
+ read_record = SeqRecord(
185
+ Seq(mutated_seq),
186
+ id=read_id,
187
+ description=f"source={seq_record.id} error_rate={error_rate:.3f}",
188
+ letter_annotations={'phred_quality': [quality] * len(mutated_seq)}
189
+ )
190
+
191
+ reads.append(read_record)
192
+
193
+ # Shuffle reads to mix sequences
194
+ rng.shuffle(reads)
195
+
196
+ return reads
197
+
198
+
199
+ def main():
200
+ """Main entry point for speconsense-synth."""
201
+ parser = argparse.ArgumentParser(
202
+ description='Generate synthetic reads from reference sequences with controlled error rates',
203
+ formatter_class=argparse.RawDescriptionHelpFormatter,
204
+ epilog="""
205
+ Examples:
206
+ # Generate 1000 reads with 10% error rate
207
+ speconsense-synth reference.fasta -n 1000 -e 0.1 -o synthetic_reads.fastq
208
+
209
+ # Generate reads from multiple sequences with specific ratios
210
+ speconsense-synth variants.fasta -n 5000 -e 0.15 --ratios 70,30 -o mixed_reads.fastq
211
+
212
+ # Set seed for reproducible results
213
+ speconsense-synth reference.fasta -n 1000 -e 0.1 --seed 42 -o synthetic_reads.fastq
214
+ """
215
+ )
216
+
217
+ parser.add_argument('input', help='Input FASTA file with reference sequence(s)')
218
+ parser.add_argument('-n', '--num-reads', type=int, default=1000,
219
+ help='Number of reads to generate (default: 1000)')
220
+ parser.add_argument('-e', '--error-rate', type=float, default=0.1,
221
+ help='Per-base error rate (default: 0.1)')
222
+ parser.add_argument('-o', '--output', default='synthetic_reads.fastq',
223
+ help='Output FASTQ file (default: synthetic_reads.fastq)')
224
+ parser.add_argument('--ratios', type=str,
225
+ help='Comma-separated ratios for multiple sequences (e.g., 70,30)')
226
+ parser.add_argument('--seed', type=int,
227
+ help='Random seed for reproducibility')
228
+ parser.add_argument('--log-level', default='INFO',
229
+ choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
230
+ help='Logging level (default: INFO)')
231
+
232
+ args = parser.parse_args()
233
+
234
+ # Set up logging
235
+ logging.basicConfig(
236
+ level=getattr(logging, args.log_level),
237
+ format='%(asctime)s - %(levelname)s - %(message)s'
238
+ )
239
+
240
+ # Validate error rate
241
+ if not 0 <= args.error_rate <= 1:
242
+ parser.error("Error rate must be between 0 and 1")
243
+
244
+ # Load input sequences
245
+ try:
246
+ sequences = list(SeqIO.parse(args.input, 'fasta'))
247
+ if not sequences:
248
+ parser.error(f"No sequences found in {args.input}")
249
+ logging.info(f"Loaded {len(sequences)} sequence(s) from {args.input}")
250
+ except Exception as e:
251
+ parser.error(f"Failed to read input file: {e}")
252
+
253
+ # Parse ratios
254
+ try:
255
+ ratios = parse_ratios(args.ratios, len(sequences))
256
+ if len(sequences) > 1:
257
+ ratio_str = ', '.join(f"{s.id}:{r:.1%}" for s, r in zip(sequences, ratios))
258
+ logging.info(f"Sequence ratios: {ratio_str}")
259
+ except ValueError as e:
260
+ parser.error(str(e))
261
+
262
+ # Generate synthetic reads
263
+ logging.info(f"Generating {args.num_reads} reads with {args.error_rate:.1%} error rate")
264
+ if args.seed is not None:
265
+ logging.info(f"Using random seed: {args.seed}")
266
+
267
+ reads = generate_reads(
268
+ sequences=sequences,
269
+ num_reads=args.num_reads,
270
+ error_rate=args.error_rate,
271
+ ratios=ratios,
272
+ seed=args.seed
273
+ )
274
+
275
+ # Write output
276
+ try:
277
+ with open(args.output, 'w') as f:
278
+ SeqIO.write(reads, f, 'fastq')
279
+ logging.info(f"Wrote {len(reads)} reads to {args.output}")
280
+
281
+ # Report statistics
282
+ total_bases = sum(len(r.seq) for r in reads)
283
+ avg_length = total_bases / len(reads) if reads else 0
284
+ logging.info(f"Total bases: {total_bases:,}, Average read length: {avg_length:.1f}")
285
+
286
+ except Exception as e:
287
+ logging.error(f"Failed to write output: {e}")
288
+ sys.exit(1)
289
+
290
+
291
+ if __name__ == '__main__':
292
+ main()
speconsense/types.py ADDED
@@ -0,0 +1,38 @@
1
+ """
2
+ Shared type definitions for speconsense.
3
+
4
+ This module contains data classes used across multiple modules,
5
+ extracted to avoid circular imports.
6
+ """
7
+
8
+ from typing import List, Optional, NamedTuple
9
+
10
+
11
+ class ConsensusInfo(NamedTuple):
12
+ """Information about a consensus sequence from speconsense output."""
13
+ sample_name: str
14
+ cluster_id: str
15
+ sequence: str
16
+ ric: int
17
+ size: int
18
+ file_path: str
19
+ snp_count: Optional[int] = None # Number of SNPs from IUPAC consensus generation
20
+ primers: Optional[List[str]] = None # List of detected primer names
21
+ raw_ric: Optional[List[int]] = None # RiC values of .raw source variants
22
+ raw_len: Optional[List[int]] = None # Lengths of merged source sequences
23
+ rid: Optional[float] = None # Mean read identity (internal consistency metric)
24
+ rid_min: Optional[float] = None # Minimum read identity (worst-case read)
25
+ merge_indel_count: Optional[int] = None # Number of indels consumed by merging (for cumulative tracking)
26
+
27
+
28
+ class OverlapMergeInfo(NamedTuple):
29
+ """Information about a single overlap merge event for quality reporting."""
30
+ specimen: str # Specimen name
31
+ iteration: int # Merge iteration (1 = first pass, 2+ = iterative)
32
+ input_clusters: List[str] # Cluster IDs involved in merge
33
+ input_lengths: List[int] # Original sequence lengths
34
+ input_rics: List[int] # RiC values of input sequences
35
+ overlap_bp: int # Overlap region size in bp
36
+ prefix_bp: int # Extension before overlap
37
+ suffix_bp: int # Extension after overlap
38
+ output_length: int # Final merged sequence length