speconsense 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speconsense/__init__.py +16 -0
- speconsense/cli.py +6 -0
- speconsense/core/__init__.py +32 -0
- speconsense/core/__main__.py +6 -0
- speconsense/core/cli.py +308 -0
- speconsense/core/clusterer.py +1565 -0
- speconsense/core/workers.py +696 -0
- speconsense/msa.py +813 -0
- speconsense/profiles/__init__.py +514 -0
- speconsense/profiles/example.yaml +97 -0
- speconsense/profiles/herbarium.yaml +25 -0
- speconsense/profiles/largedata.yaml +19 -0
- speconsense/profiles/nostalgia.yaml +22 -0
- speconsense/profiles/strict.yaml +27 -0
- speconsense/quality_report.py +499 -0
- speconsense/scalability/__init__.py +29 -0
- speconsense/scalability/base.py +461 -0
- speconsense/scalability/config.py +42 -0
- speconsense/scalability/vsearch.py +226 -0
- speconsense/summarize/__init__.py +129 -0
- speconsense/summarize/__main__.py +6 -0
- speconsense/summarize/analysis.py +780 -0
- speconsense/summarize/cli.py +528 -0
- speconsense/summarize/clustering.py +669 -0
- speconsense/summarize/fields.py +262 -0
- speconsense/summarize/io.py +723 -0
- speconsense/summarize/iupac.py +294 -0
- speconsense/summarize/merging.py +606 -0
- speconsense/synth.py +292 -0
- speconsense/types.py +38 -0
- speconsense-0.7.2.dist-info/METADATA +1449 -0
- speconsense-0.7.2.dist-info/RECORD +36 -0
- speconsense-0.7.2.dist-info/WHEEL +5 -0
- speconsense-0.7.2.dist-info/entry_points.txt +4 -0
- speconsense-0.7.2.dist-info/licenses/LICENSE +28 -0
- speconsense-0.7.2.dist-info/top_level.txt +1 -0
speconsense/synth.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Speconsense-synth: Synthetic read generator for testing consensus algorithms.
|
|
4
|
+
|
|
5
|
+
Generates simulated reads from input sequences with controlled error rates
|
|
6
|
+
for testing clustering and consensus generation behavior.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import sys
|
|
11
|
+
import random
|
|
12
|
+
import logging
|
|
13
|
+
from typing import List, Tuple, Dict
|
|
14
|
+
from Bio import SeqIO
|
|
15
|
+
from Bio.Seq import Seq
|
|
16
|
+
from Bio.SeqRecord import SeqRecord
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def parse_ratios(ratio_str: str, num_sequences: int) -> List[float]:
|
|
20
|
+
"""Parse ratio string and validate against number of sequences.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
ratio_str: Comma-separated ratios (e.g., "50,30,20")
|
|
24
|
+
num_sequences: Number of input sequences
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
List of normalized ratios summing to 1.0
|
|
28
|
+
"""
|
|
29
|
+
if not ratio_str:
|
|
30
|
+
# Equal distribution by default
|
|
31
|
+
return [1.0 / num_sequences] * num_sequences
|
|
32
|
+
|
|
33
|
+
ratios = [float(r.strip()) for r in ratio_str.split(',')]
|
|
34
|
+
|
|
35
|
+
if len(ratios) != num_sequences:
|
|
36
|
+
raise ValueError(f"Number of ratios ({len(ratios)}) must match number of sequences ({num_sequences})")
|
|
37
|
+
|
|
38
|
+
if any(r < 0 for r in ratios):
|
|
39
|
+
raise ValueError("Ratios must be non-negative")
|
|
40
|
+
|
|
41
|
+
total = sum(ratios)
|
|
42
|
+
if total == 0:
|
|
43
|
+
raise ValueError("At least one ratio must be positive")
|
|
44
|
+
|
|
45
|
+
# Normalize to sum to 1.0
|
|
46
|
+
return [r / total for r in ratios]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def error_rate_to_phred(error_rate: float) -> int:
|
|
50
|
+
"""Convert error rate to Phred quality score.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
error_rate: Probability of error (0-1)
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Phred quality score (0-40, capped)
|
|
57
|
+
"""
|
|
58
|
+
if error_rate <= 0:
|
|
59
|
+
return 40 # Cap at Q40
|
|
60
|
+
if error_rate >= 1:
|
|
61
|
+
return 0
|
|
62
|
+
|
|
63
|
+
import math
|
|
64
|
+
phred = -10 * math.log10(error_rate)
|
|
65
|
+
return min(40, max(0, int(round(phred))))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def introduce_errors(sequence: str, error_rate: float, rng: random.Random) -> str:
|
|
69
|
+
"""Introduce errors into a sequence at specified rate.
|
|
70
|
+
|
|
71
|
+
Each position has error_rate chance of mutation.
|
|
72
|
+
Error types (insertion, deletion, substitution) are equally likely.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
sequence: Original sequence
|
|
76
|
+
error_rate: Probability of error at each position
|
|
77
|
+
rng: Random number generator for reproducibility
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Mutated sequence
|
|
81
|
+
"""
|
|
82
|
+
if error_rate <= 0:
|
|
83
|
+
return sequence
|
|
84
|
+
|
|
85
|
+
result = []
|
|
86
|
+
bases = ['A', 'C', 'G', 'T']
|
|
87
|
+
|
|
88
|
+
for base in sequence:
|
|
89
|
+
if rng.random() < error_rate:
|
|
90
|
+
# Error occurs - choose type
|
|
91
|
+
error_type = rng.choice(['insertion', 'deletion', 'substitution'])
|
|
92
|
+
|
|
93
|
+
if error_type == 'deletion':
|
|
94
|
+
# Skip this base
|
|
95
|
+
continue
|
|
96
|
+
elif error_type == 'insertion':
|
|
97
|
+
# Add current base plus a random insertion
|
|
98
|
+
result.append(base)
|
|
99
|
+
result.append(rng.choice(bases))
|
|
100
|
+
else: # substitution
|
|
101
|
+
# Replace with different base
|
|
102
|
+
alternatives = [b for b in bases if b != base.upper()]
|
|
103
|
+
result.append(rng.choice(alternatives))
|
|
104
|
+
else:
|
|
105
|
+
result.append(base)
|
|
106
|
+
|
|
107
|
+
return ''.join(result)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def normalize_sequence(seq_str: str, seq_id: str) -> str:
|
|
111
|
+
"""Normalize a sequence by removing whitespace and converting to uppercase.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
seq_str: Input sequence string
|
|
115
|
+
seq_id: Sequence ID for error messages
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Normalized sequence (uppercase, no whitespace)
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
ValueError: If sequence contains non-ACGT bases
|
|
122
|
+
"""
|
|
123
|
+
# Remove all whitespace and convert to uppercase
|
|
124
|
+
normalized = ''.join(seq_str.split()).upper()
|
|
125
|
+
|
|
126
|
+
# Check for non-ACGT bases
|
|
127
|
+
valid_bases = set('ACGT')
|
|
128
|
+
invalid_bases = set(normalized) - valid_bases
|
|
129
|
+
|
|
130
|
+
if invalid_bases:
|
|
131
|
+
logging.warning(f"Sequence '{seq_id}' contains non-ACGT bases: {sorted(invalid_bases)}")
|
|
132
|
+
logging.warning(f"These bases will be treated as-is but may cause unexpected behavior")
|
|
133
|
+
|
|
134
|
+
return normalized
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def generate_reads(sequences: List[SeqRecord],
|
|
138
|
+
num_reads: int,
|
|
139
|
+
error_rate: float,
|
|
140
|
+
ratios: List[float],
|
|
141
|
+
seed: int = None) -> List[SeqRecord]:
|
|
142
|
+
"""Generate synthetic reads from input sequences.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
sequences: Input sequences to generate reads from
|
|
146
|
+
num_reads: Total number of reads to generate
|
|
147
|
+
error_rate: Per-base error probability
|
|
148
|
+
ratios: Relative abundance of each sequence
|
|
149
|
+
seed: Random seed for reproducibility
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List of synthetic reads as SeqRecord objects
|
|
153
|
+
"""
|
|
154
|
+
rng = random.Random(seed)
|
|
155
|
+
reads = []
|
|
156
|
+
|
|
157
|
+
# Calculate reads per sequence based on ratios
|
|
158
|
+
reads_per_seq = []
|
|
159
|
+
cumulative = 0
|
|
160
|
+
for i, ratio in enumerate(ratios):
|
|
161
|
+
if i == len(ratios) - 1:
|
|
162
|
+
# Last sequence gets remaining reads to ensure exact count
|
|
163
|
+
reads_per_seq.append(num_reads - cumulative)
|
|
164
|
+
else:
|
|
165
|
+
count = int(round(num_reads * ratio))
|
|
166
|
+
reads_per_seq.append(count)
|
|
167
|
+
cumulative += count
|
|
168
|
+
|
|
169
|
+
# Generate reads for each sequence
|
|
170
|
+
quality = error_rate_to_phred(error_rate)
|
|
171
|
+
|
|
172
|
+
for seq_idx, (seq_record, seq_reads) in enumerate(zip(sequences, reads_per_seq)):
|
|
173
|
+
# Normalize sequence
|
|
174
|
+
sequence_str = normalize_sequence(str(seq_record.seq), seq_record.id)
|
|
175
|
+
|
|
176
|
+
for read_idx in range(seq_reads):
|
|
177
|
+
# Introduce errors
|
|
178
|
+
mutated_seq = introduce_errors(sequence_str, error_rate, rng)
|
|
179
|
+
|
|
180
|
+
# Create read ID with provenance (unique ID + source)
|
|
181
|
+
read_id = f"read_{len(reads) + 1}_from_{seq_record.id}"
|
|
182
|
+
|
|
183
|
+
# Create SeqRecord with quality scores
|
|
184
|
+
read_record = SeqRecord(
|
|
185
|
+
Seq(mutated_seq),
|
|
186
|
+
id=read_id,
|
|
187
|
+
description=f"source={seq_record.id} error_rate={error_rate:.3f}",
|
|
188
|
+
letter_annotations={'phred_quality': [quality] * len(mutated_seq)}
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
reads.append(read_record)
|
|
192
|
+
|
|
193
|
+
# Shuffle reads to mix sequences
|
|
194
|
+
rng.shuffle(reads)
|
|
195
|
+
|
|
196
|
+
return reads
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def main():
|
|
200
|
+
"""Main entry point for speconsense-synth."""
|
|
201
|
+
parser = argparse.ArgumentParser(
|
|
202
|
+
description='Generate synthetic reads from reference sequences with controlled error rates',
|
|
203
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
204
|
+
epilog="""
|
|
205
|
+
Examples:
|
|
206
|
+
# Generate 1000 reads with 10% error rate
|
|
207
|
+
speconsense-synth reference.fasta -n 1000 -e 0.1 -o synthetic_reads.fastq
|
|
208
|
+
|
|
209
|
+
# Generate reads from multiple sequences with specific ratios
|
|
210
|
+
speconsense-synth variants.fasta -n 5000 -e 0.15 --ratios 70,30 -o mixed_reads.fastq
|
|
211
|
+
|
|
212
|
+
# Set seed for reproducible results
|
|
213
|
+
speconsense-synth reference.fasta -n 1000 -e 0.1 --seed 42 -o synthetic_reads.fastq
|
|
214
|
+
"""
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
parser.add_argument('input', help='Input FASTA file with reference sequence(s)')
|
|
218
|
+
parser.add_argument('-n', '--num-reads', type=int, default=1000,
|
|
219
|
+
help='Number of reads to generate (default: 1000)')
|
|
220
|
+
parser.add_argument('-e', '--error-rate', type=float, default=0.1,
|
|
221
|
+
help='Per-base error rate (default: 0.1)')
|
|
222
|
+
parser.add_argument('-o', '--output', default='synthetic_reads.fastq',
|
|
223
|
+
help='Output FASTQ file (default: synthetic_reads.fastq)')
|
|
224
|
+
parser.add_argument('--ratios', type=str,
|
|
225
|
+
help='Comma-separated ratios for multiple sequences (e.g., 70,30)')
|
|
226
|
+
parser.add_argument('--seed', type=int,
|
|
227
|
+
help='Random seed for reproducibility')
|
|
228
|
+
parser.add_argument('--log-level', default='INFO',
|
|
229
|
+
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
|
|
230
|
+
help='Logging level (default: INFO)')
|
|
231
|
+
|
|
232
|
+
args = parser.parse_args()
|
|
233
|
+
|
|
234
|
+
# Set up logging
|
|
235
|
+
logging.basicConfig(
|
|
236
|
+
level=getattr(logging, args.log_level),
|
|
237
|
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
# Validate error rate
|
|
241
|
+
if not 0 <= args.error_rate <= 1:
|
|
242
|
+
parser.error("Error rate must be between 0 and 1")
|
|
243
|
+
|
|
244
|
+
# Load input sequences
|
|
245
|
+
try:
|
|
246
|
+
sequences = list(SeqIO.parse(args.input, 'fasta'))
|
|
247
|
+
if not sequences:
|
|
248
|
+
parser.error(f"No sequences found in {args.input}")
|
|
249
|
+
logging.info(f"Loaded {len(sequences)} sequence(s) from {args.input}")
|
|
250
|
+
except Exception as e:
|
|
251
|
+
parser.error(f"Failed to read input file: {e}")
|
|
252
|
+
|
|
253
|
+
# Parse ratios
|
|
254
|
+
try:
|
|
255
|
+
ratios = parse_ratios(args.ratios, len(sequences))
|
|
256
|
+
if len(sequences) > 1:
|
|
257
|
+
ratio_str = ', '.join(f"{s.id}:{r:.1%}" for s, r in zip(sequences, ratios))
|
|
258
|
+
logging.info(f"Sequence ratios: {ratio_str}")
|
|
259
|
+
except ValueError as e:
|
|
260
|
+
parser.error(str(e))
|
|
261
|
+
|
|
262
|
+
# Generate synthetic reads
|
|
263
|
+
logging.info(f"Generating {args.num_reads} reads with {args.error_rate:.1%} error rate")
|
|
264
|
+
if args.seed is not None:
|
|
265
|
+
logging.info(f"Using random seed: {args.seed}")
|
|
266
|
+
|
|
267
|
+
reads = generate_reads(
|
|
268
|
+
sequences=sequences,
|
|
269
|
+
num_reads=args.num_reads,
|
|
270
|
+
error_rate=args.error_rate,
|
|
271
|
+
ratios=ratios,
|
|
272
|
+
seed=args.seed
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# Write output
|
|
276
|
+
try:
|
|
277
|
+
with open(args.output, 'w') as f:
|
|
278
|
+
SeqIO.write(reads, f, 'fastq')
|
|
279
|
+
logging.info(f"Wrote {len(reads)} reads to {args.output}")
|
|
280
|
+
|
|
281
|
+
# Report statistics
|
|
282
|
+
total_bases = sum(len(r.seq) for r in reads)
|
|
283
|
+
avg_length = total_bases / len(reads) if reads else 0
|
|
284
|
+
logging.info(f"Total bases: {total_bases:,}, Average read length: {avg_length:.1f}")
|
|
285
|
+
|
|
286
|
+
except Exception as e:
|
|
287
|
+
logging.error(f"Failed to write output: {e}")
|
|
288
|
+
sys.exit(1)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
if __name__ == '__main__':
|
|
292
|
+
main()
|
speconsense/types.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared type definitions for speconsense.
|
|
3
|
+
|
|
4
|
+
This module contains data classes used across multiple modules,
|
|
5
|
+
extracted to avoid circular imports.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List, Optional, NamedTuple
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ConsensusInfo(NamedTuple):
|
|
12
|
+
"""Information about a consensus sequence from speconsense output."""
|
|
13
|
+
sample_name: str
|
|
14
|
+
cluster_id: str
|
|
15
|
+
sequence: str
|
|
16
|
+
ric: int
|
|
17
|
+
size: int
|
|
18
|
+
file_path: str
|
|
19
|
+
snp_count: Optional[int] = None # Number of SNPs from IUPAC consensus generation
|
|
20
|
+
primers: Optional[List[str]] = None # List of detected primer names
|
|
21
|
+
raw_ric: Optional[List[int]] = None # RiC values of .raw source variants
|
|
22
|
+
raw_len: Optional[List[int]] = None # Lengths of merged source sequences
|
|
23
|
+
rid: Optional[float] = None # Mean read identity (internal consistency metric)
|
|
24
|
+
rid_min: Optional[float] = None # Minimum read identity (worst-case read)
|
|
25
|
+
merge_indel_count: Optional[int] = None # Number of indels consumed by merging (for cumulative tracking)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class OverlapMergeInfo(NamedTuple):
|
|
29
|
+
"""Information about a single overlap merge event for quality reporting."""
|
|
30
|
+
specimen: str # Specimen name
|
|
31
|
+
iteration: int # Merge iteration (1 = first pass, 2+ = iterative)
|
|
32
|
+
input_clusters: List[str] # Cluster IDs involved in merge
|
|
33
|
+
input_lengths: List[int] # Original sequence lengths
|
|
34
|
+
input_rics: List[int] # RiC values of input sequences
|
|
35
|
+
overlap_bp: int # Overlap region size in bp
|
|
36
|
+
prefix_bp: int # Extension before overlap
|
|
37
|
+
suffix_bp: int # Extension after overlap
|
|
38
|
+
output_length: int # Final merged sequence length
|