speconsense 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speconsense/__init__.py +16 -0
- speconsense/cli.py +6 -0
- speconsense/core/__init__.py +32 -0
- speconsense/core/__main__.py +6 -0
- speconsense/core/cli.py +308 -0
- speconsense/core/clusterer.py +1565 -0
- speconsense/core/workers.py +696 -0
- speconsense/msa.py +813 -0
- speconsense/profiles/__init__.py +514 -0
- speconsense/profiles/example.yaml +97 -0
- speconsense/profiles/herbarium.yaml +25 -0
- speconsense/profiles/largedata.yaml +19 -0
- speconsense/profiles/nostalgia.yaml +22 -0
- speconsense/profiles/strict.yaml +27 -0
- speconsense/quality_report.py +499 -0
- speconsense/scalability/__init__.py +29 -0
- speconsense/scalability/base.py +461 -0
- speconsense/scalability/config.py +42 -0
- speconsense/scalability/vsearch.py +226 -0
- speconsense/summarize/__init__.py +129 -0
- speconsense/summarize/__main__.py +6 -0
- speconsense/summarize/analysis.py +780 -0
- speconsense/summarize/cli.py +528 -0
- speconsense/summarize/clustering.py +669 -0
- speconsense/summarize/fields.py +262 -0
- speconsense/summarize/io.py +723 -0
- speconsense/summarize/iupac.py +294 -0
- speconsense/summarize/merging.py +606 -0
- speconsense/synth.py +292 -0
- speconsense/types.py +38 -0
- speconsense-0.7.2.dist-info/METADATA +1449 -0
- speconsense-0.7.2.dist-info/RECORD +36 -0
- speconsense-0.7.2.dist-info/WHEEL +5 -0
- speconsense-0.7.2.dist-info/entry_points.txt +4 -0
- speconsense-0.7.2.dist-info/licenses/LICENSE +28 -0
- speconsense-0.7.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"""FASTA field customization support for speconsense-summarize.
|
|
2
|
+
|
|
3
|
+
Allows users to control which metadata fields appear in FASTA headers.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
from speconsense.types import ConsensusInfo
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FastaField:
|
|
13
|
+
"""Base class for FASTA header field definitions."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, name: str, description: str):
|
|
16
|
+
self.name = name # Field name (matches field code for clarity)
|
|
17
|
+
self.description = description
|
|
18
|
+
|
|
19
|
+
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
20
|
+
"""Format field value for this consensus. Returns None if not applicable."""
|
|
21
|
+
raise NotImplementedError
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SizeField(FastaField):
|
|
25
|
+
def __init__(self):
|
|
26
|
+
super().__init__('size', 'Total reads across merged variants')
|
|
27
|
+
|
|
28
|
+
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
29
|
+
return f"size={consensus.size}"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class RicField(FastaField):
|
|
33
|
+
def __init__(self):
|
|
34
|
+
super().__init__('ric', 'Reads in consensus')
|
|
35
|
+
|
|
36
|
+
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
37
|
+
return f"ric={consensus.ric}"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class LengthField(FastaField):
|
|
41
|
+
def __init__(self):
|
|
42
|
+
super().__init__('length', 'Sequence length in bases')
|
|
43
|
+
|
|
44
|
+
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
45
|
+
return f"length={len(consensus.sequence)}"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class RawRicField(FastaField):
|
|
49
|
+
def __init__(self):
|
|
50
|
+
super().__init__('rawric', 'RiC values of .raw source variants')
|
|
51
|
+
|
|
52
|
+
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
53
|
+
if consensus.raw_ric and len(consensus.raw_ric) > 0:
|
|
54
|
+
ric_values = sorted(consensus.raw_ric, reverse=True)
|
|
55
|
+
return f"rawric={'+'.join(str(r) for r in ric_values)}"
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class RawLenField(FastaField):
|
|
60
|
+
def __init__(self):
|
|
61
|
+
super().__init__('rawlen', 'Lengths of merged source sequences')
|
|
62
|
+
|
|
63
|
+
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
64
|
+
if consensus.raw_len and len(consensus.raw_len) > 0:
|
|
65
|
+
len_values = sorted(consensus.raw_len, reverse=True)
|
|
66
|
+
return f"rawlen={'+'.join(str(l) for l in len_values)}"
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class SnpField(FastaField):
|
|
71
|
+
def __init__(self):
|
|
72
|
+
super().__init__('snp', 'Number of IUPAC ambiguity positions from merging')
|
|
73
|
+
|
|
74
|
+
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
75
|
+
if consensus.snp_count is not None and consensus.snp_count > 0:
|
|
76
|
+
return f"snp={consensus.snp_count}"
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class AmbigField(FastaField):
|
|
81
|
+
def __init__(self):
|
|
82
|
+
super().__init__('ambig', 'Count of IUPAC ambiguity codes in consensus')
|
|
83
|
+
|
|
84
|
+
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
85
|
+
# Count non-ACGT characters in the sequence
|
|
86
|
+
ambig_count = sum(1 for c in consensus.sequence if c.upper() not in 'ACGT')
|
|
87
|
+
if ambig_count > 0:
|
|
88
|
+
return f"ambig={ambig_count}"
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class PrimersField(FastaField):
|
|
93
|
+
def __init__(self):
|
|
94
|
+
super().__init__('primers', 'Detected primer names')
|
|
95
|
+
|
|
96
|
+
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
97
|
+
if consensus.primers:
|
|
98
|
+
return f"primers={','.join(consensus.primers)}"
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class RidField(FastaField):
|
|
103
|
+
def __init__(self):
|
|
104
|
+
super().__init__('rid', 'Mean read identity (percentage)')
|
|
105
|
+
|
|
106
|
+
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
107
|
+
if consensus.rid is not None:
|
|
108
|
+
return f"rid={consensus.rid*100:.1f}"
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class RidMinField(FastaField):
|
|
113
|
+
def __init__(self):
|
|
114
|
+
super().__init__('rid_min', 'Minimum read identity (percentage)')
|
|
115
|
+
|
|
116
|
+
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
117
|
+
if consensus.rid_min is not None:
|
|
118
|
+
return f"rid_min={consensus.rid_min*100:.1f}"
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class GroupField(FastaField):
|
|
123
|
+
def __init__(self):
|
|
124
|
+
super().__init__('group', 'Variant group number')
|
|
125
|
+
|
|
126
|
+
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
127
|
+
# Extract from sample_name (e.g., "...-1.v1" or "...-2.v1.raw1")
|
|
128
|
+
match = re.search(r'-(\d+)\.v\d+(?:\.raw\d+)?$', consensus.sample_name)
|
|
129
|
+
if match:
|
|
130
|
+
return f"group={match.group(1)}"
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class VariantField(FastaField):
|
|
135
|
+
def __init__(self):
|
|
136
|
+
super().__init__('variant', 'Variant identifier within group')
|
|
137
|
+
|
|
138
|
+
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
139
|
+
# Extract from sample_name (e.g., "...-1.v1" -> "v1" or "...-1.v1.raw1" -> "v1")
|
|
140
|
+
match = re.search(r'\.(v\d+)(?:\.raw\d+)?$', consensus.sample_name)
|
|
141
|
+
if match:
|
|
142
|
+
return f"variant={match.group(1)}"
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# Field registry - field name is the key (codes = names)
|
|
147
|
+
FASTA_FIELDS = {
|
|
148
|
+
'size': SizeField(),
|
|
149
|
+
'ric': RicField(),
|
|
150
|
+
'length': LengthField(),
|
|
151
|
+
'rawric': RawRicField(),
|
|
152
|
+
'rawlen': RawLenField(),
|
|
153
|
+
'snp': SnpField(),
|
|
154
|
+
'ambig': AmbigField(),
|
|
155
|
+
'rid': RidField(),
|
|
156
|
+
'rid_min': RidMinField(),
|
|
157
|
+
'primers': PrimersField(),
|
|
158
|
+
'group': GroupField(),
|
|
159
|
+
'variant': VariantField(),
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# Preset definitions
|
|
163
|
+
FASTA_FIELD_PRESETS = {
|
|
164
|
+
'default': ['size', 'ric', 'rawric', 'rawlen', 'snp', 'ambig', 'primers'],
|
|
165
|
+
'minimal': ['size', 'ric'],
|
|
166
|
+
'qc': ['size', 'ric', 'length', 'rid', 'ambig'],
|
|
167
|
+
'full': ['size', 'ric', 'length', 'rawric', 'rawlen', 'snp', 'ambig', 'rid', 'primers'],
|
|
168
|
+
'id-only': [],
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def validate_field_registry():
|
|
173
|
+
"""Validate that all preset fields exist in registry."""
|
|
174
|
+
for preset_name, field_names in FASTA_FIELD_PRESETS.items():
|
|
175
|
+
for field_name in field_names:
|
|
176
|
+
if field_name not in FASTA_FIELDS:
|
|
177
|
+
raise ValueError(f"Preset '{preset_name}' references unknown field '{field_name}'")
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# Validate at module load
|
|
181
|
+
validate_field_registry()
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def parse_fasta_fields(spec: str) -> List[FastaField]:
|
|
185
|
+
"""
|
|
186
|
+
Parse --fasta-fields specification into list of field objects.
|
|
187
|
+
Supports preset composition with union semantics.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
spec: Comma-separated list of preset names and/or field names
|
|
191
|
+
Examples:
|
|
192
|
+
- "default" (single preset)
|
|
193
|
+
- "minimal,qc" (preset union)
|
|
194
|
+
- "size,ric,primers" (field list)
|
|
195
|
+
- "minimal,rid" (preset + fields)
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
List of FastaField objects in specified order, duplicates removed
|
|
199
|
+
|
|
200
|
+
Raises:
|
|
201
|
+
ValueError: If spec contains unknown preset or field names
|
|
202
|
+
"""
|
|
203
|
+
spec = spec.strip().lower()
|
|
204
|
+
if not spec:
|
|
205
|
+
# Default to "default" preset if empty
|
|
206
|
+
spec = "default"
|
|
207
|
+
|
|
208
|
+
# Parse comma-separated items (can be presets or field names)
|
|
209
|
+
items = [item.strip() for item in spec.split(',')]
|
|
210
|
+
|
|
211
|
+
# Expand presets and collect all field names, preserving order
|
|
212
|
+
all_field_names = []
|
|
213
|
+
seen = set() # Track duplicates
|
|
214
|
+
|
|
215
|
+
for item in items:
|
|
216
|
+
# Check if it's a preset
|
|
217
|
+
if item in FASTA_FIELD_PRESETS:
|
|
218
|
+
# Expand preset
|
|
219
|
+
for field_name in FASTA_FIELD_PRESETS[item]:
|
|
220
|
+
if field_name not in seen:
|
|
221
|
+
all_field_names.append(field_name)
|
|
222
|
+
seen.add(field_name)
|
|
223
|
+
elif item in FASTA_FIELDS:
|
|
224
|
+
# It's a field name
|
|
225
|
+
if item not in seen:
|
|
226
|
+
all_field_names.append(item)
|
|
227
|
+
seen.add(item)
|
|
228
|
+
else:
|
|
229
|
+
# Unknown item - provide helpful error
|
|
230
|
+
available_fields = ', '.join(sorted(FASTA_FIELDS.keys()))
|
|
231
|
+
available_presets = ', '.join(sorted(FASTA_FIELD_PRESETS.keys()))
|
|
232
|
+
raise ValueError(
|
|
233
|
+
f"Unknown preset or field name: '{item}'\n"
|
|
234
|
+
f" Available presets: {available_presets}\n"
|
|
235
|
+
f" Available fields: {available_fields}"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# Convert field names to field objects
|
|
239
|
+
fields = [FASTA_FIELDS[name] for name in all_field_names]
|
|
240
|
+
|
|
241
|
+
return fields
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def format_fasta_header(consensus: ConsensusInfo, fields: List[FastaField]) -> str:
|
|
245
|
+
"""
|
|
246
|
+
Format FASTA header with specified fields.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
consensus: Consensus information
|
|
250
|
+
fields: List of fields to include (in order)
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
Formatted header line (without leading '>')
|
|
254
|
+
"""
|
|
255
|
+
parts = [consensus.sample_name]
|
|
256
|
+
|
|
257
|
+
for field in fields:
|
|
258
|
+
value = field.format_value(consensus)
|
|
259
|
+
if value is not None: # Skip fields that aren't applicable
|
|
260
|
+
parts.append(value)
|
|
261
|
+
|
|
262
|
+
return ' '.join(parts)
|