speconsense 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,262 @@
1
+ """FASTA field customization support for speconsense-summarize.
2
+
3
+ Allows users to control which metadata fields appear in FASTA headers.
4
+ """
5
+
6
+ import re
7
+ from typing import List, Optional
8
+
9
+ from speconsense.types import ConsensusInfo
10
+
11
+
12
+ class FastaField:
13
+ """Base class for FASTA header field definitions."""
14
+
15
+ def __init__(self, name: str, description: str):
16
+ self.name = name # Field name (matches field code for clarity)
17
+ self.description = description
18
+
19
+ def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
20
+ """Format field value for this consensus. Returns None if not applicable."""
21
+ raise NotImplementedError
22
+
23
+
24
+ class SizeField(FastaField):
25
+ def __init__(self):
26
+ super().__init__('size', 'Total reads across merged variants')
27
+
28
+ def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
29
+ return f"size={consensus.size}"
30
+
31
+
32
+ class RicField(FastaField):
33
+ def __init__(self):
34
+ super().__init__('ric', 'Reads in consensus')
35
+
36
+ def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
37
+ return f"ric={consensus.ric}"
38
+
39
+
40
+ class LengthField(FastaField):
41
+ def __init__(self):
42
+ super().__init__('length', 'Sequence length in bases')
43
+
44
+ def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
45
+ return f"length={len(consensus.sequence)}"
46
+
47
+
48
+ class RawRicField(FastaField):
49
+ def __init__(self):
50
+ super().__init__('rawric', 'RiC values of .raw source variants')
51
+
52
+ def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
53
+ if consensus.raw_ric and len(consensus.raw_ric) > 0:
54
+ ric_values = sorted(consensus.raw_ric, reverse=True)
55
+ return f"rawric={'+'.join(str(r) for r in ric_values)}"
56
+ return None
57
+
58
+
59
+ class RawLenField(FastaField):
60
+ def __init__(self):
61
+ super().__init__('rawlen', 'Lengths of merged source sequences')
62
+
63
+ def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
64
+ if consensus.raw_len and len(consensus.raw_len) > 0:
65
+ len_values = sorted(consensus.raw_len, reverse=True)
66
+ return f"rawlen={'+'.join(str(l) for l in len_values)}"
67
+ return None
68
+
69
+
70
+ class SnpField(FastaField):
71
+ def __init__(self):
72
+ super().__init__('snp', 'Number of IUPAC ambiguity positions from merging')
73
+
74
+ def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
75
+ if consensus.snp_count is not None and consensus.snp_count > 0:
76
+ return f"snp={consensus.snp_count}"
77
+ return None
78
+
79
+
80
+ class AmbigField(FastaField):
81
+ def __init__(self):
82
+ super().__init__('ambig', 'Count of IUPAC ambiguity codes in consensus')
83
+
84
+ def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
85
+ # Count non-ACGT characters in the sequence
86
+ ambig_count = sum(1 for c in consensus.sequence if c.upper() not in 'ACGT')
87
+ if ambig_count > 0:
88
+ return f"ambig={ambig_count}"
89
+ return None
90
+
91
+
92
+ class PrimersField(FastaField):
93
+ def __init__(self):
94
+ super().__init__('primers', 'Detected primer names')
95
+
96
+ def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
97
+ if consensus.primers:
98
+ return f"primers={','.join(consensus.primers)}"
99
+ return None
100
+
101
+
102
+ class RidField(FastaField):
103
+ def __init__(self):
104
+ super().__init__('rid', 'Mean read identity (percentage)')
105
+
106
+ def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
107
+ if consensus.rid is not None:
108
+ return f"rid={consensus.rid*100:.1f}"
109
+ return None
110
+
111
+
112
+ class RidMinField(FastaField):
113
+ def __init__(self):
114
+ super().__init__('rid_min', 'Minimum read identity (percentage)')
115
+
116
+ def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
117
+ if consensus.rid_min is not None:
118
+ return f"rid_min={consensus.rid_min*100:.1f}"
119
+ return None
120
+
121
+
122
+ class GroupField(FastaField):
123
+ def __init__(self):
124
+ super().__init__('group', 'Variant group number')
125
+
126
+ def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
127
+ # Extract from sample_name (e.g., "...-1.v1" or "...-2.v1.raw1")
128
+ match = re.search(r'-(\d+)\.v\d+(?:\.raw\d+)?$', consensus.sample_name)
129
+ if match:
130
+ return f"group={match.group(1)}"
131
+ return None
132
+
133
+
134
+ class VariantField(FastaField):
135
+ def __init__(self):
136
+ super().__init__('variant', 'Variant identifier within group')
137
+
138
+ def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
139
+ # Extract from sample_name (e.g., "...-1.v1" -> "v1" or "...-1.v1.raw1" -> "v1")
140
+ match = re.search(r'\.(v\d+)(?:\.raw\d+)?$', consensus.sample_name)
141
+ if match:
142
+ return f"variant={match.group(1)}"
143
+ return None
144
+
145
+
146
+ # Field registry - field name is the key (codes = names)
147
+ FASTA_FIELDS = {
148
+ 'size': SizeField(),
149
+ 'ric': RicField(),
150
+ 'length': LengthField(),
151
+ 'rawric': RawRicField(),
152
+ 'rawlen': RawLenField(),
153
+ 'snp': SnpField(),
154
+ 'ambig': AmbigField(),
155
+ 'rid': RidField(),
156
+ 'rid_min': RidMinField(),
157
+ 'primers': PrimersField(),
158
+ 'group': GroupField(),
159
+ 'variant': VariantField(),
160
+ }
161
+
162
+ # Preset definitions
163
+ FASTA_FIELD_PRESETS = {
164
+ 'default': ['size', 'ric', 'rawric', 'rawlen', 'snp', 'ambig', 'primers'],
165
+ 'minimal': ['size', 'ric'],
166
+ 'qc': ['size', 'ric', 'length', 'rid', 'ambig'],
167
+ 'full': ['size', 'ric', 'length', 'rawric', 'rawlen', 'snp', 'ambig', 'rid', 'primers'],
168
+ 'id-only': [],
169
+ }
170
+
171
+
172
+ def validate_field_registry():
173
+ """Validate that all preset fields exist in registry."""
174
+ for preset_name, field_names in FASTA_FIELD_PRESETS.items():
175
+ for field_name in field_names:
176
+ if field_name not in FASTA_FIELDS:
177
+ raise ValueError(f"Preset '{preset_name}' references unknown field '{field_name}'")
178
+
179
+
180
+ # Validate at module load
181
+ validate_field_registry()
182
+
183
+
184
+ def parse_fasta_fields(spec: str) -> List[FastaField]:
185
+ """
186
+ Parse --fasta-fields specification into list of field objects.
187
+ Supports preset composition with union semantics.
188
+
189
+ Args:
190
+ spec: Comma-separated list of preset names and/or field names
191
+ Examples:
192
+ - "default" (single preset)
193
+ - "minimal,qc" (preset union)
194
+ - "size,ric,primers" (field list)
195
+ - "minimal,rid" (preset + fields)
196
+
197
+ Returns:
198
+ List of FastaField objects in specified order, duplicates removed
199
+
200
+ Raises:
201
+ ValueError: If spec contains unknown preset or field names
202
+ """
203
+ spec = spec.strip().lower()
204
+ if not spec:
205
+ # Default to "default" preset if empty
206
+ spec = "default"
207
+
208
+ # Parse comma-separated items (can be presets or field names)
209
+ items = [item.strip() for item in spec.split(',')]
210
+
211
+ # Expand presets and collect all field names, preserving order
212
+ all_field_names = []
213
+ seen = set() # Track duplicates
214
+
215
+ for item in items:
216
+ # Check if it's a preset
217
+ if item in FASTA_FIELD_PRESETS:
218
+ # Expand preset
219
+ for field_name in FASTA_FIELD_PRESETS[item]:
220
+ if field_name not in seen:
221
+ all_field_names.append(field_name)
222
+ seen.add(field_name)
223
+ elif item in FASTA_FIELDS:
224
+ # It's a field name
225
+ if item not in seen:
226
+ all_field_names.append(item)
227
+ seen.add(item)
228
+ else:
229
+ # Unknown item - provide helpful error
230
+ available_fields = ', '.join(sorted(FASTA_FIELDS.keys()))
231
+ available_presets = ', '.join(sorted(FASTA_FIELD_PRESETS.keys()))
232
+ raise ValueError(
233
+ f"Unknown preset or field name: '{item}'\n"
234
+ f" Available presets: {available_presets}\n"
235
+ f" Available fields: {available_fields}"
236
+ )
237
+
238
+ # Convert field names to field objects
239
+ fields = [FASTA_FIELDS[name] for name in all_field_names]
240
+
241
+ return fields
242
+
243
+
244
+ def format_fasta_header(consensus: ConsensusInfo, fields: List[FastaField]) -> str:
245
+ """
246
+ Format FASTA header with specified fields.
247
+
248
+ Args:
249
+ consensus: Consensus information
250
+ fields: List of fields to include (in order)
251
+
252
+ Returns:
253
+ Formatted header line (without leading '>')
254
+ """
255
+ parts = [consensus.sample_name]
256
+
257
+ for field in fields:
258
+ value = field.format_value(consensus)
259
+ if value is not None: # Skip fields that aren't applicable
260
+ parts.append(value)
261
+
262
+ return ' '.join(parts)