debase 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/PIPELINE_FLOW.md +100 -0
- debase/__init__.py +18 -0
- debase/__main__.py +9 -0
- debase/_version.py +3 -0
- debase/build_db.py +190 -0
- debase/cleanup_sequence.py +905 -0
- debase/enzyme_lineage_extractor.py +2169 -0
- debase/lineage_format.py +808 -0
- debase/reaction_info_extractor.py +2331 -0
- debase/substrate_scope_extractor.py +2039 -0
- debase/wrapper.py +303 -0
- debase-0.1.0.dist-info/METADATA +299 -0
- debase-0.1.0.dist-info/RECORD +17 -0
- debase-0.1.0.dist-info/WHEEL +5 -0
- debase-0.1.0.dist-info/entry_points.txt +2 -0
- debase-0.1.0.dist-info/licenses/LICENSE +21 -0
- debase-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,905 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
cleanup_sequence_structured.py - Enhanced protein sequence generator from mutations
|
4
|
+
|
5
|
+
This module takes the output from enzyme_lineage_extractor and generates complete
|
6
|
+
protein sequences by applying mutations throughout the lineage tree.
|
7
|
+
|
8
|
+
Usage:
|
9
|
+
python cleanup_sequence_structured.py input.csv output.csv
|
10
|
+
"""
|
11
|
+
|
12
|
+
import argparse
|
13
|
+
import logging
|
14
|
+
import re
|
15
|
+
import sys
|
16
|
+
from dataclasses import dataclass, field
|
17
|
+
from pathlib import Path
|
18
|
+
from typing import Dict, List, Optional, Set, Tuple, Union
|
19
|
+
|
20
|
+
import pandas as pd
|
21
|
+
|
22
|
+
|
23
|
+
# === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
|
24
|
+
|
25
|
+
VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codons
|
26
|
+
|
27
|
+
# Configure module logger
|
28
|
+
log = logging.getLogger(__name__)
|
29
|
+
|
30
|
+
|
31
|
+
# === 2. DATA MODELS === ------------------------------------------------------
|
32
|
+
|
33
|
+
@dataclass
|
34
|
+
class Mutation:
|
35
|
+
"""Represents a single point mutation."""
|
36
|
+
original: str
|
37
|
+
position: int
|
38
|
+
replacement: str
|
39
|
+
|
40
|
+
def __str__(self) -> str:
|
41
|
+
return f"{self.original}{self.position}{self.replacement}"
|
42
|
+
|
43
|
+
|
44
|
+
@dataclass
|
45
|
+
class ComplexMutation:
|
46
|
+
"""Represents complex mutations like C-terminal modifications."""
|
47
|
+
replacement_seq: str
|
48
|
+
start_pos: int
|
49
|
+
end_pos: int
|
50
|
+
extension_seq: str = ""
|
51
|
+
has_stop: bool = False
|
52
|
+
|
53
|
+
def __str__(self) -> str:
|
54
|
+
result = f"{self.replacement_seq}({self.start_pos}-{self.end_pos})"
|
55
|
+
if self.extension_seq:
|
56
|
+
result += self.extension_seq
|
57
|
+
if self.has_stop:
|
58
|
+
result += "[STOP]"
|
59
|
+
return result
|
60
|
+
|
61
|
+
|
62
|
+
@dataclass
|
63
|
+
class Variant:
|
64
|
+
"""Enhanced variant representation with sequence information."""
|
65
|
+
enzyme_id: str
|
66
|
+
parent_enzyme_id: Optional[str]
|
67
|
+
mutations: str
|
68
|
+
protein_sequence: Optional[str] = None
|
69
|
+
generation: Optional[int] = None
|
70
|
+
flag: str = ""
|
71
|
+
|
72
|
+
@property
|
73
|
+
def has_sequence(self) -> bool:
|
74
|
+
return bool(self.protein_sequence and self.protein_sequence.strip())
|
75
|
+
|
76
|
+
@property
|
77
|
+
def has_complex_mutations(self) -> bool:
|
78
|
+
return "complex_mutation" in self.flag
|
79
|
+
|
80
|
+
|
81
|
+
@dataclass
|
82
|
+
class SequenceGenerationResult:
|
83
|
+
"""Result of sequence generation attempt."""
|
84
|
+
sequence: str
|
85
|
+
method: str # "from_parent", "from_child", "from_ancestor", "from_descendant"
|
86
|
+
source_id: str
|
87
|
+
confidence: float = 1.0
|
88
|
+
notes: str = ""
|
89
|
+
|
90
|
+
|
91
|
+
# === 3. MUTATION PARSING === -------------------------------------------------
|
92
|
+
|
93
|
+
class MutationParser:
|
94
|
+
"""Handles parsing of various mutation formats."""
|
95
|
+
|
96
|
+
POINT_MUTATION_PATTERN = re.compile(r"^([A-Za-z\*])([0-9]+)([A-Za-z\*])$")
|
97
|
+
COMPLEX_C_TERMINAL_PATTERN = re.compile(r'([A-Z]+)\((\d+)-(\d+)\)([A-Z]*)\[STOP\]')
|
98
|
+
COMPLEX_C_TERMINAL_NO_STOP = re.compile(r'([A-Z]+)\((\d+)-(\d+)\)([A-Z]+)')
|
99
|
+
|
100
|
+
@classmethod
|
101
|
+
def parse_mutations(cls, mutation_str: str) -> List[Mutation]:
|
102
|
+
"""Parse standard point mutations from a mutation string."""
|
103
|
+
if not mutation_str or mutation_str.strip() == "":
|
104
|
+
return []
|
105
|
+
|
106
|
+
mutations = []
|
107
|
+
for mut_str in mutation_str.split(','):
|
108
|
+
mut_str = mut_str.strip()
|
109
|
+
if not mut_str:
|
110
|
+
continue
|
111
|
+
|
112
|
+
match = cls.POINT_MUTATION_PATTERN.match(mut_str)
|
113
|
+
if match:
|
114
|
+
try:
|
115
|
+
orig, pos_str, new = match.groups()
|
116
|
+
mutations.append(Mutation(
|
117
|
+
original=orig.upper(),
|
118
|
+
position=int(pos_str),
|
119
|
+
replacement=new.upper()
|
120
|
+
))
|
121
|
+
except ValueError as e:
|
122
|
+
log.warning(f"Failed to parse mutation '{mut_str}': {e}")
|
123
|
+
|
124
|
+
return mutations
|
125
|
+
|
126
|
+
@classmethod
|
127
|
+
def parse_complex_c_terminal(cls, mutation_str: str) -> Optional[ComplexMutation]:
|
128
|
+
"""Parse complex C-terminal mutations."""
|
129
|
+
# Try pattern with [STOP]
|
130
|
+
match = cls.COMPLEX_C_TERMINAL_PATTERN.search(mutation_str)
|
131
|
+
if match:
|
132
|
+
return ComplexMutation(
|
133
|
+
replacement_seq=match.group(1),
|
134
|
+
start_pos=int(match.group(2)),
|
135
|
+
end_pos=int(match.group(3)),
|
136
|
+
extension_seq=match.group(4),
|
137
|
+
has_stop=True
|
138
|
+
)
|
139
|
+
|
140
|
+
# Try pattern without [STOP]
|
141
|
+
match = cls.COMPLEX_C_TERMINAL_NO_STOP.search(mutation_str)
|
142
|
+
if match:
|
143
|
+
return ComplexMutation(
|
144
|
+
replacement_seq=match.group(1),
|
145
|
+
start_pos=int(match.group(2)),
|
146
|
+
end_pos=int(match.group(3)),
|
147
|
+
extension_seq=match.group(4),
|
148
|
+
has_stop=False
|
149
|
+
)
|
150
|
+
|
151
|
+
return None
|
152
|
+
|
153
|
+
@classmethod
|
154
|
+
def detect_complex_mutations(cls, mutation_str: str) -> List[str]:
|
155
|
+
"""Detect non-standard mutations in the mutation string."""
|
156
|
+
if not mutation_str or mutation_str.strip() == "":
|
157
|
+
return []
|
158
|
+
|
159
|
+
all_muts = [m.strip() for m in mutation_str.split(',') if m.strip()]
|
160
|
+
std_muts = {str(m) for m in cls.parse_mutations(mutation_str)}
|
161
|
+
|
162
|
+
return [m for m in all_muts if m not in std_muts]
|
163
|
+
|
164
|
+
|
165
|
+
# === 4. SEQUENCE MANIPULATION === --------------------------------------------
|
166
|
+
|
167
|
+
class SequenceManipulator:
|
168
|
+
"""Handles application and reversal of mutations on sequences."""
|
169
|
+
|
170
|
+
@staticmethod
|
171
|
+
def validate_sequence(seq: str) -> bool:
|
172
|
+
"""Validate that a sequence contains only valid amino acids."""
|
173
|
+
return all(aa in VALID_AMINO_ACIDS for aa in seq.upper())
|
174
|
+
|
175
|
+
@staticmethod
|
176
|
+
def determine_indexing(parent_seq: str, mutations: List[Mutation]) -> int:
|
177
|
+
"""Determine whether mutations use 0-based or 1-based indexing."""
|
178
|
+
if not mutations or not parent_seq:
|
179
|
+
return 1 # Default to 1-based
|
180
|
+
|
181
|
+
# Count matches for each indexing scheme
|
182
|
+
zero_matches = sum(
|
183
|
+
1 for m in mutations
|
184
|
+
if 0 <= m.position < len(parent_seq) and
|
185
|
+
parent_seq[m.position].upper() == m.original.upper()
|
186
|
+
)
|
187
|
+
one_matches = sum(
|
188
|
+
1 for m in mutations
|
189
|
+
if 0 <= m.position - 1 < len(parent_seq) and
|
190
|
+
parent_seq[m.position - 1].upper() == m.original.upper()
|
191
|
+
)
|
192
|
+
|
193
|
+
return 0 if zero_matches >= one_matches else 1
|
194
|
+
|
195
|
+
@classmethod
|
196
|
+
def apply_mutations(cls, parent_seq: str, mutation_str: str) -> str:
|
197
|
+
"""Apply mutations to a parent sequence."""
|
198
|
+
if not parent_seq:
|
199
|
+
return ""
|
200
|
+
|
201
|
+
seq = list(parent_seq)
|
202
|
+
|
203
|
+
# Apply point mutations
|
204
|
+
mutations = MutationParser.parse_mutations(mutation_str)
|
205
|
+
if mutations:
|
206
|
+
idx_offset = cls.determine_indexing(parent_seq, mutations)
|
207
|
+
|
208
|
+
for mut in mutations:
|
209
|
+
idx = mut.position - idx_offset
|
210
|
+
# Try primary index
|
211
|
+
if 0 <= idx < len(seq) and seq[idx].upper() == mut.original.upper():
|
212
|
+
seq[idx] = mut.replacement
|
213
|
+
else:
|
214
|
+
# Try alternate index
|
215
|
+
alt_idx = mut.position - (1 - idx_offset)
|
216
|
+
if 0 <= alt_idx < len(seq) and seq[alt_idx].upper() == mut.original.upper():
|
217
|
+
seq[alt_idx] = mut.replacement
|
218
|
+
else:
|
219
|
+
log.warning(
|
220
|
+
f"Mutation {mut} does not match parent sequence at "
|
221
|
+
f"position {mut.position} (tried both 0- and 1-based indexing)"
|
222
|
+
)
|
223
|
+
|
224
|
+
# Apply complex C-terminal mutations
|
225
|
+
complex_mut = MutationParser.parse_complex_c_terminal(mutation_str)
|
226
|
+
if complex_mut:
|
227
|
+
log.info(f"Applying complex C-terminal mutation: {complex_mut}")
|
228
|
+
|
229
|
+
# Convert to 0-indexed
|
230
|
+
start_idx = complex_mut.start_pos - 1
|
231
|
+
end_idx = complex_mut.end_pos - 1
|
232
|
+
|
233
|
+
if 0 <= start_idx <= end_idx < len(seq):
|
234
|
+
# Replace the specified region
|
235
|
+
seq[start_idx:end_idx + 1] = list(complex_mut.replacement_seq)
|
236
|
+
|
237
|
+
# Handle STOP codon
|
238
|
+
if complex_mut.has_stop:
|
239
|
+
seq = seq[:start_idx + len(complex_mut.replacement_seq)]
|
240
|
+
|
241
|
+
# Add extension if present
|
242
|
+
if complex_mut.extension_seq:
|
243
|
+
seq.extend(list(complex_mut.extension_seq))
|
244
|
+
else:
|
245
|
+
log.warning(
|
246
|
+
f"Invalid C-terminal mutation positions: {complex_mut.start_pos}-"
|
247
|
+
f"{complex_mut.end_pos} for sequence of length {len(seq)}"
|
248
|
+
)
|
249
|
+
|
250
|
+
return "".join(seq)
|
251
|
+
|
252
|
+
@classmethod
|
253
|
+
def reverse_mutations(cls, child_seq: str, mutation_str: str) -> str:
|
254
|
+
"""Reverse mutations to get parent sequence from child."""
|
255
|
+
if not child_seq:
|
256
|
+
return ""
|
257
|
+
|
258
|
+
seq = list(child_seq)
|
259
|
+
mutations = MutationParser.parse_mutations(mutation_str)
|
260
|
+
|
261
|
+
if not mutations:
|
262
|
+
return child_seq
|
263
|
+
|
264
|
+
# Determine indexing by checking which positions have the "new" amino acid
|
265
|
+
zero_matches = sum(
|
266
|
+
1 for m in mutations
|
267
|
+
if 0 <= m.position < len(child_seq) and
|
268
|
+
child_seq[m.position].upper() == m.replacement.upper()
|
269
|
+
)
|
270
|
+
one_matches = sum(
|
271
|
+
1 for m in mutations
|
272
|
+
if 0 <= m.position - 1 < len(child_seq) and
|
273
|
+
child_seq[m.position - 1].upper() == m.replacement.upper()
|
274
|
+
)
|
275
|
+
|
276
|
+
idx_offset = 0 if zero_matches >= one_matches else 1
|
277
|
+
|
278
|
+
# Reverse mutations (change replacement -> original)
|
279
|
+
for mut in mutations:
|
280
|
+
idx = mut.position - idx_offset
|
281
|
+
if 0 <= idx < len(seq) and seq[idx].upper() == mut.replacement.upper():
|
282
|
+
seq[idx] = mut.original
|
283
|
+
else:
|
284
|
+
alt_idx = mut.position - (1 - idx_offset)
|
285
|
+
if 0 <= alt_idx < len(seq) and seq[alt_idx].upper() == mut.replacement.upper():
|
286
|
+
seq[alt_idx] = mut.original
|
287
|
+
else:
|
288
|
+
log.warning(
|
289
|
+
f"Cannot reverse mutation {mut}: replacement amino acid "
|
290
|
+
f"not found at expected position"
|
291
|
+
)
|
292
|
+
|
293
|
+
return "".join(seq)
|
294
|
+
|
295
|
+
|
296
|
+
# === 5. LINEAGE NAVIGATION === -----------------------------------------------
|
297
|
+
|
298
|
+
class LineageNavigator:
|
299
|
+
"""Handles navigation through the enzyme lineage tree."""
|
300
|
+
|
301
|
+
def __init__(self, df: pd.DataFrame):
|
302
|
+
self.df = df
|
303
|
+
self._build_relationships()
|
304
|
+
|
305
|
+
def _build_relationships(self) -> None:
|
306
|
+
"""Build parent-child relationship mappings."""
|
307
|
+
self.parent_to_children: Dict[str, List[str]] = {}
|
308
|
+
self.child_to_parent: Dict[str, str] = {}
|
309
|
+
|
310
|
+
for _, row in self.df.iterrows():
|
311
|
+
child_id = row["enzyme_id"]
|
312
|
+
parent_id = row.get("parent_enzyme_id")
|
313
|
+
|
314
|
+
if parent_id:
|
315
|
+
self.child_to_parent[child_id] = parent_id
|
316
|
+
if parent_id not in self.parent_to_children:
|
317
|
+
self.parent_to_children[parent_id] = []
|
318
|
+
self.parent_to_children[parent_id].append(child_id)
|
319
|
+
|
320
|
+
def get_ancestors(self, variant_id: str) -> List[str]:
|
321
|
+
"""Get all ancestors of a variant in order (immediate parent first)."""
|
322
|
+
ancestors = []
|
323
|
+
current_id = self.child_to_parent.get(variant_id)
|
324
|
+
|
325
|
+
while current_id:
|
326
|
+
ancestors.append(current_id)
|
327
|
+
current_id = self.child_to_parent.get(current_id)
|
328
|
+
|
329
|
+
return ancestors
|
330
|
+
|
331
|
+
def get_descendants(self, variant_id: str) -> List[str]:
|
332
|
+
"""Get all descendants of a variant (breadth-first order)."""
|
333
|
+
descendants = []
|
334
|
+
queue = [variant_id]
|
335
|
+
visited = {variant_id}
|
336
|
+
|
337
|
+
while queue:
|
338
|
+
current_id = queue.pop(0)
|
339
|
+
children = self.parent_to_children.get(current_id, [])
|
340
|
+
|
341
|
+
for child in children:
|
342
|
+
if child not in visited:
|
343
|
+
visited.add(child)
|
344
|
+
descendants.append(child)
|
345
|
+
queue.append(child)
|
346
|
+
|
347
|
+
return descendants
|
348
|
+
|
349
|
+
def find_path(self, from_id: str, to_id: str) -> Optional[List[str]]:
|
350
|
+
"""Find path between two variants if one exists."""
|
351
|
+
# Check if to_id is descendant of from_id
|
352
|
+
descendants = self.get_descendants(from_id)
|
353
|
+
if to_id in descendants:
|
354
|
+
# Build path forward
|
355
|
+
path = [from_id]
|
356
|
+
current = from_id
|
357
|
+
|
358
|
+
while current != to_id:
|
359
|
+
# Find child that leads to to_id
|
360
|
+
for child in self.parent_to_children.get(current, []):
|
361
|
+
if child == to_id or to_id in self.get_descendants(child):
|
362
|
+
path.append(child)
|
363
|
+
current = child
|
364
|
+
break
|
365
|
+
|
366
|
+
return path
|
367
|
+
|
368
|
+
# Check if to_id is ancestor of from_id
|
369
|
+
ancestors = self.get_ancestors(from_id)
|
370
|
+
if to_id in ancestors:
|
371
|
+
# Build path backward
|
372
|
+
path = [from_id]
|
373
|
+
current = from_id
|
374
|
+
|
375
|
+
while current != to_id:
|
376
|
+
parent = self.child_to_parent.get(current)
|
377
|
+
if parent:
|
378
|
+
path.append(parent)
|
379
|
+
current = parent
|
380
|
+
else:
|
381
|
+
break
|
382
|
+
|
383
|
+
return path
|
384
|
+
|
385
|
+
return None
|
386
|
+
|
387
|
+
|
388
|
+
# === 6. SEQUENCE GENERATOR === -----------------------------------------------
|
389
|
+
|
390
|
+
class SequenceGenerator:
|
391
|
+
"""Main class for generating protein sequences from mutations."""
|
392
|
+
|
393
|
+
def __init__(self, df: pd.DataFrame):
|
394
|
+
self.df = df
|
395
|
+
self.navigator = LineageNavigator(df)
|
396
|
+
self.manipulator = SequenceManipulator()
|
397
|
+
self._update_ground_truths()
|
398
|
+
|
399
|
+
def _update_ground_truths(self) -> None:
|
400
|
+
"""Update the set of variants with known sequences."""
|
401
|
+
self.ground_truth_ids = set(
|
402
|
+
self.df[
|
403
|
+
self.df["protein_sequence"].notna() &
|
404
|
+
(self.df["protein_sequence"].str.strip() != "")
|
405
|
+
]["enzyme_id"]
|
406
|
+
)
|
407
|
+
|
408
|
+
def find_best_ground_truth(
|
409
|
+
self,
|
410
|
+
variant_id: str,
|
411
|
+
has_complex_mutation: bool
|
412
|
+
) -> Tuple[str, str]:
|
413
|
+
"""
|
414
|
+
Find the best ground truth sequence to use for generation.
|
415
|
+
|
416
|
+
Returns:
|
417
|
+
(ground_truth_id, direction) where direction is 'up' or 'down'
|
418
|
+
"""
|
419
|
+
# Get variant info
|
420
|
+
variant_row = self.df[self.df["enzyme_id"] == variant_id].iloc[0]
|
421
|
+
parent_id = variant_row.get("parent_enzyme_id")
|
422
|
+
|
423
|
+
# Check direct parent
|
424
|
+
if parent_id in self.ground_truth_ids:
|
425
|
+
if not has_complex_mutation:
|
426
|
+
return parent_id, "up"
|
427
|
+
|
428
|
+
# Check direct children
|
429
|
+
direct_children = self.navigator.parent_to_children.get(variant_id, [])
|
430
|
+
child_gts = [c for c in direct_children if c in self.ground_truth_ids]
|
431
|
+
|
432
|
+
if child_gts:
|
433
|
+
if has_complex_mutation:
|
434
|
+
return child_gts[0], "down"
|
435
|
+
elif parent_id not in self.ground_truth_ids:
|
436
|
+
return child_gts[0], "down"
|
437
|
+
|
438
|
+
# Check all descendants
|
439
|
+
descendants = self.navigator.get_descendants(variant_id)
|
440
|
+
desc_gts = [d for d in descendants if d in self.ground_truth_ids]
|
441
|
+
|
442
|
+
# Check all ancestors
|
443
|
+
ancestors = self.navigator.get_ancestors(variant_id)
|
444
|
+
anc_gts = [a for a in ancestors if a in self.ground_truth_ids]
|
445
|
+
|
446
|
+
# Prioritize based on mutation type
|
447
|
+
if has_complex_mutation and desc_gts:
|
448
|
+
return desc_gts[0], "down"
|
449
|
+
|
450
|
+
if has_complex_mutation and parent_id in self.ground_truth_ids:
|
451
|
+
return parent_id, "up"
|
452
|
+
|
453
|
+
# Return closest ground truth
|
454
|
+
if anc_gts:
|
455
|
+
return anc_gts[0], "up"
|
456
|
+
if desc_gts:
|
457
|
+
return desc_gts[0], "down"
|
458
|
+
|
459
|
+
return "", ""
|
460
|
+
|
461
|
+
def generate_from_parent(
|
462
|
+
self,
|
463
|
+
variant_id: str,
|
464
|
+
parent_id: str
|
465
|
+
) -> Optional[SequenceGenerationResult]:
|
466
|
+
"""Generate sequence by applying mutations to parent."""
|
467
|
+
parent_row = self.df[self.df["enzyme_id"] == parent_id].iloc[0]
|
468
|
+
parent_seq = parent_row.get("protein_sequence", "")
|
469
|
+
|
470
|
+
if not parent_seq:
|
471
|
+
return None
|
472
|
+
|
473
|
+
variant_row = self.df[self.df["enzyme_id"] == variant_id].iloc[0]
|
474
|
+
mutations = variant_row.get("mutations", "")
|
475
|
+
|
476
|
+
if not mutations:
|
477
|
+
return None
|
478
|
+
|
479
|
+
sequence = self.manipulator.apply_mutations(parent_seq, mutations)
|
480
|
+
|
481
|
+
return SequenceGenerationResult(
|
482
|
+
sequence=sequence,
|
483
|
+
method="from_parent",
|
484
|
+
source_id=parent_id,
|
485
|
+
confidence=1.0
|
486
|
+
)
|
487
|
+
|
488
|
+
def generate_from_child(
|
489
|
+
self,
|
490
|
+
variant_id: str,
|
491
|
+
child_id: str
|
492
|
+
) -> Optional[SequenceGenerationResult]:
|
493
|
+
"""Generate sequence by reversing mutations from child."""
|
494
|
+
child_row = self.df[self.df["enzyme_id"] == child_id].iloc[0]
|
495
|
+
child_seq = child_row.get("protein_sequence", "")
|
496
|
+
child_mutations = child_row.get("mutations", "")
|
497
|
+
|
498
|
+
if not child_seq or not child_mutations:
|
499
|
+
return None
|
500
|
+
|
501
|
+
sequence = self.manipulator.reverse_mutations(child_seq, child_mutations)
|
502
|
+
|
503
|
+
return SequenceGenerationResult(
|
504
|
+
sequence=sequence,
|
505
|
+
method="from_child",
|
506
|
+
source_id=child_id,
|
507
|
+
confidence=0.9
|
508
|
+
)
|
509
|
+
|
510
|
+
def generate_sequence(self, variant_id: str) -> Optional[SequenceGenerationResult]:
|
511
|
+
"""Generate sequence for a variant using the best available method."""
|
512
|
+
# Check if already has sequence
|
513
|
+
variant_row = self.df[self.df["enzyme_id"] == variant_id].iloc[0]
|
514
|
+
if variant_row.get("protein_sequence", "").strip():
|
515
|
+
return SequenceGenerationResult(
|
516
|
+
sequence=variant_row["protein_sequence"],
|
517
|
+
method="existing",
|
518
|
+
source_id=variant_id,
|
519
|
+
confidence=1.0
|
520
|
+
)
|
521
|
+
|
522
|
+
# Get variant info
|
523
|
+
parent_id = variant_row.get("parent_enzyme_id")
|
524
|
+
mutations = variant_row.get("mutations", "")
|
525
|
+
|
526
|
+
# Check for complex mutations
|
527
|
+
complex_muts = MutationParser.detect_complex_mutations(mutations) if mutations else []
|
528
|
+
has_complex = bool(complex_muts)
|
529
|
+
|
530
|
+
# Find best ground truth
|
531
|
+
gt_id, direction = self.find_best_ground_truth(variant_id, has_complex)
|
532
|
+
|
533
|
+
if not gt_id:
|
534
|
+
log.warning(f"No suitable ground truth found for {variant_id}")
|
535
|
+
return None
|
536
|
+
|
537
|
+
log.info(f"Using {gt_id} as ground truth ({direction} direction) for {variant_id}")
|
538
|
+
|
539
|
+
# Generate based on direction
|
540
|
+
if direction == "up" and parent_id and mutations:
|
541
|
+
if gt_id == parent_id:
|
542
|
+
return self.generate_from_parent(variant_id, parent_id)
|
543
|
+
else:
|
544
|
+
# Non-direct ancestor - less reliable
|
545
|
+
result = self.generate_from_parent(variant_id, gt_id)
|
546
|
+
if result:
|
547
|
+
result.confidence = 0.7
|
548
|
+
result.notes = "Generated from non-direct ancestor"
|
549
|
+
return result
|
550
|
+
else: # down or no parent/mutations
|
551
|
+
direct_children = self.navigator.parent_to_children.get(variant_id, [])
|
552
|
+
if gt_id in direct_children:
|
553
|
+
return self.generate_from_child(variant_id, gt_id)
|
554
|
+
else:
|
555
|
+
# Try to find path through direct child
|
556
|
+
path = self.navigator.find_path(variant_id, gt_id)
|
557
|
+
if path and len(path) > 1:
|
558
|
+
direct_child = path[1]
|
559
|
+
result = self.generate_from_child(variant_id, direct_child)
|
560
|
+
if result:
|
561
|
+
result.confidence = 0.8
|
562
|
+
result.notes = f"Generated via path through {direct_child}"
|
563
|
+
return result
|
564
|
+
|
565
|
+
return None
|
566
|
+
|
567
|
+
|
568
|
+
# === 7. MAIN PROCESSOR === ---------------------------------------------------
|
569
|
+
|
570
|
+
class SequenceProcessor:
|
571
|
+
"""Main processor for handling the complete workflow."""
|
572
|
+
|
573
|
+
def __init__(self, input_csv: Path, output_csv: Path):
|
574
|
+
self.input_csv = input_csv
|
575
|
+
self.output_csv = output_csv
|
576
|
+
self.df = None
|
577
|
+
self.generator = None
|
578
|
+
|
579
|
+
def load_data(self) -> None:
|
580
|
+
"""Load and prepare the input data."""
|
581
|
+
self.df = pd.read_csv(self.input_csv, keep_default_na=False)
|
582
|
+
|
583
|
+
# Detect and handle column format automatically
|
584
|
+
self._normalize_columns()
|
585
|
+
|
586
|
+
log.info(
|
587
|
+
f"Loaded {len(self.df)} rows, "
|
588
|
+
f"{sum(self.df['protein_sequence'].str.strip() == '')} empty sequences"
|
589
|
+
)
|
590
|
+
|
591
|
+
# Ensure required columns exist
|
592
|
+
if "flag" not in self.df.columns:
|
593
|
+
self.df["flag"] = ""
|
594
|
+
|
595
|
+
# Initialize generator
|
596
|
+
self.generator = SequenceGenerator(self.df)
|
597
|
+
|
598
|
+
def _normalize_columns(self) -> None:
|
599
|
+
"""Automatically detect and normalize column names from different formats."""
|
600
|
+
# Check if this is enzyme_lineage_extractor format
|
601
|
+
if "variant_id" in self.df.columns:
|
602
|
+
log.info("Detected enzyme_lineage_extractor format, converting columns...")
|
603
|
+
|
604
|
+
# Rename columns
|
605
|
+
column_mapping = {
|
606
|
+
"variant_id": "enzyme_id",
|
607
|
+
"parent_id": "parent_enzyme_id",
|
608
|
+
"aa_seq": "protein_sequence"
|
609
|
+
}
|
610
|
+
|
611
|
+
self.df = self.df.rename(columns=column_mapping)
|
612
|
+
|
613
|
+
# Convert mutation format from semicolon to comma-separated
|
614
|
+
if "mutations" in self.df.columns:
|
615
|
+
self.df["mutations"] = self.df["mutations"].str.replace(";", ",")
|
616
|
+
|
617
|
+
log.info("Column conversion complete")
|
618
|
+
|
619
|
+
# Verify required columns exist
|
620
|
+
required_columns = ["enzyme_id", "parent_enzyme_id", "mutations", "protein_sequence"]
|
621
|
+
missing_columns = [col for col in required_columns if col not in self.df.columns]
|
622
|
+
|
623
|
+
if missing_columns:
|
624
|
+
raise ValueError(
|
625
|
+
f"Missing required columns: {missing_columns}. "
|
626
|
+
f"Found columns: {list(self.df.columns)}"
|
627
|
+
)
|
628
|
+
|
629
|
+
def flag_complex_mutations(self) -> None:
|
630
|
+
"""Flag variants with complex mutations."""
|
631
|
+
complex_count = 0
|
632
|
+
|
633
|
+
for idx, row in self.df.iterrows():
|
634
|
+
if row.get("mutations", ""):
|
635
|
+
complex_muts = MutationParser.detect_complex_mutations(row["mutations"])
|
636
|
+
if complex_muts:
|
637
|
+
self.df.at[idx, "flag"] = "complex_mutation"
|
638
|
+
complex_count += 1
|
639
|
+
log.info(
|
640
|
+
f"Variant {row['enzyme_id']} has complex mutations: {complex_muts}"
|
641
|
+
)
|
642
|
+
|
643
|
+
log.info(f"Flagged {complex_count} variants with complex mutations")
|
644
|
+
|
645
|
+
def process_simple_mutations(self) -> None:
|
646
|
+
"""Process variants with simple point mutations."""
|
647
|
+
processed = 0
|
648
|
+
|
649
|
+
for idx, row in self.df.iterrows():
|
650
|
+
# Skip if already has sequence or has complex mutations
|
651
|
+
if (row.get("protein_sequence", "").strip() or
|
652
|
+
"complex_mutation" in str(row.get("flag", ""))):
|
653
|
+
continue
|
654
|
+
|
655
|
+
variant_id = row["enzyme_id"]
|
656
|
+
result = self.generator.generate_sequence(variant_id)
|
657
|
+
|
658
|
+
if result and result.method == "from_parent":
|
659
|
+
self.df.at[idx, "protein_sequence"] = result.sequence
|
660
|
+
|
661
|
+
# Check for unexpected length changes
|
662
|
+
parent_seq = self.df[
|
663
|
+
self.df["enzyme_id"] == result.source_id
|
664
|
+
]["protein_sequence"].iloc[0]
|
665
|
+
|
666
|
+
if len(result.sequence) != len(parent_seq):
|
667
|
+
self.df.at[idx, "flag"] = "unexpected_length_change"
|
668
|
+
log.warning(
|
669
|
+
f"Unexpected length change for {variant_id} "
|
670
|
+
f"with standard mutations"
|
671
|
+
)
|
672
|
+
|
673
|
+
processed += 1
|
674
|
+
|
675
|
+
log.info(f"Processed {processed} variants with simple mutations")
|
676
|
+
|
677
|
+
def process_complex_mutations(self) -> None:
|
678
|
+
"""Process variants with complex mutations."""
|
679
|
+
complex_variants = self.df[
|
680
|
+
self.df["flag"].str.contains("complex_mutation", na=False)
|
681
|
+
]["enzyme_id"].tolist()
|
682
|
+
|
683
|
+
log.info(f"Processing {len(complex_variants)} variants with complex mutations")
|
684
|
+
|
685
|
+
processed = 0
|
686
|
+
for variant_id in complex_variants:
|
687
|
+
idx = self.df[self.df["enzyme_id"] == variant_id].index[0]
|
688
|
+
|
689
|
+
if self.df.at[idx, "protein_sequence"]:
|
690
|
+
continue
|
691
|
+
|
692
|
+
result = self.generator.generate_sequence(variant_id)
|
693
|
+
|
694
|
+
if result:
|
695
|
+
self.df.at[idx, "protein_sequence"] = result.sequence
|
696
|
+
|
697
|
+
# Check length changes
|
698
|
+
parent_id = self.df.at[idx, "parent_enzyme_id"]
|
699
|
+
parent_row = self.df[self.df["enzyme_id"] == parent_id]
|
700
|
+
|
701
|
+
if not parent_row.empty and parent_row.iloc[0]["protein_sequence"]:
|
702
|
+
parent_seq = parent_row.iloc[0]["protein_sequence"]
|
703
|
+
if len(result.sequence) != len(parent_seq):
|
704
|
+
self.df.at[idx, "flag"] = "complex_mutation length_change"
|
705
|
+
log.info(
|
706
|
+
f"Length change for {variant_id}: "
|
707
|
+
f"{len(parent_seq)} -> {len(result.sequence)}"
|
708
|
+
)
|
709
|
+
|
710
|
+
processed += 1
|
711
|
+
|
712
|
+
log.info(f"Processed {processed} complex mutation variants")
|
713
|
+
|
714
|
+
def process_remaining(self) -> None:
|
715
|
+
"""Process any remaining variants."""
|
716
|
+
# Update ground truths with newly generated sequences
|
717
|
+
self.generator._update_ground_truths()
|
718
|
+
|
719
|
+
remaining = self.df[
|
720
|
+
self.df["protein_sequence"].str.strip() == ""
|
721
|
+
]["enzyme_id"].tolist()
|
722
|
+
|
723
|
+
if not remaining:
|
724
|
+
return
|
725
|
+
|
726
|
+
log.info(f"Processing {len(remaining)} remaining variants")
|
727
|
+
|
728
|
+
# Sort by generation if available
|
729
|
+
if "generation" in self.df.columns:
|
730
|
+
remaining.sort(
|
731
|
+
key=lambda x: self.df[
|
732
|
+
self.df["enzyme_id"] == x
|
733
|
+
]["generation"].iloc[0] if x in self.df["enzyme_id"].values else float('inf')
|
734
|
+
)
|
735
|
+
|
736
|
+
processed = 0
|
737
|
+
for variant_id in remaining:
|
738
|
+
idx = self.df[self.df["enzyme_id"] == variant_id].index[0]
|
739
|
+
|
740
|
+
if self.df.at[idx, "protein_sequence"]:
|
741
|
+
continue
|
742
|
+
|
743
|
+
result = self.generator.generate_sequence(variant_id)
|
744
|
+
|
745
|
+
if result:
|
746
|
+
self.df.at[idx, "protein_sequence"] = result.sequence
|
747
|
+
|
748
|
+
# Add generation method to flag
|
749
|
+
method_flag = f"generated_{result.method}"
|
750
|
+
if result.confidence < 1.0:
|
751
|
+
method_flag += f"_conf{result.confidence:.1f}"
|
752
|
+
|
753
|
+
existing_flag = self.df.at[idx, "flag"]
|
754
|
+
self.df.at[idx, "flag"] = f"{existing_flag} {method_flag}".strip()
|
755
|
+
|
756
|
+
processed += 1
|
757
|
+
|
758
|
+
# Update ground truths for next iterations
|
759
|
+
self.generator._update_ground_truths()
|
760
|
+
|
761
|
+
log.info(f"Processed {processed} remaining variants")
|
762
|
+
|
763
|
+
def backward_pass(self) -> None:
|
764
|
+
"""Work backward from terminal variants to fill remaining gaps."""
|
765
|
+
missing = self.df[
|
766
|
+
self.df["protein_sequence"].str.strip() == ""
|
767
|
+
]["enzyme_id"].tolist()
|
768
|
+
|
769
|
+
if not missing:
|
770
|
+
return
|
771
|
+
|
772
|
+
log.info(
|
773
|
+
f"Backward pass: attempting to fill {len(missing)} remaining sequences"
|
774
|
+
)
|
775
|
+
|
776
|
+
# Find terminal variants (no children) with sequences
|
777
|
+
all_parents = set(self.df["parent_enzyme_id"].dropna())
|
778
|
+
terminal_variants = [
|
779
|
+
v for v in self.generator.ground_truth_ids
|
780
|
+
if v not in all_parents
|
781
|
+
]
|
782
|
+
|
783
|
+
log.info(f"Found {len(terminal_variants)} terminal variants with sequences")
|
784
|
+
|
785
|
+
# Sort missing by generation (latest first)
|
786
|
+
if "generation" in self.df.columns:
|
787
|
+
missing.sort(
|
788
|
+
key=lambda x: self.df[
|
789
|
+
self.df["enzyme_id"] == x
|
790
|
+
]["generation"].iloc[0] if x in self.df["enzyme_id"].values else 0,
|
791
|
+
reverse=True
|
792
|
+
)
|
793
|
+
|
794
|
+
processed = 0
|
795
|
+
for variant_id in missing:
|
796
|
+
idx = self.df[self.df["enzyme_id"] == variant_id].index[0]
|
797
|
+
|
798
|
+
if self.df.at[idx, "protein_sequence"]:
|
799
|
+
continue
|
800
|
+
|
801
|
+
result = self.generator.generate_sequence(variant_id)
|
802
|
+
|
803
|
+
if result:
|
804
|
+
self.df.at[idx, "protein_sequence"] = result.sequence
|
805
|
+
self.df.at[idx, "flag"] += " backward_from_terminal"
|
806
|
+
processed += 1
|
807
|
+
|
808
|
+
# Update ground truths
|
809
|
+
self.generator._update_ground_truths()
|
810
|
+
|
811
|
+
log.info(f"Backward pass: filled {processed} sequences")
|
812
|
+
|
813
|
+
def save_results(self) -> None:
|
814
|
+
"""Save the processed data."""
|
815
|
+
# Final statistics
|
816
|
+
empty_final = sum(self.df["protein_sequence"].str.strip() == "")
|
817
|
+
length_changes = sum(self.df["flag"].str.contains("length_change", na=False))
|
818
|
+
complex_mutations = sum(self.df["flag"].str.contains("complex_mutation", na=False))
|
819
|
+
|
820
|
+
log.info(
|
821
|
+
f"Final results: {len(self.df)} rows, {empty_final} empty, "
|
822
|
+
f"{complex_mutations} complex mutations, {length_changes} length changes"
|
823
|
+
)
|
824
|
+
|
825
|
+
# Save to CSV
|
826
|
+
self.df.to_csv(self.output_csv, index=False)
|
827
|
+
log.info(f"Saved results to {self.output_csv}")
|
828
|
+
|
829
|
+
def run(self) -> None:
|
830
|
+
"""Run the complete processing pipeline."""
|
831
|
+
log.info("Starting sequence generation pipeline")
|
832
|
+
|
833
|
+
# Load data
|
834
|
+
self.load_data()
|
835
|
+
|
836
|
+
# Flag complex mutations
|
837
|
+
self.flag_complex_mutations()
|
838
|
+
|
839
|
+
# Process in order
|
840
|
+
self.process_simple_mutations()
|
841
|
+
self.process_complex_mutations()
|
842
|
+
self.process_remaining()
|
843
|
+
self.backward_pass()
|
844
|
+
|
845
|
+
# Save results
|
846
|
+
self.save_results()
|
847
|
+
|
848
|
+
log.info("Pipeline completed")
|
849
|
+
|
850
|
+
|
851
|
+
# === 8. CLI INTERFACE === ----------------------------------------------------
|
852
|
+
|
853
|
+
def setup_logging(verbose: int = 0) -> None:
|
854
|
+
"""Configure logging based on verbosity level."""
|
855
|
+
if verbose >= 2:
|
856
|
+
level = logging.DEBUG
|
857
|
+
elif verbose == 1:
|
858
|
+
level = logging.INFO
|
859
|
+
else:
|
860
|
+
level = logging.WARNING
|
861
|
+
|
862
|
+
logging.basicConfig(
|
863
|
+
level=level,
|
864
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
865
|
+
datefmt="%Y-%m-%d %H:%M:%S"
|
866
|
+
)
|
867
|
+
|
868
|
+
|
869
|
+
def main(argv: Optional[List[str]] = None) -> None:
|
870
|
+
"""Main CLI entry point."""
|
871
|
+
parser = argparse.ArgumentParser(
|
872
|
+
prog="cleanup_sequence_structured",
|
873
|
+
description="Generate protein sequences from mutation data",
|
874
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
875
|
+
)
|
876
|
+
|
877
|
+
parser.add_argument(
|
878
|
+
"input_csv",
|
879
|
+
type=Path,
|
880
|
+
help="Input CSV file with enzyme lineage data"
|
881
|
+
)
|
882
|
+
parser.add_argument(
|
883
|
+
"output_csv",
|
884
|
+
type=Path,
|
885
|
+
help="Output CSV file with generated sequences"
|
886
|
+
)
|
887
|
+
parser.add_argument(
|
888
|
+
"-v", "--verbose",
|
889
|
+
action="count",
|
890
|
+
default=0,
|
891
|
+
help="Increase verbosity (use -vv for debug output)"
|
892
|
+
)
|
893
|
+
|
894
|
+
args = parser.parse_args(argv)
|
895
|
+
|
896
|
+
# Setup logging
|
897
|
+
setup_logging(args.verbose)
|
898
|
+
|
899
|
+
# Process the data (format detection is automatic)
|
900
|
+
processor = SequenceProcessor(args.input_csv, args.output_csv)
|
901
|
+
processor.run()
|
902
|
+
|
903
|
+
|
904
|
+
if __name__ == "__main__":
|
905
|
+
main()
|