debase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,905 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ cleanup_sequence_structured.py - Enhanced protein sequence generator from mutations
4
+
5
+ This module takes the output from enzyme_lineage_extractor and generates complete
6
+ protein sequences by applying mutations throughout the lineage tree.
7
+
8
+ Usage:
9
+ python cleanup_sequence_structured.py input.csv output.csv
10
+ """
11
+
12
+ import argparse
13
+ import logging
14
+ import re
15
+ import sys
16
+ from dataclasses import dataclass, field
17
+ from pathlib import Path
18
+ from typing import Dict, List, Optional, Set, Tuple, Union
19
+
20
+ import pandas as pd
21
+
22
+
23
+ # === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
24
+
25
+ VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codons
26
+
27
+ # Configure module logger
28
+ log = logging.getLogger(__name__)
29
+
30
+
31
+ # === 2. DATA MODELS === ------------------------------------------------------
32
+
33
+ @dataclass
34
+ class Mutation:
35
+ """Represents a single point mutation."""
36
+ original: str
37
+ position: int
38
+ replacement: str
39
+
40
+ def __str__(self) -> str:
41
+ return f"{self.original}{self.position}{self.replacement}"
42
+
43
+
44
+ @dataclass
45
+ class ComplexMutation:
46
+ """Represents complex mutations like C-terminal modifications."""
47
+ replacement_seq: str
48
+ start_pos: int
49
+ end_pos: int
50
+ extension_seq: str = ""
51
+ has_stop: bool = False
52
+
53
+ def __str__(self) -> str:
54
+ result = f"{self.replacement_seq}({self.start_pos}-{self.end_pos})"
55
+ if self.extension_seq:
56
+ result += self.extension_seq
57
+ if self.has_stop:
58
+ result += "[STOP]"
59
+ return result
60
+
61
+
62
+ @dataclass
63
+ class Variant:
64
+ """Enhanced variant representation with sequence information."""
65
+ enzyme_id: str
66
+ parent_enzyme_id: Optional[str]
67
+ mutations: str
68
+ protein_sequence: Optional[str] = None
69
+ generation: Optional[int] = None
70
+ flag: str = ""
71
+
72
+ @property
73
+ def has_sequence(self) -> bool:
74
+ return bool(self.protein_sequence and self.protein_sequence.strip())
75
+
76
+ @property
77
+ def has_complex_mutations(self) -> bool:
78
+ return "complex_mutation" in self.flag
79
+
80
+
81
+ @dataclass
82
+ class SequenceGenerationResult:
83
+ """Result of sequence generation attempt."""
84
+ sequence: str
85
+ method: str # "from_parent", "from_child", "from_ancestor", "from_descendant"
86
+ source_id: str
87
+ confidence: float = 1.0
88
+ notes: str = ""
89
+
90
+
91
+ # === 3. MUTATION PARSING === -------------------------------------------------
92
+
93
+ class MutationParser:
94
+ """Handles parsing of various mutation formats."""
95
+
96
+ POINT_MUTATION_PATTERN = re.compile(r"^([A-Za-z\*])([0-9]+)([A-Za-z\*])$")
97
+ COMPLEX_C_TERMINAL_PATTERN = re.compile(r'([A-Z]+)\((\d+)-(\d+)\)([A-Z]*)\[STOP\]')
98
+ COMPLEX_C_TERMINAL_NO_STOP = re.compile(r'([A-Z]+)\((\d+)-(\d+)\)([A-Z]+)')
99
+
100
+ @classmethod
101
+ def parse_mutations(cls, mutation_str: str) -> List[Mutation]:
102
+ """Parse standard point mutations from a mutation string."""
103
+ if not mutation_str or mutation_str.strip() == "":
104
+ return []
105
+
106
+ mutations = []
107
+ for mut_str in mutation_str.split(','):
108
+ mut_str = mut_str.strip()
109
+ if not mut_str:
110
+ continue
111
+
112
+ match = cls.POINT_MUTATION_PATTERN.match(mut_str)
113
+ if match:
114
+ try:
115
+ orig, pos_str, new = match.groups()
116
+ mutations.append(Mutation(
117
+ original=orig.upper(),
118
+ position=int(pos_str),
119
+ replacement=new.upper()
120
+ ))
121
+ except ValueError as e:
122
+ log.warning(f"Failed to parse mutation '{mut_str}': {e}")
123
+
124
+ return mutations
125
+
126
+ @classmethod
127
+ def parse_complex_c_terminal(cls, mutation_str: str) -> Optional[ComplexMutation]:
128
+ """Parse complex C-terminal mutations."""
129
+ # Try pattern with [STOP]
130
+ match = cls.COMPLEX_C_TERMINAL_PATTERN.search(mutation_str)
131
+ if match:
132
+ return ComplexMutation(
133
+ replacement_seq=match.group(1),
134
+ start_pos=int(match.group(2)),
135
+ end_pos=int(match.group(3)),
136
+ extension_seq=match.group(4),
137
+ has_stop=True
138
+ )
139
+
140
+ # Try pattern without [STOP]
141
+ match = cls.COMPLEX_C_TERMINAL_NO_STOP.search(mutation_str)
142
+ if match:
143
+ return ComplexMutation(
144
+ replacement_seq=match.group(1),
145
+ start_pos=int(match.group(2)),
146
+ end_pos=int(match.group(3)),
147
+ extension_seq=match.group(4),
148
+ has_stop=False
149
+ )
150
+
151
+ return None
152
+
153
+ @classmethod
154
+ def detect_complex_mutations(cls, mutation_str: str) -> List[str]:
155
+ """Detect non-standard mutations in the mutation string."""
156
+ if not mutation_str or mutation_str.strip() == "":
157
+ return []
158
+
159
+ all_muts = [m.strip() for m in mutation_str.split(',') if m.strip()]
160
+ std_muts = {str(m) for m in cls.parse_mutations(mutation_str)}
161
+
162
+ return [m for m in all_muts if m not in std_muts]
163
+
164
+
165
+ # === 4. SEQUENCE MANIPULATION === --------------------------------------------
166
+
167
+ class SequenceManipulator:
168
+ """Handles application and reversal of mutations on sequences."""
169
+
170
+ @staticmethod
171
+ def validate_sequence(seq: str) -> bool:
172
+ """Validate that a sequence contains only valid amino acids."""
173
+ return all(aa in VALID_AMINO_ACIDS for aa in seq.upper())
174
+
175
+ @staticmethod
176
+ def determine_indexing(parent_seq: str, mutations: List[Mutation]) -> int:
177
+ """Determine whether mutations use 0-based or 1-based indexing."""
178
+ if not mutations or not parent_seq:
179
+ return 1 # Default to 1-based
180
+
181
+ # Count matches for each indexing scheme
182
+ zero_matches = sum(
183
+ 1 for m in mutations
184
+ if 0 <= m.position < len(parent_seq) and
185
+ parent_seq[m.position].upper() == m.original.upper()
186
+ )
187
+ one_matches = sum(
188
+ 1 for m in mutations
189
+ if 0 <= m.position - 1 < len(parent_seq) and
190
+ parent_seq[m.position - 1].upper() == m.original.upper()
191
+ )
192
+
193
+ return 0 if zero_matches >= one_matches else 1
194
+
195
+ @classmethod
196
+ def apply_mutations(cls, parent_seq: str, mutation_str: str) -> str:
197
+ """Apply mutations to a parent sequence."""
198
+ if not parent_seq:
199
+ return ""
200
+
201
+ seq = list(parent_seq)
202
+
203
+ # Apply point mutations
204
+ mutations = MutationParser.parse_mutations(mutation_str)
205
+ if mutations:
206
+ idx_offset = cls.determine_indexing(parent_seq, mutations)
207
+
208
+ for mut in mutations:
209
+ idx = mut.position - idx_offset
210
+ # Try primary index
211
+ if 0 <= idx < len(seq) and seq[idx].upper() == mut.original.upper():
212
+ seq[idx] = mut.replacement
213
+ else:
214
+ # Try alternate index
215
+ alt_idx = mut.position - (1 - idx_offset)
216
+ if 0 <= alt_idx < len(seq) and seq[alt_idx].upper() == mut.original.upper():
217
+ seq[alt_idx] = mut.replacement
218
+ else:
219
+ log.warning(
220
+ f"Mutation {mut} does not match parent sequence at "
221
+ f"position {mut.position} (tried both 0- and 1-based indexing)"
222
+ )
223
+
224
+ # Apply complex C-terminal mutations
225
+ complex_mut = MutationParser.parse_complex_c_terminal(mutation_str)
226
+ if complex_mut:
227
+ log.info(f"Applying complex C-terminal mutation: {complex_mut}")
228
+
229
+ # Convert to 0-indexed
230
+ start_idx = complex_mut.start_pos - 1
231
+ end_idx = complex_mut.end_pos - 1
232
+
233
+ if 0 <= start_idx <= end_idx < len(seq):
234
+ # Replace the specified region
235
+ seq[start_idx:end_idx + 1] = list(complex_mut.replacement_seq)
236
+
237
+ # Handle STOP codon
238
+ if complex_mut.has_stop:
239
+ seq = seq[:start_idx + len(complex_mut.replacement_seq)]
240
+
241
+ # Add extension if present
242
+ if complex_mut.extension_seq:
243
+ seq.extend(list(complex_mut.extension_seq))
244
+ else:
245
+ log.warning(
246
+ f"Invalid C-terminal mutation positions: {complex_mut.start_pos}-"
247
+ f"{complex_mut.end_pos} for sequence of length {len(seq)}"
248
+ )
249
+
250
+ return "".join(seq)
251
+
252
+ @classmethod
253
+ def reverse_mutations(cls, child_seq: str, mutation_str: str) -> str:
254
+ """Reverse mutations to get parent sequence from child."""
255
+ if not child_seq:
256
+ return ""
257
+
258
+ seq = list(child_seq)
259
+ mutations = MutationParser.parse_mutations(mutation_str)
260
+
261
+ if not mutations:
262
+ return child_seq
263
+
264
+ # Determine indexing by checking which positions have the "new" amino acid
265
+ zero_matches = sum(
266
+ 1 for m in mutations
267
+ if 0 <= m.position < len(child_seq) and
268
+ child_seq[m.position].upper() == m.replacement.upper()
269
+ )
270
+ one_matches = sum(
271
+ 1 for m in mutations
272
+ if 0 <= m.position - 1 < len(child_seq) and
273
+ child_seq[m.position - 1].upper() == m.replacement.upper()
274
+ )
275
+
276
+ idx_offset = 0 if zero_matches >= one_matches else 1
277
+
278
+ # Reverse mutations (change replacement -> original)
279
+ for mut in mutations:
280
+ idx = mut.position - idx_offset
281
+ if 0 <= idx < len(seq) and seq[idx].upper() == mut.replacement.upper():
282
+ seq[idx] = mut.original
283
+ else:
284
+ alt_idx = mut.position - (1 - idx_offset)
285
+ if 0 <= alt_idx < len(seq) and seq[alt_idx].upper() == mut.replacement.upper():
286
+ seq[alt_idx] = mut.original
287
+ else:
288
+ log.warning(
289
+ f"Cannot reverse mutation {mut}: replacement amino acid "
290
+ f"not found at expected position"
291
+ )
292
+
293
+ return "".join(seq)
294
+
295
+
296
+ # === 5. LINEAGE NAVIGATION === -----------------------------------------------
297
+
298
+ class LineageNavigator:
299
+ """Handles navigation through the enzyme lineage tree."""
300
+
301
+ def __init__(self, df: pd.DataFrame):
302
+ self.df = df
303
+ self._build_relationships()
304
+
305
+ def _build_relationships(self) -> None:
306
+ """Build parent-child relationship mappings."""
307
+ self.parent_to_children: Dict[str, List[str]] = {}
308
+ self.child_to_parent: Dict[str, str] = {}
309
+
310
+ for _, row in self.df.iterrows():
311
+ child_id = row["enzyme_id"]
312
+ parent_id = row.get("parent_enzyme_id")
313
+
314
+ if parent_id:
315
+ self.child_to_parent[child_id] = parent_id
316
+ if parent_id not in self.parent_to_children:
317
+ self.parent_to_children[parent_id] = []
318
+ self.parent_to_children[parent_id].append(child_id)
319
+
320
+ def get_ancestors(self, variant_id: str) -> List[str]:
321
+ """Get all ancestors of a variant in order (immediate parent first)."""
322
+ ancestors = []
323
+ current_id = self.child_to_parent.get(variant_id)
324
+
325
+ while current_id:
326
+ ancestors.append(current_id)
327
+ current_id = self.child_to_parent.get(current_id)
328
+
329
+ return ancestors
330
+
331
+ def get_descendants(self, variant_id: str) -> List[str]:
332
+ """Get all descendants of a variant (breadth-first order)."""
333
+ descendants = []
334
+ queue = [variant_id]
335
+ visited = {variant_id}
336
+
337
+ while queue:
338
+ current_id = queue.pop(0)
339
+ children = self.parent_to_children.get(current_id, [])
340
+
341
+ for child in children:
342
+ if child not in visited:
343
+ visited.add(child)
344
+ descendants.append(child)
345
+ queue.append(child)
346
+
347
+ return descendants
348
+
349
+ def find_path(self, from_id: str, to_id: str) -> Optional[List[str]]:
350
+ """Find path between two variants if one exists."""
351
+ # Check if to_id is descendant of from_id
352
+ descendants = self.get_descendants(from_id)
353
+ if to_id in descendants:
354
+ # Build path forward
355
+ path = [from_id]
356
+ current = from_id
357
+
358
+ while current != to_id:
359
+ # Find child that leads to to_id
360
+ for child in self.parent_to_children.get(current, []):
361
+ if child == to_id or to_id in self.get_descendants(child):
362
+ path.append(child)
363
+ current = child
364
+ break
365
+
366
+ return path
367
+
368
+ # Check if to_id is ancestor of from_id
369
+ ancestors = self.get_ancestors(from_id)
370
+ if to_id in ancestors:
371
+ # Build path backward
372
+ path = [from_id]
373
+ current = from_id
374
+
375
+ while current != to_id:
376
+ parent = self.child_to_parent.get(current)
377
+ if parent:
378
+ path.append(parent)
379
+ current = parent
380
+ else:
381
+ break
382
+
383
+ return path
384
+
385
+ return None
386
+
387
+
388
+ # === 6. SEQUENCE GENERATOR === -----------------------------------------------
389
+
390
+ class SequenceGenerator:
391
+ """Main class for generating protein sequences from mutations."""
392
+
393
+ def __init__(self, df: pd.DataFrame):
394
+ self.df = df
395
+ self.navigator = LineageNavigator(df)
396
+ self.manipulator = SequenceManipulator()
397
+ self._update_ground_truths()
398
+
399
+ def _update_ground_truths(self) -> None:
400
+ """Update the set of variants with known sequences."""
401
+ self.ground_truth_ids = set(
402
+ self.df[
403
+ self.df["protein_sequence"].notna() &
404
+ (self.df["protein_sequence"].str.strip() != "")
405
+ ]["enzyme_id"]
406
+ )
407
+
408
+ def find_best_ground_truth(
409
+ self,
410
+ variant_id: str,
411
+ has_complex_mutation: bool
412
+ ) -> Tuple[str, str]:
413
+ """
414
+ Find the best ground truth sequence to use for generation.
415
+
416
+ Returns:
417
+ (ground_truth_id, direction) where direction is 'up' or 'down'
418
+ """
419
+ # Get variant info
420
+ variant_row = self.df[self.df["enzyme_id"] == variant_id].iloc[0]
421
+ parent_id = variant_row.get("parent_enzyme_id")
422
+
423
+ # Check direct parent
424
+ if parent_id in self.ground_truth_ids:
425
+ if not has_complex_mutation:
426
+ return parent_id, "up"
427
+
428
+ # Check direct children
429
+ direct_children = self.navigator.parent_to_children.get(variant_id, [])
430
+ child_gts = [c for c in direct_children if c in self.ground_truth_ids]
431
+
432
+ if child_gts:
433
+ if has_complex_mutation:
434
+ return child_gts[0], "down"
435
+ elif parent_id not in self.ground_truth_ids:
436
+ return child_gts[0], "down"
437
+
438
+ # Check all descendants
439
+ descendants = self.navigator.get_descendants(variant_id)
440
+ desc_gts = [d for d in descendants if d in self.ground_truth_ids]
441
+
442
+ # Check all ancestors
443
+ ancestors = self.navigator.get_ancestors(variant_id)
444
+ anc_gts = [a for a in ancestors if a in self.ground_truth_ids]
445
+
446
+ # Prioritize based on mutation type
447
+ if has_complex_mutation and desc_gts:
448
+ return desc_gts[0], "down"
449
+
450
+ if has_complex_mutation and parent_id in self.ground_truth_ids:
451
+ return parent_id, "up"
452
+
453
+ # Return closest ground truth
454
+ if anc_gts:
455
+ return anc_gts[0], "up"
456
+ if desc_gts:
457
+ return desc_gts[0], "down"
458
+
459
+ return "", ""
460
+
461
+ def generate_from_parent(
462
+ self,
463
+ variant_id: str,
464
+ parent_id: str
465
+ ) -> Optional[SequenceGenerationResult]:
466
+ """Generate sequence by applying mutations to parent."""
467
+ parent_row = self.df[self.df["enzyme_id"] == parent_id].iloc[0]
468
+ parent_seq = parent_row.get("protein_sequence", "")
469
+
470
+ if not parent_seq:
471
+ return None
472
+
473
+ variant_row = self.df[self.df["enzyme_id"] == variant_id].iloc[0]
474
+ mutations = variant_row.get("mutations", "")
475
+
476
+ if not mutations:
477
+ return None
478
+
479
+ sequence = self.manipulator.apply_mutations(parent_seq, mutations)
480
+
481
+ return SequenceGenerationResult(
482
+ sequence=sequence,
483
+ method="from_parent",
484
+ source_id=parent_id,
485
+ confidence=1.0
486
+ )
487
+
488
+ def generate_from_child(
489
+ self,
490
+ variant_id: str,
491
+ child_id: str
492
+ ) -> Optional[SequenceGenerationResult]:
493
+ """Generate sequence by reversing mutations from child."""
494
+ child_row = self.df[self.df["enzyme_id"] == child_id].iloc[0]
495
+ child_seq = child_row.get("protein_sequence", "")
496
+ child_mutations = child_row.get("mutations", "")
497
+
498
+ if not child_seq or not child_mutations:
499
+ return None
500
+
501
+ sequence = self.manipulator.reverse_mutations(child_seq, child_mutations)
502
+
503
+ return SequenceGenerationResult(
504
+ sequence=sequence,
505
+ method="from_child",
506
+ source_id=child_id,
507
+ confidence=0.9
508
+ )
509
+
510
+ def generate_sequence(self, variant_id: str) -> Optional[SequenceGenerationResult]:
511
+ """Generate sequence for a variant using the best available method."""
512
+ # Check if already has sequence
513
+ variant_row = self.df[self.df["enzyme_id"] == variant_id].iloc[0]
514
+ if variant_row.get("protein_sequence", "").strip():
515
+ return SequenceGenerationResult(
516
+ sequence=variant_row["protein_sequence"],
517
+ method="existing",
518
+ source_id=variant_id,
519
+ confidence=1.0
520
+ )
521
+
522
+ # Get variant info
523
+ parent_id = variant_row.get("parent_enzyme_id")
524
+ mutations = variant_row.get("mutations", "")
525
+
526
+ # Check for complex mutations
527
+ complex_muts = MutationParser.detect_complex_mutations(mutations) if mutations else []
528
+ has_complex = bool(complex_muts)
529
+
530
+ # Find best ground truth
531
+ gt_id, direction = self.find_best_ground_truth(variant_id, has_complex)
532
+
533
+ if not gt_id:
534
+ log.warning(f"No suitable ground truth found for {variant_id}")
535
+ return None
536
+
537
+ log.info(f"Using {gt_id} as ground truth ({direction} direction) for {variant_id}")
538
+
539
+ # Generate based on direction
540
+ if direction == "up" and parent_id and mutations:
541
+ if gt_id == parent_id:
542
+ return self.generate_from_parent(variant_id, parent_id)
543
+ else:
544
+ # Non-direct ancestor - less reliable
545
+ result = self.generate_from_parent(variant_id, gt_id)
546
+ if result:
547
+ result.confidence = 0.7
548
+ result.notes = "Generated from non-direct ancestor"
549
+ return result
550
+ else: # down or no parent/mutations
551
+ direct_children = self.navigator.parent_to_children.get(variant_id, [])
552
+ if gt_id in direct_children:
553
+ return self.generate_from_child(variant_id, gt_id)
554
+ else:
555
+ # Try to find path through direct child
556
+ path = self.navigator.find_path(variant_id, gt_id)
557
+ if path and len(path) > 1:
558
+ direct_child = path[1]
559
+ result = self.generate_from_child(variant_id, direct_child)
560
+ if result:
561
+ result.confidence = 0.8
562
+ result.notes = f"Generated via path through {direct_child}"
563
+ return result
564
+
565
+ return None
566
+
567
+
568
+ # === 7. MAIN PROCESSOR === ---------------------------------------------------
569
+
570
+ class SequenceProcessor:
571
+ """Main processor for handling the complete workflow."""
572
+
573
+ def __init__(self, input_csv: Path, output_csv: Path):
574
+ self.input_csv = input_csv
575
+ self.output_csv = output_csv
576
+ self.df = None
577
+ self.generator = None
578
+
579
+ def load_data(self) -> None:
580
+ """Load and prepare the input data."""
581
+ self.df = pd.read_csv(self.input_csv, keep_default_na=False)
582
+
583
+ # Detect and handle column format automatically
584
+ self._normalize_columns()
585
+
586
+ log.info(
587
+ f"Loaded {len(self.df)} rows, "
588
+ f"{sum(self.df['protein_sequence'].str.strip() == '')} empty sequences"
589
+ )
590
+
591
+ # Ensure required columns exist
592
+ if "flag" not in self.df.columns:
593
+ self.df["flag"] = ""
594
+
595
+ # Initialize generator
596
+ self.generator = SequenceGenerator(self.df)
597
+
598
+ def _normalize_columns(self) -> None:
599
+ """Automatically detect and normalize column names from different formats."""
600
+ # Check if this is enzyme_lineage_extractor format
601
+ if "variant_id" in self.df.columns:
602
+ log.info("Detected enzyme_lineage_extractor format, converting columns...")
603
+
604
+ # Rename columns
605
+ column_mapping = {
606
+ "variant_id": "enzyme_id",
607
+ "parent_id": "parent_enzyme_id",
608
+ "aa_seq": "protein_sequence"
609
+ }
610
+
611
+ self.df = self.df.rename(columns=column_mapping)
612
+
613
+ # Convert mutation format from semicolon to comma-separated
614
+ if "mutations" in self.df.columns:
615
+ self.df["mutations"] = self.df["mutations"].str.replace(";", ",")
616
+
617
+ log.info("Column conversion complete")
618
+
619
+ # Verify required columns exist
620
+ required_columns = ["enzyme_id", "parent_enzyme_id", "mutations", "protein_sequence"]
621
+ missing_columns = [col for col in required_columns if col not in self.df.columns]
622
+
623
+ if missing_columns:
624
+ raise ValueError(
625
+ f"Missing required columns: {missing_columns}. "
626
+ f"Found columns: {list(self.df.columns)}"
627
+ )
628
+
629
+ def flag_complex_mutations(self) -> None:
630
+ """Flag variants with complex mutations."""
631
+ complex_count = 0
632
+
633
+ for idx, row in self.df.iterrows():
634
+ if row.get("mutations", ""):
635
+ complex_muts = MutationParser.detect_complex_mutations(row["mutations"])
636
+ if complex_muts:
637
+ self.df.at[idx, "flag"] = "complex_mutation"
638
+ complex_count += 1
639
+ log.info(
640
+ f"Variant {row['enzyme_id']} has complex mutations: {complex_muts}"
641
+ )
642
+
643
+ log.info(f"Flagged {complex_count} variants with complex mutations")
644
+
645
+ def process_simple_mutations(self) -> None:
646
+ """Process variants with simple point mutations."""
647
+ processed = 0
648
+
649
+ for idx, row in self.df.iterrows():
650
+ # Skip if already has sequence or has complex mutations
651
+ if (row.get("protein_sequence", "").strip() or
652
+ "complex_mutation" in str(row.get("flag", ""))):
653
+ continue
654
+
655
+ variant_id = row["enzyme_id"]
656
+ result = self.generator.generate_sequence(variant_id)
657
+
658
+ if result and result.method == "from_parent":
659
+ self.df.at[idx, "protein_sequence"] = result.sequence
660
+
661
+ # Check for unexpected length changes
662
+ parent_seq = self.df[
663
+ self.df["enzyme_id"] == result.source_id
664
+ ]["protein_sequence"].iloc[0]
665
+
666
+ if len(result.sequence) != len(parent_seq):
667
+ self.df.at[idx, "flag"] = "unexpected_length_change"
668
+ log.warning(
669
+ f"Unexpected length change for {variant_id} "
670
+ f"with standard mutations"
671
+ )
672
+
673
+ processed += 1
674
+
675
+ log.info(f"Processed {processed} variants with simple mutations")
676
+
677
+ def process_complex_mutations(self) -> None:
678
+ """Process variants with complex mutations."""
679
+ complex_variants = self.df[
680
+ self.df["flag"].str.contains("complex_mutation", na=False)
681
+ ]["enzyme_id"].tolist()
682
+
683
+ log.info(f"Processing {len(complex_variants)} variants with complex mutations")
684
+
685
+ processed = 0
686
+ for variant_id in complex_variants:
687
+ idx = self.df[self.df["enzyme_id"] == variant_id].index[0]
688
+
689
+ if self.df.at[idx, "protein_sequence"]:
690
+ continue
691
+
692
+ result = self.generator.generate_sequence(variant_id)
693
+
694
+ if result:
695
+ self.df.at[idx, "protein_sequence"] = result.sequence
696
+
697
+ # Check length changes
698
+ parent_id = self.df.at[idx, "parent_enzyme_id"]
699
+ parent_row = self.df[self.df["enzyme_id"] == parent_id]
700
+
701
+ if not parent_row.empty and parent_row.iloc[0]["protein_sequence"]:
702
+ parent_seq = parent_row.iloc[0]["protein_sequence"]
703
+ if len(result.sequence) != len(parent_seq):
704
+ self.df.at[idx, "flag"] = "complex_mutation length_change"
705
+ log.info(
706
+ f"Length change for {variant_id}: "
707
+ f"{len(parent_seq)} -> {len(result.sequence)}"
708
+ )
709
+
710
+ processed += 1
711
+
712
+ log.info(f"Processed {processed} complex mutation variants")
713
+
714
+ def process_remaining(self) -> None:
715
+ """Process any remaining variants."""
716
+ # Update ground truths with newly generated sequences
717
+ self.generator._update_ground_truths()
718
+
719
+ remaining = self.df[
720
+ self.df["protein_sequence"].str.strip() == ""
721
+ ]["enzyme_id"].tolist()
722
+
723
+ if not remaining:
724
+ return
725
+
726
+ log.info(f"Processing {len(remaining)} remaining variants")
727
+
728
+ # Sort by generation if available
729
+ if "generation" in self.df.columns:
730
+ remaining.sort(
731
+ key=lambda x: self.df[
732
+ self.df["enzyme_id"] == x
733
+ ]["generation"].iloc[0] if x in self.df["enzyme_id"].values else float('inf')
734
+ )
735
+
736
+ processed = 0
737
+ for variant_id in remaining:
738
+ idx = self.df[self.df["enzyme_id"] == variant_id].index[0]
739
+
740
+ if self.df.at[idx, "protein_sequence"]:
741
+ continue
742
+
743
+ result = self.generator.generate_sequence(variant_id)
744
+
745
+ if result:
746
+ self.df.at[idx, "protein_sequence"] = result.sequence
747
+
748
+ # Add generation method to flag
749
+ method_flag = f"generated_{result.method}"
750
+ if result.confidence < 1.0:
751
+ method_flag += f"_conf{result.confidence:.1f}"
752
+
753
+ existing_flag = self.df.at[idx, "flag"]
754
+ self.df.at[idx, "flag"] = f"{existing_flag} {method_flag}".strip()
755
+
756
+ processed += 1
757
+
758
+ # Update ground truths for next iterations
759
+ self.generator._update_ground_truths()
760
+
761
+ log.info(f"Processed {processed} remaining variants")
762
+
763
+ def backward_pass(self) -> None:
764
+ """Work backward from terminal variants to fill remaining gaps."""
765
+ missing = self.df[
766
+ self.df["protein_sequence"].str.strip() == ""
767
+ ]["enzyme_id"].tolist()
768
+
769
+ if not missing:
770
+ return
771
+
772
+ log.info(
773
+ f"Backward pass: attempting to fill {len(missing)} remaining sequences"
774
+ )
775
+
776
+ # Find terminal variants (no children) with sequences
777
+ all_parents = set(self.df["parent_enzyme_id"].dropna())
778
+ terminal_variants = [
779
+ v for v in self.generator.ground_truth_ids
780
+ if v not in all_parents
781
+ ]
782
+
783
+ log.info(f"Found {len(terminal_variants)} terminal variants with sequences")
784
+
785
+ # Sort missing by generation (latest first)
786
+ if "generation" in self.df.columns:
787
+ missing.sort(
788
+ key=lambda x: self.df[
789
+ self.df["enzyme_id"] == x
790
+ ]["generation"].iloc[0] if x in self.df["enzyme_id"].values else 0,
791
+ reverse=True
792
+ )
793
+
794
+ processed = 0
795
+ for variant_id in missing:
796
+ idx = self.df[self.df["enzyme_id"] == variant_id].index[0]
797
+
798
+ if self.df.at[idx, "protein_sequence"]:
799
+ continue
800
+
801
+ result = self.generator.generate_sequence(variant_id)
802
+
803
+ if result:
804
+ self.df.at[idx, "protein_sequence"] = result.sequence
805
+ self.df.at[idx, "flag"] += " backward_from_terminal"
806
+ processed += 1
807
+
808
+ # Update ground truths
809
+ self.generator._update_ground_truths()
810
+
811
+ log.info(f"Backward pass: filled {processed} sequences")
812
+
813
+ def save_results(self) -> None:
814
+ """Save the processed data."""
815
+ # Final statistics
816
+ empty_final = sum(self.df["protein_sequence"].str.strip() == "")
817
+ length_changes = sum(self.df["flag"].str.contains("length_change", na=False))
818
+ complex_mutations = sum(self.df["flag"].str.contains("complex_mutation", na=False))
819
+
820
+ log.info(
821
+ f"Final results: {len(self.df)} rows, {empty_final} empty, "
822
+ f"{complex_mutations} complex mutations, {length_changes} length changes"
823
+ )
824
+
825
+ # Save to CSV
826
+ self.df.to_csv(self.output_csv, index=False)
827
+ log.info(f"Saved results to {self.output_csv}")
828
+
829
+ def run(self) -> None:
830
+ """Run the complete processing pipeline."""
831
+ log.info("Starting sequence generation pipeline")
832
+
833
+ # Load data
834
+ self.load_data()
835
+
836
+ # Flag complex mutations
837
+ self.flag_complex_mutations()
838
+
839
+ # Process in order
840
+ self.process_simple_mutations()
841
+ self.process_complex_mutations()
842
+ self.process_remaining()
843
+ self.backward_pass()
844
+
845
+ # Save results
846
+ self.save_results()
847
+
848
+ log.info("Pipeline completed")
849
+
850
+
851
+ # === 8. CLI INTERFACE === ----------------------------------------------------
852
+
853
+ def setup_logging(verbose: int = 0) -> None:
854
+ """Configure logging based on verbosity level."""
855
+ if verbose >= 2:
856
+ level = logging.DEBUG
857
+ elif verbose == 1:
858
+ level = logging.INFO
859
+ else:
860
+ level = logging.WARNING
861
+
862
+ logging.basicConfig(
863
+ level=level,
864
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
865
+ datefmt="%Y-%m-%d %H:%M:%S"
866
+ )
867
+
868
+
869
+ def main(argv: Optional[List[str]] = None) -> None:
870
+ """Main CLI entry point."""
871
+ parser = argparse.ArgumentParser(
872
+ prog="cleanup_sequence_structured",
873
+ description="Generate protein sequences from mutation data",
874
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
875
+ )
876
+
877
+ parser.add_argument(
878
+ "input_csv",
879
+ type=Path,
880
+ help="Input CSV file with enzyme lineage data"
881
+ )
882
+ parser.add_argument(
883
+ "output_csv",
884
+ type=Path,
885
+ help="Output CSV file with generated sequences"
886
+ )
887
+ parser.add_argument(
888
+ "-v", "--verbose",
889
+ action="count",
890
+ default=0,
891
+ help="Increase verbosity (use -vv for debug output)"
892
+ )
893
+
894
+ args = parser.parse_args(argv)
895
+
896
+ # Setup logging
897
+ setup_logging(args.verbose)
898
+
899
+ # Process the data (format detection is automatic)
900
+ processor = SequenceProcessor(args.input_csv, args.output_csv)
901
+ processor.run()
902
+
903
+
904
+ if __name__ == "__main__":
905
+ main()