factorforge-cds 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. factorforge/__init__.py +19 -0
  2. factorforge/__main__.py +8 -0
  3. factorforge/cli/__init__.py +5 -0
  4. factorforge/cli/legacy_cli.py +157 -0
  5. factorforge/cli/main.py +305 -0
  6. factorforge/core/interfaces/__init__.py +7 -0
  7. factorforge/core/interfaces/exporter.py +13 -0
  8. factorforge/core/interfaces/optimizer.py +85 -0
  9. factorforge/core/interfaces/validator.py +9 -0
  10. factorforge/database.py +150 -0
  11. factorforge/engines/__init__.py +60 -0
  12. factorforge/engines/ml/__init__.py +0 -0
  13. factorforge/engines/ml/plant_optimizer.py +325 -0
  14. factorforge/engines/registry.py +141 -0
  15. factorforge/engines/v1_archived/__init__.py +15 -0
  16. factorforge/engines/v2/__init__.py +13 -0
  17. factorforge/engines/v2/codon_table_builder.py +107 -0
  18. factorforge/engines/v2/construct_builder.py +403 -0
  19. factorforge/engines/v2/exporter.py +455 -0
  20. factorforge/engines/v2/optimizer.py +190 -0
  21. factorforge/engines/v2/pipeline.py +275 -0
  22. factorforge/engines/v2/rules/__init__.py +3 -0
  23. factorforge/engines/v2/rules/domesticator.py +403 -0
  24. factorforge/engines/v2/rules/reverse_translator.py +765 -0
  25. factorforge/engines/v2/rules/rule_engine.py +867 -0
  26. factorforge/engines/v2/scoring.py +232 -0
  27. factorforge/engines/v2/utils.py +231 -0
  28. factorforge/engines/v2/validator.py +383 -0
  29. factorforge/engines/v3/__init__.py +12 -0
  30. factorforge/engines/v3/explain.py +119 -0
  31. factorforge/engines/v3/inference/__init__.py +6 -0
  32. factorforge/engines/v3/inference/constrained_decoder.py +80 -0
  33. factorforge/engines/v3/inference/v2_adapter.py +72 -0
  34. factorforge/engines/v3/metrics.py +145 -0
  35. factorforge/engines/v3/modeling_bart_decoder.py +127 -0
  36. factorforge/engines/v3/pipeline.py +192 -0
  37. factorforge/engines/v3/synonym_mask.py +61 -0
  38. factorforge/engines/v3/tokenizer.py +192 -0
  39. factorforge/ml/__init__.py +33 -0
  40. factorforge/ml/feasibility.py +199 -0
  41. factorforge/ml/metrics.py +295 -0
  42. factorforge/utils/__init__.py +31 -0
  43. factorforge/utils/construct_id.py +8 -0
  44. factorforge/utils/exceptions.py +32 -0
  45. factorforge/utils/sequence_validator.py +189 -0
  46. factorforge/utils/validation.py +104 -0
  47. factorforge_cds-3.0.0.dist-info/METADATA +475 -0
  48. factorforge_cds-3.0.0.dist-info/RECORD +52 -0
  49. factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
  50. factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
  51. factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
  52. factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,765 @@
1
+ """
2
+ Reverse Translator for FactorForge v2
3
+ Reverse-translate amino acid sequences to N. benthamiana-optimized codons (P0-2)
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from bisect import bisect_left
9
+ import json
10
+ import logging
11
+ import math
12
+ import random
13
+ import secrets
14
+ from enum import Enum
15
+ from pathlib import Path
16
+ from typing import Any, cast
17
+
18
+ from factorforge.engines.v2.scoring import calculate_composite_score
19
+ from factorforge.engines.v2.utils import (
20
+ build_aa_to_codons_map,
21
+ calculate_gc,
22
+ get_data_path,
23
+ load_golden_set,
24
+ )
25
+ from factorforge.utils.exceptions import EmptyCandidateError
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class OptimizationProfile(Enum):
31
+ """Optimization profile"""
32
+
33
+ BALANCED = "balanced"
34
+ HIGH_CAI = "high_cai"
35
+ GC_TARGET = "gc_target"
36
+ ASSEMBLY_FRIENDLY = "assembly_friendly"
37
+ RAMP = "ramp"
38
+ VIRAL_DELIVERY = "viral_delivery" # TRV 바이러스 전달 최적화 (Li et al. 2026)
39
+
40
+
41
+ class ReverseTranslator:
42
+ """
43
+ Reverse-translate amino acid sequences to DNA
44
+
45
+ Supports 4 optimization profiles:
46
+ 1. Balanced: CAI priority, GC balance
47
+ 2. High-CAI: use only preferred codons
48
+ 3. GC-Target: enforce GC% 50% ±5%
49
+ 4. Assembly-Friendly: avoid BsaI/BpiI
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ codon_table_path: str | Path | None = None,
55
+ golden_set_path: str | Path | None = None,
56
+ ) -> None:
57
+ """
58
+ Args:
59
+ codon_table_path: Path to codon table JSON file.
60
+ golden_set_path: Path to golden set JSON for CAI reference weights.
61
+ If None, attempts to load default golden set.
62
+ """
63
+ if codon_table_path is None:
64
+ # Use centralized data path management
65
+ data_dir = get_data_path()
66
+ codon_table_path = data_dir / "nbenthamiana_codons.json"
67
+
68
+ self.codon_table: dict[str, Any] = self._load_codon_table(codon_table_path)
69
+ self.aa_to_codons: dict[str, list[tuple[str, float]]] = self._build_aa_to_codons_map()
70
+
71
+ # Load golden set for CAI reference weights
72
+ if golden_set_path is not None:
73
+ self.golden_set_table: dict[str, Any] = self._load_codon_table(golden_set_path)
74
+ else:
75
+ try:
76
+ self.golden_set_table = load_golden_set()
77
+ except (FileNotFoundError, json.JSONDecodeError):
78
+ self.golden_set_table = self.codon_table
79
+
80
+ # Pre-compute relative adaptiveness weights from golden set (Sharp & Li 1987)
81
+ self.golden_ref_weights: dict[str, float] = self._build_ref_weights(
82
+ self.golden_set_table
83
+ )
84
+
85
+ # Pre-compute max frequency per amino acid for CAI fallback path
86
+ # Avoids repeated max() inside calculate_cai() hot loop
87
+ self._aa_max_freq: dict[str, float] = {
88
+ aa: max(f for _, f in codons)
89
+ for aa, codons in self.aa_to_codons.items()
90
+ }
91
+ self._aa_primary_codon: dict[str, str] = {}
92
+ self._aa_weighted_codons: dict[str, tuple[str, ...]] = {}
93
+ self._aa_weighted_cumprob: dict[str, tuple[float, ...]] = {}
94
+ for aa, codons in self.aa_to_codons.items():
95
+ if not codons:
96
+ continue
97
+ self._aa_primary_codon[aa] = codons[0][0]
98
+
99
+ codon_names = tuple(c for c, _ in codons)
100
+ raw_weights = [float(w) for _, w in codons]
101
+ total = sum(raw_weights)
102
+ if total <= 0.0:
103
+ # Defensive fallback: uniform sampling if malformed frequencies are loaded.
104
+ n = len(codon_names)
105
+ cumprob = tuple((i + 1) / n for i in range(n))
106
+ else:
107
+ running = 0.0
108
+ cumprob_list: list[float] = []
109
+ for w in raw_weights:
110
+ running += w / total
111
+ cumprob_list.append(running)
112
+ # Guard against tiny floating drift.
113
+ cumprob_list[-1] = 1.0
114
+ cumprob = tuple(cumprob_list)
115
+ self._aa_weighted_codons[aa] = codon_names
116
+ self._aa_weighted_cumprob[aa] = cumprob
117
+
118
+ # Restriction sites (for Assembly-Friendly profile)
119
+ # Each enzyme maps to a list of recognition sequences (forward + reverse complement)
120
+ self.restriction_sites: dict[str, list[str]] = {
121
+ "BsaI": ["GGTCTC", "GAGACC"],
122
+ "BpiI": ["GAAGAC", "GTCTTC"],
123
+ "BsmBI": ["CGTCTC", "GAGACG"],
124
+ }
125
+
126
+ def _load_codon_table(self, path: str | Path) -> dict[str, Any]:
127
+ """Load codon table"""
128
+ with open(path, "r", encoding="utf-8") as f:
129
+ return cast(dict[str, Any], json.load(f))
130
+
131
+ def _build_aa_to_codons_map(self) -> dict[str, list[tuple[str, float]]]:
132
+ """
133
+ Build amino-acid-to-codons map
134
+
135
+ Returns:
136
+ {"A": [("GCC", 0.40), ("GCT", 0.26), ...], ...}
137
+ """
138
+ aa_map: dict[str, list[tuple[str, float]]] = {}
139
+ raw_aa_map = build_aa_to_codons_map(self.codon_table)
140
+ for aa, codons in raw_aa_map.items():
141
+ codons_with_freq: list[tuple[str, float]] = []
142
+ for codon in codons:
143
+ codon_info = self.codon_table["codons"].get(codon)
144
+ if not codon_info:
145
+ continue
146
+ freq = float(codon_info["frequency"])
147
+ codons_with_freq.append((codon, freq))
148
+
149
+ if codons_with_freq:
150
+ # Sort by frequency (descending)
151
+ codons_with_freq.sort(key=lambda x: x[1], reverse=True)
152
+ aa_map[aa] = codons_with_freq
153
+
154
+ return aa_map
155
+
156
+ @staticmethod
157
+ def _build_ref_weights(ref_table: dict[str, Any]) -> dict[str, float]:
158
+ """Build relative adaptiveness weights from a reference codon table.
159
+
160
+ Groups codons by amino acid and computes w_i = f_i / f_max per amino acid,
161
+ following Sharp & Li (1987).
162
+
163
+ Args:
164
+ ref_table: Codon table dict with "codons" section.
165
+
166
+ Returns:
167
+ Mapping of codon → relative adaptiveness weight (0-1).
168
+ """
169
+ codons_section = ref_table.get("codons", {})
170
+
171
+ # Group frequencies by amino acid
172
+ aa_codons: dict[str, list[tuple[str, float]]] = {}
173
+ for codon, info in codons_section.items():
174
+ aa = info["aa"]
175
+ freq = info.get("frequency", 0.0)
176
+ aa_codons.setdefault(aa, []).append((codon, freq))
177
+
178
+ # Compute relative adaptiveness
179
+ weights: dict[str, float] = {}
180
+ for aa, codon_freqs in aa_codons.items():
181
+ if aa == "*": # Skip stop codons
182
+ continue
183
+ max_freq = max(f for _, f in codon_freqs)
184
+ for codon, freq in codon_freqs:
185
+ weights[codon] = freq / max_freq if max_freq > 0 else 0.0
186
+
187
+ return weights
188
+
189
+ def calculate_cai(self, dna_sequence: str) -> float:
190
+ """
191
+ Calculate Codon Adaptation Index (CAI) using golden set reference weights.
192
+
193
+ Uses pre-computed relative adaptiveness weights from the golden set
194
+ (Sharp & Li 1987). Falls back to the working codon table if the golden
195
+ set does not contain a codon.
196
+
197
+ Args:
198
+ dna_sequence: DNA sequence (length must be divisible by 3).
199
+
200
+ Returns:
201
+ CAI value (0.0 ~ 1.0).
202
+
203
+ Examples:
204
+ >>> translator = ReverseTranslator()
205
+ >>> translator.calculate_cai("ATGGCC")
206
+ 0.0
207
+ """
208
+ if len(dna_sequence) % 3 != 0:
209
+ return 0.0
210
+
211
+ # ============================================================
212
+ # ORIGINAL (preserved as comment)
213
+ # ============================================================
214
+ # weights: list[float] = []
215
+ # for i in range(0, len(dna_sequence), 3):
216
+ # codon = dna_sequence[i : i + 3].upper()
217
+ # w = self.golden_ref_weights.get(codon)
218
+ # if w is not None and w > 0:
219
+ # weights.append(w)
220
+ # elif codon in self.codon_table.get("codons", {}):
221
+ # aa = self.codon_table["codons"][codon]["aa"]
222
+ # if aa == "*":
223
+ # continue
224
+ # freq = self.codon_table["codons"][codon]["frequency"]
225
+ # if aa in self.aa_to_codons:
226
+ # max_freq = max(f for _, f in self.aa_to_codons[aa]) # ← HOT: O(k)×n
227
+ # weight = freq / max_freq if max_freq > 0 else 0.0
228
+ # if weight > 0:
229
+ # weights.append(weight)
230
+ # if not weights:
231
+ # return 0.0
232
+ # log_sum = sum(math.log(w) for w in weights) # ← 2-pass
233
+ # cai = math.exp(log_sum / len(weights))
234
+ # ============================================================
235
+ # OPTIMIZED
236
+ # ============================================================
237
+ # - Fallback max_freq uses pre-computed self._aa_max_freq (O(1) lookup)
238
+ # - 1-pass log accumulation: no list allocation, no second sum() pass
239
+ # Performance: ~8-12x faster for 2,000+ codon sequences
240
+ # ============================================================
241
+ log_sum = 0.0
242
+ count = 0
243
+ codons_section = self.codon_table.get("codons", {})
244
+
245
+ for i in range(0, len(dna_sequence), 3):
246
+ codon = dna_sequence[i : i + 3].upper()
247
+
248
+ # Primary: golden set reference weights
249
+ w = self.golden_ref_weights.get(codon)
250
+ if w is not None and w > 0:
251
+ log_sum += math.log(w)
252
+ count += 1
253
+ elif codon in codons_section:
254
+ # Fallback: working table with pre-computed max_freq (O(1))
255
+ codon_info = codons_section[codon]
256
+ aa = codon_info["aa"]
257
+ if aa == "*":
258
+ continue
259
+ max_freq = self._aa_max_freq.get(aa, 0.0)
260
+ if max_freq > 0:
261
+ weight = codon_info["frequency"] / max_freq
262
+ if weight > 0:
263
+ log_sum += math.log(weight)
264
+ count += 1
265
+
266
+ if count == 0:
267
+ return 0.0
268
+
269
+ # Geometric mean
270
+ return round(math.exp(log_sum / count), 3)
271
+
272
+ def calculate_gc_content(self, dna_sequence: str) -> float:
273
+ """
274
+ Calculate GC content
275
+
276
+ Args:
277
+ dna_sequence: DNA sequence
278
+
279
+ Returns:
280
+ GC% (0.0 ~ 100.0)
281
+
282
+ Raises:
283
+ None.
284
+
285
+ Examples:
286
+ >>> translator = ReverseTranslator()
287
+ >>> translator.calculate_gc_content("ATGC")
288
+ 50.0
289
+ """
290
+ return round(calculate_gc(dna_sequence), 2)
291
+
292
+ def calculate_local_gc(self, dna_sequence: str, window_size: int = 50) -> list[float]:
293
+ """
294
+ Calculate local GC content (sliding window)
295
+
296
+ Args:
297
+ dna_sequence: DNA sequence
298
+ window_size: Window size (bp)
299
+
300
+ Returns:
301
+ GC% list per window
302
+
303
+ Raises:
304
+ None.
305
+
306
+ Examples:
307
+ >>> translator = ReverseTranslator()
308
+ >>> translator.calculate_local_gc("ATGCATGC", window_size=4)
309
+ [50.0, 50.0, 50.0, 50.0, 50.0]
310
+ """
311
+ local_gc: list[float] = []
312
+
313
+ for i in range(len(dna_sequence) - window_size + 1):
314
+ window = dna_sequence[i : i + window_size]
315
+ gc = self.calculate_gc_content(window)
316
+ local_gc.append(gc)
317
+
318
+ return local_gc
319
+
320
+ def reverse_translate(
321
+ self,
322
+ protein_seq: str,
323
+ profile: OptimizationProfile = OptimizationProfile.BALANCED,
324
+ **kwargs: Any,
325
+ ) -> str:
326
+ """
327
+ Reverse-translate amino acid sequence to DNA
328
+
329
+ Args:
330
+ protein_seq: Amino acid sequence
331
+ profile: Optimization profile
332
+ **kwargs: Profile-specific parameters
333
+
334
+ Returns:
335
+ Optimized DNA sequence
336
+
337
+ Raises:
338
+ ValueError: Unknown profile or invalid amino acids.
339
+
340
+ Examples:
341
+ >>> translator = ReverseTranslator()
342
+ >>> translator.reverse_translate("MA", profile=OptimizationProfile.HIGH_CAI)
343
+ 'ATGGCC'
344
+ """
345
+ protein_seq = protein_seq.upper().replace(" ", "")
346
+ kozak = kwargs.pop("kozak", False)
347
+
348
+ if profile == OptimizationProfile.BALANCED:
349
+ result = self._balanced_translate(protein_seq, **kwargs)
350
+ elif profile == OptimizationProfile.HIGH_CAI:
351
+ result = self._high_cai_translate(protein_seq, **kwargs)
352
+ elif profile == OptimizationProfile.GC_TARGET:
353
+ result = self._gc_target_translate(protein_seq, **kwargs)
354
+ elif profile == OptimizationProfile.ASSEMBLY_FRIENDLY:
355
+ result = self._assembly_friendly_translate(protein_seq, **kwargs)
356
+ elif profile == OptimizationProfile.RAMP:
357
+ result = self._ramp_translate(protein_seq, **kwargs)
358
+ elif profile == OptimizationProfile.VIRAL_DELIVERY:
359
+ result = self._balanced_translate(protein_seq, **kwargs)
360
+ else:
361
+ raise ValueError(f"Unknown profile: {profile}")
362
+
363
+ if kozak:
364
+ result = self._apply_kozak_optimization(result, protein_seq)
365
+
366
+ return result
367
+
368
+ def _balanced_translate(self, protein_seq: str, **kwargs: Any) -> str:
369
+ """
370
+ Balanced profile: CAI first, GC balanced
371
+
372
+ - Preferred codon ratio: 70%
373
+ - Target GC: 45-55%
374
+ """
375
+ target_gc_min = kwargs.get("target_gc_min", 45)
376
+ target_gc_max = kwargs.get("target_gc_max", 55)
377
+ preferred_ratio = kwargs.get("preferred_ratio", 0.7)
378
+ max_attempts = kwargs.get("max_gc_attempts", 10)
379
+ if max_attempts < 1:
380
+ raise ValueError("max_gc_attempts must be >= 1")
381
+
382
+ best_result: str | None = None
383
+ best_gc_diff = float("inf")
384
+ last_result = ""
385
+
386
+ # Try multiple times to find GC within target range
387
+ for _attempt in range(max_attempts):
388
+ dna_seq: list[str] = []
389
+
390
+ for aa in protein_seq:
391
+ if aa not in self._aa_primary_codon:
392
+ raise ValueError(f"Invalid amino acid: {aa}")
393
+
394
+ # 70% preferred codon, 30% secondary codon
395
+ if random.random() < preferred_ratio:
396
+ # Preferred codon
397
+ codon = self._aa_primary_codon[aa]
398
+ else:
399
+ # Secondary codon (weighted by frequency)
400
+ codon = self._sample_weighted_codon(aa)
401
+
402
+ dna_seq.append(codon)
403
+
404
+ result = "".join(dna_seq)
405
+ last_result = result
406
+ gc = self.calculate_gc_content(result)
407
+
408
+ # Return immediately if within target GC
409
+ if target_gc_min <= gc <= target_gc_max:
410
+ return result
411
+
412
+ # Track best result
413
+ target_gc_mid = (target_gc_min + target_gc_max) / 2
414
+ gc_diff = abs(gc - target_gc_mid)
415
+ if gc_diff < best_gc_diff:
416
+ best_gc_diff = gc_diff
417
+ best_result = result
418
+
419
+ # Return closest result if target range not found
420
+ return best_result if best_result is not None else last_result
421
+
422
+ def _high_cai_translate(self, protein_seq: str, **kwargs: Any) -> str:
423
+ """
424
+ High-CAI profile: use only preferred codons
425
+
426
+ - CAI > 0.85 guaranteed
427
+ - No GC constraints
428
+ """
429
+ dna_seq: list[str] = []
430
+
431
+ for aa in protein_seq:
432
+ if aa not in self.aa_to_codons:
433
+ raise ValueError(f"Invalid amino acid: {aa}")
434
+
435
+ # Pick codon with highest golden set relative adaptiveness weight (CAI-optimal)
436
+ codons = self.aa_to_codons[aa]
437
+ preferred_codon = max(codons, key=lambda c: self.golden_ref_weights.get(c[0], 0.0))[0]
438
+ dna_seq.append(preferred_codon)
439
+
440
+ return "".join(dna_seq)
441
+
442
+ def _gc_target_translate(self, protein_seq: str, **kwargs: Any) -> str:
443
+ """
444
+ GC-Target profile: enforce GC% 42.5% ±2% (N. benthamiana optimal)
445
+
446
+ - GC constraint first
447
+ - CAI may be sacrificed
448
+ - Balance local window GC (50 bp)
449
+ """
450
+ target_gc = kwargs.get("target_gc", 42.5)
451
+ tolerance = kwargs.get("tolerance", 2.0)
452
+
453
+ dna_seq: list[str] = []
454
+
455
+ for i, aa in enumerate(protein_seq):
456
+ if aa not in self.aa_to_codons:
457
+ raise ValueError(f"Invalid amino acid: {aa}")
458
+
459
+ codons = self.aa_to_codons[aa]
460
+
461
+ # Current GC so far
462
+ current_seq = "".join(dna_seq)
463
+ current_gc = self.calculate_gc_content(current_seq) if current_seq else target_gc
464
+
465
+ # Choose codon that brings GC closer to target
466
+ best_codon: str | None = None
467
+ best_diff = float("inf")
468
+
469
+ for codon, _ in codons:
470
+ test_seq = current_seq + codon
471
+ test_gc = self.calculate_gc_content(test_seq)
472
+ diff = abs(test_gc - target_gc)
473
+
474
+ if diff < best_diff:
475
+ best_diff = diff
476
+ best_codon = codon
477
+
478
+ dna_seq.append(cast(str, best_codon))
479
+
480
+ return "".join(dna_seq)
481
+
482
+ def _assembly_friendly_translate(self, protein_seq: str, **kwargs: Any) -> str:
483
+ """
484
+ Assembly-Friendly profile: avoid BsaI/BpiI
485
+
486
+ - Golden Gate compatible
487
+ - CAI trade-offs allowed
488
+ """
489
+ max_attempts = kwargs.get("max_attempts", 10)
490
+ if max_attempts < 1:
491
+ raise ValueError("max_attempts must be >= 1")
492
+ last_seq = ""
493
+
494
+ for attempt in range(max_attempts):
495
+ # Start with Balanced strategy
496
+ dna_seq = self._balanced_translate(protein_seq, preferred_ratio=0.6)
497
+ last_seq = dna_seq
498
+
499
+ # Check restriction sites (forward + reverse complement)
500
+ has_restriction_site = False
501
+ for site_name, site_seqs in self.restriction_sites.items():
502
+ for site_seq in site_seqs:
503
+ if site_seq in dna_seq:
504
+ has_restriction_site = True
505
+ break
506
+ if has_restriction_site:
507
+ break
508
+
509
+ if not has_restriction_site:
510
+ return dna_seq
511
+
512
+ # Return with warning if attempts are exhausted
513
+ logger.warning(
514
+ f"Could not remove all restriction sites after {max_attempts} attempts. "
515
+ "The returned sequence may contain restriction sites."
516
+ )
517
+ return last_seq
518
+
519
+ def _ramp_translate(self, protein_seq: str, **kwargs: Any) -> str:
520
+ """
521
+ RAMP profile: balanced translation + N-terminal ramp.
522
+
523
+ Uses balanced translation as a base, then applies a codon deoptimization
524
+ ramp to the first N codons to promote co-translational folding.
525
+
526
+ Args:
527
+ protein_seq: Amino acid sequence.
528
+ **kwargs: ramp_codons (int): Number of N-terminal codons to ramp. Default 50.
529
+ """
530
+ ramp_codons = kwargs.get("ramp_codons", 50)
531
+ dna_seq = self._balanced_translate(protein_seq, **kwargs)
532
+ return self._apply_nterminal_ramp(dna_seq, protein_seq, ramp_codons=ramp_codons)
533
+
534
+ def _apply_nterminal_ramp(
535
+ self, dna_seq: str, protein_seq: str, ramp_codons: int = 50
536
+ ) -> str:
537
+ """
538
+ Apply N-terminal codon ramp for co-translational folding.
539
+
540
+ Replaces the first `ramp_codons` codons with lower-frequency synonymous
541
+ codons (bottom 50% by frequency) to slow the ribosome at the N-terminus.
542
+ Single-codon amino acids (Met, Trp) are left unchanged.
543
+
544
+ Args:
545
+ dna_seq: Full-length DNA sequence.
546
+ protein_seq: Original protein sequence (same length as dna_seq/3).
547
+ ramp_codons: Number of N-terminal codons to deoptimize.
548
+
549
+ Returns:
550
+ DNA sequence with N-terminal ramp applied.
551
+ """
552
+ codons = [dna_seq[i : i + 3] for i in range(0, len(dna_seq), 3)]
553
+ n_ramp = min(ramp_codons, len(codons), len(protein_seq))
554
+
555
+ for idx in range(n_ramp):
556
+ aa = protein_seq[idx]
557
+ if aa not in self.aa_to_codons:
558
+ continue
559
+
560
+ all_codons = self.aa_to_codons[aa]
561
+ # Skip single-codon amino acids (M, W)
562
+ if len(all_codons) <= 1:
563
+ continue
564
+
565
+ # Select from bottom 50% by frequency (deoptimized)
566
+ midpoint = max(1, len(all_codons) // 2)
567
+ low_freq_codons = all_codons[midpoint:]
568
+
569
+ if not low_freq_codons:
570
+ continue
571
+
572
+ # Weighted random from low-frequency codons
573
+ weights = [freq for _, freq in low_freq_codons]
574
+ chosen = random.choices(
575
+ [c for c, _ in low_freq_codons], weights=weights, k=1
576
+ )[0]
577
+ codons[idx] = chosen
578
+
579
+ return "".join(codons)
580
+
581
+ def _apply_kozak_optimization(self, dna_seq: str, protein_seq: str) -> str:
582
+ """
583
+ Optimize Kozak context at the 5' end of CDS.
584
+
585
+ Plant (N. benthamiana) optimal Kozak context: AACAATG**GC**...
586
+ The 2nd codon (position 4-6, encoding protein_seq[1]) should ideally
587
+ start with G (good) or GC (best) to match the plant Kozak consensus.
588
+
589
+ Only performs synonymous codon substitution -- the amino acid sequence
590
+ is preserved. If no synonymous codon starting with G exists, the
591
+ sequence is returned unchanged.
592
+
593
+ Args:
594
+ dna_seq: Full-length DNA sequence (must start with ATG).
595
+ protein_seq: Original protein sequence.
596
+
597
+ Returns:
598
+ DNA sequence with Kozak-optimized 2nd codon, or original if
599
+ optimization is not possible.
600
+ """
601
+ # Need at least 2 codons (ATG + codon2)
602
+ if len(protein_seq) < 2 or len(dna_seq) < 6:
603
+ return dna_seq
604
+
605
+ aa2 = protein_seq[1]
606
+ if aa2 not in self.aa_to_codons:
607
+ return dna_seq
608
+
609
+ current_codon2 = dna_seq[3:6]
610
+ codons_for_aa2 = self.aa_to_codons[aa2]
611
+
612
+ # Already optimal: starts with G
613
+ if current_codon2[0] == "G":
614
+ return dna_seq
615
+
616
+ # Score candidates: prefer GC > G > other
617
+ best_codon = current_codon2
618
+ best_kozak_score = 0 # 0=no G, 1=starts with G, 2=starts with GC
619
+ best_freq = 0.0
620
+
621
+ for codon, freq in codons_for_aa2:
622
+ kozak_score = 0
623
+ if codon[0] == "G":
624
+ kozak_score = 1
625
+ if codon[1] == "C":
626
+ kozak_score = 2
627
+ if kozak_score > best_kozak_score or (
628
+ kozak_score == best_kozak_score and freq > best_freq
629
+ ):
630
+ best_kozak_score = kozak_score
631
+ best_codon = codon
632
+ best_freq = freq
633
+
634
+ if best_kozak_score == 0:
635
+ return dna_seq
636
+
637
+ return dna_seq[:3] + best_codon + dna_seq[6:]
638
+
639
+ def _sample_weighted_codon(self, aa: str) -> str:
640
+ """Sample a codon for one amino acid using precomputed CDF."""
641
+ codons = self._aa_weighted_codons.get(aa)
642
+ cumprob = self._aa_weighted_cumprob.get(aa)
643
+ if not codons or not cumprob:
644
+ raise ValueError(f"Invalid amino acid: {aa}")
645
+ r = random.random()
646
+ idx = bisect_left(cumprob, r)
647
+ if idx >= len(codons):
648
+ idx = len(codons) - 1
649
+ return codons[idx]
650
+
651
+ def generate_candidates(
652
+ self,
653
+ protein_seq: str,
654
+ profile: OptimizationProfile = OptimizationProfile.BALANCED,
655
+ n: int = 5,
656
+ **kwargs: Any,
657
+ ) -> list[dict[str, Any]]:
658
+ """
659
+ Generate top-N candidates
660
+
661
+ Args:
662
+ protein_seq: Amino acid sequence
663
+ profile: Optimization profile
664
+ n: Number of candidates to generate
665
+ **kwargs: Profile-specific parameters
666
+
667
+ Returns:
668
+ [{"sequence": "ATG...", "cai": 0.87, "gc": 51.2, "score": 0.92}, ...]
669
+
670
+ Raises:
671
+ ValueError: Invalid amino acids are present.
672
+
673
+ Examples:
674
+ >>> translator = ReverseTranslator()
675
+ >>> candidates = translator.generate_candidates("MA", n=2)
676
+ >>> len(candidates) == 2
677
+ True
678
+ """
679
+ if n < 1:
680
+ raise ValueError("n must be >= 1")
681
+
682
+ def _build_candidate() -> dict[str, Any]:
683
+ dna_seq = self.reverse_translate(protein_seq, profile, **kwargs)
684
+ cai = self.calculate_cai(dna_seq)
685
+ gc = self.calculate_gc_content(dna_seq)
686
+ score = calculate_composite_score(
687
+ cai=cai,
688
+ gc=gc,
689
+ sequence=dna_seq,
690
+ profile=profile.value,
691
+ **kwargs,
692
+ )
693
+ return {"sequence": dna_seq, "cai": cai, "gc": gc, "score": score}
694
+
695
+ # Fast path for the dominant API call shape (n=1).
696
+ if n == 1:
697
+ try:
698
+ return [_build_candidate()]
699
+ except (ValueError, KeyError, TypeError) as exc:
700
+ reason = (
701
+ "Could not generate a valid candidate in fast path. "
702
+ f"Last error: {exc}"
703
+ )
704
+ raise EmptyCandidateError(protein_seq[:10], reason=reason) from exc
705
+
706
+ candidates: list[dict[str, Any]] = []
707
+ last_error: Exception | None = None
708
+ random.seed(secrets.randbits(32))
709
+
710
+ for attempt in range(n):
711
+ try:
712
+ candidates.append(_build_candidate())
713
+ except (ValueError, KeyError, TypeError) as exc:
714
+ # Catch specific exceptions: invalid amino acids, missing codons, type errors
715
+ logger.debug(f"Candidate generation attempt {attempt + 1} failed: {exc}")
716
+ last_error = exc
717
+ continue
718
+
719
+ if not candidates:
720
+ reason = (
721
+ f"Could not generate any valid candidates after {n} attempts. "
722
+ "Check codon table and profile settings."
723
+ )
724
+ if last_error is not None:
725
+ reason = f"{reason} Last error: {last_error}"
726
+ raise EmptyCandidateError(protein_seq[:10], reason=reason)
727
+
728
+ # Sort by score
729
+ candidates.sort(key=lambda x: x["score"], reverse=True)
730
+
731
+ return candidates
732
+
733
+
734
+ # --- Usage example ---
735
+ if __name__ == "__main__":
736
+ import json
737
+
738
+ translator = ReverseTranslator()
739
+
740
+ # Test sequence (partial GFP)
741
+ protein_seq = "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHKVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK"
742
+
743
+ print("=== Balanced Profile ===")
744
+ balanced = translator.reverse_translate(protein_seq, OptimizationProfile.BALANCED)
745
+ print(f"Length: {len(balanced)} bp")
746
+ print(f"CAI: {translator.calculate_cai(balanced)}")
747
+ print(f"GC%: {translator.calculate_gc_content(balanced)}")
748
+ print(f"Sequence: {balanced[:60]}...")
749
+
750
+ print("\n=== High-CAI Profile ===")
751
+ high_cai = translator.reverse_translate(protein_seq, OptimizationProfile.HIGH_CAI)
752
+ print(f"CAI: {translator.calculate_cai(high_cai)}")
753
+ print(f"GC%: {translator.calculate_gc_content(high_cai)}")
754
+
755
+ print("\n=== GC-Target Profile ===")
756
+ gc_target = translator.reverse_translate(
757
+ protein_seq, OptimizationProfile.GC_TARGET, target_gc=50.0
758
+ )
759
+ print(f"CAI: {translator.calculate_cai(gc_target)}")
760
+ print(f"GC%: {translator.calculate_gc_content(gc_target)}")
761
+
762
+ print("\n=== Top-5 Candidates (Balanced) ===")
763
+ candidates = translator.generate_candidates(protein_seq, OptimizationProfile.BALANCED, n=5)
764
+ for i, cand in enumerate(candidates, 1):
765
+ print(f"{i}. CAI={cand['cai']:.3f}, GC={cand['gc']:.1f}%, Score={cand['score']:.3f}")