factorforge-cds 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- factorforge/__init__.py +19 -0
- factorforge/__main__.py +8 -0
- factorforge/cli/__init__.py +5 -0
- factorforge/cli/legacy_cli.py +157 -0
- factorforge/cli/main.py +305 -0
- factorforge/core/interfaces/__init__.py +7 -0
- factorforge/core/interfaces/exporter.py +13 -0
- factorforge/core/interfaces/optimizer.py +85 -0
- factorforge/core/interfaces/validator.py +9 -0
- factorforge/database.py +150 -0
- factorforge/engines/__init__.py +60 -0
- factorforge/engines/ml/__init__.py +0 -0
- factorforge/engines/ml/plant_optimizer.py +325 -0
- factorforge/engines/registry.py +141 -0
- factorforge/engines/v1_archived/__init__.py +15 -0
- factorforge/engines/v2/__init__.py +13 -0
- factorforge/engines/v2/codon_table_builder.py +107 -0
- factorforge/engines/v2/construct_builder.py +403 -0
- factorforge/engines/v2/exporter.py +455 -0
- factorforge/engines/v2/optimizer.py +190 -0
- factorforge/engines/v2/pipeline.py +275 -0
- factorforge/engines/v2/rules/__init__.py +3 -0
- factorforge/engines/v2/rules/domesticator.py +403 -0
- factorforge/engines/v2/rules/reverse_translator.py +765 -0
- factorforge/engines/v2/rules/rule_engine.py +867 -0
- factorforge/engines/v2/scoring.py +232 -0
- factorforge/engines/v2/utils.py +231 -0
- factorforge/engines/v2/validator.py +383 -0
- factorforge/engines/v3/__init__.py +12 -0
- factorforge/engines/v3/explain.py +119 -0
- factorforge/engines/v3/inference/__init__.py +6 -0
- factorforge/engines/v3/inference/constrained_decoder.py +80 -0
- factorforge/engines/v3/inference/v2_adapter.py +72 -0
- factorforge/engines/v3/metrics.py +145 -0
- factorforge/engines/v3/modeling_bart_decoder.py +127 -0
- factorforge/engines/v3/pipeline.py +192 -0
- factorforge/engines/v3/synonym_mask.py +61 -0
- factorforge/engines/v3/tokenizer.py +192 -0
- factorforge/ml/__init__.py +33 -0
- factorforge/ml/feasibility.py +199 -0
- factorforge/ml/metrics.py +295 -0
- factorforge/utils/__init__.py +31 -0
- factorforge/utils/construct_id.py +8 -0
- factorforge/utils/exceptions.py +32 -0
- factorforge/utils/sequence_validator.py +189 -0
- factorforge/utils/validation.py +104 -0
- factorforge_cds-3.0.0.dist-info/METADATA +475 -0
- factorforge_cds-3.0.0.dist-info/RECORD +52 -0
- factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
- factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
- factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
- factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,765 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Reverse Translator for FactorForge v2
|
|
3
|
+
Reverse-translate amino acid sequences to N. benthamiana-optimized codons (P0-2)
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from bisect import bisect_left
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
import math
|
|
12
|
+
import random
|
|
13
|
+
import secrets
|
|
14
|
+
from enum import Enum
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any, cast
|
|
17
|
+
|
|
18
|
+
from factorforge.engines.v2.scoring import calculate_composite_score
|
|
19
|
+
from factorforge.engines.v2.utils import (
|
|
20
|
+
build_aa_to_codons_map,
|
|
21
|
+
calculate_gc,
|
|
22
|
+
get_data_path,
|
|
23
|
+
load_golden_set,
|
|
24
|
+
)
|
|
25
|
+
from factorforge.utils.exceptions import EmptyCandidateError
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class OptimizationProfile(Enum):
|
|
31
|
+
"""Optimization profile"""
|
|
32
|
+
|
|
33
|
+
BALANCED = "balanced"
|
|
34
|
+
HIGH_CAI = "high_cai"
|
|
35
|
+
GC_TARGET = "gc_target"
|
|
36
|
+
ASSEMBLY_FRIENDLY = "assembly_friendly"
|
|
37
|
+
RAMP = "ramp"
|
|
38
|
+
VIRAL_DELIVERY = "viral_delivery" # TRV 바이러스 전달 최적화 (Li et al. 2026)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ReverseTranslator:
|
|
42
|
+
"""
|
|
43
|
+
Reverse-translate amino acid sequences to DNA
|
|
44
|
+
|
|
45
|
+
Supports 4 optimization profiles:
|
|
46
|
+
1. Balanced: CAI priority, GC balance
|
|
47
|
+
2. High-CAI: use only preferred codons
|
|
48
|
+
3. GC-Target: enforce GC% 50% ±5%
|
|
49
|
+
4. Assembly-Friendly: avoid BsaI/BpiI
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
codon_table_path: str | Path | None = None,
|
|
55
|
+
golden_set_path: str | Path | None = None,
|
|
56
|
+
) -> None:
|
|
57
|
+
"""
|
|
58
|
+
Args:
|
|
59
|
+
codon_table_path: Path to codon table JSON file.
|
|
60
|
+
golden_set_path: Path to golden set JSON for CAI reference weights.
|
|
61
|
+
If None, attempts to load default golden set.
|
|
62
|
+
"""
|
|
63
|
+
if codon_table_path is None:
|
|
64
|
+
# Use centralized data path management
|
|
65
|
+
data_dir = get_data_path()
|
|
66
|
+
codon_table_path = data_dir / "nbenthamiana_codons.json"
|
|
67
|
+
|
|
68
|
+
self.codon_table: dict[str, Any] = self._load_codon_table(codon_table_path)
|
|
69
|
+
self.aa_to_codons: dict[str, list[tuple[str, float]]] = self._build_aa_to_codons_map()
|
|
70
|
+
|
|
71
|
+
# Load golden set for CAI reference weights
|
|
72
|
+
if golden_set_path is not None:
|
|
73
|
+
self.golden_set_table: dict[str, Any] = self._load_codon_table(golden_set_path)
|
|
74
|
+
else:
|
|
75
|
+
try:
|
|
76
|
+
self.golden_set_table = load_golden_set()
|
|
77
|
+
except (FileNotFoundError, json.JSONDecodeError):
|
|
78
|
+
self.golden_set_table = self.codon_table
|
|
79
|
+
|
|
80
|
+
# Pre-compute relative adaptiveness weights from golden set (Sharp & Li 1987)
|
|
81
|
+
self.golden_ref_weights: dict[str, float] = self._build_ref_weights(
|
|
82
|
+
self.golden_set_table
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Pre-compute max frequency per amino acid for CAI fallback path
|
|
86
|
+
# Avoids repeated max() inside calculate_cai() hot loop
|
|
87
|
+
self._aa_max_freq: dict[str, float] = {
|
|
88
|
+
aa: max(f for _, f in codons)
|
|
89
|
+
for aa, codons in self.aa_to_codons.items()
|
|
90
|
+
}
|
|
91
|
+
self._aa_primary_codon: dict[str, str] = {}
|
|
92
|
+
self._aa_weighted_codons: dict[str, tuple[str, ...]] = {}
|
|
93
|
+
self._aa_weighted_cumprob: dict[str, tuple[float, ...]] = {}
|
|
94
|
+
for aa, codons in self.aa_to_codons.items():
|
|
95
|
+
if not codons:
|
|
96
|
+
continue
|
|
97
|
+
self._aa_primary_codon[aa] = codons[0][0]
|
|
98
|
+
|
|
99
|
+
codon_names = tuple(c for c, _ in codons)
|
|
100
|
+
raw_weights = [float(w) for _, w in codons]
|
|
101
|
+
total = sum(raw_weights)
|
|
102
|
+
if total <= 0.0:
|
|
103
|
+
# Defensive fallback: uniform sampling if malformed frequencies are loaded.
|
|
104
|
+
n = len(codon_names)
|
|
105
|
+
cumprob = tuple((i + 1) / n for i in range(n))
|
|
106
|
+
else:
|
|
107
|
+
running = 0.0
|
|
108
|
+
cumprob_list: list[float] = []
|
|
109
|
+
for w in raw_weights:
|
|
110
|
+
running += w / total
|
|
111
|
+
cumprob_list.append(running)
|
|
112
|
+
# Guard against tiny floating drift.
|
|
113
|
+
cumprob_list[-1] = 1.0
|
|
114
|
+
cumprob = tuple(cumprob_list)
|
|
115
|
+
self._aa_weighted_codons[aa] = codon_names
|
|
116
|
+
self._aa_weighted_cumprob[aa] = cumprob
|
|
117
|
+
|
|
118
|
+
# Restriction sites (for Assembly-Friendly profile)
|
|
119
|
+
# Each enzyme maps to a list of recognition sequences (forward + reverse complement)
|
|
120
|
+
self.restriction_sites: dict[str, list[str]] = {
|
|
121
|
+
"BsaI": ["GGTCTC", "GAGACC"],
|
|
122
|
+
"BpiI": ["GAAGAC", "GTCTTC"],
|
|
123
|
+
"BsmBI": ["CGTCTC", "GAGACG"],
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
def _load_codon_table(self, path: str | Path) -> dict[str, Any]:
|
|
127
|
+
"""Load codon table"""
|
|
128
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
129
|
+
return cast(dict[str, Any], json.load(f))
|
|
130
|
+
|
|
131
|
+
def _build_aa_to_codons_map(self) -> dict[str, list[tuple[str, float]]]:
|
|
132
|
+
"""
|
|
133
|
+
Build amino-acid-to-codons map
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
{"A": [("GCC", 0.40), ("GCT", 0.26), ...], ...}
|
|
137
|
+
"""
|
|
138
|
+
aa_map: dict[str, list[tuple[str, float]]] = {}
|
|
139
|
+
raw_aa_map = build_aa_to_codons_map(self.codon_table)
|
|
140
|
+
for aa, codons in raw_aa_map.items():
|
|
141
|
+
codons_with_freq: list[tuple[str, float]] = []
|
|
142
|
+
for codon in codons:
|
|
143
|
+
codon_info = self.codon_table["codons"].get(codon)
|
|
144
|
+
if not codon_info:
|
|
145
|
+
continue
|
|
146
|
+
freq = float(codon_info["frequency"])
|
|
147
|
+
codons_with_freq.append((codon, freq))
|
|
148
|
+
|
|
149
|
+
if codons_with_freq:
|
|
150
|
+
# Sort by frequency (descending)
|
|
151
|
+
codons_with_freq.sort(key=lambda x: x[1], reverse=True)
|
|
152
|
+
aa_map[aa] = codons_with_freq
|
|
153
|
+
|
|
154
|
+
return aa_map
|
|
155
|
+
|
|
156
|
+
@staticmethod
|
|
157
|
+
def _build_ref_weights(ref_table: dict[str, Any]) -> dict[str, float]:
|
|
158
|
+
"""Build relative adaptiveness weights from a reference codon table.
|
|
159
|
+
|
|
160
|
+
Groups codons by amino acid and computes w_i = f_i / f_max per amino acid,
|
|
161
|
+
following Sharp & Li (1987).
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
ref_table: Codon table dict with "codons" section.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Mapping of codon → relative adaptiveness weight (0-1).
|
|
168
|
+
"""
|
|
169
|
+
codons_section = ref_table.get("codons", {})
|
|
170
|
+
|
|
171
|
+
# Group frequencies by amino acid
|
|
172
|
+
aa_codons: dict[str, list[tuple[str, float]]] = {}
|
|
173
|
+
for codon, info in codons_section.items():
|
|
174
|
+
aa = info["aa"]
|
|
175
|
+
freq = info.get("frequency", 0.0)
|
|
176
|
+
aa_codons.setdefault(aa, []).append((codon, freq))
|
|
177
|
+
|
|
178
|
+
# Compute relative adaptiveness
|
|
179
|
+
weights: dict[str, float] = {}
|
|
180
|
+
for aa, codon_freqs in aa_codons.items():
|
|
181
|
+
if aa == "*": # Skip stop codons
|
|
182
|
+
continue
|
|
183
|
+
max_freq = max(f for _, f in codon_freqs)
|
|
184
|
+
for codon, freq in codon_freqs:
|
|
185
|
+
weights[codon] = freq / max_freq if max_freq > 0 else 0.0
|
|
186
|
+
|
|
187
|
+
return weights
|
|
188
|
+
|
|
189
|
+
def calculate_cai(self, dna_sequence: str) -> float:
|
|
190
|
+
"""
|
|
191
|
+
Calculate Codon Adaptation Index (CAI) using golden set reference weights.
|
|
192
|
+
|
|
193
|
+
Uses pre-computed relative adaptiveness weights from the golden set
|
|
194
|
+
(Sharp & Li 1987). Falls back to the working codon table if the golden
|
|
195
|
+
set does not contain a codon.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
dna_sequence: DNA sequence (length must be divisible by 3).
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
CAI value (0.0 ~ 1.0).
|
|
202
|
+
|
|
203
|
+
Examples:
|
|
204
|
+
>>> translator = ReverseTranslator()
|
|
205
|
+
>>> translator.calculate_cai("ATGGCC")
|
|
206
|
+
0.0
|
|
207
|
+
"""
|
|
208
|
+
if len(dna_sequence) % 3 != 0:
|
|
209
|
+
return 0.0
|
|
210
|
+
|
|
211
|
+
# ============================================================
|
|
212
|
+
# ORIGINAL (preserved as comment)
|
|
213
|
+
# ============================================================
|
|
214
|
+
# weights: list[float] = []
|
|
215
|
+
# for i in range(0, len(dna_sequence), 3):
|
|
216
|
+
# codon = dna_sequence[i : i + 3].upper()
|
|
217
|
+
# w = self.golden_ref_weights.get(codon)
|
|
218
|
+
# if w is not None and w > 0:
|
|
219
|
+
# weights.append(w)
|
|
220
|
+
# elif codon in self.codon_table.get("codons", {}):
|
|
221
|
+
# aa = self.codon_table["codons"][codon]["aa"]
|
|
222
|
+
# if aa == "*":
|
|
223
|
+
# continue
|
|
224
|
+
# freq = self.codon_table["codons"][codon]["frequency"]
|
|
225
|
+
# if aa in self.aa_to_codons:
|
|
226
|
+
# max_freq = max(f for _, f in self.aa_to_codons[aa]) # ← HOT: O(k)×n
|
|
227
|
+
# weight = freq / max_freq if max_freq > 0 else 0.0
|
|
228
|
+
# if weight > 0:
|
|
229
|
+
# weights.append(weight)
|
|
230
|
+
# if not weights:
|
|
231
|
+
# return 0.0
|
|
232
|
+
# log_sum = sum(math.log(w) for w in weights) # ← 2-pass
|
|
233
|
+
# cai = math.exp(log_sum / len(weights))
|
|
234
|
+
# ============================================================
|
|
235
|
+
# OPTIMIZED
|
|
236
|
+
# ============================================================
|
|
237
|
+
# - Fallback max_freq uses pre-computed self._aa_max_freq (O(1) lookup)
|
|
238
|
+
# - 1-pass log accumulation: no list allocation, no second sum() pass
|
|
239
|
+
# Performance: ~8-12x faster for 2,000+ codon sequences
|
|
240
|
+
# ============================================================
|
|
241
|
+
log_sum = 0.0
|
|
242
|
+
count = 0
|
|
243
|
+
codons_section = self.codon_table.get("codons", {})
|
|
244
|
+
|
|
245
|
+
for i in range(0, len(dna_sequence), 3):
|
|
246
|
+
codon = dna_sequence[i : i + 3].upper()
|
|
247
|
+
|
|
248
|
+
# Primary: golden set reference weights
|
|
249
|
+
w = self.golden_ref_weights.get(codon)
|
|
250
|
+
if w is not None and w > 0:
|
|
251
|
+
log_sum += math.log(w)
|
|
252
|
+
count += 1
|
|
253
|
+
elif codon in codons_section:
|
|
254
|
+
# Fallback: working table with pre-computed max_freq (O(1))
|
|
255
|
+
codon_info = codons_section[codon]
|
|
256
|
+
aa = codon_info["aa"]
|
|
257
|
+
if aa == "*":
|
|
258
|
+
continue
|
|
259
|
+
max_freq = self._aa_max_freq.get(aa, 0.0)
|
|
260
|
+
if max_freq > 0:
|
|
261
|
+
weight = codon_info["frequency"] / max_freq
|
|
262
|
+
if weight > 0:
|
|
263
|
+
log_sum += math.log(weight)
|
|
264
|
+
count += 1
|
|
265
|
+
|
|
266
|
+
if count == 0:
|
|
267
|
+
return 0.0
|
|
268
|
+
|
|
269
|
+
# Geometric mean
|
|
270
|
+
return round(math.exp(log_sum / count), 3)
|
|
271
|
+
|
|
272
|
+
def calculate_gc_content(self, dna_sequence: str) -> float:
|
|
273
|
+
"""
|
|
274
|
+
Calculate GC content
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
dna_sequence: DNA sequence
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
GC% (0.0 ~ 100.0)
|
|
281
|
+
|
|
282
|
+
Raises:
|
|
283
|
+
None.
|
|
284
|
+
|
|
285
|
+
Examples:
|
|
286
|
+
>>> translator = ReverseTranslator()
|
|
287
|
+
>>> translator.calculate_gc_content("ATGC")
|
|
288
|
+
50.0
|
|
289
|
+
"""
|
|
290
|
+
return round(calculate_gc(dna_sequence), 2)
|
|
291
|
+
|
|
292
|
+
def calculate_local_gc(self, dna_sequence: str, window_size: int = 50) -> list[float]:
|
|
293
|
+
"""
|
|
294
|
+
Calculate local GC content (sliding window)
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
dna_sequence: DNA sequence
|
|
298
|
+
window_size: Window size (bp)
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
GC% list per window
|
|
302
|
+
|
|
303
|
+
Raises:
|
|
304
|
+
None.
|
|
305
|
+
|
|
306
|
+
Examples:
|
|
307
|
+
>>> translator = ReverseTranslator()
|
|
308
|
+
>>> translator.calculate_local_gc("ATGCATGC", window_size=4)
|
|
309
|
+
[50.0, 50.0, 50.0, 50.0, 50.0]
|
|
310
|
+
"""
|
|
311
|
+
local_gc: list[float] = []
|
|
312
|
+
|
|
313
|
+
for i in range(len(dna_sequence) - window_size + 1):
|
|
314
|
+
window = dna_sequence[i : i + window_size]
|
|
315
|
+
gc = self.calculate_gc_content(window)
|
|
316
|
+
local_gc.append(gc)
|
|
317
|
+
|
|
318
|
+
return local_gc
|
|
319
|
+
|
|
320
|
+
def reverse_translate(
|
|
321
|
+
self,
|
|
322
|
+
protein_seq: str,
|
|
323
|
+
profile: OptimizationProfile = OptimizationProfile.BALANCED,
|
|
324
|
+
**kwargs: Any,
|
|
325
|
+
) -> str:
|
|
326
|
+
"""
|
|
327
|
+
Reverse-translate amino acid sequence to DNA
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
protein_seq: Amino acid sequence
|
|
331
|
+
profile: Optimization profile
|
|
332
|
+
**kwargs: Profile-specific parameters
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
Optimized DNA sequence
|
|
336
|
+
|
|
337
|
+
Raises:
|
|
338
|
+
ValueError: Unknown profile or invalid amino acids.
|
|
339
|
+
|
|
340
|
+
Examples:
|
|
341
|
+
>>> translator = ReverseTranslator()
|
|
342
|
+
>>> translator.reverse_translate("MA", profile=OptimizationProfile.HIGH_CAI)
|
|
343
|
+
'ATGGCC'
|
|
344
|
+
"""
|
|
345
|
+
protein_seq = protein_seq.upper().replace(" ", "")
|
|
346
|
+
kozak = kwargs.pop("kozak", False)
|
|
347
|
+
|
|
348
|
+
if profile == OptimizationProfile.BALANCED:
|
|
349
|
+
result = self._balanced_translate(protein_seq, **kwargs)
|
|
350
|
+
elif profile == OptimizationProfile.HIGH_CAI:
|
|
351
|
+
result = self._high_cai_translate(protein_seq, **kwargs)
|
|
352
|
+
elif profile == OptimizationProfile.GC_TARGET:
|
|
353
|
+
result = self._gc_target_translate(protein_seq, **kwargs)
|
|
354
|
+
elif profile == OptimizationProfile.ASSEMBLY_FRIENDLY:
|
|
355
|
+
result = self._assembly_friendly_translate(protein_seq, **kwargs)
|
|
356
|
+
elif profile == OptimizationProfile.RAMP:
|
|
357
|
+
result = self._ramp_translate(protein_seq, **kwargs)
|
|
358
|
+
elif profile == OptimizationProfile.VIRAL_DELIVERY:
|
|
359
|
+
result = self._balanced_translate(protein_seq, **kwargs)
|
|
360
|
+
else:
|
|
361
|
+
raise ValueError(f"Unknown profile: {profile}")
|
|
362
|
+
|
|
363
|
+
if kozak:
|
|
364
|
+
result = self._apply_kozak_optimization(result, protein_seq)
|
|
365
|
+
|
|
366
|
+
return result
|
|
367
|
+
|
|
368
|
+
def _balanced_translate(self, protein_seq: str, **kwargs: Any) -> str:
|
|
369
|
+
"""
|
|
370
|
+
Balanced profile: CAI first, GC balanced
|
|
371
|
+
|
|
372
|
+
- Preferred codon ratio: 70%
|
|
373
|
+
- Target GC: 45-55%
|
|
374
|
+
"""
|
|
375
|
+
target_gc_min = kwargs.get("target_gc_min", 45)
|
|
376
|
+
target_gc_max = kwargs.get("target_gc_max", 55)
|
|
377
|
+
preferred_ratio = kwargs.get("preferred_ratio", 0.7)
|
|
378
|
+
max_attempts = kwargs.get("max_gc_attempts", 10)
|
|
379
|
+
if max_attempts < 1:
|
|
380
|
+
raise ValueError("max_gc_attempts must be >= 1")
|
|
381
|
+
|
|
382
|
+
best_result: str | None = None
|
|
383
|
+
best_gc_diff = float("inf")
|
|
384
|
+
last_result = ""
|
|
385
|
+
|
|
386
|
+
# Try multiple times to find GC within target range
|
|
387
|
+
for _attempt in range(max_attempts):
|
|
388
|
+
dna_seq: list[str] = []
|
|
389
|
+
|
|
390
|
+
for aa in protein_seq:
|
|
391
|
+
if aa not in self._aa_primary_codon:
|
|
392
|
+
raise ValueError(f"Invalid amino acid: {aa}")
|
|
393
|
+
|
|
394
|
+
# 70% preferred codon, 30% secondary codon
|
|
395
|
+
if random.random() < preferred_ratio:
|
|
396
|
+
# Preferred codon
|
|
397
|
+
codon = self._aa_primary_codon[aa]
|
|
398
|
+
else:
|
|
399
|
+
# Secondary codon (weighted by frequency)
|
|
400
|
+
codon = self._sample_weighted_codon(aa)
|
|
401
|
+
|
|
402
|
+
dna_seq.append(codon)
|
|
403
|
+
|
|
404
|
+
result = "".join(dna_seq)
|
|
405
|
+
last_result = result
|
|
406
|
+
gc = self.calculate_gc_content(result)
|
|
407
|
+
|
|
408
|
+
# Return immediately if within target GC
|
|
409
|
+
if target_gc_min <= gc <= target_gc_max:
|
|
410
|
+
return result
|
|
411
|
+
|
|
412
|
+
# Track best result
|
|
413
|
+
target_gc_mid = (target_gc_min + target_gc_max) / 2
|
|
414
|
+
gc_diff = abs(gc - target_gc_mid)
|
|
415
|
+
if gc_diff < best_gc_diff:
|
|
416
|
+
best_gc_diff = gc_diff
|
|
417
|
+
best_result = result
|
|
418
|
+
|
|
419
|
+
# Return closest result if target range not found
|
|
420
|
+
return best_result if best_result is not None else last_result
|
|
421
|
+
|
|
422
|
+
def _high_cai_translate(self, protein_seq: str, **kwargs: Any) -> str:
|
|
423
|
+
"""
|
|
424
|
+
High-CAI profile: use only preferred codons
|
|
425
|
+
|
|
426
|
+
- CAI > 0.85 guaranteed
|
|
427
|
+
- No GC constraints
|
|
428
|
+
"""
|
|
429
|
+
dna_seq: list[str] = []
|
|
430
|
+
|
|
431
|
+
for aa in protein_seq:
|
|
432
|
+
if aa not in self.aa_to_codons:
|
|
433
|
+
raise ValueError(f"Invalid amino acid: {aa}")
|
|
434
|
+
|
|
435
|
+
# Pick codon with highest golden set relative adaptiveness weight (CAI-optimal)
|
|
436
|
+
codons = self.aa_to_codons[aa]
|
|
437
|
+
preferred_codon = max(codons, key=lambda c: self.golden_ref_weights.get(c[0], 0.0))[0]
|
|
438
|
+
dna_seq.append(preferred_codon)
|
|
439
|
+
|
|
440
|
+
return "".join(dna_seq)
|
|
441
|
+
|
|
442
|
+
def _gc_target_translate(self, protein_seq: str, **kwargs: Any) -> str:
|
|
443
|
+
"""
|
|
444
|
+
GC-Target profile: enforce GC% 42.5% ±2% (N. benthamiana optimal)
|
|
445
|
+
|
|
446
|
+
- GC constraint first
|
|
447
|
+
- CAI may be sacrificed
|
|
448
|
+
- Balance local window GC (50 bp)
|
|
449
|
+
"""
|
|
450
|
+
target_gc = kwargs.get("target_gc", 42.5)
|
|
451
|
+
tolerance = kwargs.get("tolerance", 2.0)
|
|
452
|
+
|
|
453
|
+
dna_seq: list[str] = []
|
|
454
|
+
|
|
455
|
+
for i, aa in enumerate(protein_seq):
|
|
456
|
+
if aa not in self.aa_to_codons:
|
|
457
|
+
raise ValueError(f"Invalid amino acid: {aa}")
|
|
458
|
+
|
|
459
|
+
codons = self.aa_to_codons[aa]
|
|
460
|
+
|
|
461
|
+
# Current GC so far
|
|
462
|
+
current_seq = "".join(dna_seq)
|
|
463
|
+
current_gc = self.calculate_gc_content(current_seq) if current_seq else target_gc
|
|
464
|
+
|
|
465
|
+
# Choose codon that brings GC closer to target
|
|
466
|
+
best_codon: str | None = None
|
|
467
|
+
best_diff = float("inf")
|
|
468
|
+
|
|
469
|
+
for codon, _ in codons:
|
|
470
|
+
test_seq = current_seq + codon
|
|
471
|
+
test_gc = self.calculate_gc_content(test_seq)
|
|
472
|
+
diff = abs(test_gc - target_gc)
|
|
473
|
+
|
|
474
|
+
if diff < best_diff:
|
|
475
|
+
best_diff = diff
|
|
476
|
+
best_codon = codon
|
|
477
|
+
|
|
478
|
+
dna_seq.append(cast(str, best_codon))
|
|
479
|
+
|
|
480
|
+
return "".join(dna_seq)
|
|
481
|
+
|
|
482
|
+
def _assembly_friendly_translate(self, protein_seq: str, **kwargs: Any) -> str:
|
|
483
|
+
"""
|
|
484
|
+
Assembly-Friendly profile: avoid BsaI/BpiI
|
|
485
|
+
|
|
486
|
+
- Golden Gate compatible
|
|
487
|
+
- CAI trade-offs allowed
|
|
488
|
+
"""
|
|
489
|
+
max_attempts = kwargs.get("max_attempts", 10)
|
|
490
|
+
if max_attempts < 1:
|
|
491
|
+
raise ValueError("max_attempts must be >= 1")
|
|
492
|
+
last_seq = ""
|
|
493
|
+
|
|
494
|
+
for attempt in range(max_attempts):
|
|
495
|
+
# Start with Balanced strategy
|
|
496
|
+
dna_seq = self._balanced_translate(protein_seq, preferred_ratio=0.6)
|
|
497
|
+
last_seq = dna_seq
|
|
498
|
+
|
|
499
|
+
# Check restriction sites (forward + reverse complement)
|
|
500
|
+
has_restriction_site = False
|
|
501
|
+
for site_name, site_seqs in self.restriction_sites.items():
|
|
502
|
+
for site_seq in site_seqs:
|
|
503
|
+
if site_seq in dna_seq:
|
|
504
|
+
has_restriction_site = True
|
|
505
|
+
break
|
|
506
|
+
if has_restriction_site:
|
|
507
|
+
break
|
|
508
|
+
|
|
509
|
+
if not has_restriction_site:
|
|
510
|
+
return dna_seq
|
|
511
|
+
|
|
512
|
+
# Return with warning if attempts are exhausted
|
|
513
|
+
logger.warning(
|
|
514
|
+
f"Could not remove all restriction sites after {max_attempts} attempts. "
|
|
515
|
+
"The returned sequence may contain restriction sites."
|
|
516
|
+
)
|
|
517
|
+
return last_seq
|
|
518
|
+
|
|
519
|
+
def _ramp_translate(self, protein_seq: str, **kwargs: Any) -> str:
|
|
520
|
+
"""
|
|
521
|
+
RAMP profile: balanced translation + N-terminal ramp.
|
|
522
|
+
|
|
523
|
+
Uses balanced translation as a base, then applies a codon deoptimization
|
|
524
|
+
ramp to the first N codons to promote co-translational folding.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
protein_seq: Amino acid sequence.
|
|
528
|
+
**kwargs: ramp_codons (int): Number of N-terminal codons to ramp. Default 50.
|
|
529
|
+
"""
|
|
530
|
+
ramp_codons = kwargs.get("ramp_codons", 50)
|
|
531
|
+
dna_seq = self._balanced_translate(protein_seq, **kwargs)
|
|
532
|
+
return self._apply_nterminal_ramp(dna_seq, protein_seq, ramp_codons=ramp_codons)
|
|
533
|
+
|
|
534
|
+
def _apply_nterminal_ramp(
|
|
535
|
+
self, dna_seq: str, protein_seq: str, ramp_codons: int = 50
|
|
536
|
+
) -> str:
|
|
537
|
+
"""
|
|
538
|
+
Apply N-terminal codon ramp for co-translational folding.
|
|
539
|
+
|
|
540
|
+
Replaces the first `ramp_codons` codons with lower-frequency synonymous
|
|
541
|
+
codons (bottom 50% by frequency) to slow the ribosome at the N-terminus.
|
|
542
|
+
Single-codon amino acids (Met, Trp) are left unchanged.
|
|
543
|
+
|
|
544
|
+
Args:
|
|
545
|
+
dna_seq: Full-length DNA sequence.
|
|
546
|
+
protein_seq: Original protein sequence (same length as dna_seq/3).
|
|
547
|
+
ramp_codons: Number of N-terminal codons to deoptimize.
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
DNA sequence with N-terminal ramp applied.
|
|
551
|
+
"""
|
|
552
|
+
codons = [dna_seq[i : i + 3] for i in range(0, len(dna_seq), 3)]
|
|
553
|
+
n_ramp = min(ramp_codons, len(codons), len(protein_seq))
|
|
554
|
+
|
|
555
|
+
for idx in range(n_ramp):
|
|
556
|
+
aa = protein_seq[idx]
|
|
557
|
+
if aa not in self.aa_to_codons:
|
|
558
|
+
continue
|
|
559
|
+
|
|
560
|
+
all_codons = self.aa_to_codons[aa]
|
|
561
|
+
# Skip single-codon amino acids (M, W)
|
|
562
|
+
if len(all_codons) <= 1:
|
|
563
|
+
continue
|
|
564
|
+
|
|
565
|
+
# Select from bottom 50% by frequency (deoptimized)
|
|
566
|
+
midpoint = max(1, len(all_codons) // 2)
|
|
567
|
+
low_freq_codons = all_codons[midpoint:]
|
|
568
|
+
|
|
569
|
+
if not low_freq_codons:
|
|
570
|
+
continue
|
|
571
|
+
|
|
572
|
+
# Weighted random from low-frequency codons
|
|
573
|
+
weights = [freq for _, freq in low_freq_codons]
|
|
574
|
+
chosen = random.choices(
|
|
575
|
+
[c for c, _ in low_freq_codons], weights=weights, k=1
|
|
576
|
+
)[0]
|
|
577
|
+
codons[idx] = chosen
|
|
578
|
+
|
|
579
|
+
return "".join(codons)
|
|
580
|
+
|
|
581
|
+
def _apply_kozak_optimization(self, dna_seq: str, protein_seq: str) -> str:
|
|
582
|
+
"""
|
|
583
|
+
Optimize Kozak context at the 5' end of CDS.
|
|
584
|
+
|
|
585
|
+
Plant (N. benthamiana) optimal Kozak context: AACAATG**GC**...
|
|
586
|
+
The 2nd codon (position 4-6, encoding protein_seq[1]) should ideally
|
|
587
|
+
start with G (good) or GC (best) to match the plant Kozak consensus.
|
|
588
|
+
|
|
589
|
+
Only performs synonymous codon substitution -- the amino acid sequence
|
|
590
|
+
is preserved. If no synonymous codon starting with G exists, the
|
|
591
|
+
sequence is returned unchanged.
|
|
592
|
+
|
|
593
|
+
Args:
|
|
594
|
+
dna_seq: Full-length DNA sequence (must start with ATG).
|
|
595
|
+
protein_seq: Original protein sequence.
|
|
596
|
+
|
|
597
|
+
Returns:
|
|
598
|
+
DNA sequence with Kozak-optimized 2nd codon, or original if
|
|
599
|
+
optimization is not possible.
|
|
600
|
+
"""
|
|
601
|
+
# Need at least 2 codons (ATG + codon2)
|
|
602
|
+
if len(protein_seq) < 2 or len(dna_seq) < 6:
|
|
603
|
+
return dna_seq
|
|
604
|
+
|
|
605
|
+
aa2 = protein_seq[1]
|
|
606
|
+
if aa2 not in self.aa_to_codons:
|
|
607
|
+
return dna_seq
|
|
608
|
+
|
|
609
|
+
current_codon2 = dna_seq[3:6]
|
|
610
|
+
codons_for_aa2 = self.aa_to_codons[aa2]
|
|
611
|
+
|
|
612
|
+
# Already optimal: starts with G
|
|
613
|
+
if current_codon2[0] == "G":
|
|
614
|
+
return dna_seq
|
|
615
|
+
|
|
616
|
+
# Score candidates: prefer GC > G > other
|
|
617
|
+
best_codon = current_codon2
|
|
618
|
+
best_kozak_score = 0 # 0=no G, 1=starts with G, 2=starts with GC
|
|
619
|
+
best_freq = 0.0
|
|
620
|
+
|
|
621
|
+
for codon, freq in codons_for_aa2:
|
|
622
|
+
kozak_score = 0
|
|
623
|
+
if codon[0] == "G":
|
|
624
|
+
kozak_score = 1
|
|
625
|
+
if codon[1] == "C":
|
|
626
|
+
kozak_score = 2
|
|
627
|
+
if kozak_score > best_kozak_score or (
|
|
628
|
+
kozak_score == best_kozak_score and freq > best_freq
|
|
629
|
+
):
|
|
630
|
+
best_kozak_score = kozak_score
|
|
631
|
+
best_codon = codon
|
|
632
|
+
best_freq = freq
|
|
633
|
+
|
|
634
|
+
if best_kozak_score == 0:
|
|
635
|
+
return dna_seq
|
|
636
|
+
|
|
637
|
+
return dna_seq[:3] + best_codon + dna_seq[6:]
|
|
638
|
+
|
|
639
|
+
def _sample_weighted_codon(self, aa: str) -> str:
|
|
640
|
+
"""Sample a codon for one amino acid using precomputed CDF."""
|
|
641
|
+
codons = self._aa_weighted_codons.get(aa)
|
|
642
|
+
cumprob = self._aa_weighted_cumprob.get(aa)
|
|
643
|
+
if not codons or not cumprob:
|
|
644
|
+
raise ValueError(f"Invalid amino acid: {aa}")
|
|
645
|
+
r = random.random()
|
|
646
|
+
idx = bisect_left(cumprob, r)
|
|
647
|
+
if idx >= len(codons):
|
|
648
|
+
idx = len(codons) - 1
|
|
649
|
+
return codons[idx]
|
|
650
|
+
|
|
651
|
+
def generate_candidates(
|
|
652
|
+
self,
|
|
653
|
+
protein_seq: str,
|
|
654
|
+
profile: OptimizationProfile = OptimizationProfile.BALANCED,
|
|
655
|
+
n: int = 5,
|
|
656
|
+
**kwargs: Any,
|
|
657
|
+
) -> list[dict[str, Any]]:
|
|
658
|
+
"""
|
|
659
|
+
Generate top-N candidates
|
|
660
|
+
|
|
661
|
+
Args:
|
|
662
|
+
protein_seq: Amino acid sequence
|
|
663
|
+
profile: Optimization profile
|
|
664
|
+
n: Number of candidates to generate
|
|
665
|
+
**kwargs: Profile-specific parameters
|
|
666
|
+
|
|
667
|
+
Returns:
|
|
668
|
+
[{"sequence": "ATG...", "cai": 0.87, "gc": 51.2, "score": 0.92}, ...]
|
|
669
|
+
|
|
670
|
+
Raises:
|
|
671
|
+
ValueError: Invalid amino acids are present.
|
|
672
|
+
|
|
673
|
+
Examples:
|
|
674
|
+
>>> translator = ReverseTranslator()
|
|
675
|
+
>>> candidates = translator.generate_candidates("MA", n=2)
|
|
676
|
+
>>> len(candidates) == 2
|
|
677
|
+
True
|
|
678
|
+
"""
|
|
679
|
+
if n < 1:
|
|
680
|
+
raise ValueError("n must be >= 1")
|
|
681
|
+
|
|
682
|
+
def _build_candidate() -> dict[str, Any]:
|
|
683
|
+
dna_seq = self.reverse_translate(protein_seq, profile, **kwargs)
|
|
684
|
+
cai = self.calculate_cai(dna_seq)
|
|
685
|
+
gc = self.calculate_gc_content(dna_seq)
|
|
686
|
+
score = calculate_composite_score(
|
|
687
|
+
cai=cai,
|
|
688
|
+
gc=gc,
|
|
689
|
+
sequence=dna_seq,
|
|
690
|
+
profile=profile.value,
|
|
691
|
+
**kwargs,
|
|
692
|
+
)
|
|
693
|
+
return {"sequence": dna_seq, "cai": cai, "gc": gc, "score": score}
|
|
694
|
+
|
|
695
|
+
# Fast path for the dominant API call shape (n=1).
|
|
696
|
+
if n == 1:
|
|
697
|
+
try:
|
|
698
|
+
return [_build_candidate()]
|
|
699
|
+
except (ValueError, KeyError, TypeError) as exc:
|
|
700
|
+
reason = (
|
|
701
|
+
"Could not generate a valid candidate in fast path. "
|
|
702
|
+
f"Last error: {exc}"
|
|
703
|
+
)
|
|
704
|
+
raise EmptyCandidateError(protein_seq[:10], reason=reason) from exc
|
|
705
|
+
|
|
706
|
+
candidates: list[dict[str, Any]] = []
|
|
707
|
+
last_error: Exception | None = None
|
|
708
|
+
random.seed(secrets.randbits(32))
|
|
709
|
+
|
|
710
|
+
for attempt in range(n):
|
|
711
|
+
try:
|
|
712
|
+
candidates.append(_build_candidate())
|
|
713
|
+
except (ValueError, KeyError, TypeError) as exc:
|
|
714
|
+
# Catch specific exceptions: invalid amino acids, missing codons, type errors
|
|
715
|
+
logger.debug(f"Candidate generation attempt {attempt + 1} failed: {exc}")
|
|
716
|
+
last_error = exc
|
|
717
|
+
continue
|
|
718
|
+
|
|
719
|
+
if not candidates:
|
|
720
|
+
reason = (
|
|
721
|
+
f"Could not generate any valid candidates after {n} attempts. "
|
|
722
|
+
"Check codon table and profile settings."
|
|
723
|
+
)
|
|
724
|
+
if last_error is not None:
|
|
725
|
+
reason = f"{reason} Last error: {last_error}"
|
|
726
|
+
raise EmptyCandidateError(protein_seq[:10], reason=reason)
|
|
727
|
+
|
|
728
|
+
# Sort by score
|
|
729
|
+
candidates.sort(key=lambda x: x["score"], reverse=True)
|
|
730
|
+
|
|
731
|
+
return candidates
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
# --- Usage example ---
|
|
735
|
+
if __name__ == "__main__":
|
|
736
|
+
import json
|
|
737
|
+
|
|
738
|
+
translator = ReverseTranslator()
|
|
739
|
+
|
|
740
|
+
# Test sequence (partial GFP)
|
|
741
|
+
protein_seq = "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHKVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK"
|
|
742
|
+
|
|
743
|
+
print("=== Balanced Profile ===")
|
|
744
|
+
balanced = translator.reverse_translate(protein_seq, OptimizationProfile.BALANCED)
|
|
745
|
+
print(f"Length: {len(balanced)} bp")
|
|
746
|
+
print(f"CAI: {translator.calculate_cai(balanced)}")
|
|
747
|
+
print(f"GC%: {translator.calculate_gc_content(balanced)}")
|
|
748
|
+
print(f"Sequence: {balanced[:60]}...")
|
|
749
|
+
|
|
750
|
+
print("\n=== High-CAI Profile ===")
|
|
751
|
+
high_cai = translator.reverse_translate(protein_seq, OptimizationProfile.HIGH_CAI)
|
|
752
|
+
print(f"CAI: {translator.calculate_cai(high_cai)}")
|
|
753
|
+
print(f"GC%: {translator.calculate_gc_content(high_cai)}")
|
|
754
|
+
|
|
755
|
+
print("\n=== GC-Target Profile ===")
|
|
756
|
+
gc_target = translator.reverse_translate(
|
|
757
|
+
protein_seq, OptimizationProfile.GC_TARGET, target_gc=50.0
|
|
758
|
+
)
|
|
759
|
+
print(f"CAI: {translator.calculate_cai(gc_target)}")
|
|
760
|
+
print(f"GC%: {translator.calculate_gc_content(gc_target)}")
|
|
761
|
+
|
|
762
|
+
print("\n=== Top-5 Candidates (Balanced) ===")
|
|
763
|
+
candidates = translator.generate_candidates(protein_seq, OptimizationProfile.BALANCED, n=5)
|
|
764
|
+
for i, cand in enumerate(candidates, 1):
|
|
765
|
+
print(f"{i}. CAI={cand['cai']:.3f}, GC={cand['gc']:.1f}%, Score={cand['score']:.3f}")
|