factorforge-cds 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- factorforge/__init__.py +19 -0
- factorforge/__main__.py +8 -0
- factorforge/cli/__init__.py +5 -0
- factorforge/cli/legacy_cli.py +157 -0
- factorforge/cli/main.py +305 -0
- factorforge/core/interfaces/__init__.py +7 -0
- factorforge/core/interfaces/exporter.py +13 -0
- factorforge/core/interfaces/optimizer.py +85 -0
- factorforge/core/interfaces/validator.py +9 -0
- factorforge/database.py +150 -0
- factorforge/engines/__init__.py +60 -0
- factorforge/engines/ml/__init__.py +0 -0
- factorforge/engines/ml/plant_optimizer.py +325 -0
- factorforge/engines/registry.py +141 -0
- factorforge/engines/v1_archived/__init__.py +15 -0
- factorforge/engines/v2/__init__.py +13 -0
- factorforge/engines/v2/codon_table_builder.py +107 -0
- factorforge/engines/v2/construct_builder.py +403 -0
- factorforge/engines/v2/exporter.py +455 -0
- factorforge/engines/v2/optimizer.py +190 -0
- factorforge/engines/v2/pipeline.py +275 -0
- factorforge/engines/v2/rules/__init__.py +3 -0
- factorforge/engines/v2/rules/domesticator.py +403 -0
- factorforge/engines/v2/rules/reverse_translator.py +765 -0
- factorforge/engines/v2/rules/rule_engine.py +867 -0
- factorforge/engines/v2/scoring.py +232 -0
- factorforge/engines/v2/utils.py +231 -0
- factorforge/engines/v2/validator.py +383 -0
- factorforge/engines/v3/__init__.py +12 -0
- factorforge/engines/v3/explain.py +119 -0
- factorforge/engines/v3/inference/__init__.py +6 -0
- factorforge/engines/v3/inference/constrained_decoder.py +80 -0
- factorforge/engines/v3/inference/v2_adapter.py +72 -0
- factorforge/engines/v3/metrics.py +145 -0
- factorforge/engines/v3/modeling_bart_decoder.py +127 -0
- factorforge/engines/v3/pipeline.py +192 -0
- factorforge/engines/v3/synonym_mask.py +61 -0
- factorforge/engines/v3/tokenizer.py +192 -0
- factorforge/ml/__init__.py +33 -0
- factorforge/ml/feasibility.py +199 -0
- factorforge/ml/metrics.py +295 -0
- factorforge/utils/__init__.py +31 -0
- factorforge/utils/construct_id.py +8 -0
- factorforge/utils/exceptions.py +32 -0
- factorforge/utils/sequence_validator.py +189 -0
- factorforge/utils/validation.py +104 -0
- factorforge_cds-3.0.0.dist-info/METADATA +475 -0
- factorforge_cds-3.0.0.dist-info/RECORD +52 -0
- factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
- factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
- factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
- factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Multidimensional Scoring for FactorForge v2.
|
|
3
|
+
Composite scoring function with optional ViennaRNA MFE integration.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Optimal GC range for N. benthamiana
|
|
15
|
+
GC_OPT_MIN = 41.0
|
|
16
|
+
GC_OPT_MAX = 44.0
|
|
17
|
+
GC_OPT_MID = 42.5
|
|
18
|
+
|
|
19
|
+
# ViennaRNA availability cache
|
|
20
|
+
_vienna_available: bool | None = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ScoringConfig:
|
|
25
|
+
"""Scoring weight configuration for composite optimization score."""
|
|
26
|
+
|
|
27
|
+
w_cai: float = 0.5
|
|
28
|
+
w_gc: float = 0.3
|
|
29
|
+
w_mfe: float = 0.2
|
|
30
|
+
w_dinuc: float = 0.0 # CpG/TpA dinucleotide penalty (opt-in, default off)
|
|
31
|
+
gc_opt: float = GC_OPT_MID
|
|
32
|
+
use_mfe: bool = True
|
|
33
|
+
|
|
34
|
+
def __post_init__(self) -> None:
|
|
35
|
+
"""Normalize weights to sum to 1.0."""
|
|
36
|
+
self._normalize()
|
|
37
|
+
|
|
38
|
+
def _normalize(self) -> None:
|
|
39
|
+
"""Ensure active weights sum to 1.0."""
|
|
40
|
+
total = (
|
|
41
|
+
self.w_cai
|
|
42
|
+
+ self.w_gc
|
|
43
|
+
+ (self.w_mfe if self.use_mfe else 0.0)
|
|
44
|
+
+ self.w_dinuc
|
|
45
|
+
)
|
|
46
|
+
if total > 0:
|
|
47
|
+
self.w_cai /= total
|
|
48
|
+
self.w_gc /= total
|
|
49
|
+
if self.use_mfe:
|
|
50
|
+
self.w_mfe /= total
|
|
51
|
+
else:
|
|
52
|
+
self.w_mfe = 0.0
|
|
53
|
+
self.w_dinuc /= total
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Pre-defined scoring configs per optimization profile
|
|
57
|
+
PROFILE_SCORING_CONFIGS: dict[str, ScoringConfig] = {
|
|
58
|
+
"balanced": ScoringConfig(w_cai=0.5, w_gc=0.3, w_mfe=0.2, gc_opt=GC_OPT_MID),
|
|
59
|
+
"high_cai": ScoringConfig(w_cai=0.8, w_gc=0.1, w_mfe=0.1, gc_opt=GC_OPT_MID),
|
|
60
|
+
"gc_target": ScoringConfig(w_cai=0.1, w_gc=0.7, w_mfe=0.2, gc_opt=GC_OPT_MID),
|
|
61
|
+
"assembly_friendly": ScoringConfig(w_cai=0.5, w_gc=0.3, w_mfe=0.2, gc_opt=GC_OPT_MID),
|
|
62
|
+
"ramp": ScoringConfig(w_cai=0.4, w_gc=0.3, w_mfe=0.3, gc_opt=GC_OPT_MID),
|
|
63
|
+
# TRV viral-delivery profile — Li et al. (2026): prioritize MFE and viral-context GC target.
|
|
64
|
+
"viral_delivery": ScoringConfig(w_cai=0.35, w_gc=0.25, w_mfe=0.40, gc_opt=47.5, use_mfe=True),
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _check_vienna_available() -> bool:
|
|
69
|
+
"""Check if ViennaRNA Python bindings are available."""
|
|
70
|
+
global _vienna_available
|
|
71
|
+
if _vienna_available is None:
|
|
72
|
+
try:
|
|
73
|
+
import RNA # noqa: F401
|
|
74
|
+
|
|
75
|
+
_vienna_available = True
|
|
76
|
+
logger.debug("ViennaRNA Python bindings available")
|
|
77
|
+
except ImportError:
|
|
78
|
+
_vienna_available = False
|
|
79
|
+
logger.debug("ViennaRNA not available; MFE scoring disabled")
|
|
80
|
+
return _vienna_available
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def calculate_mfe(sequence: str) -> float | None:
|
|
84
|
+
"""
|
|
85
|
+
Calculate minimum free energy (MFE) using ViennaRNA.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
sequence: DNA or RNA sequence.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
MFE in kcal/mol, or None if ViennaRNA is not available.
|
|
92
|
+
"""
|
|
93
|
+
if not _check_vienna_available():
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
import RNA
|
|
98
|
+
|
|
99
|
+
# Convert DNA to RNA (T → U)
|
|
100
|
+
rna_seq = sequence.upper().replace("T", "U")
|
|
101
|
+
_, mfe = RNA.fold(rna_seq)
|
|
102
|
+
return float(mfe)
|
|
103
|
+
except Exception as exc:
|
|
104
|
+
logger.debug(f"MFE calculation failed: {exc}")
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def normalize_mfe(mfe: float, seq_length: int) -> float:
|
|
109
|
+
"""
|
|
110
|
+
Normalize MFE to 0-1 range where 1 = no structure (favorable).
|
|
111
|
+
|
|
112
|
+
Uses empirical scaling: MFE per nucleotide typically ranges from
|
|
113
|
+
-0.5 to 0.0 kcal/mol/nt for mRNA coding sequences.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
mfe: Minimum free energy in kcal/mol.
|
|
117
|
+
seq_length: Sequence length in nucleotides.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Normalized MFE score (0-1, higher = less structured = better for translation).
|
|
121
|
+
"""
|
|
122
|
+
if seq_length == 0:
|
|
123
|
+
return 0.5
|
|
124
|
+
|
|
125
|
+
mfe_per_nt = mfe / seq_length
|
|
126
|
+
# Clamp to expected range [-0.5, 0.0]
|
|
127
|
+
clamped = max(-0.5, min(0.0, mfe_per_nt))
|
|
128
|
+
# Map to [0, 1] where 0.0 kcal/mol/nt → 1.0 and -0.5 → 0.0
|
|
129
|
+
return 1.0 + (clamped / 0.5)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def calculate_dinucleotide_score(sequence: str) -> float:
|
|
133
|
+
"""Calculate a dinucleotide avoidance score (0-1, higher = fewer CpG/TpA).
|
|
134
|
+
|
|
135
|
+
Combines CpG and TpA observed/expected ratios. A sequence with no CpG
|
|
136
|
+
and no TpA scores 1.0; high density scores toward 0.0.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
sequence: DNA sequence.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Dinucleotide avoidance score (0-1).
|
|
143
|
+
"""
|
|
144
|
+
from factorforge.engines.v2.utils import calculate_dinucleotide_ratio
|
|
145
|
+
|
|
146
|
+
if len(sequence) < 6:
|
|
147
|
+
return 1.0
|
|
148
|
+
|
|
149
|
+
cpg_ratio = calculate_dinucleotide_ratio(sequence, "CG")
|
|
150
|
+
tpa_ratio = calculate_dinucleotide_ratio(sequence, "TA")
|
|
151
|
+
|
|
152
|
+
# Score: 1.0 when ratio=0, 0.0 when ratio>=2.0
|
|
153
|
+
cpg_score = max(0.0, 1.0 - cpg_ratio / 2.0)
|
|
154
|
+
tpa_score = max(0.0, 1.0 - tpa_ratio / 2.0)
|
|
155
|
+
|
|
156
|
+
return (cpg_score + tpa_score) / 2.0
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def calculate_composite_score(
|
|
160
|
+
cai: float,
|
|
161
|
+
gc: float,
|
|
162
|
+
sequence: str | None = None,
|
|
163
|
+
config: ScoringConfig | None = None,
|
|
164
|
+
profile: str | None = None,
|
|
165
|
+
**kwargs: Any,
|
|
166
|
+
) -> float:
|
|
167
|
+
"""
|
|
168
|
+
Calculate multidimensional composite score.
|
|
169
|
+
|
|
170
|
+
S = w1*CAI + w2*(1 - |GC - GC_opt|/50) + w3*MFE_norm + w4*dinuc_score
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
cai: Codon Adaptation Index (0-1).
|
|
174
|
+
gc: GC content percentage (0-100).
|
|
175
|
+
sequence: DNA sequence for optional MFE calculation.
|
|
176
|
+
config: Explicit ScoringConfig. Overrides profile.
|
|
177
|
+
profile: Profile name for preset config lookup.
|
|
178
|
+
**kwargs: Additional parameters (e.g., target_gc for gc_target profile).
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Composite score (0-1).
|
|
182
|
+
"""
|
|
183
|
+
# Resolve config
|
|
184
|
+
if config is None:
|
|
185
|
+
profile_name = (profile or "balanced").lower()
|
|
186
|
+
config = PROFILE_SCORING_CONFIGS.get(profile_name)
|
|
187
|
+
if config is None:
|
|
188
|
+
config = PROFILE_SCORING_CONFIGS["balanced"]
|
|
189
|
+
|
|
190
|
+
# Allow target_gc override for gc_target profile
|
|
191
|
+
gc_opt = float(kwargs.get("target_gc", config.gc_opt))
|
|
192
|
+
|
|
193
|
+
# Component 1: CAI (already 0-1)
|
|
194
|
+
cai_score = max(0.0, min(1.0, cai))
|
|
195
|
+
|
|
196
|
+
# Component 2: GC proximity to optimum
|
|
197
|
+
gc_score = max(0.0, 1.0 - abs(gc - gc_opt) / 50.0)
|
|
198
|
+
|
|
199
|
+
# Component 3: MFE (optional)
|
|
200
|
+
mfe_score = 0.5 # neutral default
|
|
201
|
+
actual_w_mfe = config.w_mfe
|
|
202
|
+
|
|
203
|
+
if config.use_mfe and sequence is not None and _check_vienna_available():
|
|
204
|
+
mfe = calculate_mfe(sequence)
|
|
205
|
+
if mfe is not None:
|
|
206
|
+
mfe_score = normalize_mfe(mfe, len(sequence))
|
|
207
|
+
else:
|
|
208
|
+
actual_w_mfe = 0.0
|
|
209
|
+
else:
|
|
210
|
+
actual_w_mfe = 0.0
|
|
211
|
+
|
|
212
|
+
# Component 4: Dinucleotide avoidance (opt-in, default weight 0.0)
|
|
213
|
+
dinuc_score = 0.5 # neutral default
|
|
214
|
+
actual_w_dinuc = config.w_dinuc
|
|
215
|
+
if actual_w_dinuc > 0 and sequence is not None:
|
|
216
|
+
dinuc_score = calculate_dinucleotide_score(sequence)
|
|
217
|
+
elif actual_w_dinuc > 0:
|
|
218
|
+
actual_w_dinuc = 0.0 # Cannot compute without sequence
|
|
219
|
+
|
|
220
|
+
# Compute weighted score (re-normalize if MFE/dinuc disabled)
|
|
221
|
+
w_total = config.w_cai + config.w_gc + actual_w_mfe + actual_w_dinuc
|
|
222
|
+
if w_total == 0:
|
|
223
|
+
return 0.0
|
|
224
|
+
|
|
225
|
+
score = (
|
|
226
|
+
(config.w_cai / w_total) * cai_score
|
|
227
|
+
+ (config.w_gc / w_total) * gc_score
|
|
228
|
+
+ (actual_w_mfe / w_total) * mfe_score
|
|
229
|
+
+ (actual_w_dinuc / w_total) * dinuc_score
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
return round(score, 3)
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility helpers for FactorForge v2 engines.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import importlib.resources
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, cast
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_data_path() -> Path:
|
|
15
|
+
"""Get data directory path from environment or package resources.
|
|
16
|
+
|
|
17
|
+
Checks FACTORFORGE_DATA_DIR environment variable first, then falls back to
|
|
18
|
+
the bundled package data directory (works with pip-installed packages).
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Path to data directory.
|
|
22
|
+
|
|
23
|
+
Examples:
|
|
24
|
+
>>> data_dir = get_data_path()
|
|
25
|
+
>>> isinstance(data_dir, Path)
|
|
26
|
+
True
|
|
27
|
+
"""
|
|
28
|
+
data_dir = os.getenv("FACTORFORGE_DATA_DIR")
|
|
29
|
+
if data_dir:
|
|
30
|
+
return Path(data_dir)
|
|
31
|
+
|
|
32
|
+
# Use importlib.resources to locate bundled package data
|
|
33
|
+
# This works correctly whether the package is installed via pip or run from source
|
|
34
|
+
try:
|
|
35
|
+
ref = importlib.resources.files("factorforge") / "data"
|
|
36
|
+
return Path(str(ref))
|
|
37
|
+
except (TypeError, ModuleNotFoundError):
|
|
38
|
+
# Fallback for development: go up from v2/utils.py to src/factorforge/data
|
|
39
|
+
return Path(__file__).resolve().parents[2] / "data"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def calculate_gc(sequence: str) -> float:
|
|
43
|
+
"""Calculate GC content percentage.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
sequence: DNA sequence string.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
GC content as percentage (0-100).
|
|
50
|
+
|
|
51
|
+
Examples:
|
|
52
|
+
>>> calculate_gc("ATGC")
|
|
53
|
+
50.0
|
|
54
|
+
"""
|
|
55
|
+
if not sequence:
|
|
56
|
+
return 0.0
|
|
57
|
+
|
|
58
|
+
seq = sequence.upper()
|
|
59
|
+
gc_count = seq.count("G") + seq.count("C")
|
|
60
|
+
return (gc_count / len(sequence)) * 100
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def count_dinucleotides(sequence: str, dinucleotide: str = "CG") -> int:
|
|
64
|
+
"""Count occurrences of a dinucleotide in a DNA sequence.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
sequence: DNA sequence string (case-insensitive).
|
|
68
|
+
dinucleotide: Two-character dinucleotide to count (e.g., "CG", "TA").
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Count of dinucleotide occurrences.
|
|
72
|
+
|
|
73
|
+
Examples:
|
|
74
|
+
>>> count_dinucleotides("ACGACG", "CG")
|
|
75
|
+
2
|
|
76
|
+
"""
|
|
77
|
+
seq = sequence.upper()
|
|
78
|
+
dn = dinucleotide.upper()
|
|
79
|
+
return sum(1 for i in range(len(seq) - 1) if seq[i : i + 2] == dn)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def calculate_dinucleotide_ratio(sequence: str, dinucleotide: str = "CG") -> float:
|
|
83
|
+
"""Calculate observed/expected ratio of a dinucleotide.
|
|
84
|
+
|
|
85
|
+
Compares actual dinucleotide frequency to what would be expected
|
|
86
|
+
from mononucleotide composition. Ratio < 1.0 means suppressed,
|
|
87
|
+
> 1.0 means enriched.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
sequence: DNA sequence string (case-insensitive).
|
|
91
|
+
dinucleotide: Two-character dinucleotide (e.g., "CG", "TA").
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Observed/expected ratio (0.0 if sequence too short or denominator zero).
|
|
95
|
+
|
|
96
|
+
Examples:
|
|
97
|
+
>>> calculate_dinucleotide_ratio("ACGTACGT", "CG") # doctest: +SKIP
|
|
98
|
+
1.0
|
|
99
|
+
"""
|
|
100
|
+
seq = sequence.upper()
|
|
101
|
+
if len(seq) < 2:
|
|
102
|
+
return 0.0
|
|
103
|
+
|
|
104
|
+
dn = dinucleotide.upper()
|
|
105
|
+
observed = sum(1 for i in range(len(seq) - 1) if seq[i : i + 2] == dn)
|
|
106
|
+
|
|
107
|
+
n1 = seq.count(dn[0])
|
|
108
|
+
n2 = seq.count(dn[1])
|
|
109
|
+
n = len(seq)
|
|
110
|
+
|
|
111
|
+
expected = (n1 * n2) / n if n > 0 else 0.0
|
|
112
|
+
if expected == 0.0:
|
|
113
|
+
return 0.0
|
|
114
|
+
|
|
115
|
+
return observed / expected
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def load_codon_table(organism: str, codon_tables_dir: Path) -> dict[str, Any]:
|
|
119
|
+
"""Load codon usage table for organism.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
organism: Organism name (e.g., "human", "ecoli").
|
|
123
|
+
codon_tables_dir: Directory containing codon table files.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Codon table payload parsed from JSON.
|
|
127
|
+
|
|
128
|
+
Raises:
|
|
129
|
+
FileNotFoundError: If codon table file not found.
|
|
130
|
+
"""
|
|
131
|
+
filename = organism if organism.endswith(".json") else f"{organism}_codons.json"
|
|
132
|
+
codon_table_path = codon_tables_dir / filename
|
|
133
|
+
|
|
134
|
+
with open(codon_table_path, "r", encoding="utf-8") as handle:
|
|
135
|
+
return cast(dict[str, Any], json.load(handle))
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def build_aa_to_codons_map(codon_table: dict[str, Any]) -> dict[str, list[str]]:
|
|
139
|
+
"""Build amino-acid-to-codons map from a codon table payload."""
|
|
140
|
+
amino_acids = codon_table.get("amino_acids", {})
|
|
141
|
+
if not isinstance(amino_acids, dict):
|
|
142
|
+
return {}
|
|
143
|
+
return {aa: list(info.get("codons", [])) for aa, info in amino_acids.items()}
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def load_golden_set(data_dir: Path | None = None) -> dict[str, Any]:
|
|
147
|
+
"""Load golden set codon table for CAI reference weights.
|
|
148
|
+
|
|
149
|
+
Falls back to the standard codon table if golden set file is not found.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
data_dir: Data directory path. Defaults to get_data_path().
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Golden set codon table dict.
|
|
156
|
+
"""
|
|
157
|
+
if data_dir is None:
|
|
158
|
+
data_dir = get_data_path()
|
|
159
|
+
golden_path = data_dir / "nbenthamiana_golden_set.json"
|
|
160
|
+
if golden_path.exists():
|
|
161
|
+
with open(golden_path, "r", encoding="utf-8") as f:
|
|
162
|
+
return cast(dict[str, Any], json.load(f))
|
|
163
|
+
# Fallback to standard table
|
|
164
|
+
standard_path = data_dir / "nbenthamiana_codons.json"
|
|
165
|
+
with open(standard_path, "r", encoding="utf-8") as f:
|
|
166
|
+
return cast(dict[str, Any], json.load(f))
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def translate_codon(codon: str, codon_table: dict[str, str]) -> str:
|
|
170
|
+
"""Translate DNA codon to amino acid.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
codon: 3-letter DNA codon.
|
|
174
|
+
codon_table: Codon to amino acid mapping.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Single letter amino acid code.
|
|
178
|
+
|
|
179
|
+
Raises:
|
|
180
|
+
KeyError: If codon not in table.
|
|
181
|
+
"""
|
|
182
|
+
return codon_table[codon.upper()]
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def parse_fasta_records(content: str) -> list[tuple[str, str]]:
|
|
186
|
+
"""Parse FASTA content into (record_id, sequence) tuples.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
content: FASTA text content.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
List of (record_id, sequence) tuples.
|
|
193
|
+
|
|
194
|
+
Raises:
|
|
195
|
+
ValueError: If content is not valid FASTA.
|
|
196
|
+
"""
|
|
197
|
+
records: list[tuple[str, str]] = []
|
|
198
|
+
seq_id: str | None = None
|
|
199
|
+
seq_lines: list[str] = []
|
|
200
|
+
|
|
201
|
+
for raw_line in content.splitlines():
|
|
202
|
+
line = raw_line.strip()
|
|
203
|
+
if not line:
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
if line.startswith(">"):
|
|
207
|
+
if seq_id is not None:
|
|
208
|
+
sequence = "".join(seq_lines).upper()
|
|
209
|
+
if not sequence:
|
|
210
|
+
raise ValueError(f"Empty FASTA record: {seq_id}")
|
|
211
|
+
records.append((seq_id, sequence))
|
|
212
|
+
|
|
213
|
+
header = line[1:].strip()
|
|
214
|
+
seq_id = header.split()[0] if header else f"seq{len(records) + 1}"
|
|
215
|
+
seq_lines = []
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
if seq_id is None:
|
|
219
|
+
raise ValueError("Invalid FASTA: sequence data found before header")
|
|
220
|
+
seq_lines.append(line)
|
|
221
|
+
|
|
222
|
+
if seq_id is not None:
|
|
223
|
+
sequence = "".join(seq_lines).upper()
|
|
224
|
+
if not sequence:
|
|
225
|
+
raise ValueError(f"Empty FASTA record: {seq_id}")
|
|
226
|
+
records.append((seq_id, sequence))
|
|
227
|
+
|
|
228
|
+
if not records:
|
|
229
|
+
raise ValueError("No FASTA records found")
|
|
230
|
+
|
|
231
|
+
return records
|