factorforge-cds 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- factorforge/__init__.py +19 -0
- factorforge/__main__.py +8 -0
- factorforge/cli/__init__.py +5 -0
- factorforge/cli/legacy_cli.py +157 -0
- factorforge/cli/main.py +305 -0
- factorforge/core/interfaces/__init__.py +7 -0
- factorforge/core/interfaces/exporter.py +13 -0
- factorforge/core/interfaces/optimizer.py +85 -0
- factorforge/core/interfaces/validator.py +9 -0
- factorforge/database.py +150 -0
- factorforge/engines/__init__.py +60 -0
- factorforge/engines/ml/__init__.py +0 -0
- factorforge/engines/ml/plant_optimizer.py +325 -0
- factorforge/engines/registry.py +141 -0
- factorforge/engines/v1_archived/__init__.py +15 -0
- factorforge/engines/v2/__init__.py +13 -0
- factorforge/engines/v2/codon_table_builder.py +107 -0
- factorforge/engines/v2/construct_builder.py +403 -0
- factorforge/engines/v2/exporter.py +455 -0
- factorforge/engines/v2/optimizer.py +190 -0
- factorforge/engines/v2/pipeline.py +275 -0
- factorforge/engines/v2/rules/__init__.py +3 -0
- factorforge/engines/v2/rules/domesticator.py +403 -0
- factorforge/engines/v2/rules/reverse_translator.py +765 -0
- factorforge/engines/v2/rules/rule_engine.py +867 -0
- factorforge/engines/v2/scoring.py +232 -0
- factorforge/engines/v2/utils.py +231 -0
- factorforge/engines/v2/validator.py +383 -0
- factorforge/engines/v3/__init__.py +12 -0
- factorforge/engines/v3/explain.py +119 -0
- factorforge/engines/v3/inference/__init__.py +6 -0
- factorforge/engines/v3/inference/constrained_decoder.py +80 -0
- factorforge/engines/v3/inference/v2_adapter.py +72 -0
- factorforge/engines/v3/metrics.py +145 -0
- factorforge/engines/v3/modeling_bart_decoder.py +127 -0
- factorforge/engines/v3/pipeline.py +192 -0
- factorforge/engines/v3/synonym_mask.py +61 -0
- factorforge/engines/v3/tokenizer.py +192 -0
- factorforge/ml/__init__.py +33 -0
- factorforge/ml/feasibility.py +199 -0
- factorforge/ml/metrics.py +295 -0
- factorforge/utils/__init__.py +31 -0
- factorforge/utils/construct_id.py +8 -0
- factorforge/utils/exceptions.py +32 -0
- factorforge/utils/sequence_validator.py +189 -0
- factorforge/utils/validation.py +104 -0
- factorforge_cds-3.0.0.dist-info/METADATA +475 -0
- factorforge_cds-3.0.0.dist-info/RECORD +52 -0
- factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
- factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
- factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
- factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Optimization pipeline for FactorForge v2.
|
|
3
|
+
Integrates validation, translation, rule scanning, domestication, and construct building.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import TYPE_CHECKING, Any
|
|
12
|
+
|
|
13
|
+
from factorforge.engines.v2.construct_builder import ConstructBuilder
|
|
14
|
+
from factorforge.engines.v2.rules.domesticator import Domesticator
|
|
15
|
+
from factorforge.engines.v2.rules.reverse_translator import OptimizationProfile, ReverseTranslator
|
|
16
|
+
from factorforge.engines.v2.rules.rule_engine import RuleEngine
|
|
17
|
+
from factorforge.engines.v2.scoring import calculate_composite_score
|
|
18
|
+
from factorforge.engines.v2.validator import InputValidator
|
|
19
|
+
from factorforge.utils.construct_id import generate_construct_id
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from Bio.SeqRecord import SeqRecord
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class PipelineResult:
|
|
29
|
+
"""Pipeline output container."""
|
|
30
|
+
|
|
31
|
+
sequence: str
|
|
32
|
+
construct: "SeqRecord | None" = None
|
|
33
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
34
|
+
|
|
35
|
+
def export_features(self) -> dict[str, Any]:
|
|
36
|
+
"""schema.md 호환 피처 dict 반환 (purity_pct 제외 — 실험 후 수동 입력)."""
|
|
37
|
+
metrics = self.metadata.get("metrics", {})
|
|
38
|
+
scan = self.metadata.get("scan_results", {})
|
|
39
|
+
dom = self.metadata.get("domestication", {})
|
|
40
|
+
|
|
41
|
+
return {
|
|
42
|
+
"construct_id": self.metadata.get("construct_id", ""),
|
|
43
|
+
"protein_name": "",
|
|
44
|
+
"optimization_profile": self.metadata.get("profile", ""),
|
|
45
|
+
"cai_score": round(metrics.get("cai", 0.0), 4),
|
|
46
|
+
"gc_content_pct": round(metrics.get("gc", 0.0), 2),
|
|
47
|
+
"mfe_kcal_mol": round(metrics.get("mfe", 0.0), 2),
|
|
48
|
+
"polya_signal_count": len(scan.get("polya", [])),
|
|
49
|
+
"domestication_edits": len(dom.get("removed_sites", [])),
|
|
50
|
+
"sequence_length_aa": len(self.sequence) // 3,
|
|
51
|
+
"agro_od600": None,
|
|
52
|
+
"dpi": None,
|
|
53
|
+
"purity_pct": None,
|
|
54
|
+
"yield_mg_per_kg": None,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
def save(self, filepath: Path, format: str = "fasta") -> None:
|
|
58
|
+
"""
|
|
59
|
+
Save the result to a file.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
filepath: Output file path.
|
|
63
|
+
format: "fasta" or "genbank".
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
ImportError: If Biopython is required but not installed.
|
|
67
|
+
"""
|
|
68
|
+
format_lower = format.lower()
|
|
69
|
+
|
|
70
|
+
if self.construct is not None and format_lower == "genbank":
|
|
71
|
+
try:
|
|
72
|
+
from Bio import SeqIO
|
|
73
|
+
except ImportError as exc:
|
|
74
|
+
raise ImportError("Biopython is required: pip install biopython") from exc
|
|
75
|
+
|
|
76
|
+
SeqIO.write(self.construct, str(filepath), "genbank")
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
with open(filepath, "w", encoding="utf-8") as handle:
|
|
80
|
+
handle.write(f">optimized\n{self.sequence}\n")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class OptimizationPipeline:
|
|
84
|
+
"""End-to-end v2 optimization pipeline."""
|
|
85
|
+
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
profile: str = "balanced",
|
|
89
|
+
construct_template: str | None = None,
|
|
90
|
+
template_dir: Path | None = None,
|
|
91
|
+
) -> None:
|
|
92
|
+
"""
|
|
93
|
+
Args:
|
|
94
|
+
profile: Optimization profile name.
|
|
95
|
+
construct_template: Optional construct template name.
|
|
96
|
+
template_dir: Optional template directory.
|
|
97
|
+
"""
|
|
98
|
+
self.profile = profile
|
|
99
|
+
self.construct_template = construct_template
|
|
100
|
+
self.template_dir = template_dir
|
|
101
|
+
|
|
102
|
+
self.validator = InputValidator()
|
|
103
|
+
self.translator = ReverseTranslator()
|
|
104
|
+
self.rule_engine = RuleEngine()
|
|
105
|
+
self.domesticator = Domesticator()
|
|
106
|
+
|
|
107
|
+
if construct_template:
|
|
108
|
+
if template_dir is None:
|
|
109
|
+
template_dir = Path(__file__).resolve().parents[4] / "data" / "templates"
|
|
110
|
+
self.construct_builder: ConstructBuilder | None = ConstructBuilder(template_dir)
|
|
111
|
+
else:
|
|
112
|
+
self.construct_builder = None
|
|
113
|
+
|
|
114
|
+
def run(
|
|
115
|
+
self,
|
|
116
|
+
sequence: str,
|
|
117
|
+
profile: str | None = None,
|
|
118
|
+
construct_template: str | None = None,
|
|
119
|
+
**kwargs: Any,
|
|
120
|
+
) -> PipelineResult:
|
|
121
|
+
"""
|
|
122
|
+
Run the optimization pipeline.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
sequence: Input protein or DNA sequence.
|
|
126
|
+
profile: Optional profile override.
|
|
127
|
+
construct_template: Optional template override.
|
|
128
|
+
**kwargs: Additional settings.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
PipelineResult with sequence and metadata.
|
|
132
|
+
|
|
133
|
+
Raises:
|
|
134
|
+
ValueError: If input sequence is invalid.
|
|
135
|
+
"""
|
|
136
|
+
logger.info(f"Starting optimization pipeline with profile: {profile or self.profile}")
|
|
137
|
+
|
|
138
|
+
val_result = self.validator.validate(sequence)
|
|
139
|
+
if not val_result["valid"]:
|
|
140
|
+
logger.error(f"Input validation failed: {val_result['errors']}")
|
|
141
|
+
raise ValueError(f"Invalid input sequence: {val_result['errors']}")
|
|
142
|
+
|
|
143
|
+
processed = val_result["processed_sequence"]
|
|
144
|
+
seq_type = val_result["type"]
|
|
145
|
+
logger.debug(f"Detected sequence type: {seq_type}")
|
|
146
|
+
if seq_type == "fasta":
|
|
147
|
+
seq_type = self.validator.detect_sequence_type(processed).value
|
|
148
|
+
|
|
149
|
+
effective_profile = (profile or self.profile or "balanced").lower()
|
|
150
|
+
try:
|
|
151
|
+
opt_profile = OptimizationProfile(effective_profile)
|
|
152
|
+
except ValueError as exc:
|
|
153
|
+
supported = ", ".join(p.value for p in OptimizationProfile)
|
|
154
|
+
raise ValueError(
|
|
155
|
+
f"Unknown profile: {effective_profile}. Supported profiles: {supported}"
|
|
156
|
+
) from exc
|
|
157
|
+
|
|
158
|
+
if seq_type == "dna":
|
|
159
|
+
optimized_dna = processed
|
|
160
|
+
cai = self.translator.calculate_cai(optimized_dna)
|
|
161
|
+
gc = self.translator.calculate_gc_content(optimized_dna)
|
|
162
|
+
score = calculate_composite_score(
|
|
163
|
+
cai=cai, gc=gc, sequence=optimized_dna, profile=effective_profile
|
|
164
|
+
)
|
|
165
|
+
candidate_metrics = {"cai": cai, "gc": gc, "score": score}
|
|
166
|
+
else:
|
|
167
|
+
logger.debug(f"Generating candidates with profile: {opt_profile.value}")
|
|
168
|
+
candidates = self.translator.generate_candidates(processed, profile=opt_profile, n=1)
|
|
169
|
+
if not candidates:
|
|
170
|
+
logger.error("No candidates generated for input sequence")
|
|
171
|
+
raise ValueError("No candidates generated for input sequence.")
|
|
172
|
+
optimized_dna = candidates[0]["sequence"]
|
|
173
|
+
candidate_metrics = {
|
|
174
|
+
"cai": candidates[0]["cai"],
|
|
175
|
+
"gc": candidates[0]["gc"],
|
|
176
|
+
"score": candidates[0]["score"],
|
|
177
|
+
}
|
|
178
|
+
logger.info(
|
|
179
|
+
f"Generated optimized sequence: CAI={candidate_metrics['cai']:.3f}, "
|
|
180
|
+
f"GC={candidate_metrics['gc']:.1f}%"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Fast pre-check avoids an expensive full rule scan before PolyA fixing.
|
|
184
|
+
has_polya_signal = any(
|
|
185
|
+
pattern in optimized_dna for pattern in self.rule_engine.POLYA_PATTERNS
|
|
186
|
+
)
|
|
187
|
+
if has_polya_signal:
|
|
188
|
+
logger.debug("Potential PolyA signal detected; attempting iterative fix")
|
|
189
|
+
polya_fix = self.rule_engine.fix_polya_iterative(optimized_dna)
|
|
190
|
+
if polya_fix["success"]:
|
|
191
|
+
optimized_dna = polya_fix["modified_seq"]
|
|
192
|
+
logger.info(
|
|
193
|
+
f"Fixed {len(polya_fix['fixes_applied'])} PolyA violation(s) "
|
|
194
|
+
f"in {polya_fix['rounds']} round(s)"
|
|
195
|
+
)
|
|
196
|
+
else:
|
|
197
|
+
logger.warning(
|
|
198
|
+
f"Could not fix all PolyA violations. "
|
|
199
|
+
f"Remaining: {polya_fix.get('remaining_violations', '?')}"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
logger.debug("Scanning for final rule violations")
|
|
203
|
+
scan_mode = str(kwargs.get("scan_mode", "full"))
|
|
204
|
+
scan_include = kwargs.get("scan_include")
|
|
205
|
+
scan_exclude = kwargs.get("scan_exclude")
|
|
206
|
+
scan_results = self.rule_engine.scan_all(
|
|
207
|
+
optimized_dna,
|
|
208
|
+
mode=scan_mode,
|
|
209
|
+
include=scan_include,
|
|
210
|
+
exclude=scan_exclude,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
assembly_standard = kwargs.get("assembly_standard", "golden_gate")
|
|
214
|
+
domestication = self.domesticator.domesticate(optimized_dna, standard=assembly_standard)
|
|
215
|
+
domesticated_sequence = domestication.get("domesticated_seq", optimized_dna)
|
|
216
|
+
|
|
217
|
+
template_name = construct_template or self.construct_template
|
|
218
|
+
if template_name:
|
|
219
|
+
if self.construct_builder is None:
|
|
220
|
+
template_dir = (
|
|
221
|
+
self.template_dir or Path(__file__).resolve().parents[4] / "data" / "templates"
|
|
222
|
+
)
|
|
223
|
+
self.construct_builder = ConstructBuilder(template_dir)
|
|
224
|
+
construct_record = self.construct_builder.generate_construct(
|
|
225
|
+
gene_sequence=domesticated_sequence,
|
|
226
|
+
template_name=template_name,
|
|
227
|
+
)
|
|
228
|
+
final_sequence = str(construct_record.seq)
|
|
229
|
+
else:
|
|
230
|
+
construct_record = None
|
|
231
|
+
final_sequence = domesticated_sequence
|
|
232
|
+
|
|
233
|
+
metadata: dict[str, Any] = {
|
|
234
|
+
"construct_id": generate_construct_id(),
|
|
235
|
+
"profile": effective_profile,
|
|
236
|
+
"construct_template": template_name,
|
|
237
|
+
"construct_features": len(construct_record.features) if construct_record else 0,
|
|
238
|
+
"validation": val_result,
|
|
239
|
+
"scan_results": scan_results,
|
|
240
|
+
"domestication": domestication,
|
|
241
|
+
"metrics": candidate_metrics,
|
|
242
|
+
"scan_mode": scan_mode,
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return PipelineResult(
|
|
246
|
+
sequence=final_sequence,
|
|
247
|
+
construct=construct_record,
|
|
248
|
+
metadata=metadata,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
def run_batch(
|
|
252
|
+
self,
|
|
253
|
+
sequences: list[dict[str, str]] | list[str],
|
|
254
|
+
profile: str | None = None,
|
|
255
|
+
construct_template: str | None = None,
|
|
256
|
+
**kwargs: Any,
|
|
257
|
+
) -> list[PipelineResult]:
|
|
258
|
+
"""Run the optimization pipeline for a batch of sequences."""
|
|
259
|
+
results: list[PipelineResult] = []
|
|
260
|
+
for idx, entry in enumerate(sequences, start=1):
|
|
261
|
+
if isinstance(entry, dict):
|
|
262
|
+
seq = entry.get("sequence", "")
|
|
263
|
+
seq_id = entry.get("id", f"seq{idx}")
|
|
264
|
+
else:
|
|
265
|
+
seq = entry
|
|
266
|
+
seq_id = f"seq{idx}"
|
|
267
|
+
result = self.run(
|
|
268
|
+
seq,
|
|
269
|
+
profile=profile,
|
|
270
|
+
construct_template=construct_template,
|
|
271
|
+
**kwargs,
|
|
272
|
+
)
|
|
273
|
+
result.metadata["input_id"] = seq_id
|
|
274
|
+
results.append(result)
|
|
275
|
+
return results
|
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Domesticator for FactorForge v2
|
|
3
|
+
Assembly standard compatibility (P0-4) - Golden Gate/MoClo/BioBricks
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from factorforge.engines.v2.utils import build_aa_to_codons_map, load_codon_table
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Domesticator:
|
|
15
|
+
"""
|
|
16
|
+
Remove restriction enzyme sites for assembly compatibility
|
|
17
|
+
|
|
18
|
+
Supported assembly systems:
|
|
19
|
+
- Golden Gate (BsaI, BpiI, BsmBI)
|
|
20
|
+
- MoClo (BsaI + overhangs)
|
|
21
|
+
- BioBricks (EcoRI, XbaI, SpeI, PstI)
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# Assembly standard definitions
|
|
25
|
+
ASSEMBLY_STANDARDS: dict[str, dict[str, Any]] = {
|
|
26
|
+
"golden_gate": {
|
|
27
|
+
"enzymes": ["BsaI", "BpiI", "BsmBI"],
|
|
28
|
+
"sites": {
|
|
29
|
+
"BsaI": ["GGTCTC", "GAGACC"], # Forward and reverse complement
|
|
30
|
+
"BpiI": ["GAAGAC", "GTCTTC"],
|
|
31
|
+
"BsmBI": ["CGTCTC", "GAGACG"],
|
|
32
|
+
},
|
|
33
|
+
},
|
|
34
|
+
"moclo": {
|
|
35
|
+
"enzymes": ["BsaI"],
|
|
36
|
+
"sites": {"BsaI": ["GGTCTC", "GAGACC"]},
|
|
37
|
+
"overhangs": ["AATG", "AGGT", "GCTT", "CGCT"], # Level 0
|
|
38
|
+
},
|
|
39
|
+
"biobricks": {
|
|
40
|
+
"enzymes": ["EcoRI", "XbaI", "SpeI", "PstI"],
|
|
41
|
+
"sites": {
|
|
42
|
+
"EcoRI": ["GAATTC"],
|
|
43
|
+
"XbaI": ["TCTAGA"],
|
|
44
|
+
"SpeI": ["ACTAGT"],
|
|
45
|
+
"PstI": ["CTGCAG"],
|
|
46
|
+
},
|
|
47
|
+
},
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def __init__(self, codon_table: dict[str, Any] | None = None) -> None:
|
|
51
|
+
"""
|
|
52
|
+
Args:
|
|
53
|
+
codon_table: Codon table (loads default if None)
|
|
54
|
+
"""
|
|
55
|
+
if codon_table is None:
|
|
56
|
+
# Load from default path
|
|
57
|
+
project_root = Path(__file__).resolve().parents[5]
|
|
58
|
+
codon_table = load_codon_table("nbenthamiana", project_root / "data")
|
|
59
|
+
|
|
60
|
+
self.codon_table: dict[str, Any] = codon_table
|
|
61
|
+
self.aa_to_codons: dict[str, list[str]] = self._build_aa_to_codons_map()
|
|
62
|
+
|
|
63
|
+
def _build_aa_to_codons_map(self) -> dict[str, list[str]]:
|
|
64
|
+
"""Build amino-acid-to-codons map"""
|
|
65
|
+
return build_aa_to_codons_map(self.codon_table)
|
|
66
|
+
|
|
67
|
+
def scan_restriction_sites(
|
|
68
|
+
self,
|
|
69
|
+
seq: str,
|
|
70
|
+
standard: str = "golden_gate",
|
|
71
|
+
) -> list[dict[str, Any]]:
|
|
72
|
+
"""
|
|
73
|
+
Scan restriction enzyme sites
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
seq: DNA sequence
|
|
77
|
+
standard: Assembly standard ("golden_gate", "moclo", "biobricks")
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
List of detected sites
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
ValueError: Unsupported assembly standard.
|
|
84
|
+
|
|
85
|
+
Examples:
|
|
86
|
+
>>> domesticator = Domesticator()
|
|
87
|
+
>>> domesticator.scan_restriction_sites("GGTCTC", "golden_gate")
|
|
88
|
+
[{'enzyme': 'BsaI', ...}]
|
|
89
|
+
"""
|
|
90
|
+
if standard not in self.ASSEMBLY_STANDARDS:
|
|
91
|
+
raise ValueError(f"Unknown assembly standard: {standard}")
|
|
92
|
+
|
|
93
|
+
assembly_info = self.ASSEMBLY_STANDARDS[standard]
|
|
94
|
+
sites_found: list[dict[str, Any]] = []
|
|
95
|
+
|
|
96
|
+
for enzyme, site_seqs in assembly_info["sites"].items():
|
|
97
|
+
for site_seq in site_seqs:
|
|
98
|
+
pos = 0
|
|
99
|
+
while True:
|
|
100
|
+
idx = seq.find(site_seq, pos)
|
|
101
|
+
if idx == -1:
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
sites_found.append(
|
|
105
|
+
{
|
|
106
|
+
"enzyme": enzyme,
|
|
107
|
+
"site": site_seq,
|
|
108
|
+
"position": idx,
|
|
109
|
+
"context": seq[
|
|
110
|
+
max(0, idx - 10) : min(len(seq), idx + len(site_seq) + 10)
|
|
111
|
+
],
|
|
112
|
+
}
|
|
113
|
+
)
|
|
114
|
+
pos = idx + 1
|
|
115
|
+
|
|
116
|
+
return sites_found
|
|
117
|
+
|
|
118
|
+
def domesticate(
|
|
119
|
+
self,
|
|
120
|
+
seq: str,
|
|
121
|
+
standard: str = "golden_gate",
|
|
122
|
+
max_attempts: int = 100,
|
|
123
|
+
) -> dict[str, Any]:
|
|
124
|
+
"""
|
|
125
|
+
Remove restriction enzyme sites
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
seq: DNA sequence
|
|
129
|
+
standard: Assembly standard
|
|
130
|
+
max_attempts: Maximum attempts
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
{
|
|
134
|
+
"domesticated_seq": "...",
|
|
135
|
+
"removed_sites": [...],
|
|
136
|
+
"unfixable": [...],
|
|
137
|
+
"success": True/False
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
Raises:
|
|
141
|
+
ValueError: Unsupported assembly standard.
|
|
142
|
+
|
|
143
|
+
Examples:
|
|
144
|
+
>>> domesticator = Domesticator()
|
|
145
|
+
>>> result = domesticator.domesticate("ATGGGTCTCGAG", "golden_gate")
|
|
146
|
+
>>> "domesticated_seq" in result
|
|
147
|
+
True
|
|
148
|
+
"""
|
|
149
|
+
if len(seq) % 3 != 0:
|
|
150
|
+
return {
|
|
151
|
+
"success": False,
|
|
152
|
+
"error": "Sequence length not divisible by 3",
|
|
153
|
+
"domesticated_seq": seq,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
modified_seq = seq
|
|
157
|
+
removed_sites: list[dict[str, Any]] = []
|
|
158
|
+
unfixable: list[dict[str, Any]] = []
|
|
159
|
+
|
|
160
|
+
# Iteratively remove sites
|
|
161
|
+
for attempt in range(max_attempts):
|
|
162
|
+
sites = self.scan_restriction_sites(modified_seq, standard)
|
|
163
|
+
|
|
164
|
+
if not sites:
|
|
165
|
+
# All sites removed
|
|
166
|
+
return {
|
|
167
|
+
"success": True,
|
|
168
|
+
"domesticated_seq": modified_seq,
|
|
169
|
+
"removed_sites": removed_sites,
|
|
170
|
+
"unfixable": [],
|
|
171
|
+
"attempts": attempt + 1,
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
# Attempt to remove the first site
|
|
175
|
+
site = sites[0]
|
|
176
|
+
result = self._remove_site(modified_seq, site)
|
|
177
|
+
|
|
178
|
+
if result["success"]:
|
|
179
|
+
modified_seq = result["modified_seq"]
|
|
180
|
+
removed_sites.append(
|
|
181
|
+
{
|
|
182
|
+
"enzyme": site["enzyme"],
|
|
183
|
+
"position": site["position"],
|
|
184
|
+
"changes": result["changes"],
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
else:
|
|
188
|
+
# Failed to remove site
|
|
189
|
+
unfixable.append(
|
|
190
|
+
{
|
|
191
|
+
"enzyme": site["enzyme"],
|
|
192
|
+
"site": site["site"],
|
|
193
|
+
"position": site["position"],
|
|
194
|
+
"reason": result.get("reason", "Unknown"),
|
|
195
|
+
"alternatives": self._suggest_alternatives(seq, site),
|
|
196
|
+
}
|
|
197
|
+
)
|
|
198
|
+
# Record as unfixable and continue to next site
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
# Sites still remain after max attempts
|
|
202
|
+
remaining_sites = self.scan_restriction_sites(modified_seq, standard)
|
|
203
|
+
|
|
204
|
+
return {
|
|
205
|
+
"success": len(remaining_sites) == 0,
|
|
206
|
+
"domesticated_seq": modified_seq,
|
|
207
|
+
"removed_sites": removed_sites,
|
|
208
|
+
"unfixable": unfixable if unfixable else remaining_sites,
|
|
209
|
+
"attempts": max_attempts,
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
def _remove_site(self, seq: str, site: dict[str, Any]) -> dict[str, Any]:
|
|
213
|
+
"""
|
|
214
|
+
Remove a single restriction enzyme site
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
seq: DNA sequence
|
|
218
|
+
site: Site info
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
{
|
|
222
|
+
"success": True/False,
|
|
223
|
+
"modified_seq": "...",
|
|
224
|
+
"changes": [...]
|
|
225
|
+
}
|
|
226
|
+
"""
|
|
227
|
+
pos = site["position"]
|
|
228
|
+
site_seq = site["site"]
|
|
229
|
+
site_len = len(site_seq)
|
|
230
|
+
|
|
231
|
+
# Compute codon range overlapping the site
|
|
232
|
+
first_codon_idx = (pos // 3) * 3
|
|
233
|
+
last_codon_idx = ((pos + site_len - 1) // 3) * 3
|
|
234
|
+
|
|
235
|
+
# Try synonymous substitutions per codon
|
|
236
|
+
for codon_start in range(first_codon_idx, last_codon_idx + 1, 3):
|
|
237
|
+
if codon_start + 3 > len(seq):
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
original_codon = seq[codon_start : codon_start + 3]
|
|
241
|
+
|
|
242
|
+
# Validate amino acid
|
|
243
|
+
if original_codon not in self.codon_table["codons"]:
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
aa = self.codon_table["codons"][original_codon]["aa"]
|
|
247
|
+
|
|
248
|
+
# Find synonymous codons
|
|
249
|
+
synonymous_codons = [c for c in self.aa_to_codons.get(aa, []) if c != original_codon]
|
|
250
|
+
|
|
251
|
+
if not synonymous_codons:
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
# Try each synonymous codon
|
|
255
|
+
for alt_codon in synonymous_codons:
|
|
256
|
+
# Temporary substitution
|
|
257
|
+
test_seq = seq[:codon_start] + alt_codon + seq[codon_start + 3 :]
|
|
258
|
+
|
|
259
|
+
# Check if site is gone
|
|
260
|
+
test_region = test_seq[max(0, pos - 10) : min(len(test_seq), pos + site_len + 10)]
|
|
261
|
+
|
|
262
|
+
if site_seq not in test_region:
|
|
263
|
+
# Success
|
|
264
|
+
return {
|
|
265
|
+
"success": True,
|
|
266
|
+
"modified_seq": test_seq,
|
|
267
|
+
"changes": [
|
|
268
|
+
{
|
|
269
|
+
"pos": codon_start,
|
|
270
|
+
"original": original_codon,
|
|
271
|
+
"fixed": alt_codon,
|
|
272
|
+
"aa": aa,
|
|
273
|
+
}
|
|
274
|
+
],
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
# Failed to fix
|
|
278
|
+
return {
|
|
279
|
+
"success": False,
|
|
280
|
+
"modified_seq": seq,
|
|
281
|
+
"changes": [],
|
|
282
|
+
"reason": "No synonymous codon available to remove site",
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
def _suggest_alternatives(self, seq: str, site: dict[str, Any]) -> list[str]:
|
|
286
|
+
"""
|
|
287
|
+
Suggest alternatives for unremovable sites
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
seq: DNA sequence
|
|
291
|
+
site: Site info
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
List of alternative suggestions
|
|
295
|
+
"""
|
|
296
|
+
alternatives: list[str] = []
|
|
297
|
+
|
|
298
|
+
pos = site["position"]
|
|
299
|
+
site_len = len(site["site"])
|
|
300
|
+
|
|
301
|
+
# Determine affected codons
|
|
302
|
+
first_codon_idx = (pos // 3) * 3
|
|
303
|
+
last_codon_idx = ((pos + site_len - 1) // 3) * 3
|
|
304
|
+
|
|
305
|
+
affected_codons: list[dict[str, Any]] = []
|
|
306
|
+
for codon_start in range(first_codon_idx, last_codon_idx + 1, 3):
|
|
307
|
+
if codon_start + 3 <= len(seq):
|
|
308
|
+
codon = seq[codon_start : codon_start + 3]
|
|
309
|
+
if codon in self.codon_table["codons"]:
|
|
310
|
+
aa = self.codon_table["codons"][codon]["aa"]
|
|
311
|
+
synonymous = self.aa_to_codons.get(aa, [])
|
|
312
|
+
affected_codons.append(
|
|
313
|
+
{
|
|
314
|
+
"pos": codon_start,
|
|
315
|
+
"codon": codon,
|
|
316
|
+
"aa": aa,
|
|
317
|
+
"synonymous_count": len(synonymous) - 1,
|
|
318
|
+
}
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Suggest alternatives
|
|
322
|
+
if any(c["synonymous_count"] == 0 for c in affected_codons):
|
|
323
|
+
alternatives.append(
|
|
324
|
+
"Includes amino acids without synonyms - requires non-synonymous change"
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
alternatives.append("Try shifting the site by adjusting adjacent codons")
|
|
328
|
+
|
|
329
|
+
alternatives.append("Consider using a different assembly method")
|
|
330
|
+
|
|
331
|
+
return alternatives
|
|
332
|
+
|
|
333
|
+
def batch_domesticate(
|
|
334
|
+
self,
|
|
335
|
+
sequences: list[dict[str, Any]],
|
|
336
|
+
standard: str = "golden_gate",
|
|
337
|
+
) -> list[dict[str, Any]]:
|
|
338
|
+
"""
|
|
339
|
+
Batch domestication
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
sequences: [{"id": "gene1", "sequence": "ATG..."}, ...]
|
|
343
|
+
standard: Assembly standard
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
List of results
|
|
347
|
+
|
|
348
|
+
Raises:
|
|
349
|
+
ValueError: Unsupported assembly standard.
|
|
350
|
+
|
|
351
|
+
Examples:
|
|
352
|
+
>>> domesticator = Domesticator()
|
|
353
|
+
>>> results = domesticator.batch_domesticate([{"id": "x", "sequence": "ATG"}])
|
|
354
|
+
>>> len(results)
|
|
355
|
+
1
|
|
356
|
+
"""
|
|
357
|
+
results: list[dict[str, Any]] = []
|
|
358
|
+
|
|
359
|
+
for seq_data in sequences:
|
|
360
|
+
seq_id = seq_data.get("id", "unknown")
|
|
361
|
+
seq = seq_data.get("sequence", "")
|
|
362
|
+
|
|
363
|
+
result = self.domesticate(seq, standard)
|
|
364
|
+
result["id"] = seq_id
|
|
365
|
+
results.append(result)
|
|
366
|
+
|
|
367
|
+
return results
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
# --- Usage example ---
|
|
371
|
+
if __name__ == "__main__":
|
|
372
|
+
domesticator = Domesticator()
|
|
373
|
+
|
|
374
|
+
# Test sequence (includes BsaI site)
|
|
375
|
+
test_seq = "ATGGGTCTCGAGGAGCTGTTCACCGGGGTGGTGCCCATC"
|
|
376
|
+
|
|
377
|
+
print("=== Original Sequence ===")
|
|
378
|
+
print(f"Sequence: {test_seq}")
|
|
379
|
+
|
|
380
|
+
# Golden Gate scan
|
|
381
|
+
sites = domesticator.scan_restriction_sites(test_seq, "golden_gate")
|
|
382
|
+
print(f"\nRestriction sites found: {len(sites)}")
|
|
383
|
+
for site in sites:
|
|
384
|
+
print(f" - {site['enzyme']} at position {site['position']}: {site['site']}")
|
|
385
|
+
|
|
386
|
+
# Domestication
|
|
387
|
+
print("\n=== Domestication ===")
|
|
388
|
+
result = domesticator.domesticate(test_seq, "golden_gate")
|
|
389
|
+
|
|
390
|
+
if result["success"]:
|
|
391
|
+
print("✅ Domestication successful!")
|
|
392
|
+
print(f"Modified sequence: {result['domesticated_seq']}")
|
|
393
|
+
print(f"Removed sites: {len(result['removed_sites'])}")
|
|
394
|
+
for removed in result["removed_sites"]:
|
|
395
|
+
print(f" - {removed['enzyme']} at position {removed['position']}")
|
|
396
|
+
else:
|
|
397
|
+
print("❌ Domestication failed")
|
|
398
|
+
print(f"Unfixable sites: {len(result['unfixable'])}")
|
|
399
|
+
for unfixable in result["unfixable"]:
|
|
400
|
+
print(
|
|
401
|
+
f" - {unfixable.get('enzyme', 'N/A')} at position {unfixable.get('position', 'N/A')}"
|
|
402
|
+
)
|
|
403
|
+
print(f" Alternatives: {unfixable.get('alternatives', [])}")
|