factorforge-cds 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. factorforge/__init__.py +19 -0
  2. factorforge/__main__.py +8 -0
  3. factorforge/cli/__init__.py +5 -0
  4. factorforge/cli/legacy_cli.py +157 -0
  5. factorforge/cli/main.py +305 -0
  6. factorforge/core/interfaces/__init__.py +7 -0
  7. factorforge/core/interfaces/exporter.py +13 -0
  8. factorforge/core/interfaces/optimizer.py +85 -0
  9. factorforge/core/interfaces/validator.py +9 -0
  10. factorforge/database.py +150 -0
  11. factorforge/engines/__init__.py +60 -0
  12. factorforge/engines/ml/__init__.py +0 -0
  13. factorforge/engines/ml/plant_optimizer.py +325 -0
  14. factorforge/engines/registry.py +141 -0
  15. factorforge/engines/v1_archived/__init__.py +15 -0
  16. factorforge/engines/v2/__init__.py +13 -0
  17. factorforge/engines/v2/codon_table_builder.py +107 -0
  18. factorforge/engines/v2/construct_builder.py +403 -0
  19. factorforge/engines/v2/exporter.py +455 -0
  20. factorforge/engines/v2/optimizer.py +190 -0
  21. factorforge/engines/v2/pipeline.py +275 -0
  22. factorforge/engines/v2/rules/__init__.py +3 -0
  23. factorforge/engines/v2/rules/domesticator.py +403 -0
  24. factorforge/engines/v2/rules/reverse_translator.py +765 -0
  25. factorforge/engines/v2/rules/rule_engine.py +867 -0
  26. factorforge/engines/v2/scoring.py +232 -0
  27. factorforge/engines/v2/utils.py +231 -0
  28. factorforge/engines/v2/validator.py +383 -0
  29. factorforge/engines/v3/__init__.py +12 -0
  30. factorforge/engines/v3/explain.py +119 -0
  31. factorforge/engines/v3/inference/__init__.py +6 -0
  32. factorforge/engines/v3/inference/constrained_decoder.py +80 -0
  33. factorforge/engines/v3/inference/v2_adapter.py +72 -0
  34. factorforge/engines/v3/metrics.py +145 -0
  35. factorforge/engines/v3/modeling_bart_decoder.py +127 -0
  36. factorforge/engines/v3/pipeline.py +192 -0
  37. factorforge/engines/v3/synonym_mask.py +61 -0
  38. factorforge/engines/v3/tokenizer.py +192 -0
  39. factorforge/ml/__init__.py +33 -0
  40. factorforge/ml/feasibility.py +199 -0
  41. factorforge/ml/metrics.py +295 -0
  42. factorforge/utils/__init__.py +31 -0
  43. factorforge/utils/construct_id.py +8 -0
  44. factorforge/utils/exceptions.py +32 -0
  45. factorforge/utils/sequence_validator.py +189 -0
  46. factorforge/utils/validation.py +104 -0
  47. factorforge_cds-3.0.0.dist-info/METADATA +475 -0
  48. factorforge_cds-3.0.0.dist-info/RECORD +52 -0
  49. factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
  50. factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
  51. factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
  52. factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,275 @@
1
+ """
2
+ Optimization pipeline for FactorForge v2.
3
+ Integrates validation, translation, rule scanning, domestication, and construct building.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+ from typing import TYPE_CHECKING, Any
12
+
13
+ from factorforge.engines.v2.construct_builder import ConstructBuilder
14
+ from factorforge.engines.v2.rules.domesticator import Domesticator
15
+ from factorforge.engines.v2.rules.reverse_translator import OptimizationProfile, ReverseTranslator
16
+ from factorforge.engines.v2.rules.rule_engine import RuleEngine
17
+ from factorforge.engines.v2.scoring import calculate_composite_score
18
+ from factorforge.engines.v2.validator import InputValidator
19
+ from factorforge.utils.construct_id import generate_construct_id
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ if TYPE_CHECKING:
24
+ from Bio.SeqRecord import SeqRecord
25
+
26
+
27
+ @dataclass
28
+ class PipelineResult:
29
+ """Pipeline output container."""
30
+
31
+ sequence: str
32
+ construct: "SeqRecord | None" = None
33
+ metadata: dict[str, Any] = field(default_factory=dict)
34
+
35
+ def export_features(self) -> dict[str, Any]:
36
+ """schema.md 호환 피처 dict 반환 (purity_pct 제외 — 실험 후 수동 입력)."""
37
+ metrics = self.metadata.get("metrics", {})
38
+ scan = self.metadata.get("scan_results", {})
39
+ dom = self.metadata.get("domestication", {})
40
+
41
+ return {
42
+ "construct_id": self.metadata.get("construct_id", ""),
43
+ "protein_name": "",
44
+ "optimization_profile": self.metadata.get("profile", ""),
45
+ "cai_score": round(metrics.get("cai", 0.0), 4),
46
+ "gc_content_pct": round(metrics.get("gc", 0.0), 2),
47
+ "mfe_kcal_mol": round(metrics.get("mfe", 0.0), 2),
48
+ "polya_signal_count": len(scan.get("polya", [])),
49
+ "domestication_edits": len(dom.get("removed_sites", [])),
50
+ "sequence_length_aa": len(self.sequence) // 3,
51
+ "agro_od600": None,
52
+ "dpi": None,
53
+ "purity_pct": None,
54
+ "yield_mg_per_kg": None,
55
+ }
56
+
57
+ def save(self, filepath: Path, format: str = "fasta") -> None:
58
+ """
59
+ Save the result to a file.
60
+
61
+ Args:
62
+ filepath: Output file path.
63
+ format: "fasta" or "genbank".
64
+
65
+ Raises:
66
+ ImportError: If Biopython is required but not installed.
67
+ """
68
+ format_lower = format.lower()
69
+
70
+ if self.construct is not None and format_lower == "genbank":
71
+ try:
72
+ from Bio import SeqIO
73
+ except ImportError as exc:
74
+ raise ImportError("Biopython is required: pip install biopython") from exc
75
+
76
+ SeqIO.write(self.construct, str(filepath), "genbank")
77
+ return
78
+
79
+ with open(filepath, "w", encoding="utf-8") as handle:
80
+ handle.write(f">optimized\n{self.sequence}\n")
81
+
82
+
83
+ class OptimizationPipeline:
84
+ """End-to-end v2 optimization pipeline."""
85
+
86
+ def __init__(
87
+ self,
88
+ profile: str = "balanced",
89
+ construct_template: str | None = None,
90
+ template_dir: Path | None = None,
91
+ ) -> None:
92
+ """
93
+ Args:
94
+ profile: Optimization profile name.
95
+ construct_template: Optional construct template name.
96
+ template_dir: Optional template directory.
97
+ """
98
+ self.profile = profile
99
+ self.construct_template = construct_template
100
+ self.template_dir = template_dir
101
+
102
+ self.validator = InputValidator()
103
+ self.translator = ReverseTranslator()
104
+ self.rule_engine = RuleEngine()
105
+ self.domesticator = Domesticator()
106
+
107
+ if construct_template:
108
+ if template_dir is None:
109
+ template_dir = Path(__file__).resolve().parents[4] / "data" / "templates"
110
+ self.construct_builder: ConstructBuilder | None = ConstructBuilder(template_dir)
111
+ else:
112
+ self.construct_builder = None
113
+
114
+ def run(
115
+ self,
116
+ sequence: str,
117
+ profile: str | None = None,
118
+ construct_template: str | None = None,
119
+ **kwargs: Any,
120
+ ) -> PipelineResult:
121
+ """
122
+ Run the optimization pipeline.
123
+
124
+ Args:
125
+ sequence: Input protein or DNA sequence.
126
+ profile: Optional profile override.
127
+ construct_template: Optional template override.
128
+ **kwargs: Additional settings.
129
+
130
+ Returns:
131
+ PipelineResult with sequence and metadata.
132
+
133
+ Raises:
134
+ ValueError: If input sequence is invalid.
135
+ """
136
+ logger.info(f"Starting optimization pipeline with profile: {profile or self.profile}")
137
+
138
+ val_result = self.validator.validate(sequence)
139
+ if not val_result["valid"]:
140
+ logger.error(f"Input validation failed: {val_result['errors']}")
141
+ raise ValueError(f"Invalid input sequence: {val_result['errors']}")
142
+
143
+ processed = val_result["processed_sequence"]
144
+ seq_type = val_result["type"]
145
+ logger.debug(f"Detected sequence type: {seq_type}")
146
+ if seq_type == "fasta":
147
+ seq_type = self.validator.detect_sequence_type(processed).value
148
+
149
+ effective_profile = (profile or self.profile or "balanced").lower()
150
+ try:
151
+ opt_profile = OptimizationProfile(effective_profile)
152
+ except ValueError as exc:
153
+ supported = ", ".join(p.value for p in OptimizationProfile)
154
+ raise ValueError(
155
+ f"Unknown profile: {effective_profile}. Supported profiles: {supported}"
156
+ ) from exc
157
+
158
+ if seq_type == "dna":
159
+ optimized_dna = processed
160
+ cai = self.translator.calculate_cai(optimized_dna)
161
+ gc = self.translator.calculate_gc_content(optimized_dna)
162
+ score = calculate_composite_score(
163
+ cai=cai, gc=gc, sequence=optimized_dna, profile=effective_profile
164
+ )
165
+ candidate_metrics = {"cai": cai, "gc": gc, "score": score}
166
+ else:
167
+ logger.debug(f"Generating candidates with profile: {opt_profile.value}")
168
+ candidates = self.translator.generate_candidates(processed, profile=opt_profile, n=1)
169
+ if not candidates:
170
+ logger.error("No candidates generated for input sequence")
171
+ raise ValueError("No candidates generated for input sequence.")
172
+ optimized_dna = candidates[0]["sequence"]
173
+ candidate_metrics = {
174
+ "cai": candidates[0]["cai"],
175
+ "gc": candidates[0]["gc"],
176
+ "score": candidates[0]["score"],
177
+ }
178
+ logger.info(
179
+ f"Generated optimized sequence: CAI={candidate_metrics['cai']:.3f}, "
180
+ f"GC={candidate_metrics['gc']:.1f}%"
181
+ )
182
+
183
+ # Fast pre-check avoids an expensive full rule scan before PolyA fixing.
184
+ has_polya_signal = any(
185
+ pattern in optimized_dna for pattern in self.rule_engine.POLYA_PATTERNS
186
+ )
187
+ if has_polya_signal:
188
+ logger.debug("Potential PolyA signal detected; attempting iterative fix")
189
+ polya_fix = self.rule_engine.fix_polya_iterative(optimized_dna)
190
+ if polya_fix["success"]:
191
+ optimized_dna = polya_fix["modified_seq"]
192
+ logger.info(
193
+ f"Fixed {len(polya_fix['fixes_applied'])} PolyA violation(s) "
194
+ f"in {polya_fix['rounds']} round(s)"
195
+ )
196
+ else:
197
+ logger.warning(
198
+ f"Could not fix all PolyA violations. "
199
+ f"Remaining: {polya_fix.get('remaining_violations', '?')}"
200
+ )
201
+
202
+ logger.debug("Scanning for final rule violations")
203
+ scan_mode = str(kwargs.get("scan_mode", "full"))
204
+ scan_include = kwargs.get("scan_include")
205
+ scan_exclude = kwargs.get("scan_exclude")
206
+ scan_results = self.rule_engine.scan_all(
207
+ optimized_dna,
208
+ mode=scan_mode,
209
+ include=scan_include,
210
+ exclude=scan_exclude,
211
+ )
212
+
213
+ assembly_standard = kwargs.get("assembly_standard", "golden_gate")
214
+ domestication = self.domesticator.domesticate(optimized_dna, standard=assembly_standard)
215
+ domesticated_sequence = domestication.get("domesticated_seq", optimized_dna)
216
+
217
+ template_name = construct_template or self.construct_template
218
+ if template_name:
219
+ if self.construct_builder is None:
220
+ template_dir = (
221
+ self.template_dir or Path(__file__).resolve().parents[4] / "data" / "templates"
222
+ )
223
+ self.construct_builder = ConstructBuilder(template_dir)
224
+ construct_record = self.construct_builder.generate_construct(
225
+ gene_sequence=domesticated_sequence,
226
+ template_name=template_name,
227
+ )
228
+ final_sequence = str(construct_record.seq)
229
+ else:
230
+ construct_record = None
231
+ final_sequence = domesticated_sequence
232
+
233
+ metadata: dict[str, Any] = {
234
+ "construct_id": generate_construct_id(),
235
+ "profile": effective_profile,
236
+ "construct_template": template_name,
237
+ "construct_features": len(construct_record.features) if construct_record else 0,
238
+ "validation": val_result,
239
+ "scan_results": scan_results,
240
+ "domestication": domestication,
241
+ "metrics": candidate_metrics,
242
+ "scan_mode": scan_mode,
243
+ }
244
+
245
+ return PipelineResult(
246
+ sequence=final_sequence,
247
+ construct=construct_record,
248
+ metadata=metadata,
249
+ )
250
+
251
+ def run_batch(
252
+ self,
253
+ sequences: list[dict[str, str]] | list[str],
254
+ profile: str | None = None,
255
+ construct_template: str | None = None,
256
+ **kwargs: Any,
257
+ ) -> list[PipelineResult]:
258
+ """Run the optimization pipeline for a batch of sequences."""
259
+ results: list[PipelineResult] = []
260
+ for idx, entry in enumerate(sequences, start=1):
261
+ if isinstance(entry, dict):
262
+ seq = entry.get("sequence", "")
263
+ seq_id = entry.get("id", f"seq{idx}")
264
+ else:
265
+ seq = entry
266
+ seq_id = f"seq{idx}"
267
+ result = self.run(
268
+ seq,
269
+ profile=profile,
270
+ construct_template=construct_template,
271
+ **kwargs,
272
+ )
273
+ result.metadata["input_id"] = seq_id
274
+ results.append(result)
275
+ return results
@@ -0,0 +1,3 @@
1
+ """Rule-based optimization rules"""
2
+
3
+ # TODO: Add PolyA scanner, domesticator, etc.
@@ -0,0 +1,403 @@
1
+ """
2
+ Domesticator for FactorForge v2
3
+ Assembly standard compatibility (P0-4) - Golden Gate/MoClo/BioBricks
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from factorforge.engines.v2.utils import build_aa_to_codons_map, load_codon_table
12
+
13
+
14
+ class Domesticator:
15
+ """
16
+ Remove restriction enzyme sites for assembly compatibility
17
+
18
+ Supported assembly systems:
19
+ - Golden Gate (BsaI, BpiI, BsmBI)
20
+ - MoClo (BsaI + overhangs)
21
+ - BioBricks (EcoRI, XbaI, SpeI, PstI)
22
+ """
23
+
24
+ # Assembly standard definitions
25
+ ASSEMBLY_STANDARDS: dict[str, dict[str, Any]] = {
26
+ "golden_gate": {
27
+ "enzymes": ["BsaI", "BpiI", "BsmBI"],
28
+ "sites": {
29
+ "BsaI": ["GGTCTC", "GAGACC"], # Forward and reverse complement
30
+ "BpiI": ["GAAGAC", "GTCTTC"],
31
+ "BsmBI": ["CGTCTC", "GAGACG"],
32
+ },
33
+ },
34
+ "moclo": {
35
+ "enzymes": ["BsaI"],
36
+ "sites": {"BsaI": ["GGTCTC", "GAGACC"]},
37
+ "overhangs": ["AATG", "AGGT", "GCTT", "CGCT"], # Level 0
38
+ },
39
+ "biobricks": {
40
+ "enzymes": ["EcoRI", "XbaI", "SpeI", "PstI"],
41
+ "sites": {
42
+ "EcoRI": ["GAATTC"],
43
+ "XbaI": ["TCTAGA"],
44
+ "SpeI": ["ACTAGT"],
45
+ "PstI": ["CTGCAG"],
46
+ },
47
+ },
48
+ }
49
+
50
+ def __init__(self, codon_table: dict[str, Any] | None = None) -> None:
51
+ """
52
+ Args:
53
+ codon_table: Codon table (loads default if None)
54
+ """
55
+ if codon_table is None:
56
+ # Load from default path
57
+ project_root = Path(__file__).resolve().parents[5]
58
+ codon_table = load_codon_table("nbenthamiana", project_root / "data")
59
+
60
+ self.codon_table: dict[str, Any] = codon_table
61
+ self.aa_to_codons: dict[str, list[str]] = self._build_aa_to_codons_map()
62
+
63
+ def _build_aa_to_codons_map(self) -> dict[str, list[str]]:
64
+ """Build amino-acid-to-codons map"""
65
+ return build_aa_to_codons_map(self.codon_table)
66
+
67
+ def scan_restriction_sites(
68
+ self,
69
+ seq: str,
70
+ standard: str = "golden_gate",
71
+ ) -> list[dict[str, Any]]:
72
+ """
73
+ Scan restriction enzyme sites
74
+
75
+ Args:
76
+ seq: DNA sequence
77
+ standard: Assembly standard ("golden_gate", "moclo", "biobricks")
78
+
79
+ Returns:
80
+ List of detected sites
81
+
82
+ Raises:
83
+ ValueError: Unsupported assembly standard.
84
+
85
+ Examples:
86
+ >>> domesticator = Domesticator()
87
+ >>> domesticator.scan_restriction_sites("GGTCTC", "golden_gate")
88
+ [{'enzyme': 'BsaI', ...}]
89
+ """
90
+ if standard not in self.ASSEMBLY_STANDARDS:
91
+ raise ValueError(f"Unknown assembly standard: {standard}")
92
+
93
+ assembly_info = self.ASSEMBLY_STANDARDS[standard]
94
+ sites_found: list[dict[str, Any]] = []
95
+
96
+ for enzyme, site_seqs in assembly_info["sites"].items():
97
+ for site_seq in site_seqs:
98
+ pos = 0
99
+ while True:
100
+ idx = seq.find(site_seq, pos)
101
+ if idx == -1:
102
+ break
103
+
104
+ sites_found.append(
105
+ {
106
+ "enzyme": enzyme,
107
+ "site": site_seq,
108
+ "position": idx,
109
+ "context": seq[
110
+ max(0, idx - 10) : min(len(seq), idx + len(site_seq) + 10)
111
+ ],
112
+ }
113
+ )
114
+ pos = idx + 1
115
+
116
+ return sites_found
117
+
118
+ def domesticate(
119
+ self,
120
+ seq: str,
121
+ standard: str = "golden_gate",
122
+ max_attempts: int = 100,
123
+ ) -> dict[str, Any]:
124
+ """
125
+ Remove restriction enzyme sites
126
+
127
+ Args:
128
+ seq: DNA sequence
129
+ standard: Assembly standard
130
+ max_attempts: Maximum attempts
131
+
132
+ Returns:
133
+ {
134
+ "domesticated_seq": "...",
135
+ "removed_sites": [...],
136
+ "unfixable": [...],
137
+ "success": True/False
138
+ }
139
+
140
+ Raises:
141
+ ValueError: Unsupported assembly standard.
142
+
143
+ Examples:
144
+ >>> domesticator = Domesticator()
145
+ >>> result = domesticator.domesticate("ATGGGTCTCGAG", "golden_gate")
146
+ >>> "domesticated_seq" in result
147
+ True
148
+ """
149
+ if len(seq) % 3 != 0:
150
+ return {
151
+ "success": False,
152
+ "error": "Sequence length not divisible by 3",
153
+ "domesticated_seq": seq,
154
+ }
155
+
156
+ modified_seq = seq
157
+ removed_sites: list[dict[str, Any]] = []
158
+ unfixable: list[dict[str, Any]] = []
159
+
160
+ # Iteratively remove sites
161
+ for attempt in range(max_attempts):
162
+ sites = self.scan_restriction_sites(modified_seq, standard)
163
+
164
+ if not sites:
165
+ # All sites removed
166
+ return {
167
+ "success": True,
168
+ "domesticated_seq": modified_seq,
169
+ "removed_sites": removed_sites,
170
+ "unfixable": [],
171
+ "attempts": attempt + 1,
172
+ }
173
+
174
+ # Attempt to remove the first site
175
+ site = sites[0]
176
+ result = self._remove_site(modified_seq, site)
177
+
178
+ if result["success"]:
179
+ modified_seq = result["modified_seq"]
180
+ removed_sites.append(
181
+ {
182
+ "enzyme": site["enzyme"],
183
+ "position": site["position"],
184
+ "changes": result["changes"],
185
+ }
186
+ )
187
+ else:
188
+ # Failed to remove site
189
+ unfixable.append(
190
+ {
191
+ "enzyme": site["enzyme"],
192
+ "site": site["site"],
193
+ "position": site["position"],
194
+ "reason": result.get("reason", "Unknown"),
195
+ "alternatives": self._suggest_alternatives(seq, site),
196
+ }
197
+ )
198
+ # Record as unfixable and continue to next site
199
+ continue
200
+
201
+ # Sites still remain after max attempts
202
+ remaining_sites = self.scan_restriction_sites(modified_seq, standard)
203
+
204
+ return {
205
+ "success": len(remaining_sites) == 0,
206
+ "domesticated_seq": modified_seq,
207
+ "removed_sites": removed_sites,
208
+ "unfixable": unfixable if unfixable else remaining_sites,
209
+ "attempts": max_attempts,
210
+ }
211
+
212
+ def _remove_site(self, seq: str, site: dict[str, Any]) -> dict[str, Any]:
213
+ """
214
+ Remove a single restriction enzyme site
215
+
216
+ Args:
217
+ seq: DNA sequence
218
+ site: Site info
219
+
220
+ Returns:
221
+ {
222
+ "success": True/False,
223
+ "modified_seq": "...",
224
+ "changes": [...]
225
+ }
226
+ """
227
+ pos = site["position"]
228
+ site_seq = site["site"]
229
+ site_len = len(site_seq)
230
+
231
+ # Compute codon range overlapping the site
232
+ first_codon_idx = (pos // 3) * 3
233
+ last_codon_idx = ((pos + site_len - 1) // 3) * 3
234
+
235
+ # Try synonymous substitutions per codon
236
+ for codon_start in range(first_codon_idx, last_codon_idx + 1, 3):
237
+ if codon_start + 3 > len(seq):
238
+ continue
239
+
240
+ original_codon = seq[codon_start : codon_start + 3]
241
+
242
+ # Validate amino acid
243
+ if original_codon not in self.codon_table["codons"]:
244
+ continue
245
+
246
+ aa = self.codon_table["codons"][original_codon]["aa"]
247
+
248
+ # Find synonymous codons
249
+ synonymous_codons = [c for c in self.aa_to_codons.get(aa, []) if c != original_codon]
250
+
251
+ if not synonymous_codons:
252
+ continue
253
+
254
+ # Try each synonymous codon
255
+ for alt_codon in synonymous_codons:
256
+ # Temporary substitution
257
+ test_seq = seq[:codon_start] + alt_codon + seq[codon_start + 3 :]
258
+
259
+ # Check if site is gone
260
+ test_region = test_seq[max(0, pos - 10) : min(len(test_seq), pos + site_len + 10)]
261
+
262
+ if site_seq not in test_region:
263
+ # Success
264
+ return {
265
+ "success": True,
266
+ "modified_seq": test_seq,
267
+ "changes": [
268
+ {
269
+ "pos": codon_start,
270
+ "original": original_codon,
271
+ "fixed": alt_codon,
272
+ "aa": aa,
273
+ }
274
+ ],
275
+ }
276
+
277
+ # Failed to fix
278
+ return {
279
+ "success": False,
280
+ "modified_seq": seq,
281
+ "changes": [],
282
+ "reason": "No synonymous codon available to remove site",
283
+ }
284
+
285
+ def _suggest_alternatives(self, seq: str, site: dict[str, Any]) -> list[str]:
286
+ """
287
+ Suggest alternatives for unremovable sites
288
+
289
+ Args:
290
+ seq: DNA sequence
291
+ site: Site info
292
+
293
+ Returns:
294
+ List of alternative suggestions
295
+ """
296
+ alternatives: list[str] = []
297
+
298
+ pos = site["position"]
299
+ site_len = len(site["site"])
300
+
301
+ # Determine affected codons
302
+ first_codon_idx = (pos // 3) * 3
303
+ last_codon_idx = ((pos + site_len - 1) // 3) * 3
304
+
305
+ affected_codons: list[dict[str, Any]] = []
306
+ for codon_start in range(first_codon_idx, last_codon_idx + 1, 3):
307
+ if codon_start + 3 <= len(seq):
308
+ codon = seq[codon_start : codon_start + 3]
309
+ if codon in self.codon_table["codons"]:
310
+ aa = self.codon_table["codons"][codon]["aa"]
311
+ synonymous = self.aa_to_codons.get(aa, [])
312
+ affected_codons.append(
313
+ {
314
+ "pos": codon_start,
315
+ "codon": codon,
316
+ "aa": aa,
317
+ "synonymous_count": len(synonymous) - 1,
318
+ }
319
+ )
320
+
321
+ # Suggest alternatives
322
+ if any(c["synonymous_count"] == 0 for c in affected_codons):
323
+ alternatives.append(
324
+ "Includes amino acids without synonyms - requires non-synonymous change"
325
+ )
326
+
327
+ alternatives.append("Try shifting the site by adjusting adjacent codons")
328
+
329
+ alternatives.append("Consider using a different assembly method")
330
+
331
+ return alternatives
332
+
333
+ def batch_domesticate(
334
+ self,
335
+ sequences: list[dict[str, Any]],
336
+ standard: str = "golden_gate",
337
+ ) -> list[dict[str, Any]]:
338
+ """
339
+ Batch domestication
340
+
341
+ Args:
342
+ sequences: [{"id": "gene1", "sequence": "ATG..."}, ...]
343
+ standard: Assembly standard
344
+
345
+ Returns:
346
+ List of results
347
+
348
+ Raises:
349
+ ValueError: Unsupported assembly standard.
350
+
351
+ Examples:
352
+ >>> domesticator = Domesticator()
353
+ >>> results = domesticator.batch_domesticate([{"id": "x", "sequence": "ATG"}])
354
+ >>> len(results)
355
+ 1
356
+ """
357
+ results: list[dict[str, Any]] = []
358
+
359
+ for seq_data in sequences:
360
+ seq_id = seq_data.get("id", "unknown")
361
+ seq = seq_data.get("sequence", "")
362
+
363
+ result = self.domesticate(seq, standard)
364
+ result["id"] = seq_id
365
+ results.append(result)
366
+
367
+ return results
368
+
369
+
370
+ # --- Usage example ---
371
+ if __name__ == "__main__":
372
+ domesticator = Domesticator()
373
+
374
+ # Test sequence (includes BsaI site)
375
+ test_seq = "ATGGGTCTCGAGGAGCTGTTCACCGGGGTGGTGCCCATC"
376
+
377
+ print("=== Original Sequence ===")
378
+ print(f"Sequence: {test_seq}")
379
+
380
+ # Golden Gate scan
381
+ sites = domesticator.scan_restriction_sites(test_seq, "golden_gate")
382
+ print(f"\nRestriction sites found: {len(sites)}")
383
+ for site in sites:
384
+ print(f" - {site['enzyme']} at position {site['position']}: {site['site']}")
385
+
386
+ # Domestication
387
+ print("\n=== Domestication ===")
388
+ result = domesticator.domesticate(test_seq, "golden_gate")
389
+
390
+ if result["success"]:
391
+ print("✅ Domestication successful!")
392
+ print(f"Modified sequence: {result['domesticated_seq']}")
393
+ print(f"Removed sites: {len(result['removed_sites'])}")
394
+ for removed in result["removed_sites"]:
395
+ print(f" - {removed['enzyme']} at position {removed['position']}")
396
+ else:
397
+ print("❌ Domestication failed")
398
+ print(f"Unfixable sites: {len(result['unfixable'])}")
399
+ for unfixable in result["unfixable"]:
400
+ print(
401
+ f" - {unfixable.get('enzyme', 'N/A')} at position {unfixable.get('position', 'N/A')}"
402
+ )
403
+ print(f" Alternatives: {unfixable.get('alternatives', [])}")