factorforge-cds 3.1.7__tar.gz → 3.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {factorforge_cds-3.1.7/src/factorforge_cds.egg-info → factorforge_cds-3.1.9}/PKG-INFO +2 -2
  2. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/README.md +1 -1
  3. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/pyproject.toml +2 -1
  4. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/__init__.py +1 -1
  5. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/analysis/feasibility.py +8 -4
  6. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/analysis/metrics.py +61 -8
  7. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/__init__.py +1 -1
  8. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/__init__.py +1 -1
  9. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/exporter.py +25 -2
  10. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/optimizer.py +6 -2
  11. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/pipeline.py +38 -2
  12. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/rules/reverse_translator.py +36 -12
  13. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/rules/rule_engine.py +35 -24
  14. factorforge_cds-3.1.9/src/factorforge/engines/profile/scoring.py +351 -0
  15. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/utils/sequence_validator.py +2 -1
  16. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/utils/validation.py +3 -1
  17. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9/src/factorforge_cds.egg-info}/PKG-INFO +2 -2
  18. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/tests/test_sequence_validator.py +2 -1
  19. factorforge_cds-3.1.7/src/factorforge/engines/profile/scoring.py +0 -260
  20. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/LICENSE +0 -0
  21. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/setup.cfg +0 -0
  22. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/__main__.py +0 -0
  23. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/analysis/__init__.py +0 -0
  24. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/cli/__init__.py +0 -0
  25. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/cli/legacy_cli.py +0 -0
  26. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/cli/main.py +0 -0
  27. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/core/interfaces/__init__.py +0 -0
  28. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/core/interfaces/exporter.py +0 -0
  29. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/core/interfaces/optimizer.py +0 -0
  30. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/core/interfaces/validator.py +0 -0
  31. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/data/nbenthamiana_codons.json +0 -0
  32. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/data/nbenthamiana_golden_set.json +0 -0
  33. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/data/ntabacum_codons.json +0 -0
  34. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/data/templates/high_expression.json +0 -0
  35. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/data/templates/standard_expression.json +0 -0
  36. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/data/wolffia_globosa_codons.json +0 -0
  37. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/database.py +0 -0
  38. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/codon_table_builder.py +0 -0
  39. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/construct_builder.py +0 -0
  40. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/rules/__init__.py +0 -0
  41. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/rules/domesticator.py +0 -0
  42. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/scoring_ml.py +0 -0
  43. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/utils.py +0 -0
  44. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/validator.py +0 -0
  45. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/registry.py +0 -0
  46. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/schemas/__init__.py +0 -0
  47. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/schemas/design_package.py +0 -0
  48. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/schemas/design_package.schema.json +0 -0
  49. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/utils/__init__.py +0 -0
  50. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/utils/construct_id.py +0 -0
  51. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/utils/exceptions.py +0 -0
  52. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/utils/restriction_sites.py +0 -0
  53. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/validation/__init__.py +0 -0
  54. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/validation/cli.py +0 -0
  55. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/validation/package_generator.py +0 -0
  56. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge_cds.egg-info/SOURCES.txt +0 -0
  57. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge_cds.egg-info/dependency_links.txt +0 -0
  58. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge_cds.egg-info/entry_points.txt +0 -0
  59. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge_cds.egg-info/requires.txt +0 -0
  60. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge_cds.egg-info/top_level.txt +0 -0
  61. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/tests/test_database.py +0 -0
  62. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/tests/test_legacy_cli.py +0 -0
  63. {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/tests/test_restriction_sites.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: factorforge-cds
3
- Version: 3.1.7
3
+ Version: 3.1.9
4
4
  Summary: FactorForge - open-source constraint-based CDS design engine by Eijex.
5
5
  Author-email: Eijex <eijex.lab@gmail.com>
6
6
  License-Expression: AGPL-3.0-only
@@ -107,7 +107,7 @@ FactorForge predictions are **in-silico only** and have not been experimentally
107
107
  ## Citing
108
108
 
109
109
  ```
110
- FactorForge v3.1.7 (2026). Open-source constraint-based CDS design engine.
110
+ FactorForge v3.1.9 (2026). Open-source constraint-based CDS design engine.
111
111
  Eijex. https://github.com/eijex/factorforge-cds
112
112
  ```
113
113
 
@@ -76,7 +76,7 @@ FactorForge predictions are **in-silico only** and have not been experimentally
76
76
  ## Citing
77
77
 
78
78
  ```
79
- FactorForge v3.1.7 (2026). Open-source constraint-based CDS design engine.
79
+ FactorForge v3.1.9 (2026). Open-source constraint-based CDS design engine.
80
80
  Eijex. https://github.com/eijex/factorforge-cds
81
81
  ```
82
82
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "factorforge-cds"
7
- version = "3.1.7"
7
+ version = "3.1.9"
8
8
  description = "FactorForge - open-source constraint-based CDS design engine by Eijex."
9
9
  readme = "README.md"
10
10
  license = "AGPL-3.0-only"
@@ -56,6 +56,7 @@ where = ["src"]
56
56
  [tool.pytest.ini_options]
57
57
  testpaths = ["tests"]
58
58
  norecursedirs = ["archive"]
59
+ pythonpath = ["src"]
59
60
 
60
61
  [tool.ruff]
61
62
  line-length = 100
@@ -4,7 +4,7 @@ FactorForge - Codon Optimization Platform
4
4
  profile: constraint-aware rule/profile engine
5
5
  """
6
6
 
7
- __version__ = "3.1.7"
7
+ __version__ = "3.1.9"
8
8
  __author__ = "Eijex"
9
9
 
10
10
  # Auto-register engines (safe when running from source tree)
@@ -88,9 +88,9 @@ def _reconstruct_sequence(
88
88
  def analyze_feasibility(
89
89
  protein_sequence: str,
90
90
  codon_weights: dict[str, float],
91
- target_cai: float = 0.92,
92
- target_gc_low: float = 41.0,
93
- target_gc_high: float = 44.0,
91
+ target_cai: float = 0.82,
92
+ target_gc_low: float = 55.0,
93
+ target_gc_high: float = 65.0,
94
94
  gc_ranges: list[tuple[float, float]] | None = None,
95
95
  ) -> dict[str, Any]:
96
96
  """Compute exact CAI/GC feasibility over synonymous codon choices.
@@ -98,12 +98,16 @@ def analyze_feasibility(
98
98
  The dynamic program keeps the best log-CAI sequence for each reachable
99
99
  global GC count. This is exact for global GC and CAI under the supplied
100
100
  codon weights.
101
+
102
+ Defaults calibrated to nbenthamiana profile engine output distribution
103
+ (analysis 004, n=49): avg CAI=0.76, avg GC=60.1% (range 55-71%).
104
+ target_cai=0.82 aligns with industry practice (>0.8) and is achievable.
101
105
  """
102
106
  protein = "".join(protein_sequence.upper().split()).rstrip("*")
103
107
  if not protein:
104
108
  raise ValueError("protein_sequence must not be empty")
105
109
 
106
- ranges = gc_ranges or [(41.0, 44.0), (40.0, 50.0), (40.0, 55.0)]
110
+ ranges = gc_ranges or [(55.0, 65.0), (50.0, 65.0), (40.0, 65.0)]
107
111
  normalized_ranges = [
108
112
  (_normalize_gc_bound(low), _normalize_gc_bound(high)) for low, high in ranges
109
113
  ]
@@ -11,6 +11,12 @@ from typing import Any
11
11
 
12
12
  from factorforge.engines.profile.utils import get_data_path
13
13
 
14
+ # Homopolymer thresholds — two distinct concerns, intentionally different values.
15
+ # Expression stability: AT-rich runs ≥6 nt can resemble instability elements (ARE).
16
+ # Synthesis/manufacturing: runs ≥8 nt are flagged by gene synthesis vendors as difficult.
17
+ HOMOPOLYMER_EXPRESSION_WARN_NT = 6
18
+ HOMOPOLYMER_SYNTHESIS_WARN_NT = 8
19
+
14
20
 
15
21
  STANDARD_GENETIC_CODE: dict[str, str] = {
16
22
  "TTT": "F",
@@ -271,6 +277,32 @@ def calculate_cai(sequence: str, codon_weights: dict[str, float]) -> float:
271
277
  return math.exp(log_sum / count) if count else 0.0
272
278
 
273
279
 
280
+ def calculate_dinucleotide_score(
281
+ sequence: str,
282
+ cpg_weight: float = 0.0,
283
+ tpa_weight: float = 1.0,
284
+ ) -> float:
285
+ """Score dinucleotide avoidance.
286
+
287
+ Plant default: CpG inactive (cpg_weight=0.0), only TpA is penalized.
288
+ Mammalian opt-in: set cpg_weight=1.0 and tpa_weight=1.0 to penalize both.
289
+ """
290
+ from factorforge.engines.profile.utils import calculate_dinucleotide_ratio
291
+
292
+ if len(sequence) < 6:
293
+ return 1.0
294
+
295
+ total_weight = cpg_weight + tpa_weight
296
+ if total_weight == 0:
297
+ return 1.0
298
+
299
+ cpg_ratio = calculate_dinucleotide_ratio(sequence, "CG")
300
+ tpa_ratio = calculate_dinucleotide_ratio(sequence, "TA")
301
+ cpg_score = max(0.0, 1.0 - cpg_ratio / 2.0)
302
+ tpa_score = max(0.0, 1.0 - tpa_ratio / 2.0)
303
+ return (cpg_weight * cpg_score + tpa_weight * tpa_score) / total_weight
304
+
305
+
274
306
  def codon_usage_profile(sequence: str) -> dict[str, dict[str, float | int | str]]:
275
307
  """Return codon counts and frequencies for a DNA sequence."""
276
308
  codons = _codons(sequence)
@@ -286,8 +318,19 @@ def codon_usage_profile(sequence: str) -> dict[str, dict[str, float | int | str]
286
318
  return profile
287
319
 
288
320
 
289
- def detect_homopolymers(sequence: str, max_run: int = 6) -> list[dict[str, Any]]:
290
- """Detect runs whose length is greater than or equal to max_run."""
321
+ def detect_homopolymers(
322
+ sequence: str,
323
+ max_run: int = HOMOPOLYMER_EXPRESSION_WARN_NT,
324
+ ) -> list[dict[str, Any]]:
325
+ """Detect homopolymer runs for expression stability evaluation.
326
+
327
+ Uses HOMOPOLYMER_EXPRESSION_WARN_NT (default 6 nt) — AT-rich runs of this
328
+ length can resemble AU-rich instability elements (ARE) and affect mRNA
329
+ stability in plant expression systems.
330
+
331
+ For synthesis/manufacturing risk, see RuleEngine.scan_homopolymers()
332
+ which uses HOMOPOLYMER_SYNTHESIS_WARN_NT (8 nt).
333
+ """
291
334
  if max_run <= 1:
292
335
  raise ValueError("max_run must be > 1")
293
336
 
@@ -303,17 +346,27 @@ def detect_homopolymers(sequence: str, max_run: int = 6) -> list[dict[str, Any]]
303
346
  continue
304
347
  run_length = index - run_start
305
348
  if run_length >= max_run:
306
- findings.append(
307
- {"start": run_start, "end": index, "base": run_base, "length": run_length}
308
- )
349
+ findings.append({
350
+ "start": run_start,
351
+ "end": index,
352
+ "base": run_base,
353
+ "length": run_length,
354
+ "context": "expression_stability",
355
+ "threshold_nt": max_run,
356
+ })
309
357
  run_base = base
310
358
  run_start = index
311
359
 
312
360
  run_length = len(seq) - run_start
313
361
  if run_length >= max_run:
314
- findings.append(
315
- {"start": run_start, "end": len(seq), "base": run_base, "length": run_length}
316
- )
362
+ findings.append({
363
+ "start": run_start,
364
+ "end": len(seq),
365
+ "base": run_base,
366
+ "length": run_length,
367
+ "context": "expression_stability",
368
+ "threshold_nt": max_run,
369
+ })
317
370
  return findings
318
371
 
319
372
 
@@ -13,7 +13,7 @@ def register_builtin_engines() -> None:
13
13
  "profile",
14
14
  RuleBasedOptimizer,
15
15
  metadata={
16
- "version": "3.1.7",
16
+ "version": "3.1.9",
17
17
  "engine_type": "profile_rule_based",
18
18
  "role": "stable_profile_engine",
19
19
  "stable": True,
@@ -5,7 +5,7 @@ Production system (2026)
5
5
  Plant-specific rule-based optimization
6
6
  """
7
7
 
8
- __version__ = "3.1.7"
8
+ __version__ = "3.1.9"
9
9
 
10
10
  from .optimizer import RuleBasedOptimizer
11
11
  from .pipeline import OptimizationPipeline
@@ -6,8 +6,10 @@ GenBank and FASTA export module (P0-5)
6
6
  from __future__ import annotations
7
7
 
8
8
  import hashlib
9
+ import json
9
10
  from datetime import datetime
10
11
  from io import StringIO
12
+ from pathlib import Path
11
13
  from typing import Any
12
14
 
13
15
 
@@ -25,6 +27,26 @@ class SequenceExporter:
25
27
  """Initialize"""
26
28
  pass
27
29
 
30
+ def host_species(self, metadata: dict[str, Any]) -> str:
31
+ """Resolve host species from feature_registry.json when possible."""
32
+ if metadata.get("organism"):
33
+ return str(metadata["organism"])
34
+
35
+ host = str(
36
+ metadata.get("host_profile") or metadata.get("host") or "nbenthamiana"
37
+ ).strip().lower()
38
+ host_aliases = {"ntabacum": "by2"}
39
+ registry_key = host_aliases.get(host, host)
40
+ registry_path = Path(__file__).resolve().parents[4] / "scripts" / "feature_registry.json"
41
+
42
+ try:
43
+ registry = json.loads(registry_path.read_text(encoding="utf-8"))
44
+ except (OSError, json.JSONDecodeError):
45
+ registry = {}
46
+
47
+ species = registry.get("hosts", {}).get(registry_key, {}).get("species")
48
+ return str(species or "Nicotiana benthamiana")
49
+
28
50
  def generate_run_id(self, sequence: str, params: dict[str, Any]) -> str:
29
51
  """
30
52
  Generate a reproducible run_id
@@ -69,6 +91,7 @@ class SequenceExporter:
69
91
  "run_id": "abc12345",
70
92
  "timestamp": "2026-01-22T12:00:00",
71
93
  "organism": "Nicotiana benthamiana",
94
+ "host_profile": "nbenthamiana",
72
95
  "gene_name": "GFP",
73
96
  "violations_fixed": [...],
74
97
  "warnings": [...]
@@ -99,7 +122,7 @@ class SequenceExporter:
99
122
  run_id = metadata.get("run_id", self.generate_run_id(sequence, metadata))
100
123
  timestamp = metadata.get("timestamp", datetime.now().strftime("%Y%m%d"))
101
124
  gene_name = metadata.get("gene_name", "optimized_gene")
102
- organism = metadata.get("organism", "Nicotiana benthamiana")
125
+ organism = self.host_species(metadata)
103
126
 
104
127
  # Build locus ID
105
128
  locus_id = f"PFORM_{run_id}_{timestamp}"
@@ -355,7 +378,7 @@ class SequenceExporter:
355
378
  "--- Optimization Settings ---",
356
379
  f"Profile: {metadata.get('profile', 'N/A')}",
357
380
  f"Assembly Standard: {metadata.get('assembly_standard', 'None')}",
358
- f"Organism: {metadata.get('organism', 'Nicotiana benthamiana')}",
381
+ f"Organism: {self.host_species(metadata)}",
359
382
  "",
360
383
  ]
361
384
 
@@ -9,7 +9,7 @@ from factorforge.core.interfaces import OptimizationResult, OptimizerEngine
9
9
  from .exporter import SequenceExporter
10
10
  from .rules.reverse_translator import OptimizationProfile, ReverseTranslator
11
11
  from .rules.rule_engine import RuleEngine
12
- from .scoring import calculate_composite_score
12
+ from .scoring import calculate_composite_score, compute_mfe_evidence
13
13
  from .validator import InputValidator
14
14
 
15
15
 
@@ -17,7 +17,7 @@ class RuleBasedOptimizer(OptimizerEngine):
17
17
  """Profile-based rule optimization engine."""
18
18
 
19
19
  name = "Profile-based"
20
- version = "3.1.7"
20
+ version = "3.1.9"
21
21
 
22
22
  def __init__(self) -> None:
23
23
  self.validator = InputValidator()
@@ -117,6 +117,10 @@ class RuleBasedOptimizer(OptimizerEngine):
117
117
  "score": candidates[0]["score"],
118
118
  "violations": sum(len(v) for v in scan_results.values()),
119
119
  }
120
+ # MFE provenance: expose whether MFE was actually computed so downstream
121
+ # artifacts (API response, Design Package) never report an uncomputed
122
+ # MFE as a misleading 0.0 (016 audit). Score value is unchanged.
123
+ metrics.update(compute_mfe_evidence(optimized_dna, profile=profile_value))
120
124
 
121
125
  return OptimizationResult(
122
126
  sequence=optimized_dna,
@@ -18,9 +18,14 @@ from factorforge.engines.profile.rules.reverse_translator import (
18
18
  ReverseTranslator,
19
19
  )
20
20
  from factorforge.engines.profile.rules.rule_engine import RuleEngine
21
- from factorforge.engines.profile.scoring import calculate_composite_score
21
+ from factorforge.engines.profile.scoring import (
22
+ calculate_composite_score,
23
+ compute_mfe_evidence,
24
+ )
22
25
  from factorforge.engines.profile.validator import InputValidator
26
+ from factorforge.analysis.metrics import translate_dna
23
27
  from factorforge.utils.construct_id import generate_construct_id
28
+ from factorforge.utils.sequence_validator import validate_cds_output
24
29
 
25
30
  logger = logging.getLogger(__name__)
26
31
 
@@ -48,7 +53,15 @@ class PipelineResult:
48
53
  "optimization_profile": self.metadata.get("profile", ""),
49
54
  "cai_score": round(metrics.get("cai", 0.0), 4),
50
55
  "gc_content_pct": round(metrics.get("gc", 0.0), 2),
51
- "mfe_kcal_mol": round(metrics.get("mfe", 0.0), 2),
56
+ # MFE provenance (016 audit): None when not computed (e.g. ViennaRNA
57
+ # unavailable) — never report an uncomputed MFE as a misleading 0.0.
58
+ "mfe_kcal_mol": (
59
+ round(metrics["mfe_kcal_mol"], 2)
60
+ if metrics.get("mfe_kcal_mol") is not None
61
+ else None
62
+ ),
63
+ "mfe_status": metrics.get("mfe_status", "not_computed"),
64
+ "mfe_used": metrics.get("mfe_used", False),
52
65
  "polya_signal_count": len(scan.get("polya", [])),
53
66
  "domestication_edits": len(dom.get("removed_sites", [])),
54
67
  "sequence_length_aa": len(self.sequence) // 3,
@@ -175,6 +188,7 @@ class OptimizationPipeline:
175
188
 
176
189
  if seq_type == "dna":
177
190
  optimized_dna = processed
191
+ expected_protein = translate_dna(processed).rstrip("*")
178
192
  cai = translator.calculate_cai(optimized_dna)
179
193
  gc = translator.calculate_gc_content(optimized_dna)
180
194
  score = calculate_composite_score(
@@ -182,6 +196,7 @@ class OptimizationPipeline:
182
196
  )
183
197
  candidate_metrics = {"cai": cai, "gc": gc, "score": score}
184
198
  else:
199
+ expected_protein = processed.rstrip("*")
185
200
  logger.debug(f"Generating candidates with profile: {opt_profile.value}")
186
201
  candidates = translator.generate_candidates(processed, profile=opt_profile, n=1)
187
202
  if not candidates:
@@ -251,7 +266,20 @@ class OptimizationPipeline:
251
266
 
252
267
  assembly_standard = kwargs.get("assembly_standard", "golden_gate")
253
268
  domestication = domesticator.domesticate(optimized_dna, standard=assembly_standard)
269
+ if not domestication.get("success", False):
270
+ unfixable = domestication.get("unfixable", [])
271
+ error = domestication.get("error")
272
+ detail = error or f"unfixable restriction sites: {unfixable}"
273
+ raise ValueError(f"Domestication failed for {assembly_standard}: {detail}")
274
+
254
275
  domesticated_sequence = domestication.get("domesticated_seq", optimized_dna)
276
+ final_validation = validate_cds_output(expected_protein, domesticated_sequence)
277
+ if not final_validation["passed"]:
278
+ raise ValueError(
279
+ "Final CDS validation failed: "
280
+ f"{final_validation['errors']} "
281
+ f"(aa_identity={final_validation['aa_identity']:.4f})"
282
+ )
255
283
 
256
284
  template_name = construct_template or self.construct_template
257
285
  if template_name:
@@ -269,6 +297,13 @@ class OptimizationPipeline:
269
297
  construct_record = None
270
298
  final_sequence = domesticated_sequence
271
299
 
300
+ # MFE provenance for the final output sequence (016 audit): record
301
+ # whether MFE was computed so export_features / Design Package never
302
+ # report an uncomputed MFE as 0.0.
303
+ candidate_metrics.update(
304
+ compute_mfe_evidence(domesticated_sequence, profile=effective_profile)
305
+ )
306
+
272
307
  metadata: dict[str, Any] = {
273
308
  "construct_id": generate_construct_id(),
274
309
  "profile": effective_profile,
@@ -278,6 +313,7 @@ class OptimizationPipeline:
278
313
  "validation": val_result,
279
314
  "scan_results": scan_results,
280
315
  "domestication": domestication,
316
+ "final_validation": final_validation,
281
317
  "metrics": candidate_metrics,
282
318
  "scan_mode": scan_mode,
283
319
  }
@@ -15,7 +15,7 @@ from enum import Enum
15
15
  from pathlib import Path
16
16
  from typing import Any, cast
17
17
 
18
- from factorforge.engines.profile.scoring import calculate_composite_score
18
+ from factorforge.engines.profile.scoring import GC_OPT_MID, calculate_composite_score
19
19
  from factorforge.engines.profile.utils import (
20
20
  build_aa_to_codons_map,
21
21
  calculate_gc,
@@ -440,14 +440,22 @@ class ReverseTranslator:
440
440
  return "".join(dna_seq)
441
441
 
442
442
  def _gc_target_translate(self, protein_seq: str, **kwargs: Any) -> str:
443
- """
444
- GC-Target profile: enforce GC% 42.5% ±2% (N. benthamiana optimal)
443
+ """GC-Target profile: drive global GC toward a configurable target.
444
+
445
+ Targets the caller-supplied ``target_gc`` if provided, otherwise the
446
+ host-profile GC midpoint (GC_OPT_MID = 60% for N. benthamiana). To target
447
+ a lower GC (e.g. for specific vector requirements), pass target_gc explicitly.
445
448
 
446
449
  - GC constraint first
447
450
  - CAI may be sacrificed
448
451
  - Balance local window GC (50 bp)
452
+
453
+ TODO: GC_OPT_MID is currently a single N. benthamiana-calibrated constant.
454
+ When per-host GC profiles are added, source the default from the active host.
449
455
  """
450
- target_gc = kwargs.get("target_gc", 42.5)
456
+ target_gc = kwargs.get("target_gc")
457
+ if target_gc is None:
458
+ target_gc = GC_OPT_MID
451
459
 
452
460
  dna_seq: list[str] = []
453
461
 
@@ -477,11 +485,22 @@ class ReverseTranslator:
477
485
  return "".join(dna_seq)
478
486
 
479
487
  def _assembly_friendly_translate(self, protein_seq: str, **kwargs: Any) -> str:
480
- """
481
- Assembly-Friendly profile: avoid BsaI/BpiI
488
+ """Translate for Golden Gate / MoClo assembly compatibility.
489
+
490
+ Strategy:
491
+ - Starts from balanced codon selection (preferred_ratio=0.6)
492
+ - Retries up to max_attempts times until no BsaI/BpiI Type IIS
493
+ restriction sites remain in the CDS (forward + reverse complement)
494
+ - CAI trade-offs are accepted to achieve site-free sequences
482
495
 
483
- - Golden Gate compatible
484
- - CAI trade-offs allowed
496
+ Current scope:
497
+ - Supported: BsaI/BpiI site avoidance via stochastic retry
498
+ - Not yet implemented: local GC window uniformity scoring,
499
+ repeat-pattern penalties, synthesis vendor constraint profiles
500
+
501
+ Args:
502
+ protein_seq: Amino acid sequence.
503
+ max_attempts: Retry limit for site removal (default: 10).
485
504
  """
486
505
  max_attempts = kwargs.get("max_attempts", 10)
487
506
  if max_attempts < 1:
@@ -529,12 +548,17 @@ class ReverseTranslator:
529
548
  return self._apply_nterminal_ramp(dna_seq, protein_seq, ramp_codons=ramp_codons)
530
549
 
531
550
  def _apply_nterminal_ramp(self, dna_seq: str, protein_seq: str, ramp_codons: int = 50) -> str:
532
- """
533
- Apply N-terminal codon ramp for co-translational folding.
551
+ """Apply N-terminal codon ramp for co-translational folding.
534
552
 
535
553
  Replaces the first `ramp_codons` codons with lower-frequency synonymous
536
- codons (bottom 50% by frequency) to slow the ribosome at the N-terminus.
537
- Single-codon amino acids (Met, Trp) are left unchanged.
554
+ codons (bottom 25% by frequency; cutoff = 3*len//4) to slow the ribosome
555
+ at the N-terminus. Single-codon amino acids (Met, Trp) are left unchanged.
556
+
557
+ TODO: ramp profile is currently not in VALID_PROFILES (not publicly accessible).
558
+ Before re-enabling, revisit ramp_codons=50:
559
+ - Literature suggests 10–30 codons (Tuller et al. 2010, Chu et al. 2014).
560
+ - For short proteins, 50 codons can cover the entire CDS.
561
+ - Consider: ramp_len = min(30, max(10, int(protein_length * 0.15)))
538
562
 
539
563
  Args:
540
564
  dna_seq: Full-length DNA sequence.
@@ -11,6 +11,7 @@ import math
11
11
  import re
12
12
  from typing import Any
13
13
 
14
+ from factorforge.analysis.metrics import HOMOPOLYMER_SYNTHESIS_WARN_NT
14
15
  from factorforge.engines.profile.rules.reverse_translator import ReverseTranslator
15
16
  from factorforge.engines.profile.utils import (
16
17
  build_aa_to_codons_map,
@@ -246,24 +247,27 @@ class RuleEngine:
246
247
 
247
248
  return violations
248
249
 
249
- def scan_homopolymers(self, seq: str, min_length: int = 8) -> list[dict[str, Any]]:
250
- """
251
- Detect 8+ homopolymers (synthesis risk)
250
+ def scan_homopolymers(
251
+ self, seq: str, min_length: int = HOMOPOLYMER_SYNTHESIS_WARN_NT
252
+ ) -> list[dict[str, Any]]:
253
+ """Detect homopolymer runs for synthesis/manufacturing risk evaluation.
252
254
 
253
- Args:
254
- seq: DNA sequence
255
- min_length: Minimum length
255
+ Uses HOMOPOLYMER_SYNTHESIS_WARN_NT (default 8 nt) — the threshold at
256
+ which gene synthesis vendors flag homopolymers as difficult to synthesize
257
+ with high fidelity.
256
258
 
257
- Returns:
258
- List of violations
259
+ For expression stability risk (≥6 nt), see
260
+ factorforge.analysis.metrics.detect_homopolymers() which uses
261
+ HOMOPOLYMER_EXPRESSION_WARN_NT.
259
262
 
260
- Raises:
261
- None.
263
+ Args:
264
+ seq: DNA sequence
265
+ min_length: Minimum run length to flag (default: HOMOPOLYMER_SYNTHESIS_WARN_NT)
262
266
 
263
267
  Examples:
264
268
  >>> engine = RuleEngine()
265
269
  >>> engine.scan_homopolymers("AAAAAAAA", min_length=8)
266
- [{'type': 'homopolymer', ...}]
270
+ [{'type': 'homopolymer', 'context': 'synthesis', ...}]
267
271
  """
268
272
  violations: list[dict[str, Any]] = []
269
273
 
@@ -280,16 +284,16 @@ class RuleEngine:
280
284
  while idx + actual_length < len(seq) and seq[idx + actual_length] == base:
281
285
  actual_length += 1
282
286
 
283
- violations.append(
284
- {
285
- "type": "homopolymer",
286
- "base": base,
287
- "position": idx,
288
- "length": actual_length,
289
- "sequence": base * actual_length,
290
- "severity": "high" if actual_length >= 10 else "medium",
291
- }
292
- )
287
+ violations.append({
288
+ "type": "homopolymer",
289
+ "context": "synthesis",
290
+ "threshold_nt": min_length,
291
+ "base": base,
292
+ "position": idx,
293
+ "length": actual_length,
294
+ "sequence": base * actual_length,
295
+ "severity": "high" if actual_length >= 10 else "medium",
296
+ })
293
297
  pos = idx + actual_length
294
298
 
295
299
  return violations
@@ -350,13 +354,20 @@ class RuleEngine:
350
354
  max_gc: float = 75,
351
355
  ) -> list[dict[str, Any]]:
352
356
  """
353
- Detect extreme GC regions
357
+ Detect extreme GC regions in a sliding local window.
358
+
359
+ This is a LOCAL synthesis/extreme-window guard (default 25-75% over a
360
+ 50 bp window), NOT the global GC target. Global GC is governed separately
361
+ by the scoring band (GC_OPT_MIN/MAX, ~55-65%) and the API/DP gc_min/gc_max
362
+ constraints. The wide 25-75% band intentionally flags only synthesis-hostile
363
+ local windows; narrowing it toward the global optimum would raise false
364
+ positives against the engine's own output distribution (analysis 004: 55-71%).
354
365
 
355
366
  Args:
356
367
  seq: DNA sequence
357
368
  window: Window size (bp)
358
- min_gc: Minimum GC% threshold
359
- max_gc: Maximum GC% threshold
369
+ min_gc: Minimum local GC% threshold (synthesis guard, not global target)
370
+ max_gc: Maximum local GC% threshold (synthesis guard, not global target)
360
371
 
361
372
  Returns:
362
373
  List of violations