factorforge-cds 3.1.7__tar.gz → 3.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {factorforge_cds-3.1.7/src/factorforge_cds.egg-info → factorforge_cds-3.1.8}/PKG-INFO +2 -2
  2. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/README.md +1 -1
  3. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/pyproject.toml +2 -1
  4. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/__init__.py +1 -1
  5. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/analysis/feasibility.py +8 -4
  6. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/analysis/metrics.py +35 -8
  7. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/__init__.py +1 -1
  8. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/__init__.py +1 -1
  9. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/exporter.py +25 -2
  10. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/optimizer.py +1 -1
  11. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/rules/reverse_translator.py +34 -10
  12. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/rules/rule_engine.py +25 -21
  13. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/scoring.py +77 -27
  14. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/utils/sequence_validator.py +2 -1
  15. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/utils/validation.py +3 -1
  16. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8/src/factorforge_cds.egg-info}/PKG-INFO +2 -2
  17. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/tests/test_sequence_validator.py +2 -1
  18. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/LICENSE +0 -0
  19. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/setup.cfg +0 -0
  20. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/__main__.py +0 -0
  21. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/analysis/__init__.py +0 -0
  22. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/cli/__init__.py +0 -0
  23. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/cli/legacy_cli.py +0 -0
  24. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/cli/main.py +0 -0
  25. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/core/interfaces/__init__.py +0 -0
  26. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/core/interfaces/exporter.py +0 -0
  27. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/core/interfaces/optimizer.py +0 -0
  28. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/core/interfaces/validator.py +0 -0
  29. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/data/nbenthamiana_codons.json +0 -0
  30. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/data/nbenthamiana_golden_set.json +0 -0
  31. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/data/ntabacum_codons.json +0 -0
  32. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/data/templates/high_expression.json +0 -0
  33. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/data/templates/standard_expression.json +0 -0
  34. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/data/wolffia_globosa_codons.json +0 -0
  35. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/database.py +0 -0
  36. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/codon_table_builder.py +0 -0
  37. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/construct_builder.py +0 -0
  38. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/pipeline.py +0 -0
  39. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/rules/__init__.py +0 -0
  40. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/rules/domesticator.py +0 -0
  41. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/scoring_ml.py +0 -0
  42. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/utils.py +0 -0
  43. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/profile/validator.py +0 -0
  44. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/engines/registry.py +0 -0
  45. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/schemas/__init__.py +0 -0
  46. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/schemas/design_package.py +0 -0
  47. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/schemas/design_package.schema.json +0 -0
  48. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/utils/__init__.py +0 -0
  49. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/utils/construct_id.py +0 -0
  50. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/utils/exceptions.py +0 -0
  51. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/utils/restriction_sites.py +0 -0
  52. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/validation/__init__.py +0 -0
  53. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/validation/cli.py +0 -0
  54. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge/validation/package_generator.py +0 -0
  55. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge_cds.egg-info/SOURCES.txt +0 -0
  56. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge_cds.egg-info/dependency_links.txt +0 -0
  57. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge_cds.egg-info/entry_points.txt +0 -0
  58. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge_cds.egg-info/requires.txt +0 -0
  59. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/src/factorforge_cds.egg-info/top_level.txt +0 -0
  60. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/tests/test_database.py +0 -0
  61. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/tests/test_legacy_cli.py +0 -0
  62. {factorforge_cds-3.1.7 → factorforge_cds-3.1.8}/tests/test_restriction_sites.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: factorforge-cds
3
- Version: 3.1.7
3
+ Version: 3.1.8
4
4
  Summary: FactorForge - open-source constraint-based CDS design engine by Eijex.
5
5
  Author-email: Eijex <eijex.lab@gmail.com>
6
6
  License-Expression: AGPL-3.0-only
@@ -107,7 +107,7 @@ FactorForge predictions are **in-silico only** and have not been experimentally
107
107
  ## Citing
108
108
 
109
109
  ```
110
- FactorForge v3.1.7 (2026). Open-source constraint-based CDS design engine.
110
+ FactorForge v3.1.8 (2026). Open-source constraint-based CDS design engine.
111
111
  Eijex. https://github.com/eijex/factorforge-cds
112
112
  ```
113
113
 
@@ -76,7 +76,7 @@ FactorForge predictions are **in-silico only** and have not been experimentally
76
76
  ## Citing
77
77
 
78
78
  ```
79
- FactorForge v3.1.7 (2026). Open-source constraint-based CDS design engine.
79
+ FactorForge v3.1.8 (2026). Open-source constraint-based CDS design engine.
80
80
  Eijex. https://github.com/eijex/factorforge-cds
81
81
  ```
82
82
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "factorforge-cds"
7
- version = "3.1.7"
7
+ version = "3.1.8"
8
8
  description = "FactorForge - open-source constraint-based CDS design engine by Eijex."
9
9
  readme = "README.md"
10
10
  license = "AGPL-3.0-only"
@@ -56,6 +56,7 @@ where = ["src"]
56
56
  [tool.pytest.ini_options]
57
57
  testpaths = ["tests"]
58
58
  norecursedirs = ["archive"]
59
+ pythonpath = ["src"]
59
60
 
60
61
  [tool.ruff]
61
62
  line-length = 100
@@ -4,7 +4,7 @@ FactorForge - Codon Optimization Platform
4
4
  profile: constraint-aware rule/profile engine
5
5
  """
6
6
 
7
- __version__ = "3.1.7"
7
+ __version__ = "3.1.8"
8
8
  __author__ = "Eijex"
9
9
 
10
10
  # Auto-register engines (safe when running from source tree)
@@ -88,9 +88,9 @@ def _reconstruct_sequence(
88
88
  def analyze_feasibility(
89
89
  protein_sequence: str,
90
90
  codon_weights: dict[str, float],
91
- target_cai: float = 0.92,
92
- target_gc_low: float = 41.0,
93
- target_gc_high: float = 44.0,
91
+ target_cai: float = 0.82,
92
+ target_gc_low: float = 55.0,
93
+ target_gc_high: float = 65.0,
94
94
  gc_ranges: list[tuple[float, float]] | None = None,
95
95
  ) -> dict[str, Any]:
96
96
  """Compute exact CAI/GC feasibility over synonymous codon choices.
@@ -98,12 +98,16 @@ def analyze_feasibility(
98
98
  The dynamic program keeps the best log-CAI sequence for each reachable
99
99
  global GC count. This is exact for global GC and CAI under the supplied
100
100
  codon weights.
101
+
102
+ Defaults calibrated to nbenthamiana profile engine output distribution
103
+ (analysis 004, n=49): avg CAI=0.76, avg GC=60.1% (range 55-71%).
104
+ target_cai=0.82 aligns with industry practice (>0.8) and is achievable.
101
105
  """
102
106
  protein = "".join(protein_sequence.upper().split()).rstrip("*")
103
107
  if not protein:
104
108
  raise ValueError("protein_sequence must not be empty")
105
109
 
106
- ranges = gc_ranges or [(41.0, 44.0), (40.0, 50.0), (40.0, 55.0)]
110
+ ranges = gc_ranges or [(55.0, 65.0), (50.0, 65.0), (40.0, 65.0)]
107
111
  normalized_ranges = [
108
112
  (_normalize_gc_bound(low), _normalize_gc_bound(high)) for low, high in ranges
109
113
  ]
@@ -11,6 +11,12 @@ from typing import Any
11
11
 
12
12
  from factorforge.engines.profile.utils import get_data_path
13
13
 
14
+ # Homopolymer thresholds — two distinct concerns, intentionally different values.
15
+ # Expression stability: AT-rich runs ≥6 nt can resemble instability elements (ARE).
16
+ # Synthesis/manufacturing: runs ≥8 nt are flagged by gene synthesis vendors as difficult.
17
+ HOMOPOLYMER_EXPRESSION_WARN_NT = 6
18
+ HOMOPOLYMER_SYNTHESIS_WARN_NT = 8
19
+
14
20
 
15
21
  STANDARD_GENETIC_CODE: dict[str, str] = {
16
22
  "TTT": "F",
@@ -286,8 +292,19 @@ def codon_usage_profile(sequence: str) -> dict[str, dict[str, float | int | str]
286
292
  return profile
287
293
 
288
294
 
289
- def detect_homopolymers(sequence: str, max_run: int = 6) -> list[dict[str, Any]]:
290
- """Detect runs whose length is greater than or equal to max_run."""
295
+ def detect_homopolymers(
296
+ sequence: str,
297
+ max_run: int = HOMOPOLYMER_EXPRESSION_WARN_NT,
298
+ ) -> list[dict[str, Any]]:
299
+ """Detect homopolymer runs for expression stability evaluation.
300
+
301
+ Uses HOMOPOLYMER_EXPRESSION_WARN_NT (default 6 nt) — AT-rich runs of this
302
+ length can resemble AU-rich instability elements (ARE) and affect mRNA
303
+ stability in plant expression systems.
304
+
305
+ For synthesis/manufacturing risk, see RuleEngine.scan_homopolymers()
306
+ which uses HOMOPOLYMER_SYNTHESIS_WARN_NT (8 nt).
307
+ """
291
308
  if max_run <= 1:
292
309
  raise ValueError("max_run must be > 1")
293
310
 
@@ -303,17 +320,27 @@ def detect_homopolymers(sequence: str, max_run: int = 6) -> list[dict[str, Any]]
303
320
  continue
304
321
  run_length = index - run_start
305
322
  if run_length >= max_run:
306
- findings.append(
307
- {"start": run_start, "end": index, "base": run_base, "length": run_length}
308
- )
323
+ findings.append({
324
+ "start": run_start,
325
+ "end": index,
326
+ "base": run_base,
327
+ "length": run_length,
328
+ "context": "expression_stability",
329
+ "threshold_nt": max_run,
330
+ })
309
331
  run_base = base
310
332
  run_start = index
311
333
 
312
334
  run_length = len(seq) - run_start
313
335
  if run_length >= max_run:
314
- findings.append(
315
- {"start": run_start, "end": len(seq), "base": run_base, "length": run_length}
316
- )
336
+ findings.append({
337
+ "start": run_start,
338
+ "end": len(seq),
339
+ "base": run_base,
340
+ "length": run_length,
341
+ "context": "expression_stability",
342
+ "threshold_nt": max_run,
343
+ })
317
344
  return findings
318
345
 
319
346
 
@@ -13,7 +13,7 @@ def register_builtin_engines() -> None:
13
13
  "profile",
14
14
  RuleBasedOptimizer,
15
15
  metadata={
16
- "version": "3.1.7",
16
+ "version": "3.1.8",
17
17
  "engine_type": "profile_rule_based",
18
18
  "role": "stable_profile_engine",
19
19
  "stable": True,
@@ -5,7 +5,7 @@ Production system (2026)
5
5
  Plant-specific rule-based optimization
6
6
  """
7
7
 
8
- __version__ = "3.1.7"
8
+ __version__ = "3.1.8"
9
9
 
10
10
  from .optimizer import RuleBasedOptimizer
11
11
  from .pipeline import OptimizationPipeline
@@ -6,8 +6,10 @@ GenBank and FASTA export module (P0-5)
6
6
  from __future__ import annotations
7
7
 
8
8
  import hashlib
9
+ import json
9
10
  from datetime import datetime
10
11
  from io import StringIO
12
+ from pathlib import Path
11
13
  from typing import Any
12
14
 
13
15
 
@@ -25,6 +27,26 @@ class SequenceExporter:
25
27
  """Initialize"""
26
28
  pass
27
29
 
30
+ def host_species(self, metadata: dict[str, Any]) -> str:
31
+ """Resolve host species from feature_registry.json when possible."""
32
+ if metadata.get("organism"):
33
+ return str(metadata["organism"])
34
+
35
+ host = str(
36
+ metadata.get("host_profile") or metadata.get("host") or "nbenthamiana"
37
+ ).strip().lower()
38
+ host_aliases = {"ntabacum": "by2"}
39
+ registry_key = host_aliases.get(host, host)
40
+ registry_path = Path(__file__).resolve().parents[4] / "scripts" / "feature_registry.json"
41
+
42
+ try:
43
+ registry = json.loads(registry_path.read_text(encoding="utf-8"))
44
+ except (OSError, json.JSONDecodeError):
45
+ registry = {}
46
+
47
+ species = registry.get("hosts", {}).get(registry_key, {}).get("species")
48
+ return str(species or "Nicotiana benthamiana")
49
+
28
50
  def generate_run_id(self, sequence: str, params: dict[str, Any]) -> str:
29
51
  """
30
52
  Generate a reproducible run_id
@@ -69,6 +91,7 @@ class SequenceExporter:
69
91
  "run_id": "abc12345",
70
92
  "timestamp": "2026-01-22T12:00:00",
71
93
  "organism": "Nicotiana benthamiana",
94
+ "host_profile": "nbenthamiana",
72
95
  "gene_name": "GFP",
73
96
  "violations_fixed": [...],
74
97
  "warnings": [...]
@@ -99,7 +122,7 @@ class SequenceExporter:
99
122
  run_id = metadata.get("run_id", self.generate_run_id(sequence, metadata))
100
123
  timestamp = metadata.get("timestamp", datetime.now().strftime("%Y%m%d"))
101
124
  gene_name = metadata.get("gene_name", "optimized_gene")
102
- organism = metadata.get("organism", "Nicotiana benthamiana")
125
+ organism = self.host_species(metadata)
103
126
 
104
127
  # Build locus ID
105
128
  locus_id = f"PFORM_{run_id}_{timestamp}"
@@ -355,7 +378,7 @@ class SequenceExporter:
355
378
  "--- Optimization Settings ---",
356
379
  f"Profile: {metadata.get('profile', 'N/A')}",
357
380
  f"Assembly Standard: {metadata.get('assembly_standard', 'None')}",
358
- f"Organism: {metadata.get('organism', 'Nicotiana benthamiana')}",
381
+ f"Organism: {self.host_species(metadata)}",
359
382
  "",
360
383
  ]
361
384
 
@@ -17,7 +17,7 @@ class RuleBasedOptimizer(OptimizerEngine):
17
17
  """Profile-based rule optimization engine."""
18
18
 
19
19
  name = "Profile-based"
20
- version = "3.1.7"
20
+ version = "3.1.8"
21
21
 
22
22
  def __init__(self) -> None:
23
23
  self.validator = InputValidator()
@@ -15,7 +15,7 @@ from enum import Enum
15
15
  from pathlib import Path
16
16
  from typing import Any, cast
17
17
 
18
- from factorforge.engines.profile.scoring import calculate_composite_score
18
+ from factorforge.engines.profile.scoring import GC_OPT_MID, calculate_composite_score
19
19
  from factorforge.engines.profile.utils import (
20
20
  build_aa_to_codons_map,
21
21
  calculate_gc,
@@ -440,14 +440,22 @@ class ReverseTranslator:
440
440
  return "".join(dna_seq)
441
441
 
442
442
  def _gc_target_translate(self, protein_seq: str, **kwargs: Any) -> str:
443
- """
444
- GC-Target profile: enforce GC% 42.5% ±2% (N. benthamiana optimal)
443
+ """GC-Target profile: drive global GC toward a configurable target.
444
+
445
+ Targets the caller-supplied ``target_gc`` if provided, otherwise the
446
+ host-profile GC midpoint (GC_OPT_MID = 60% for N. benthamiana). To target
447
+ a lower GC (e.g. for specific vector requirements), pass target_gc explicitly.
445
448
 
446
449
  - GC constraint first
447
450
  - CAI may be sacrificed
448
451
  - Balance local window GC (50 bp)
452
+
453
+ TODO: GC_OPT_MID is currently a single N. benthamiana-calibrated constant.
454
+ When per-host GC profiles are added, source the default from the active host.
449
455
  """
450
- target_gc = kwargs.get("target_gc", 42.5)
456
+ target_gc = kwargs.get("target_gc")
457
+ if target_gc is None:
458
+ target_gc = GC_OPT_MID
451
459
 
452
460
  dna_seq: list[str] = []
453
461
 
@@ -477,11 +485,22 @@ class ReverseTranslator:
477
485
  return "".join(dna_seq)
478
486
 
479
487
  def _assembly_friendly_translate(self, protein_seq: str, **kwargs: Any) -> str:
480
- """
481
- Assembly-Friendly profile: avoid BsaI/BpiI
488
+ """Translate for Golden Gate / MoClo assembly compatibility.
489
+
490
+ Strategy:
491
+ - Starts from balanced codon selection (preferred_ratio=0.6)
492
+ - Retries up to max_attempts times until no BsaI/BpiI Type IIS
493
+ restriction sites remain in the CDS (forward + reverse complement)
494
+ - CAI trade-offs are accepted to achieve site-free sequences
482
495
 
483
- - Golden Gate compatible
484
- - CAI trade-offs allowed
496
+ Current scope:
497
+ - Supported: BsaI/BpiI site avoidance via stochastic retry
498
+ - Not yet implemented: local GC window uniformity scoring,
499
+ repeat-pattern penalties, synthesis vendor constraint profiles
500
+
501
+ Args:
502
+ protein_seq: Amino acid sequence.
503
+ max_attempts: Retry limit for site removal (default: 10).
485
504
  """
486
505
  max_attempts = kwargs.get("max_attempts", 10)
487
506
  if max_attempts < 1:
@@ -529,13 +548,18 @@ class ReverseTranslator:
529
548
  return self._apply_nterminal_ramp(dna_seq, protein_seq, ramp_codons=ramp_codons)
530
549
 
531
550
  def _apply_nterminal_ramp(self, dna_seq: str, protein_seq: str, ramp_codons: int = 50) -> str:
532
- """
533
- Apply N-terminal codon ramp for co-translational folding.
551
+ """Apply N-terminal codon ramp for co-translational folding.
534
552
 
535
553
  Replaces the first `ramp_codons` codons with lower-frequency synonymous
536
554
  codons (bottom 50% by frequency) to slow the ribosome at the N-terminus.
537
555
  Single-codon amino acids (Met, Trp) are left unchanged.
538
556
 
557
+ TODO: ramp profile is currently not in VALID_PROFILES (not publicly accessible).
558
+ Before re-enabling, revisit ramp_codons=50:
559
+ - Literature suggests 10–30 codons (Tuller et al. 2010, Chu et al. 2014).
560
+ - For short proteins, 50 codons can cover the entire CDS.
561
+ - Consider: ramp_len = min(30, max(10, int(protein_length * 0.15)))
562
+
539
563
  Args:
540
564
  dna_seq: Full-length DNA sequence.
541
565
  protein_seq: Original protein sequence (same length as dna_seq/3).
@@ -11,6 +11,7 @@ import math
11
11
  import re
12
12
  from typing import Any
13
13
 
14
+ from factorforge.analysis.metrics import HOMOPOLYMER_SYNTHESIS_WARN_NT
14
15
  from factorforge.engines.profile.rules.reverse_translator import ReverseTranslator
15
16
  from factorforge.engines.profile.utils import (
16
17
  build_aa_to_codons_map,
@@ -246,24 +247,27 @@ class RuleEngine:
246
247
 
247
248
  return violations
248
249
 
249
- def scan_homopolymers(self, seq: str, min_length: int = 8) -> list[dict[str, Any]]:
250
- """
251
- Detect 8+ homopolymers (synthesis risk)
250
+ def scan_homopolymers(
251
+ self, seq: str, min_length: int = HOMOPOLYMER_SYNTHESIS_WARN_NT
252
+ ) -> list[dict[str, Any]]:
253
+ """Detect homopolymer runs for synthesis/manufacturing risk evaluation.
252
254
 
253
- Args:
254
- seq: DNA sequence
255
- min_length: Minimum length
255
+ Uses HOMOPOLYMER_SYNTHESIS_WARN_NT (default 8 nt) — the threshold at
256
+ which gene synthesis vendors flag homopolymers as difficult to synthesize
257
+ with high fidelity.
256
258
 
257
- Returns:
258
- List of violations
259
+ For expression stability risk (≥6 nt), see
260
+ factorforge.analysis.metrics.detect_homopolymers() which uses
261
+ HOMOPOLYMER_EXPRESSION_WARN_NT.
259
262
 
260
- Raises:
261
- None.
263
+ Args:
264
+ seq: DNA sequence
265
+ min_length: Minimum run length to flag (default: HOMOPOLYMER_SYNTHESIS_WARN_NT)
262
266
 
263
267
  Examples:
264
268
  >>> engine = RuleEngine()
265
269
  >>> engine.scan_homopolymers("AAAAAAAA", min_length=8)
266
- [{'type': 'homopolymer', ...}]
270
+ [{'type': 'homopolymer', 'context': 'synthesis', ...}]
267
271
  """
268
272
  violations: list[dict[str, Any]] = []
269
273
 
@@ -280,16 +284,16 @@ class RuleEngine:
280
284
  while idx + actual_length < len(seq) and seq[idx + actual_length] == base:
281
285
  actual_length += 1
282
286
 
283
- violations.append(
284
- {
285
- "type": "homopolymer",
286
- "base": base,
287
- "position": idx,
288
- "length": actual_length,
289
- "sequence": base * actual_length,
290
- "severity": "high" if actual_length >= 10 else "medium",
291
- }
292
- )
287
+ violations.append({
288
+ "type": "homopolymer",
289
+ "context": "synthesis",
290
+ "threshold_nt": min_length,
291
+ "base": base,
292
+ "position": idx,
293
+ "length": actual_length,
294
+ "sequence": base * actual_length,
295
+ "severity": "high" if actual_length >= 10 else "medium",
296
+ })
293
297
  pos = idx + actual_length
294
298
 
295
299
  return violations
@@ -11,13 +11,16 @@ from typing import Any
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
- # Optimal GC range for N. benthamiana codon-optimized sequences.
14
+ # GC band for N. benthamiana codon-optimized sequences.
15
15
  # Benchmark (analysis 004, n=49): balanced profile output average GC% = 60.1%
16
16
  # (range 55-71%). The genome-wide average (~42%) reflects all genes, not the
17
17
  # high-expression codon table which exhibits 3rd-position GC bias.
18
+ # These constants define the acceptable band — sequences within [GC_OPT_MIN, GC_OPT_MAX]
19
+ # receive full GC score; outside the band the score decays linearly.
18
20
  GC_OPT_MIN = 55.0
19
21
  GC_OPT_MAX = 65.0
20
- GC_OPT_MID = 60.0
22
+ GC_OPT_MID = 60.0 # kept for gc_target point-scoring and viral_delivery centering
23
+ GC_DECAY_WIDTH = 20.0 # percentage points outside band before score reaches 0.0
21
24
 
22
25
  # ViennaRNA availability cache
23
26
  _vienna_available: bool | None = None
@@ -32,7 +35,11 @@ class ScoringConfig:
32
35
  w_mfe: float = 0.2
33
36
  w_dinuc: float = 0.0 # CpG/TpA dinucleotide penalty (opt-in, default off)
34
37
  w_syncodonlm: float = 0.0 # SynCodonLM quality score (opt-in, default off)
35
- gc_opt: float = GC_OPT_MID
38
+ gc_opt: float = GC_OPT_MID # no longer used by calculate_composite_score (superseded by
39
+ # gc_min/gc_max band); retained for external API compatibility
40
+ gc_min: float = GC_OPT_MIN # acceptable band lower boundary
41
+ gc_max: float = GC_OPT_MAX # acceptable band upper boundary
42
+ gc_decay_width: float = GC_DECAY_WIDTH # % points outside band before score → 0
36
43
  use_mfe: bool = True
37
44
 
38
45
  def __post_init__(self) -> None:
@@ -61,22 +68,28 @@ class ScoringConfig:
61
68
 
62
69
  # Pre-defined scoring configs per optimization profile
63
70
  PROFILE_SCORING_CONFIGS: dict[str, ScoringConfig] = {
64
- "balanced": ScoringConfig(w_cai=0.5, w_gc=0.3, w_mfe=0.2, gc_opt=GC_OPT_MID),
65
- "high_cai": ScoringConfig(w_cai=0.8, w_gc=0.1, w_mfe=0.1, gc_opt=GC_OPT_MID),
66
- "gc_target": ScoringConfig(w_cai=0.1, w_gc=0.7, w_mfe=0.2, gc_opt=GC_OPT_MID),
67
- "assembly_friendly": ScoringConfig(w_cai=0.5, w_gc=0.3, w_mfe=0.2, gc_opt=GC_OPT_MID),
68
- "ramp": ScoringConfig(w_cai=0.4, w_gc=0.3, w_mfe=0.3, gc_opt=GC_OPT_MID),
69
- # TRV viral-delivery profile GC target ~47.5% based on TRV genome composition.
70
- # MFE weighted at 0.30 (Peccoud et al. 2024, PMC11718241: secondary structure shows
71
- # weak univariate correlation with yield in tobacco viral expression).
72
- "viral_delivery": ScoringConfig(w_cai=0.35, w_gc=0.35, w_mfe=0.30, gc_opt=47.5, use_mfe=True),
73
- "ml_enhanced": ScoringConfig(
74
- w_cai=0.35,
75
- w_gc=0.25,
76
- w_mfe=0.15,
77
- w_syncodonlm=0.25,
78
- gc_opt=GC_OPT_MID,
71
+ "balanced": ScoringConfig(w_cai=0.5, w_gc=0.3, w_mfe=0.2),
72
+ # high_cai: CAI 1.0 mimics naturally high-expression N. benthamiana genes (golden_set).
73
+ # For very long or structurally complex proteins under extreme agroinfiltration overexpression,
74
+ # consider the ramp profile instead (N-terminal codon deoptimization reduces ribosome stalling).
75
+ "high_cai": ScoringConfig(w_cai=0.8, w_gc=0.1, w_mfe=0.1),
76
+ # gc_target: gc_min/gc_max are overridden dynamically from target_gc kwarg in
77
+ # calculate_composite_score the config defaults here are not used for band scoring.
78
+ "gc_target": ScoringConfig(w_cai=0.1, w_gc=0.7, w_mfe=0.2),
79
+ # assembly_friendly: CAI pressure reduced vs balanced; GC/MFE weights raised to
80
+ # reflect Type IIS restriction-site avoidance priority (Golden Gate / MoClo).
81
+ # w_gc scores GC band compliance (global GC%), NOT local GC uniformity.
82
+ # Window-level GC variance and repeat-pattern penalties are not yet implemented.
83
+ "assembly_friendly": ScoringConfig(w_cai=0.3, w_gc=0.4, w_mfe=0.3),
84
+ "ramp": ScoringConfig(w_cai=0.4, w_gc=0.3, w_mfe=0.3),
85
+ # TRV viral-delivery profile — GC band centered on TRV genome composition (~47.5%).
86
+ # MFE weighted at 0.30 (Peccoud et al. 2024, PMC11718241).
87
+ "viral_delivery": ScoringConfig(
88
+ w_cai=0.35, w_gc=0.35, w_mfe=0.30,
89
+ gc_opt=47.5, gc_min=37.5, gc_max=57.5,
90
+ use_mfe=True,
79
91
  ),
92
+ "ml_enhanced": ScoringConfig(w_cai=0.35, w_gc=0.25, w_mfe=0.15, w_syncodonlm=0.25),
80
93
  }
81
94
 
82
95
 
@@ -144,6 +157,36 @@ def normalize_mfe(mfe: float, seq_length: int) -> float:
144
157
  return 1.0 + (clamped / 0.5)
145
158
 
146
159
 
160
+ def gc_band_score(
161
+ gc: float,
162
+ gc_min: float,
163
+ gc_max: float,
164
+ decay_width: float = GC_DECAY_WIDTH,
165
+ ) -> float:
166
+ """Score GC content against an acceptable band.
167
+
168
+ Returns 1.0 inside [gc_min, gc_max]; linearly decays to 0.0 after
169
+ decay_width percentage points outside the band.
170
+
171
+ Args:
172
+ gc: GC content percentage (0-100).
173
+ gc_min: Lower boundary of acceptable band.
174
+ gc_max: Upper boundary of acceptable band.
175
+ decay_width: Percentage points outside band before score reaches 0.0.
176
+
177
+ Examples:
178
+ gc_min=55, gc_max=65, decay_width=20:
179
+ gc=60 → 1.00 (inside band)
180
+ gc=70 → 0.75 (5 pts above gc_max)
181
+ gc=80 → 0.25 (15 pts above gc_max)
182
+ gc=85 → 0.00 (20 pts above gc_max)
183
+ """
184
+ if gc_min <= gc <= gc_max:
185
+ return 1.0
186
+ distance = (gc_min - gc) if gc < gc_min else (gc - gc_max)
187
+ return max(0.0, 1.0 - distance / decay_width)
188
+
189
+
147
190
  def calculate_dinucleotide_score(sequence: str) -> float:
148
191
  """Calculate a dinucleotide avoidance score (0-1, higher = fewer CpG/TpA).
149
192
 
@@ -179,19 +222,23 @@ def calculate_composite_score(
179
222
  profile: str | None = None,
180
223
  **kwargs: Any,
181
224
  ) -> float:
182
- """
183
- Calculate multidimensional composite score.
225
+ """Calculate multidimensional composite score.
184
226
 
185
- S = w1*CAI + w2*(1 - |GC - GC_opt|/50) + w3*MFE_norm
227
+ S = w1*CAI + w2*gc_band_score + w3*MFE_norm
186
228
  + w4*dinuc_score + w5*SynCodonLM_score
187
229
 
230
+ GC scoring uses a band function: sequences inside [gc_min, gc_max] receive
231
+ full score (1.0); outside the band the score decays linearly to 0.0 over
232
+ gc_decay_width percentage points. For gc_target profile, the band is
233
+ centred on the caller-supplied target_gc (±5 pp).
234
+
188
235
  Args:
189
236
  cai: Codon Adaptation Index (0-1).
190
237
  gc: GC content percentage (0-100).
191
238
  sequence: DNA sequence for optional MFE, dinucleotide, and SynCodonLM calculation.
192
239
  config: Explicit ScoringConfig. Overrides profile.
193
240
  profile: Profile name for preset config lookup.
194
- **kwargs: Additional parameters (e.g., target_gc for gc_target profile).
241
+ **kwargs: target_gc (float) point target for gc_target profile.
195
242
 
196
243
  Returns:
197
244
  Composite score (0-1).
@@ -203,14 +250,17 @@ def calculate_composite_score(
203
250
  if config is None:
204
251
  config = PROFILE_SCORING_CONFIGS["balanced"]
205
252
 
206
- # Allow target_gc override for gc_target profile
207
- gc_opt = float(kwargs.get("target_gc", config.gc_opt))
208
-
209
253
  # Component 1: CAI (already 0-1)
210
254
  cai_score = max(0.0, min(1.0, cai))
211
255
 
212
- # Component 2: GC proximity to optimum
213
- gc_score = max(0.0, 1.0 - abs(gc - gc_opt) / 50.0)
256
+ # Component 2: GC band scoring
257
+ # gc_target profile: caller supplies target_gc; use a ±5 pp band around it.
258
+ # All other profiles: use the band defined in ScoringConfig (gc_min/gc_max).
259
+ if "target_gc" in kwargs:
260
+ tgt = float(kwargs["target_gc"])
261
+ gc_score = gc_band_score(gc, tgt - 5.0, tgt + 5.0, config.gc_decay_width)
262
+ else:
263
+ gc_score = gc_band_score(gc, config.gc_min, config.gc_max, config.gc_decay_width)
214
264
 
215
265
  # Component 3: MFE (optional)
216
266
  mfe_score = 0.5 # neutral default
@@ -2,7 +2,7 @@
2
2
 
3
3
  from typing import Literal, Tuple
4
4
 
5
- from factorforge.analysis.metrics import detect_invalid_codons, translate_dna
5
+ from factorforge.analysis.metrics import amino_acid_identity, detect_invalid_codons, translate_dna
6
6
 
7
7
  from .exceptions import SequenceValidationError
8
8
 
@@ -186,4 +186,5 @@ def validate_cds_output(input_protein: str, dna_sequence: str) -> dict[str, obje
186
186
  return {
187
187
  "passed": not errors,
188
188
  "errors": errors,
189
+ "aa_identity": amino_acid_identity(expected, seq),
189
190
  }
@@ -5,6 +5,7 @@ from __future__ import annotations
5
5
  from typing import Any
6
6
 
7
7
  from factorforge.analysis.metrics import (
8
+ HOMOPOLYMER_EXPRESSION_WARN_NT,
8
9
  amino_acid_identity,
9
10
  calculate_first_region_gc,
10
11
  calculate_gc,
@@ -24,7 +25,8 @@ DEFAULT_CONFIG: dict[str, Any] = {
24
25
  "gc_window_step": 30,
25
26
  "forbidden_motifs": [],
26
27
  "fail_forbidden_motifs": False,
27
- "homopolymer_max_run": 6,
28
+ # Expression-stability threshold (see HOMOPOLYMER_EXPRESSION_WARN_NT in metrics).
29
+ "homopolymer_max_run": HOMOPOLYMER_EXPRESSION_WARN_NT,
28
30
  }
29
31
 
30
32
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: factorforge-cds
3
- Version: 3.1.7
3
+ Version: 3.1.8
4
4
  Summary: FactorForge - open-source constraint-based CDS design engine by Eijex.
5
5
  Author-email: Eijex <eijex.lab@gmail.com>
6
6
  License-Expression: AGPL-3.0-only
@@ -107,7 +107,7 @@ FactorForge predictions are **in-silico only** and have not been experimentally
107
107
  ## Citing
108
108
 
109
109
  ```
110
- FactorForge v3.1.7 (2026). Open-source constraint-based CDS design engine.
110
+ FactorForge v3.1.8 (2026). Open-source constraint-based CDS design engine.
111
111
  Eijex. https://github.com/eijex/factorforge-cds
112
112
  ```
113
113
 
@@ -66,7 +66,7 @@ def test_validate_protein_sequence():
66
66
  def test_validate_cds_output_passes_normal_cds():
67
67
  result = validate_cds_output("MAF", "ATGGCTTTC")
68
68
 
69
- assert result == {"passed": True, "errors": []}
69
+ assert result == {"passed": True, "errors": [], "aa_identity": 1.0}
70
70
 
71
71
 
72
72
  def test_validate_cds_output_fails_internal_stop():
@@ -80,6 +80,7 @@ def test_validate_cds_output_fails_aa_mismatch():
80
80
  result = validate_cds_output("MAF", "ATGGCTTAC")
81
81
 
82
82
  assert result["passed"] is False
83
+ assert result["aa_identity"] == pytest.approx(2 / 3)
83
84
  assert any(error.startswith("aa_mismatch") for error in result["errors"])
84
85
 
85
86
 
File without changes