factorforge-cds 3.1.7__tar.gz → 3.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {factorforge_cds-3.1.7/src/factorforge_cds.egg-info → factorforge_cds-3.1.9}/PKG-INFO +2 -2
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/README.md +1 -1
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/pyproject.toml +2 -1
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/__init__.py +1 -1
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/analysis/feasibility.py +8 -4
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/analysis/metrics.py +61 -8
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/__init__.py +1 -1
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/__init__.py +1 -1
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/exporter.py +25 -2
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/optimizer.py +6 -2
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/pipeline.py +38 -2
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/rules/reverse_translator.py +36 -12
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/rules/rule_engine.py +35 -24
- factorforge_cds-3.1.9/src/factorforge/engines/profile/scoring.py +351 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/utils/sequence_validator.py +2 -1
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/utils/validation.py +3 -1
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9/src/factorforge_cds.egg-info}/PKG-INFO +2 -2
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/tests/test_sequence_validator.py +2 -1
- factorforge_cds-3.1.7/src/factorforge/engines/profile/scoring.py +0 -260
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/LICENSE +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/setup.cfg +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/__main__.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/analysis/__init__.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/cli/__init__.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/cli/legacy_cli.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/cli/main.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/core/interfaces/__init__.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/core/interfaces/exporter.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/core/interfaces/optimizer.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/core/interfaces/validator.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/data/nbenthamiana_codons.json +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/data/nbenthamiana_golden_set.json +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/data/ntabacum_codons.json +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/data/templates/high_expression.json +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/data/templates/standard_expression.json +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/data/wolffia_globosa_codons.json +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/database.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/codon_table_builder.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/construct_builder.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/rules/__init__.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/rules/domesticator.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/scoring_ml.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/utils.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/validator.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/registry.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/schemas/__init__.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/schemas/design_package.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/schemas/design_package.schema.json +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/utils/__init__.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/utils/construct_id.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/utils/exceptions.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/utils/restriction_sites.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/validation/__init__.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/validation/cli.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/validation/package_generator.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge_cds.egg-info/SOURCES.txt +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge_cds.egg-info/dependency_links.txt +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge_cds.egg-info/entry_points.txt +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge_cds.egg-info/requires.txt +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge_cds.egg-info/top_level.txt +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/tests/test_database.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/tests/test_legacy_cli.py +0 -0
- {factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/tests/test_restriction_sites.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: factorforge-cds
|
|
3
|
-
Version: 3.1.
|
|
3
|
+
Version: 3.1.9
|
|
4
4
|
Summary: FactorForge - open-source constraint-based CDS design engine by Eijex.
|
|
5
5
|
Author-email: Eijex <eijex.lab@gmail.com>
|
|
6
6
|
License-Expression: AGPL-3.0-only
|
|
@@ -107,7 +107,7 @@ FactorForge predictions are **in-silico only** and have not been experimentally
|
|
|
107
107
|
## Citing
|
|
108
108
|
|
|
109
109
|
```
|
|
110
|
-
FactorForge v3.1.
|
|
110
|
+
FactorForge v3.1.9 (2026). Open-source constraint-based CDS design engine.
|
|
111
111
|
Eijex. https://github.com/eijex/factorforge-cds
|
|
112
112
|
```
|
|
113
113
|
|
|
@@ -76,7 +76,7 @@ FactorForge predictions are **in-silico only** and have not been experimentally
|
|
|
76
76
|
## Citing
|
|
77
77
|
|
|
78
78
|
```
|
|
79
|
-
FactorForge v3.1.
|
|
79
|
+
FactorForge v3.1.9 (2026). Open-source constraint-based CDS design engine.
|
|
80
80
|
Eijex. https://github.com/eijex/factorforge-cds
|
|
81
81
|
```
|
|
82
82
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "factorforge-cds"
|
|
7
|
-
version = "3.1.
|
|
7
|
+
version = "3.1.9"
|
|
8
8
|
description = "FactorForge - open-source constraint-based CDS design engine by Eijex."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "AGPL-3.0-only"
|
|
@@ -56,6 +56,7 @@ where = ["src"]
|
|
|
56
56
|
[tool.pytest.ini_options]
|
|
57
57
|
testpaths = ["tests"]
|
|
58
58
|
norecursedirs = ["archive"]
|
|
59
|
+
pythonpath = ["src"]
|
|
59
60
|
|
|
60
61
|
[tool.ruff]
|
|
61
62
|
line-length = 100
|
|
@@ -88,9 +88,9 @@ def _reconstruct_sequence(
|
|
|
88
88
|
def analyze_feasibility(
|
|
89
89
|
protein_sequence: str,
|
|
90
90
|
codon_weights: dict[str, float],
|
|
91
|
-
target_cai: float = 0.
|
|
92
|
-
target_gc_low: float =
|
|
93
|
-
target_gc_high: float =
|
|
91
|
+
target_cai: float = 0.82,
|
|
92
|
+
target_gc_low: float = 55.0,
|
|
93
|
+
target_gc_high: float = 65.0,
|
|
94
94
|
gc_ranges: list[tuple[float, float]] | None = None,
|
|
95
95
|
) -> dict[str, Any]:
|
|
96
96
|
"""Compute exact CAI/GC feasibility over synonymous codon choices.
|
|
@@ -98,12 +98,16 @@ def analyze_feasibility(
|
|
|
98
98
|
The dynamic program keeps the best log-CAI sequence for each reachable
|
|
99
99
|
global GC count. This is exact for global GC and CAI under the supplied
|
|
100
100
|
codon weights.
|
|
101
|
+
|
|
102
|
+
Defaults calibrated to nbenthamiana profile engine output distribution
|
|
103
|
+
(analysis 004, n=49): avg CAI=0.76, avg GC=60.1% (range 55-71%).
|
|
104
|
+
target_cai=0.82 aligns with industry practice (>0.8) and is achievable.
|
|
101
105
|
"""
|
|
102
106
|
protein = "".join(protein_sequence.upper().split()).rstrip("*")
|
|
103
107
|
if not protein:
|
|
104
108
|
raise ValueError("protein_sequence must not be empty")
|
|
105
109
|
|
|
106
|
-
ranges = gc_ranges or [(
|
|
110
|
+
ranges = gc_ranges or [(55.0, 65.0), (50.0, 65.0), (40.0, 65.0)]
|
|
107
111
|
normalized_ranges = [
|
|
108
112
|
(_normalize_gc_bound(low), _normalize_gc_bound(high)) for low, high in ranges
|
|
109
113
|
]
|
|
@@ -11,6 +11,12 @@ from typing import Any
|
|
|
11
11
|
|
|
12
12
|
from factorforge.engines.profile.utils import get_data_path
|
|
13
13
|
|
|
14
|
+
# Homopolymer thresholds — two distinct concerns, intentionally different values.
|
|
15
|
+
# Expression stability: AT-rich runs ≥6 nt can resemble instability elements (ARE).
|
|
16
|
+
# Synthesis/manufacturing: runs ≥8 nt are flagged by gene synthesis vendors as difficult.
|
|
17
|
+
HOMOPOLYMER_EXPRESSION_WARN_NT = 6
|
|
18
|
+
HOMOPOLYMER_SYNTHESIS_WARN_NT = 8
|
|
19
|
+
|
|
14
20
|
|
|
15
21
|
STANDARD_GENETIC_CODE: dict[str, str] = {
|
|
16
22
|
"TTT": "F",
|
|
@@ -271,6 +277,32 @@ def calculate_cai(sequence: str, codon_weights: dict[str, float]) -> float:
|
|
|
271
277
|
return math.exp(log_sum / count) if count else 0.0
|
|
272
278
|
|
|
273
279
|
|
|
280
|
+
def calculate_dinucleotide_score(
|
|
281
|
+
sequence: str,
|
|
282
|
+
cpg_weight: float = 0.0,
|
|
283
|
+
tpa_weight: float = 1.0,
|
|
284
|
+
) -> float:
|
|
285
|
+
"""Score dinucleotide avoidance.
|
|
286
|
+
|
|
287
|
+
Plant default: CpG inactive (cpg_weight=0.0), only TpA is penalized.
|
|
288
|
+
Mammalian opt-in: set cpg_weight=1.0 and tpa_weight=1.0 to penalize both.
|
|
289
|
+
"""
|
|
290
|
+
from factorforge.engines.profile.utils import calculate_dinucleotide_ratio
|
|
291
|
+
|
|
292
|
+
if len(sequence) < 6:
|
|
293
|
+
return 1.0
|
|
294
|
+
|
|
295
|
+
total_weight = cpg_weight + tpa_weight
|
|
296
|
+
if total_weight == 0:
|
|
297
|
+
return 1.0
|
|
298
|
+
|
|
299
|
+
cpg_ratio = calculate_dinucleotide_ratio(sequence, "CG")
|
|
300
|
+
tpa_ratio = calculate_dinucleotide_ratio(sequence, "TA")
|
|
301
|
+
cpg_score = max(0.0, 1.0 - cpg_ratio / 2.0)
|
|
302
|
+
tpa_score = max(0.0, 1.0 - tpa_ratio / 2.0)
|
|
303
|
+
return (cpg_weight * cpg_score + tpa_weight * tpa_score) / total_weight
|
|
304
|
+
|
|
305
|
+
|
|
274
306
|
def codon_usage_profile(sequence: str) -> dict[str, dict[str, float | int | str]]:
|
|
275
307
|
"""Return codon counts and frequencies for a DNA sequence."""
|
|
276
308
|
codons = _codons(sequence)
|
|
@@ -286,8 +318,19 @@ def codon_usage_profile(sequence: str) -> dict[str, dict[str, float | int | str]
|
|
|
286
318
|
return profile
|
|
287
319
|
|
|
288
320
|
|
|
289
|
-
def detect_homopolymers(
|
|
290
|
-
|
|
321
|
+
def detect_homopolymers(
|
|
322
|
+
sequence: str,
|
|
323
|
+
max_run: int = HOMOPOLYMER_EXPRESSION_WARN_NT,
|
|
324
|
+
) -> list[dict[str, Any]]:
|
|
325
|
+
"""Detect homopolymer runs for expression stability evaluation.
|
|
326
|
+
|
|
327
|
+
Uses HOMOPOLYMER_EXPRESSION_WARN_NT (default 6 nt) — AT-rich runs of this
|
|
328
|
+
length can resemble AU-rich instability elements (ARE) and affect mRNA
|
|
329
|
+
stability in plant expression systems.
|
|
330
|
+
|
|
331
|
+
For synthesis/manufacturing risk, see RuleEngine.scan_homopolymers()
|
|
332
|
+
which uses HOMOPOLYMER_SYNTHESIS_WARN_NT (8 nt).
|
|
333
|
+
"""
|
|
291
334
|
if max_run <= 1:
|
|
292
335
|
raise ValueError("max_run must be > 1")
|
|
293
336
|
|
|
@@ -303,17 +346,27 @@ def detect_homopolymers(sequence: str, max_run: int = 6) -> list[dict[str, Any]]
|
|
|
303
346
|
continue
|
|
304
347
|
run_length = index - run_start
|
|
305
348
|
if run_length >= max_run:
|
|
306
|
-
findings.append(
|
|
307
|
-
|
|
308
|
-
|
|
349
|
+
findings.append({
|
|
350
|
+
"start": run_start,
|
|
351
|
+
"end": index,
|
|
352
|
+
"base": run_base,
|
|
353
|
+
"length": run_length,
|
|
354
|
+
"context": "expression_stability",
|
|
355
|
+
"threshold_nt": max_run,
|
|
356
|
+
})
|
|
309
357
|
run_base = base
|
|
310
358
|
run_start = index
|
|
311
359
|
|
|
312
360
|
run_length = len(seq) - run_start
|
|
313
361
|
if run_length >= max_run:
|
|
314
|
-
findings.append(
|
|
315
|
-
|
|
316
|
-
|
|
362
|
+
findings.append({
|
|
363
|
+
"start": run_start,
|
|
364
|
+
"end": len(seq),
|
|
365
|
+
"base": run_base,
|
|
366
|
+
"length": run_length,
|
|
367
|
+
"context": "expression_stability",
|
|
368
|
+
"threshold_nt": max_run,
|
|
369
|
+
})
|
|
317
370
|
return findings
|
|
318
371
|
|
|
319
372
|
|
|
@@ -6,8 +6,10 @@ GenBank and FASTA export module (P0-5)
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
8
|
import hashlib
|
|
9
|
+
import json
|
|
9
10
|
from datetime import datetime
|
|
10
11
|
from io import StringIO
|
|
12
|
+
from pathlib import Path
|
|
11
13
|
from typing import Any
|
|
12
14
|
|
|
13
15
|
|
|
@@ -25,6 +27,26 @@ class SequenceExporter:
|
|
|
25
27
|
"""Initialize"""
|
|
26
28
|
pass
|
|
27
29
|
|
|
30
|
+
def host_species(self, metadata: dict[str, Any]) -> str:
|
|
31
|
+
"""Resolve host species from feature_registry.json when possible."""
|
|
32
|
+
if metadata.get("organism"):
|
|
33
|
+
return str(metadata["organism"])
|
|
34
|
+
|
|
35
|
+
host = str(
|
|
36
|
+
metadata.get("host_profile") or metadata.get("host") or "nbenthamiana"
|
|
37
|
+
).strip().lower()
|
|
38
|
+
host_aliases = {"ntabacum": "by2"}
|
|
39
|
+
registry_key = host_aliases.get(host, host)
|
|
40
|
+
registry_path = Path(__file__).resolve().parents[4] / "scripts" / "feature_registry.json"
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
registry = json.loads(registry_path.read_text(encoding="utf-8"))
|
|
44
|
+
except (OSError, json.JSONDecodeError):
|
|
45
|
+
registry = {}
|
|
46
|
+
|
|
47
|
+
species = registry.get("hosts", {}).get(registry_key, {}).get("species")
|
|
48
|
+
return str(species or "Nicotiana benthamiana")
|
|
49
|
+
|
|
28
50
|
def generate_run_id(self, sequence: str, params: dict[str, Any]) -> str:
|
|
29
51
|
"""
|
|
30
52
|
Generate a reproducible run_id
|
|
@@ -69,6 +91,7 @@ class SequenceExporter:
|
|
|
69
91
|
"run_id": "abc12345",
|
|
70
92
|
"timestamp": "2026-01-22T12:00:00",
|
|
71
93
|
"organism": "Nicotiana benthamiana",
|
|
94
|
+
"host_profile": "nbenthamiana",
|
|
72
95
|
"gene_name": "GFP",
|
|
73
96
|
"violations_fixed": [...],
|
|
74
97
|
"warnings": [...]
|
|
@@ -99,7 +122,7 @@ class SequenceExporter:
|
|
|
99
122
|
run_id = metadata.get("run_id", self.generate_run_id(sequence, metadata))
|
|
100
123
|
timestamp = metadata.get("timestamp", datetime.now().strftime("%Y%m%d"))
|
|
101
124
|
gene_name = metadata.get("gene_name", "optimized_gene")
|
|
102
|
-
organism =
|
|
125
|
+
organism = self.host_species(metadata)
|
|
103
126
|
|
|
104
127
|
# Build locus ID
|
|
105
128
|
locus_id = f"PFORM_{run_id}_{timestamp}"
|
|
@@ -355,7 +378,7 @@ class SequenceExporter:
|
|
|
355
378
|
"--- Optimization Settings ---",
|
|
356
379
|
f"Profile: {metadata.get('profile', 'N/A')}",
|
|
357
380
|
f"Assembly Standard: {metadata.get('assembly_standard', 'None')}",
|
|
358
|
-
f"Organism: {
|
|
381
|
+
f"Organism: {self.host_species(metadata)}",
|
|
359
382
|
"",
|
|
360
383
|
]
|
|
361
384
|
|
{factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/optimizer.py
RENAMED
|
@@ -9,7 +9,7 @@ from factorforge.core.interfaces import OptimizationResult, OptimizerEngine
|
|
|
9
9
|
from .exporter import SequenceExporter
|
|
10
10
|
from .rules.reverse_translator import OptimizationProfile, ReverseTranslator
|
|
11
11
|
from .rules.rule_engine import RuleEngine
|
|
12
|
-
from .scoring import calculate_composite_score
|
|
12
|
+
from .scoring import calculate_composite_score, compute_mfe_evidence
|
|
13
13
|
from .validator import InputValidator
|
|
14
14
|
|
|
15
15
|
|
|
@@ -17,7 +17,7 @@ class RuleBasedOptimizer(OptimizerEngine):
|
|
|
17
17
|
"""Profile-based rule optimization engine."""
|
|
18
18
|
|
|
19
19
|
name = "Profile-based"
|
|
20
|
-
version = "3.1.
|
|
20
|
+
version = "3.1.9"
|
|
21
21
|
|
|
22
22
|
def __init__(self) -> None:
|
|
23
23
|
self.validator = InputValidator()
|
|
@@ -117,6 +117,10 @@ class RuleBasedOptimizer(OptimizerEngine):
|
|
|
117
117
|
"score": candidates[0]["score"],
|
|
118
118
|
"violations": sum(len(v) for v in scan_results.values()),
|
|
119
119
|
}
|
|
120
|
+
# MFE provenance: expose whether MFE was actually computed so downstream
|
|
121
|
+
# artifacts (API response, Design Package) never report an uncomputed
|
|
122
|
+
# MFE as a misleading 0.0 (016 audit). Score value is unchanged.
|
|
123
|
+
metrics.update(compute_mfe_evidence(optimized_dna, profile=profile_value))
|
|
120
124
|
|
|
121
125
|
return OptimizationResult(
|
|
122
126
|
sequence=optimized_dna,
|
|
@@ -18,9 +18,14 @@ from factorforge.engines.profile.rules.reverse_translator import (
|
|
|
18
18
|
ReverseTranslator,
|
|
19
19
|
)
|
|
20
20
|
from factorforge.engines.profile.rules.rule_engine import RuleEngine
|
|
21
|
-
from factorforge.engines.profile.scoring import
|
|
21
|
+
from factorforge.engines.profile.scoring import (
|
|
22
|
+
calculate_composite_score,
|
|
23
|
+
compute_mfe_evidence,
|
|
24
|
+
)
|
|
22
25
|
from factorforge.engines.profile.validator import InputValidator
|
|
26
|
+
from factorforge.analysis.metrics import translate_dna
|
|
23
27
|
from factorforge.utils.construct_id import generate_construct_id
|
|
28
|
+
from factorforge.utils.sequence_validator import validate_cds_output
|
|
24
29
|
|
|
25
30
|
logger = logging.getLogger(__name__)
|
|
26
31
|
|
|
@@ -48,7 +53,15 @@ class PipelineResult:
|
|
|
48
53
|
"optimization_profile": self.metadata.get("profile", ""),
|
|
49
54
|
"cai_score": round(metrics.get("cai", 0.0), 4),
|
|
50
55
|
"gc_content_pct": round(metrics.get("gc", 0.0), 2),
|
|
51
|
-
|
|
56
|
+
# MFE provenance (016 audit): None when not computed (e.g. ViennaRNA
|
|
57
|
+
# unavailable) — never report an uncomputed MFE as a misleading 0.0.
|
|
58
|
+
"mfe_kcal_mol": (
|
|
59
|
+
round(metrics["mfe_kcal_mol"], 2)
|
|
60
|
+
if metrics.get("mfe_kcal_mol") is not None
|
|
61
|
+
else None
|
|
62
|
+
),
|
|
63
|
+
"mfe_status": metrics.get("mfe_status", "not_computed"),
|
|
64
|
+
"mfe_used": metrics.get("mfe_used", False),
|
|
52
65
|
"polya_signal_count": len(scan.get("polya", [])),
|
|
53
66
|
"domestication_edits": len(dom.get("removed_sites", [])),
|
|
54
67
|
"sequence_length_aa": len(self.sequence) // 3,
|
|
@@ -175,6 +188,7 @@ class OptimizationPipeline:
|
|
|
175
188
|
|
|
176
189
|
if seq_type == "dna":
|
|
177
190
|
optimized_dna = processed
|
|
191
|
+
expected_protein = translate_dna(processed).rstrip("*")
|
|
178
192
|
cai = translator.calculate_cai(optimized_dna)
|
|
179
193
|
gc = translator.calculate_gc_content(optimized_dna)
|
|
180
194
|
score = calculate_composite_score(
|
|
@@ -182,6 +196,7 @@ class OptimizationPipeline:
|
|
|
182
196
|
)
|
|
183
197
|
candidate_metrics = {"cai": cai, "gc": gc, "score": score}
|
|
184
198
|
else:
|
|
199
|
+
expected_protein = processed.rstrip("*")
|
|
185
200
|
logger.debug(f"Generating candidates with profile: {opt_profile.value}")
|
|
186
201
|
candidates = translator.generate_candidates(processed, profile=opt_profile, n=1)
|
|
187
202
|
if not candidates:
|
|
@@ -251,7 +266,20 @@ class OptimizationPipeline:
|
|
|
251
266
|
|
|
252
267
|
assembly_standard = kwargs.get("assembly_standard", "golden_gate")
|
|
253
268
|
domestication = domesticator.domesticate(optimized_dna, standard=assembly_standard)
|
|
269
|
+
if not domestication.get("success", False):
|
|
270
|
+
unfixable = domestication.get("unfixable", [])
|
|
271
|
+
error = domestication.get("error")
|
|
272
|
+
detail = error or f"unfixable restriction sites: {unfixable}"
|
|
273
|
+
raise ValueError(f"Domestication failed for {assembly_standard}: {detail}")
|
|
274
|
+
|
|
254
275
|
domesticated_sequence = domestication.get("domesticated_seq", optimized_dna)
|
|
276
|
+
final_validation = validate_cds_output(expected_protein, domesticated_sequence)
|
|
277
|
+
if not final_validation["passed"]:
|
|
278
|
+
raise ValueError(
|
|
279
|
+
"Final CDS validation failed: "
|
|
280
|
+
f"{final_validation['errors']} "
|
|
281
|
+
f"(aa_identity={final_validation['aa_identity']:.4f})"
|
|
282
|
+
)
|
|
255
283
|
|
|
256
284
|
template_name = construct_template or self.construct_template
|
|
257
285
|
if template_name:
|
|
@@ -269,6 +297,13 @@ class OptimizationPipeline:
|
|
|
269
297
|
construct_record = None
|
|
270
298
|
final_sequence = domesticated_sequence
|
|
271
299
|
|
|
300
|
+
# MFE provenance for the final output sequence (016 audit): record
|
|
301
|
+
# whether MFE was computed so export_features / Design Package never
|
|
302
|
+
# report an uncomputed MFE as 0.0.
|
|
303
|
+
candidate_metrics.update(
|
|
304
|
+
compute_mfe_evidence(domesticated_sequence, profile=effective_profile)
|
|
305
|
+
)
|
|
306
|
+
|
|
272
307
|
metadata: dict[str, Any] = {
|
|
273
308
|
"construct_id": generate_construct_id(),
|
|
274
309
|
"profile": effective_profile,
|
|
@@ -278,6 +313,7 @@ class OptimizationPipeline:
|
|
|
278
313
|
"validation": val_result,
|
|
279
314
|
"scan_results": scan_results,
|
|
280
315
|
"domestication": domestication,
|
|
316
|
+
"final_validation": final_validation,
|
|
281
317
|
"metrics": candidate_metrics,
|
|
282
318
|
"scan_mode": scan_mode,
|
|
283
319
|
}
|
|
@@ -15,7 +15,7 @@ from enum import Enum
|
|
|
15
15
|
from pathlib import Path
|
|
16
16
|
from typing import Any, cast
|
|
17
17
|
|
|
18
|
-
from factorforge.engines.profile.scoring import calculate_composite_score
|
|
18
|
+
from factorforge.engines.profile.scoring import GC_OPT_MID, calculate_composite_score
|
|
19
19
|
from factorforge.engines.profile.utils import (
|
|
20
20
|
build_aa_to_codons_map,
|
|
21
21
|
calculate_gc,
|
|
@@ -440,14 +440,22 @@ class ReverseTranslator:
|
|
|
440
440
|
return "".join(dna_seq)
|
|
441
441
|
|
|
442
442
|
def _gc_target_translate(self, protein_seq: str, **kwargs: Any) -> str:
|
|
443
|
-
"""
|
|
444
|
-
|
|
443
|
+
"""GC-Target profile: drive global GC toward a configurable target.
|
|
444
|
+
|
|
445
|
+
Targets the caller-supplied ``target_gc`` if provided, otherwise the
|
|
446
|
+
host-profile GC midpoint (GC_OPT_MID = 60% for N. benthamiana). To target
|
|
447
|
+
a lower GC (e.g. for specific vector requirements), pass target_gc explicitly.
|
|
445
448
|
|
|
446
449
|
- GC constraint first
|
|
447
450
|
- CAI may be sacrificed
|
|
448
451
|
- Balance local window GC (50 bp)
|
|
452
|
+
|
|
453
|
+
TODO: GC_OPT_MID is currently a single N. benthamiana-calibrated constant.
|
|
454
|
+
When per-host GC profiles are added, source the default from the active host.
|
|
449
455
|
"""
|
|
450
|
-
target_gc = kwargs.get("target_gc"
|
|
456
|
+
target_gc = kwargs.get("target_gc")
|
|
457
|
+
if target_gc is None:
|
|
458
|
+
target_gc = GC_OPT_MID
|
|
451
459
|
|
|
452
460
|
dna_seq: list[str] = []
|
|
453
461
|
|
|
@@ -477,11 +485,22 @@ class ReverseTranslator:
|
|
|
477
485
|
return "".join(dna_seq)
|
|
478
486
|
|
|
479
487
|
def _assembly_friendly_translate(self, protein_seq: str, **kwargs: Any) -> str:
|
|
480
|
-
"""
|
|
481
|
-
|
|
488
|
+
"""Translate for Golden Gate / MoClo assembly compatibility.
|
|
489
|
+
|
|
490
|
+
Strategy:
|
|
491
|
+
- Starts from balanced codon selection (preferred_ratio=0.6)
|
|
492
|
+
- Retries up to max_attempts times until no BsaI/BpiI Type IIS
|
|
493
|
+
restriction sites remain in the CDS (forward + reverse complement)
|
|
494
|
+
- CAI trade-offs are accepted to achieve site-free sequences
|
|
482
495
|
|
|
483
|
-
|
|
484
|
-
-
|
|
496
|
+
Current scope:
|
|
497
|
+
- Supported: BsaI/BpiI site avoidance via stochastic retry
|
|
498
|
+
- Not yet implemented: local GC window uniformity scoring,
|
|
499
|
+
repeat-pattern penalties, synthesis vendor constraint profiles
|
|
500
|
+
|
|
501
|
+
Args:
|
|
502
|
+
protein_seq: Amino acid sequence.
|
|
503
|
+
max_attempts: Retry limit for site removal (default: 10).
|
|
485
504
|
"""
|
|
486
505
|
max_attempts = kwargs.get("max_attempts", 10)
|
|
487
506
|
if max_attempts < 1:
|
|
@@ -529,12 +548,17 @@ class ReverseTranslator:
|
|
|
529
548
|
return self._apply_nterminal_ramp(dna_seq, protein_seq, ramp_codons=ramp_codons)
|
|
530
549
|
|
|
531
550
|
def _apply_nterminal_ramp(self, dna_seq: str, protein_seq: str, ramp_codons: int = 50) -> str:
|
|
532
|
-
"""
|
|
533
|
-
Apply N-terminal codon ramp for co-translational folding.
|
|
551
|
+
"""Apply N-terminal codon ramp for co-translational folding.
|
|
534
552
|
|
|
535
553
|
Replaces the first `ramp_codons` codons with lower-frequency synonymous
|
|
536
|
-
codons (bottom
|
|
537
|
-
Single-codon amino acids (Met, Trp) are left unchanged.
|
|
554
|
+
codons (bottom 25% by frequency; cutoff = 3*len//4) to slow the ribosome
|
|
555
|
+
at the N-terminus. Single-codon amino acids (Met, Trp) are left unchanged.
|
|
556
|
+
|
|
557
|
+
TODO: ramp profile is currently not in VALID_PROFILES (not publicly accessible).
|
|
558
|
+
Before re-enabling, revisit ramp_codons=50:
|
|
559
|
+
- Literature suggests 10–30 codons (Tuller et al. 2010, Chu et al. 2014).
|
|
560
|
+
- For short proteins, 50 codons can cover the entire CDS.
|
|
561
|
+
- Consider: ramp_len = min(30, max(10, int(protein_length * 0.15)))
|
|
538
562
|
|
|
539
563
|
Args:
|
|
540
564
|
dna_seq: Full-length DNA sequence.
|
{factorforge_cds-3.1.7 → factorforge_cds-3.1.9}/src/factorforge/engines/profile/rules/rule_engine.py
RENAMED
|
@@ -11,6 +11,7 @@ import math
|
|
|
11
11
|
import re
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
|
+
from factorforge.analysis.metrics import HOMOPOLYMER_SYNTHESIS_WARN_NT
|
|
14
15
|
from factorforge.engines.profile.rules.reverse_translator import ReverseTranslator
|
|
15
16
|
from factorforge.engines.profile.utils import (
|
|
16
17
|
build_aa_to_codons_map,
|
|
@@ -246,24 +247,27 @@ class RuleEngine:
|
|
|
246
247
|
|
|
247
248
|
return violations
|
|
248
249
|
|
|
249
|
-
def scan_homopolymers(
|
|
250
|
-
|
|
251
|
-
|
|
250
|
+
def scan_homopolymers(
|
|
251
|
+
self, seq: str, min_length: int = HOMOPOLYMER_SYNTHESIS_WARN_NT
|
|
252
|
+
) -> list[dict[str, Any]]:
|
|
253
|
+
"""Detect homopolymer runs for synthesis/manufacturing risk evaluation.
|
|
252
254
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
255
|
+
Uses HOMOPOLYMER_SYNTHESIS_WARN_NT (default 8 nt) — the threshold at
|
|
256
|
+
which gene synthesis vendors flag homopolymers as difficult to synthesize
|
|
257
|
+
with high fidelity.
|
|
256
258
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
+
For expression stability risk (≥6 nt), see
|
|
260
|
+
factorforge.analysis.metrics.detect_homopolymers() which uses
|
|
261
|
+
HOMOPOLYMER_EXPRESSION_WARN_NT.
|
|
259
262
|
|
|
260
|
-
|
|
261
|
-
|
|
263
|
+
Args:
|
|
264
|
+
seq: DNA sequence
|
|
265
|
+
min_length: Minimum run length to flag (default: HOMOPOLYMER_SYNTHESIS_WARN_NT)
|
|
262
266
|
|
|
263
267
|
Examples:
|
|
264
268
|
>>> engine = RuleEngine()
|
|
265
269
|
>>> engine.scan_homopolymers("AAAAAAAA", min_length=8)
|
|
266
|
-
[{'type': 'homopolymer', ...}]
|
|
270
|
+
[{'type': 'homopolymer', 'context': 'synthesis', ...}]
|
|
267
271
|
"""
|
|
268
272
|
violations: list[dict[str, Any]] = []
|
|
269
273
|
|
|
@@ -280,16 +284,16 @@ class RuleEngine:
|
|
|
280
284
|
while idx + actual_length < len(seq) and seq[idx + actual_length] == base:
|
|
281
285
|
actual_length += 1
|
|
282
286
|
|
|
283
|
-
violations.append(
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
)
|
|
287
|
+
violations.append({
|
|
288
|
+
"type": "homopolymer",
|
|
289
|
+
"context": "synthesis",
|
|
290
|
+
"threshold_nt": min_length,
|
|
291
|
+
"base": base,
|
|
292
|
+
"position": idx,
|
|
293
|
+
"length": actual_length,
|
|
294
|
+
"sequence": base * actual_length,
|
|
295
|
+
"severity": "high" if actual_length >= 10 else "medium",
|
|
296
|
+
})
|
|
293
297
|
pos = idx + actual_length
|
|
294
298
|
|
|
295
299
|
return violations
|
|
@@ -350,13 +354,20 @@ class RuleEngine:
|
|
|
350
354
|
max_gc: float = 75,
|
|
351
355
|
) -> list[dict[str, Any]]:
|
|
352
356
|
"""
|
|
353
|
-
Detect extreme GC regions
|
|
357
|
+
Detect extreme GC regions in a sliding local window.
|
|
358
|
+
|
|
359
|
+
This is a LOCAL synthesis/extreme-window guard (default 25-75% over a
|
|
360
|
+
50 bp window), NOT the global GC target. Global GC is governed separately
|
|
361
|
+
by the scoring band (GC_OPT_MIN/MAX, ~55-65%) and the API/DP gc_min/gc_max
|
|
362
|
+
constraints. The wide 25-75% band intentionally flags only synthesis-hostile
|
|
363
|
+
local windows; narrowing it toward the global optimum would raise false
|
|
364
|
+
positives against the engine's own output distribution (analysis 004: 55-71%).
|
|
354
365
|
|
|
355
366
|
Args:
|
|
356
367
|
seq: DNA sequence
|
|
357
368
|
window: Window size (bp)
|
|
358
|
-
min_gc: Minimum GC% threshold
|
|
359
|
-
max_gc: Maximum GC% threshold
|
|
369
|
+
min_gc: Minimum local GC% threshold (synthesis guard, not global target)
|
|
370
|
+
max_gc: Maximum local GC% threshold (synthesis guard, not global target)
|
|
360
371
|
|
|
361
372
|
Returns:
|
|
362
373
|
List of violations
|