factorforge-cds 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- factorforge/__init__.py +19 -0
- factorforge/__main__.py +8 -0
- factorforge/cli/__init__.py +5 -0
- factorforge/cli/legacy_cli.py +157 -0
- factorforge/cli/main.py +305 -0
- factorforge/core/interfaces/__init__.py +7 -0
- factorforge/core/interfaces/exporter.py +13 -0
- factorforge/core/interfaces/optimizer.py +85 -0
- factorforge/core/interfaces/validator.py +9 -0
- factorforge/database.py +150 -0
- factorforge/engines/__init__.py +60 -0
- factorforge/engines/ml/__init__.py +0 -0
- factorforge/engines/ml/plant_optimizer.py +325 -0
- factorforge/engines/registry.py +141 -0
- factorforge/engines/v1_archived/__init__.py +15 -0
- factorforge/engines/v2/__init__.py +13 -0
- factorforge/engines/v2/codon_table_builder.py +107 -0
- factorforge/engines/v2/construct_builder.py +403 -0
- factorforge/engines/v2/exporter.py +455 -0
- factorforge/engines/v2/optimizer.py +190 -0
- factorforge/engines/v2/pipeline.py +275 -0
- factorforge/engines/v2/rules/__init__.py +3 -0
- factorforge/engines/v2/rules/domesticator.py +403 -0
- factorforge/engines/v2/rules/reverse_translator.py +765 -0
- factorforge/engines/v2/rules/rule_engine.py +867 -0
- factorforge/engines/v2/scoring.py +232 -0
- factorforge/engines/v2/utils.py +231 -0
- factorforge/engines/v2/validator.py +383 -0
- factorforge/engines/v3/__init__.py +12 -0
- factorforge/engines/v3/explain.py +119 -0
- factorforge/engines/v3/inference/__init__.py +6 -0
- factorforge/engines/v3/inference/constrained_decoder.py +80 -0
- factorforge/engines/v3/inference/v2_adapter.py +72 -0
- factorforge/engines/v3/metrics.py +145 -0
- factorforge/engines/v3/modeling_bart_decoder.py +127 -0
- factorforge/engines/v3/pipeline.py +192 -0
- factorforge/engines/v3/synonym_mask.py +61 -0
- factorforge/engines/v3/tokenizer.py +192 -0
- factorforge/ml/__init__.py +33 -0
- factorforge/ml/feasibility.py +199 -0
- factorforge/ml/metrics.py +295 -0
- factorforge/utils/__init__.py +31 -0
- factorforge/utils/construct_id.py +8 -0
- factorforge/utils/exceptions.py +32 -0
- factorforge/utils/sequence_validator.py +189 -0
- factorforge/utils/validation.py +104 -0
- factorforge_cds-3.0.0.dist-info/METADATA +475 -0
- factorforge_cds-3.0.0.dist-info/RECORD +52 -0
- factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
- factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
- factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
- factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
factorforge/database.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Database models and CRUD operations for FactorForge."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import uuid
|
|
7
|
+
from typing import Dict, Optional
|
|
8
|
+
|
|
9
|
+
from sqlalchemy import ARRAY, DECIMAL, TIMESTAMP, Column, ForeignKey, String, Text, create_engine, func
|
|
10
|
+
from sqlalchemy.dialects.postgresql import JSONB, UUID
|
|
11
|
+
from sqlalchemy.orm import DeclarativeBase, sessionmaker
|
|
12
|
+
|
|
13
|
+
DATABASE_URL = os.getenv(
|
|
14
|
+
"DATABASE_URL",
|
|
15
|
+
"postgresql://plantform:plantform_dev_2026@localhost:5432/factorforge_operational",
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
engine = create_engine(DATABASE_URL, pool_pre_ping=True)
|
|
19
|
+
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Base(DeclarativeBase):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Batch(Base):
|
|
27
|
+
__tablename__ = "batches"
|
|
28
|
+
__table_args__ = {"schema": "factorforge"}
|
|
29
|
+
|
|
30
|
+
batch_id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
31
|
+
study_number = Column(String(50), unique=True, nullable=False)
|
|
32
|
+
organism = Column(String(100), nullable=False)
|
|
33
|
+
target_protein = Column(String(255), nullable=False)
|
|
34
|
+
created_at = Column(TIMESTAMP, server_default=func.now())
|
|
35
|
+
updated_at = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now())
|
|
36
|
+
status = Column(String(20), default="pending")
|
|
37
|
+
created_by = Column(String(100))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class Sequence(Base):
|
|
41
|
+
__tablename__ = "sequences"
|
|
42
|
+
__table_args__ = {"schema": "factorforge"}
|
|
43
|
+
|
|
44
|
+
sequence_id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
45
|
+
batch_id = Column(
|
|
46
|
+
UUID(as_uuid=True),
|
|
47
|
+
ForeignKey("factorforge.batches.batch_id", ondelete="CASCADE"),
|
|
48
|
+
nullable=False,
|
|
49
|
+
)
|
|
50
|
+
sequence_type = Column(String(20), nullable=False)
|
|
51
|
+
sequence_data = Column(Text, nullable=False)
|
|
52
|
+
gc_content = Column(DECIMAL(5, 4))
|
|
53
|
+
cai = Column(DECIMAL(5, 4))
|
|
54
|
+
tm = Column(DECIMAL(5, 2))
|
|
55
|
+
created_at = Column(TIMESTAMP, server_default=func.now())
|
|
56
|
+
metadata_ = Column("metadata", JSONB)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class OptimizationResult(Base):
|
|
60
|
+
__tablename__ = "optimization_results"
|
|
61
|
+
__table_args__ = {"schema": "factorforge"}
|
|
62
|
+
|
|
63
|
+
result_id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
64
|
+
batch_id = Column(UUID(as_uuid=True), ForeignKey("factorforge.batches.batch_id"))
|
|
65
|
+
sequence_id = Column(UUID(as_uuid=True), ForeignKey("factorforge.sequences.sequence_id"))
|
|
66
|
+
algorithm_version = Column(String(20), nullable=False)
|
|
67
|
+
execution_time_sec = Column(DECIMAL(8, 3))
|
|
68
|
+
avoided_motifs = Column(ARRAY(Text))
|
|
69
|
+
warnings = Column(ARRAY(Text))
|
|
70
|
+
created_at = Column(TIMESTAMP, server_default=func.now())
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def save_optimization(
|
|
74
|
+
study_number: str,
|
|
75
|
+
protein_name: str,
|
|
76
|
+
input_sequence: str,
|
|
77
|
+
optimized_sequence: str,
|
|
78
|
+
metrics: Dict,
|
|
79
|
+
algorithm_version: str = "2.1.0",
|
|
80
|
+
) -> str:
|
|
81
|
+
"""Save optimization result to database."""
|
|
82
|
+
with SessionLocal() as session:
|
|
83
|
+
batch = Batch(
|
|
84
|
+
study_number=study_number,
|
|
85
|
+
organism="nicotiana_benthamiana",
|
|
86
|
+
target_protein=protein_name,
|
|
87
|
+
status="completed",
|
|
88
|
+
)
|
|
89
|
+
session.add(batch)
|
|
90
|
+
session.flush()
|
|
91
|
+
|
|
92
|
+
input_seq = Sequence(
|
|
93
|
+
batch_id=batch.batch_id,
|
|
94
|
+
sequence_type="input",
|
|
95
|
+
sequence_data=input_sequence,
|
|
96
|
+
)
|
|
97
|
+
output_seq = Sequence(
|
|
98
|
+
batch_id=batch.batch_id,
|
|
99
|
+
sequence_type="optimized",
|
|
100
|
+
sequence_data=optimized_sequence,
|
|
101
|
+
gc_content=metrics.get("gc_content"),
|
|
102
|
+
cai=metrics.get("cai"),
|
|
103
|
+
tm=metrics.get("tm"),
|
|
104
|
+
metadata_=metrics,
|
|
105
|
+
)
|
|
106
|
+
session.add_all([input_seq, output_seq])
|
|
107
|
+
session.flush()
|
|
108
|
+
|
|
109
|
+
result = OptimizationResult(
|
|
110
|
+
batch_id=batch.batch_id,
|
|
111
|
+
sequence_id=output_seq.sequence_id,
|
|
112
|
+
algorithm_version=algorithm_version,
|
|
113
|
+
execution_time_sec=metrics.get("execution_time"),
|
|
114
|
+
avoided_motifs=metrics.get("avoided_motifs", []),
|
|
115
|
+
warnings=metrics.get("warnings", []),
|
|
116
|
+
)
|
|
117
|
+
session.add(result)
|
|
118
|
+
session.commit()
|
|
119
|
+
|
|
120
|
+
return str(batch.batch_id)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def get_batch(study_number: str) -> Optional[Dict]:
|
|
124
|
+
"""Retrieve batch by study number."""
|
|
125
|
+
with SessionLocal() as session:
|
|
126
|
+
batch = (
|
|
127
|
+
session.query(Batch)
|
|
128
|
+
.filter(Batch.study_number == study_number)
|
|
129
|
+
.first()
|
|
130
|
+
)
|
|
131
|
+
if not batch:
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
sequences = session.query(Sequence).filter(Sequence.batch_id == batch.batch_id).all()
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
"batch_id": str(batch.batch_id),
|
|
138
|
+
"study_number": batch.study_number,
|
|
139
|
+
"protein": batch.target_protein,
|
|
140
|
+
"status": batch.status,
|
|
141
|
+
"sequences": [
|
|
142
|
+
{
|
|
143
|
+
"type": seq.sequence_type,
|
|
144
|
+
"data": f"{seq.sequence_data[:50]}...",
|
|
145
|
+
"gc": float(seq.gc_content) if seq.gc_content is not None else None,
|
|
146
|
+
"cai": float(seq.cai) if seq.cai is not None else None,
|
|
147
|
+
}
|
|
148
|
+
for seq in sequences
|
|
149
|
+
],
|
|
150
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Optimization Engines"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Type
|
|
6
|
+
|
|
7
|
+
from factorforge.core.interfaces import OptimizerEngine
|
|
8
|
+
|
|
9
|
+
from .registry import EngineRegistry
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _load_v1() -> Type[OptimizerEngine]:
|
|
13
|
+
raise ImportError(
|
|
14
|
+
"FactorForge v1 is archived. Install with: pip install factorforge[v1]"
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _load_v3() -> Type[OptimizerEngine]:
|
|
19
|
+
from .v3 import V3Optimizer
|
|
20
|
+
|
|
21
|
+
return V3Optimizer # type: ignore[return-value]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def register_builtin_engines() -> None:
|
|
25
|
+
"""Register bundled engines with lazy loaders for archived/ML engines."""
|
|
26
|
+
from .v2 import RuleBasedOptimizer
|
|
27
|
+
|
|
28
|
+
EngineRegistry.register(
|
|
29
|
+
"v2",
|
|
30
|
+
RuleBasedOptimizer,
|
|
31
|
+
metadata={
|
|
32
|
+
"version": "3.0.0",
|
|
33
|
+
"engine_type": "rule_based",
|
|
34
|
+
"role": "legacy_fallback",
|
|
35
|
+
"stable": True,
|
|
36
|
+
},
|
|
37
|
+
)
|
|
38
|
+
EngineRegistry.register_lazy(
|
|
39
|
+
"v1",
|
|
40
|
+
_load_v1,
|
|
41
|
+
metadata={
|
|
42
|
+
"version": "archived",
|
|
43
|
+
"engine_type": "rule_based",
|
|
44
|
+
"role": "archived",
|
|
45
|
+
"stable": False,
|
|
46
|
+
},
|
|
47
|
+
)
|
|
48
|
+
EngineRegistry.register_lazy(
|
|
49
|
+
"v3",
|
|
50
|
+
_load_v3,
|
|
51
|
+
metadata={
|
|
52
|
+
"version": "alpha",
|
|
53
|
+
"engine_type": "ml",
|
|
54
|
+
"role": "experimental",
|
|
55
|
+
"stable": False,
|
|
56
|
+
},
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
__all__ = ["EngineRegistry", "register_builtin_engines"]
|
|
File without changes
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Machine-learning-based codon optimization for plants.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import math
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, List, Tuple
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import torch
|
|
14
|
+
from transformers import BartForConditionalGeneration
|
|
15
|
+
|
|
16
|
+
CODON_TO_AA = {
|
|
17
|
+
"TTT": "F",
|
|
18
|
+
"TTC": "F",
|
|
19
|
+
"TTA": "L",
|
|
20
|
+
"TTG": "L",
|
|
21
|
+
"TCT": "S",
|
|
22
|
+
"TCC": "S",
|
|
23
|
+
"TCA": "S",
|
|
24
|
+
"TCG": "S",
|
|
25
|
+
"TAT": "Y",
|
|
26
|
+
"TAC": "Y",
|
|
27
|
+
"TAA": "*",
|
|
28
|
+
"TAG": "*",
|
|
29
|
+
"TGT": "C",
|
|
30
|
+
"TGC": "C",
|
|
31
|
+
"TGA": "*",
|
|
32
|
+
"TGG": "W",
|
|
33
|
+
"CTT": "L",
|
|
34
|
+
"CTC": "L",
|
|
35
|
+
"CTA": "L",
|
|
36
|
+
"CTG": "L",
|
|
37
|
+
"CCT": "P",
|
|
38
|
+
"CCC": "P",
|
|
39
|
+
"CCA": "P",
|
|
40
|
+
"CCG": "P",
|
|
41
|
+
"CAT": "H",
|
|
42
|
+
"CAC": "H",
|
|
43
|
+
"CAA": "Q",
|
|
44
|
+
"CAG": "Q",
|
|
45
|
+
"CGT": "R",
|
|
46
|
+
"CGC": "R",
|
|
47
|
+
"CGA": "R",
|
|
48
|
+
"CGG": "R",
|
|
49
|
+
"ATT": "I",
|
|
50
|
+
"ATC": "I",
|
|
51
|
+
"ATA": "I",
|
|
52
|
+
"ATG": "M",
|
|
53
|
+
"ACT": "T",
|
|
54
|
+
"ACC": "T",
|
|
55
|
+
"ACA": "T",
|
|
56
|
+
"ACG": "T",
|
|
57
|
+
"AAT": "N",
|
|
58
|
+
"AAC": "N",
|
|
59
|
+
"AAA": "K",
|
|
60
|
+
"AAG": "K",
|
|
61
|
+
"AGT": "S",
|
|
62
|
+
"AGC": "S",
|
|
63
|
+
"AGA": "R",
|
|
64
|
+
"AGG": "R",
|
|
65
|
+
"GTT": "V",
|
|
66
|
+
"GTC": "V",
|
|
67
|
+
"GTA": "V",
|
|
68
|
+
"GTG": "V",
|
|
69
|
+
"GCT": "A",
|
|
70
|
+
"GCC": "A",
|
|
71
|
+
"GCA": "A",
|
|
72
|
+
"GCG": "A",
|
|
73
|
+
"GAT": "D",
|
|
74
|
+
"GAC": "D",
|
|
75
|
+
"GAA": "E",
|
|
76
|
+
"GAG": "E",
|
|
77
|
+
"GGT": "G",
|
|
78
|
+
"GGC": "G",
|
|
79
|
+
"GGA": "G",
|
|
80
|
+
"GGG": "G",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class CodonTokenizer:
|
|
85
|
+
def __init__(self, token_map: Dict[str, int]):
|
|
86
|
+
self.token_to_id = token_map
|
|
87
|
+
self.id_to_token = {idx: token for token, idx in token_map.items()}
|
|
88
|
+
self.pad_token_id = token_map["[PAD]"]
|
|
89
|
+
self.unk_token_id = token_map["[UNK]"]
|
|
90
|
+
self.mask_token_id = token_map["[MASK]"]
|
|
91
|
+
self.start_token_id = token_map["[START]"]
|
|
92
|
+
self.end_token_id = token_map["[END]"]
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def from_json(cls, path: Path) -> "CodonTokenizer":
|
|
96
|
+
with path.open("r", encoding="utf-8") as handle:
|
|
97
|
+
token_map = json.load(handle)
|
|
98
|
+
required = ["[PAD]", "[UNK]", "[MASK]", "[START]", "[END]"]
|
|
99
|
+
missing = [token for token in required if token not in token_map]
|
|
100
|
+
if missing:
|
|
101
|
+
raise ValueError(f"Tokenizer missing special tokens: {missing}")
|
|
102
|
+
return cls(token_map)
|
|
103
|
+
|
|
104
|
+
def encode_dna(self, dna_seq: str) -> List[int]:
|
|
105
|
+
seq = dna_seq.upper()
|
|
106
|
+
tokens = [self.start_token_id]
|
|
107
|
+
for i in range(0, len(seq), 3):
|
|
108
|
+
codon = seq[i : i + 3]
|
|
109
|
+
if len(codon) != 3:
|
|
110
|
+
continue
|
|
111
|
+
token_id = self.token_to_id.get(codon, self.unk_token_id)
|
|
112
|
+
tokens.append(token_id)
|
|
113
|
+
tokens.append(self.end_token_id)
|
|
114
|
+
return tokens
|
|
115
|
+
|
|
116
|
+
def decode_dna(self, ids: List[int]) -> str:
|
|
117
|
+
codons: List[str] = []
|
|
118
|
+
for idx in ids:
|
|
119
|
+
token = self.id_to_token.get(int(idx))
|
|
120
|
+
if token and len(token) == 3 and all(base in "ACGT" for base in token):
|
|
121
|
+
codons.append(token)
|
|
122
|
+
return "".join(codons)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class PlantCodonOptimizer:
|
|
126
|
+
"""
|
|
127
|
+
Optimize codon usage using a trained BART model and codon frequency table.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
def __init__(
|
|
131
|
+
self,
|
|
132
|
+
model_path: str,
|
|
133
|
+
codon_table_path: str,
|
|
134
|
+
tokenizer_path: str,
|
|
135
|
+
organism: str = "N.benthamiana",
|
|
136
|
+
) -> None:
|
|
137
|
+
self.organism = organism
|
|
138
|
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
139
|
+
self.model = BartForConditionalGeneration.from_pretrained(model_path).to(self.device)
|
|
140
|
+
self.model.eval()
|
|
141
|
+
|
|
142
|
+
self.codon_table = self._load_codon_table(Path(codon_table_path))
|
|
143
|
+
self.tokenizer = CodonTokenizer.from_json(Path(tokenizer_path))
|
|
144
|
+
self.codon_weights = self._build_codon_weights(self.codon_table)
|
|
145
|
+
self.best_codon_for_aa = self._best_codon_map(self.codon_table)
|
|
146
|
+
|
|
147
|
+
def optimize(self, protein_sequence: str, beam_size: int = 5) -> str:
|
|
148
|
+
"""
|
|
149
|
+
Generate an optimized DNA sequence for a protein input.
|
|
150
|
+
"""
|
|
151
|
+
protein_seq = self._normalize_protein(protein_sequence)
|
|
152
|
+
baseline_dna = self._reverse_translate(protein_seq)
|
|
153
|
+
input_ids = torch.tensor(
|
|
154
|
+
[self.tokenizer.encode_dna(baseline_dna)],
|
|
155
|
+
dtype=torch.long,
|
|
156
|
+
device=self.device,
|
|
157
|
+
)
|
|
158
|
+
attention_mask = (input_ids != self.tokenizer.pad_token_id).long()
|
|
159
|
+
max_length = input_ids.shape[1]
|
|
160
|
+
|
|
161
|
+
with torch.no_grad():
|
|
162
|
+
outputs = self.model.generate(
|
|
163
|
+
input_ids=input_ids,
|
|
164
|
+
attention_mask=attention_mask,
|
|
165
|
+
num_beams=beam_size,
|
|
166
|
+
max_length=max_length,
|
|
167
|
+
min_length=max_length,
|
|
168
|
+
early_stopping=True,
|
|
169
|
+
)
|
|
170
|
+
return self.tokenizer.decode_dna(outputs[0].tolist())
|
|
171
|
+
|
|
172
|
+
def calculate_cai(self, dna_sequence: str) -> float:
|
|
173
|
+
"""
|
|
174
|
+
Calculate Codon Adaptation Index using codon frequencies.
|
|
175
|
+
"""
|
|
176
|
+
seq = dna_sequence.upper()
|
|
177
|
+
codon_count = len(seq) // 3
|
|
178
|
+
if codon_count == 0:
|
|
179
|
+
return 0.0
|
|
180
|
+
weights = []
|
|
181
|
+
for i in range(codon_count):
|
|
182
|
+
codon = seq[i * 3 : i * 3 + 3]
|
|
183
|
+
weight = self.codon_weights.get(codon, 0.0)
|
|
184
|
+
if weight <= 0:
|
|
185
|
+
return 0.0
|
|
186
|
+
weights.append(weight)
|
|
187
|
+
log_sum = sum(math.log(w) for w in weights)
|
|
188
|
+
return math.exp(log_sum / len(weights))
|
|
189
|
+
|
|
190
|
+
def compare_sequences(self, original_dna: str, optimized_dna: str) -> Dict[str, float]:
|
|
191
|
+
"""
|
|
192
|
+
Compare codon-by-codon and return metrics.
|
|
193
|
+
"""
|
|
194
|
+
original = original_dna.upper()
|
|
195
|
+
optimized = optimized_dna.upper()
|
|
196
|
+
total_codons = min(len(original), len(optimized)) // 3
|
|
197
|
+
changed = 0
|
|
198
|
+
for i in range(total_codons):
|
|
199
|
+
o_codon = original[i * 3 : i * 3 + 3]
|
|
200
|
+
n_codon = optimized[i * 3 : i * 3 + 3]
|
|
201
|
+
if o_codon != n_codon:
|
|
202
|
+
changed += 1
|
|
203
|
+
|
|
204
|
+
original_cai = self.calculate_cai(original)
|
|
205
|
+
optimized_cai = self.calculate_cai(optimized)
|
|
206
|
+
return {
|
|
207
|
+
"total_codons": total_codons,
|
|
208
|
+
"changed_codons": changed,
|
|
209
|
+
"change_rate": (changed / total_codons * 100) if total_codons else 0.0,
|
|
210
|
+
"original_cai": original_cai,
|
|
211
|
+
"optimized_cai": optimized_cai,
|
|
212
|
+
"cai_improvement": optimized_cai - original_cai,
|
|
213
|
+
"original_gc": self._gc_content(original),
|
|
214
|
+
"optimized_gc": self._gc_content(optimized),
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
def generate_report(
|
|
218
|
+
self,
|
|
219
|
+
protein_name: str,
|
|
220
|
+
protein_seq: str,
|
|
221
|
+
original_dna: str,
|
|
222
|
+
optimized_dna: str,
|
|
223
|
+
output_path: str,
|
|
224
|
+
) -> None:
|
|
225
|
+
"""
|
|
226
|
+
Generate a formatted text report for the optimization.
|
|
227
|
+
"""
|
|
228
|
+
metrics = self.compare_sequences(original_dna, optimized_dna)
|
|
229
|
+
output = Path(output_path)
|
|
230
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
231
|
+
with output.open("w", encoding="utf-8") as handle:
|
|
232
|
+
handle.write("Codon Optimization Report\n")
|
|
233
|
+
handle.write("=" * 60 + "\n")
|
|
234
|
+
handle.write(f"Organism: {self.organism}\n")
|
|
235
|
+
handle.write(f"Protein: {protein_name}\n")
|
|
236
|
+
handle.write(f"Protein length: {len(protein_seq)} aa\n\n")
|
|
237
|
+
handle.write("Metrics\n")
|
|
238
|
+
handle.write("-" * 60 + "\n")
|
|
239
|
+
handle.write(f"Total codons: {metrics['total_codons']}\n")
|
|
240
|
+
handle.write(f"Changed codons: {metrics['changed_codons']}\n")
|
|
241
|
+
handle.write(f"Change rate: {metrics['change_rate']:.2f}%\n")
|
|
242
|
+
handle.write(f"Original CAI: {metrics['original_cai']:.4f}\n")
|
|
243
|
+
handle.write(f"Optimized CAI: {metrics['optimized_cai']:.4f}\n")
|
|
244
|
+
handle.write(f"CAI improvement: {metrics['cai_improvement']:.4f}\n")
|
|
245
|
+
handle.write(f"Original GC: {metrics['original_gc']:.2f}%\n")
|
|
246
|
+
handle.write(f"Optimized GC: {metrics['optimized_gc']:.2f}%\n\n")
|
|
247
|
+
handle.write("Original DNA\n")
|
|
248
|
+
handle.write("-" * 60 + "\n")
|
|
249
|
+
handle.write(original_dna + "\n\n")
|
|
250
|
+
handle.write("Optimized DNA\n")
|
|
251
|
+
handle.write("-" * 60 + "\n")
|
|
252
|
+
handle.write(optimized_dna + "\n")
|
|
253
|
+
|
|
254
|
+
def _load_codon_table(self, path: Path) -> Dict[str, float]:
|
|
255
|
+
if not path.exists():
|
|
256
|
+
raise FileNotFoundError(f"Codon table not found: {path}")
|
|
257
|
+
df = pd.read_csv(path)
|
|
258
|
+
columns = {col.lower(): col for col in df.columns}
|
|
259
|
+
if "codon" not in columns or "frequency" not in columns:
|
|
260
|
+
raise ValueError("Codon table must have columns: Codon, Frequency")
|
|
261
|
+
codon_col = columns["codon"]
|
|
262
|
+
freq_col = columns["frequency"]
|
|
263
|
+
codon_freq: Dict[str, float] = {}
|
|
264
|
+
for _, row in df.iterrows():
|
|
265
|
+
codon = str(row[codon_col]).strip().upper()
|
|
266
|
+
try:
|
|
267
|
+
freq = float(row[freq_col])
|
|
268
|
+
except (TypeError, ValueError):
|
|
269
|
+
freq = 0.0
|
|
270
|
+
if len(codon) == 3:
|
|
271
|
+
codon_freq[codon] = freq
|
|
272
|
+
return codon_freq
|
|
273
|
+
|
|
274
|
+
def _build_codon_weights(self, codon_freq: Dict[str, float]) -> Dict[str, float]:
|
|
275
|
+
by_aa: Dict[str, List[Tuple[str, float]]] = {}
|
|
276
|
+
for codon, freq in codon_freq.items():
|
|
277
|
+
aa = CODON_TO_AA.get(codon, "*")
|
|
278
|
+
if aa == "*":
|
|
279
|
+
continue
|
|
280
|
+
by_aa.setdefault(aa, []).append((codon, freq))
|
|
281
|
+
|
|
282
|
+
weights: Dict[str, float] = {}
|
|
283
|
+
for aa, codons in by_aa.items():
|
|
284
|
+
max_freq = max(freq for _, freq in codons) if codons else 0.0
|
|
285
|
+
for codon, freq in codons:
|
|
286
|
+
weights[codon] = freq / max_freq if max_freq > 0 else 0.0
|
|
287
|
+
return weights
|
|
288
|
+
|
|
289
|
+
def _best_codon_map(self, codon_freq: Dict[str, float]) -> Dict[str, str]:
|
|
290
|
+
best: Dict[str, Tuple[str, float]] = {}
|
|
291
|
+
for codon, freq in codon_freq.items():
|
|
292
|
+
aa = CODON_TO_AA.get(codon, "*")
|
|
293
|
+
if aa == "*":
|
|
294
|
+
continue
|
|
295
|
+
current = best.get(aa)
|
|
296
|
+
if current is None or freq > current[1]:
|
|
297
|
+
best[aa] = (codon, freq)
|
|
298
|
+
return {aa: codon for aa, (codon, _) in best.items()}
|
|
299
|
+
|
|
300
|
+
def _reverse_translate(self, protein_seq: str) -> str:
|
|
301
|
+
codons = []
|
|
302
|
+
for aa in protein_seq:
|
|
303
|
+
codon = self.best_codon_for_aa.get(aa)
|
|
304
|
+
if codon is None:
|
|
305
|
+
raise ValueError(f"No codon mapping for amino acid: {aa}")
|
|
306
|
+
codons.append(codon)
|
|
307
|
+
return "".join(codons)
|
|
308
|
+
|
|
309
|
+
def _normalize_protein(self, protein_sequence: str) -> str:
|
|
310
|
+
seq = protein_sequence.strip().replace("\n", "").replace(" ", "").upper()
|
|
311
|
+
valid = set("ACDEFGHIKLMNPQRSTVWY")
|
|
312
|
+
if not seq:
|
|
313
|
+
raise ValueError("Protein sequence is empty.")
|
|
314
|
+
invalid = {ch for ch in seq if ch not in valid}
|
|
315
|
+
if invalid:
|
|
316
|
+
raise ValueError(f"Invalid amino acids found: {''.join(sorted(invalid))}")
|
|
317
|
+
return seq
|
|
318
|
+
|
|
319
|
+
@staticmethod
|
|
320
|
+
def _gc_content(dna_sequence: str) -> float:
|
|
321
|
+
seq = dna_sequence.upper()
|
|
322
|
+
if not seq:
|
|
323
|
+
return 0.0
|
|
324
|
+
gc = seq.count("G") + seq.count("C")
|
|
325
|
+
return (gc / len(seq)) * 100.0
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Engine Registry
|
|
3
|
+
|
|
4
|
+
Registry that dynamically registers and manages engines
|
|
5
|
+
No changes needed to existing code when adding new engines (v3, v4, etc.)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Callable, Type
|
|
12
|
+
|
|
13
|
+
from factorforge.core.interfaces import OptimizerEngine
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EngineRegistry:
|
|
19
|
+
"""Optimization engine registry"""
|
|
20
|
+
|
|
21
|
+
_engines: dict[str, Type[OptimizerEngine]] = {}
|
|
22
|
+
_instances: dict[str, OptimizerEngine] = {}
|
|
23
|
+
_lazy_loaders: dict[str, Callable[[], Type[OptimizerEngine]]] = {}
|
|
24
|
+
_metadata: dict[str, dict[str, object]] = {}
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def register(
|
|
28
|
+
cls,
|
|
29
|
+
name: str,
|
|
30
|
+
engine_class: Type[OptimizerEngine],
|
|
31
|
+
metadata: dict[str, object] | None = None,
|
|
32
|
+
) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Register an engine
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
name: Engine identifier (e.g., "v1", "v2", "v3")
|
|
38
|
+
engine_class: Class implementing OptimizerEngine
|
|
39
|
+
metadata: Optional engine metadata.
|
|
40
|
+
"""
|
|
41
|
+
cls._engines[name] = engine_class
|
|
42
|
+
if metadata is not None:
|
|
43
|
+
cls._metadata[name] = dict(metadata)
|
|
44
|
+
else:
|
|
45
|
+
cls._metadata.setdefault(name, {})
|
|
46
|
+
logger.debug("Registered engine: %s (%s)", name, engine_class.__name__)
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def register_lazy(
|
|
50
|
+
cls,
|
|
51
|
+
name: str,
|
|
52
|
+
loader: Callable[[], Type[OptimizerEngine]],
|
|
53
|
+
metadata: dict[str, object] | None = None,
|
|
54
|
+
) -> None:
|
|
55
|
+
"""
|
|
56
|
+
Register a lazy engine loader.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
name: Engine identifier (e.g., "v1", "v3")
|
|
60
|
+
loader: Callable that returns the engine class on demand
|
|
61
|
+
metadata: Optional engine metadata.
|
|
62
|
+
"""
|
|
63
|
+
cls._lazy_loaders[name] = loader
|
|
64
|
+
if metadata is not None:
|
|
65
|
+
cls._metadata[name] = dict(metadata)
|
|
66
|
+
else:
|
|
67
|
+
cls._metadata.setdefault(name, {})
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def get(cls, name: str) -> OptimizerEngine:
|
|
71
|
+
"""
|
|
72
|
+
Get engine instance (singleton)
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
name: Engine identifier
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
OptimizerEngine instance
|
|
79
|
+
"""
|
|
80
|
+
if name not in cls._engines and name in cls._lazy_loaders:
|
|
81
|
+
engine_class = cls._lazy_loaders[name]()
|
|
82
|
+
cls.register(name, engine_class, metadata=cls._metadata.get(name))
|
|
83
|
+
|
|
84
|
+
if name not in cls._engines:
|
|
85
|
+
available = ", ".join(cls._engines.keys())
|
|
86
|
+
raise ValueError(f"❌ Engine '{name}' not found. Available: {available}")
|
|
87
|
+
|
|
88
|
+
# Singleton pattern
|
|
89
|
+
if name not in cls._instances:
|
|
90
|
+
cls._instances[name] = cls._engines[name]()
|
|
91
|
+
|
|
92
|
+
return cls._instances[name]
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def list_engines(cls) -> dict[str, dict[str, str]]:
|
|
96
|
+
"""
|
|
97
|
+
List available engines
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
dict: {name: {version, description}}
|
|
101
|
+
"""
|
|
102
|
+
result: dict[str, dict[str, str]] = {}
|
|
103
|
+
for name, engine_class in cls._engines.items():
|
|
104
|
+
instance = cls.get(name)
|
|
105
|
+
result[name] = {
|
|
106
|
+
"version": instance.version,
|
|
107
|
+
"name": instance.name,
|
|
108
|
+
}
|
|
109
|
+
for name in cls._lazy_loaders:
|
|
110
|
+
if name in result:
|
|
111
|
+
continue
|
|
112
|
+
result[name] = {
|
|
113
|
+
"version": "lazy",
|
|
114
|
+
"name": "lazy (not loaded)",
|
|
115
|
+
}
|
|
116
|
+
return result
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def list_with_metadata(cls) -> dict[str, dict[str, object]]:
|
|
120
|
+
"""List registered and lazy engines with metadata."""
|
|
121
|
+
result: dict[str, dict[str, object]] = {}
|
|
122
|
+
all_names = set(cls._engines) | set(cls._lazy_loaders) | set(cls._metadata)
|
|
123
|
+
for name in sorted(all_names):
|
|
124
|
+
metadata = dict(cls._metadata.get(name, {}))
|
|
125
|
+
if name in cls._engines:
|
|
126
|
+
instance = cls.get(name)
|
|
127
|
+
metadata.setdefault("version", instance.version)
|
|
128
|
+
metadata.setdefault("name", instance.name)
|
|
129
|
+
elif name in cls._lazy_loaders:
|
|
130
|
+
metadata.setdefault("version", "lazy")
|
|
131
|
+
metadata.setdefault("name", "lazy (not loaded)")
|
|
132
|
+
result[name] = metadata
|
|
133
|
+
return result
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def clear(cls) -> None:
|
|
137
|
+
"""Reset registry (for tests)"""
|
|
138
|
+
cls._engines.clear()
|
|
139
|
+
cls._instances.clear()
|
|
140
|
+
cls._lazy_loaders.clear()
|
|
141
|
+
cls._metadata.clear()
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
FactorForge v1 — Archived / Frozen
|
|
3
|
+
|
|
4
|
+
v1 (BPE Tokenizer) is preserved for research reference only.
|
|
5
|
+
It is not maintained and not recommended for new work.
|
|
6
|
+
|
|
7
|
+
Use v2 (rule-based, production) instead:
|
|
8
|
+
from factorforge.engines.v2 import RuleBasedOptimizer
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
raise ImportError(
|
|
12
|
+
"FactorForge v1 is archived and not available as an installed package. "
|
|
13
|
+
"Install with optional v1 extras to use: pip install factorforge[v1]. "
|
|
14
|
+
"For production use, switch to v2: from factorforge.engines.v2 import RuleBasedOptimizer"
|
|
15
|
+
)
|