factorforge-cds 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- factorforge/__init__.py +19 -0
- factorforge/__main__.py +8 -0
- factorforge/cli/__init__.py +5 -0
- factorforge/cli/legacy_cli.py +157 -0
- factorforge/cli/main.py +305 -0
- factorforge/core/interfaces/__init__.py +7 -0
- factorforge/core/interfaces/exporter.py +13 -0
- factorforge/core/interfaces/optimizer.py +85 -0
- factorforge/core/interfaces/validator.py +9 -0
- factorforge/database.py +150 -0
- factorforge/engines/__init__.py +60 -0
- factorforge/engines/ml/__init__.py +0 -0
- factorforge/engines/ml/plant_optimizer.py +325 -0
- factorforge/engines/registry.py +141 -0
- factorforge/engines/v1_archived/__init__.py +15 -0
- factorforge/engines/v2/__init__.py +13 -0
- factorforge/engines/v2/codon_table_builder.py +107 -0
- factorforge/engines/v2/construct_builder.py +403 -0
- factorforge/engines/v2/exporter.py +455 -0
- factorforge/engines/v2/optimizer.py +190 -0
- factorforge/engines/v2/pipeline.py +275 -0
- factorforge/engines/v2/rules/__init__.py +3 -0
- factorforge/engines/v2/rules/domesticator.py +403 -0
- factorforge/engines/v2/rules/reverse_translator.py +765 -0
- factorforge/engines/v2/rules/rule_engine.py +867 -0
- factorforge/engines/v2/scoring.py +232 -0
- factorforge/engines/v2/utils.py +231 -0
- factorforge/engines/v2/validator.py +383 -0
- factorforge/engines/v3/__init__.py +12 -0
- factorforge/engines/v3/explain.py +119 -0
- factorforge/engines/v3/inference/__init__.py +6 -0
- factorforge/engines/v3/inference/constrained_decoder.py +80 -0
- factorforge/engines/v3/inference/v2_adapter.py +72 -0
- factorforge/engines/v3/metrics.py +145 -0
- factorforge/engines/v3/modeling_bart_decoder.py +127 -0
- factorforge/engines/v3/pipeline.py +192 -0
- factorforge/engines/v3/synonym_mask.py +61 -0
- factorforge/engines/v3/tokenizer.py +192 -0
- factorforge/ml/__init__.py +33 -0
- factorforge/ml/feasibility.py +199 -0
- factorforge/ml/metrics.py +295 -0
- factorforge/utils/__init__.py +31 -0
- factorforge/utils/construct_id.py +8 -0
- factorforge/utils/exceptions.py +32 -0
- factorforge/utils/sequence_validator.py +189 -0
- factorforge/utils/validation.py +104 -0
- factorforge_cds-3.0.0.dist-info/METADATA +475 -0
- factorforge_cds-3.0.0.dist-info/RECORD +52 -0
- factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
- factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
- factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
- factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,455 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Exporter for FactorForge v2
|
|
3
|
+
GenBank and FASTA export module (P0-5)
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from io import StringIO
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SequenceExporter:
|
|
15
|
+
"""
|
|
16
|
+
Export optimized sequences in GenBank and FASTA formats
|
|
17
|
+
|
|
18
|
+
Features:
|
|
19
|
+
- GenBank format (with metadata)
|
|
20
|
+
- FASTA format (key info in header)
|
|
21
|
+
- Reproducibility via run ID
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self) -> None:
|
|
25
|
+
"""Initialize"""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
def generate_run_id(self, sequence: str, params: dict[str, Any]) -> str:
|
|
29
|
+
"""
|
|
30
|
+
Generate a reproducible run_id
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
sequence: DNA sequence
|
|
34
|
+
params: Optimization parameters
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
8-character hash string
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
None.
|
|
41
|
+
|
|
42
|
+
Examples:
|
|
43
|
+
>>> exporter = SequenceExporter()
|
|
44
|
+
>>> run_id = exporter.generate_run_id("ATG", {"profile": "balanced"})
|
|
45
|
+
>>> len(run_id) == 8
|
|
46
|
+
True
|
|
47
|
+
"""
|
|
48
|
+
# Create a hash from sequence + parameters
|
|
49
|
+
content = f"{sequence}_{params.get('profile', 'balanced')}_{params.get('assembly_standard', 'none')}"
|
|
50
|
+
hash_obj = hashlib.md5(content.encode())
|
|
51
|
+
return hash_obj.hexdigest()[:8]
|
|
52
|
+
|
|
53
|
+
def export_genbank(
|
|
54
|
+
self,
|
|
55
|
+
sequence: str,
|
|
56
|
+
metadata: dict[str, Any],
|
|
57
|
+
output_file: str | None = None,
|
|
58
|
+
) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Export in GenBank format
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
sequence: Optimized DNA sequence
|
|
64
|
+
metadata: {
|
|
65
|
+
"protein_seq": "MAKLFG...",
|
|
66
|
+
"profile": "Balanced",
|
|
67
|
+
"cai": 0.87,
|
|
68
|
+
"gc": 51.2,
|
|
69
|
+
"run_id": "abc12345",
|
|
70
|
+
"timestamp": "2026-01-22T12:00:00",
|
|
71
|
+
"organism": "Nicotiana benthamiana",
|
|
72
|
+
"gene_name": "GFP",
|
|
73
|
+
"violations_fixed": [...],
|
|
74
|
+
"warnings": [...]
|
|
75
|
+
}
|
|
76
|
+
output_file: Output file path (returns string if None)
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
GenBank-formatted string
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
ImportError: If Biopython is not installed.
|
|
83
|
+
|
|
84
|
+
Examples:
|
|
85
|
+
>>> exporter = SequenceExporter()
|
|
86
|
+
>>> gb = exporter.export_genbank("ATG", {"protein_seq": "M"})
|
|
87
|
+
>>> "LOCUS" in gb
|
|
88
|
+
True
|
|
89
|
+
"""
|
|
90
|
+
try:
|
|
91
|
+
from Bio import SeqIO
|
|
92
|
+
from Bio.Seq import Seq
|
|
93
|
+
from Bio.SeqFeature import FeatureLocation, SeqFeature
|
|
94
|
+
from Bio.SeqRecord import SeqRecord
|
|
95
|
+
except ImportError:
|
|
96
|
+
raise ImportError("Biopython is required: pip install biopython")
|
|
97
|
+
|
|
98
|
+
# Set defaults
|
|
99
|
+
run_id = metadata.get("run_id", self.generate_run_id(sequence, metadata))
|
|
100
|
+
timestamp = metadata.get("timestamp", datetime.now().strftime("%Y%m%d"))
|
|
101
|
+
gene_name = metadata.get("gene_name", "optimized_gene")
|
|
102
|
+
organism = metadata.get("organism", "Nicotiana benthamiana")
|
|
103
|
+
|
|
104
|
+
# Build locus ID
|
|
105
|
+
locus_id = f"PFORM_{run_id}_{timestamp}"
|
|
106
|
+
|
|
107
|
+
# Build SeqRecord
|
|
108
|
+
record = SeqRecord(
|
|
109
|
+
Seq(sequence),
|
|
110
|
+
id=locus_id,
|
|
111
|
+
name=gene_name[:16], # GenBank name is limited to 16 chars
|
|
112
|
+
description=f"Codon-optimized for {organism}",
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Add annotations
|
|
116
|
+
record.annotations["molecule_type"] = "DNA"
|
|
117
|
+
record.annotations["topology"] = "linear"
|
|
118
|
+
record.annotations["date"] = datetime.now().strftime("%d-%b-%Y").upper()
|
|
119
|
+
record.annotations["organism"] = organism
|
|
120
|
+
|
|
121
|
+
# Build COMMENT section
|
|
122
|
+
comment_lines = [
|
|
123
|
+
"FactorForge v2.0 - Plant Codon Optimization Tool",
|
|
124
|
+
f"Run ID: {run_id}",
|
|
125
|
+
f"Timestamp: {metadata.get('timestamp', datetime.now().isoformat())}",
|
|
126
|
+
f"Profile: {metadata.get('profile', 'N/A')}",
|
|
127
|
+
f"CAI: {metadata.get('cai', 0.0):.3f}",
|
|
128
|
+
f"GC%: {metadata.get('gc', 0.0):.1f}",
|
|
129
|
+
]
|
|
130
|
+
|
|
131
|
+
# Assembly standard info
|
|
132
|
+
if metadata.get("assembly_standard"):
|
|
133
|
+
comment_lines.append(f"Assembly Standard: {metadata['assembly_standard']}")
|
|
134
|
+
|
|
135
|
+
# Fixed violations
|
|
136
|
+
violations_fixed = metadata.get("violations_fixed", [])
|
|
137
|
+
if violations_fixed:
|
|
138
|
+
comment_lines.append(f"Violations Fixed: {len(violations_fixed)}")
|
|
139
|
+
for v in violations_fixed[:5]: # Show at most 5
|
|
140
|
+
comment_lines.append(
|
|
141
|
+
f" - {v.get('type', 'unknown')} at position {v.get('position', 'N/A')}"
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Warnings
|
|
145
|
+
warnings = metadata.get("warnings", [])
|
|
146
|
+
if warnings:
|
|
147
|
+
comment_lines.append(f"Warnings: {len(warnings)}")
|
|
148
|
+
for w in warnings[:3]: # Show at most 3
|
|
149
|
+
comment_lines.append(f" - {w.get('message', 'N/A')}")
|
|
150
|
+
|
|
151
|
+
record.annotations["comment"] = "\n".join(comment_lines)
|
|
152
|
+
|
|
153
|
+
# Add CDS feature
|
|
154
|
+
if metadata.get("protein_seq"):
|
|
155
|
+
cds_feature = SeqFeature( # type: ignore[no-untyped-call]
|
|
156
|
+
FeatureLocation(0, len(sequence)), # type: ignore[no-untyped-call]
|
|
157
|
+
type="CDS",
|
|
158
|
+
qualifiers={
|
|
159
|
+
"codon_opt": ["Nicotiana benthamiana"],
|
|
160
|
+
"translation": [metadata["protein_seq"]],
|
|
161
|
+
"note": [
|
|
162
|
+
f"CAI={metadata.get('cai', 0.0):.3f}, GC={metadata.get('gc', 0.0):.1f}%"
|
|
163
|
+
],
|
|
164
|
+
"gene": [gene_name],
|
|
165
|
+
},
|
|
166
|
+
)
|
|
167
|
+
record.features.append(cds_feature)
|
|
168
|
+
|
|
169
|
+
# Additional feature annotations (promoter, terminator, etc.)
|
|
170
|
+
if metadata.get("features"):
|
|
171
|
+
for feat in metadata["features"]:
|
|
172
|
+
feature = SeqFeature( # type: ignore[no-untyped-call]
|
|
173
|
+
FeatureLocation(feat["start"], feat["end"]), # type: ignore[no-untyped-call]
|
|
174
|
+
type=feat["type"],
|
|
175
|
+
qualifiers=feat.get("qualifiers", {}),
|
|
176
|
+
)
|
|
177
|
+
record.features.append(feature)
|
|
178
|
+
|
|
179
|
+
# Write file or return string
|
|
180
|
+
if output_file:
|
|
181
|
+
SeqIO.write(record, output_file, "genbank")
|
|
182
|
+
return f"GenBank file written to {output_file}"
|
|
183
|
+
else:
|
|
184
|
+
output = StringIO()
|
|
185
|
+
SeqIO.write(record, output, "genbank")
|
|
186
|
+
return output.getvalue()
|
|
187
|
+
|
|
188
|
+
def export_fasta(
|
|
189
|
+
self,
|
|
190
|
+
sequence: str,
|
|
191
|
+
metadata: dict[str, Any],
|
|
192
|
+
output_file: str | None = None,
|
|
193
|
+
line_width: int = 80,
|
|
194
|
+
) -> str:
|
|
195
|
+
"""
|
|
196
|
+
Export in FASTA format
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
sequence: Optimized DNA sequence
|
|
200
|
+
metadata: Metadata (same as export_genbank)
|
|
201
|
+
output_file: Output file path (returns string if None)
|
|
202
|
+
line_width: Line wrap width (0 for no wrapping)
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
FASTA-formatted string
|
|
206
|
+
|
|
207
|
+
Raises:
|
|
208
|
+
None.
|
|
209
|
+
|
|
210
|
+
Examples:
|
|
211
|
+
>>> exporter = SequenceExporter()
|
|
212
|
+
>>> fasta = exporter.export_fasta("ATG", {"gene_name": "x"})
|
|
213
|
+
>>> fasta.startswith(">")
|
|
214
|
+
True
|
|
215
|
+
"""
|
|
216
|
+
# Set defaults
|
|
217
|
+
run_id = metadata.get("run_id", self.generate_run_id(sequence, metadata))
|
|
218
|
+
gene_name = metadata.get("gene_name", "optimized_gene")
|
|
219
|
+
|
|
220
|
+
# Build header
|
|
221
|
+
header_parts = [
|
|
222
|
+
f"PFORM_{run_id}",
|
|
223
|
+
f"gene={gene_name}",
|
|
224
|
+
f"CAI={metadata.get('cai', 0.0):.3f}",
|
|
225
|
+
f"GC={metadata.get('gc', 0.0):.1f}",
|
|
226
|
+
f"profile={metadata.get('profile', 'N/A')}",
|
|
227
|
+
]
|
|
228
|
+
|
|
229
|
+
if metadata.get("assembly_standard"):
|
|
230
|
+
header_parts.append(f"assembly={metadata['assembly_standard']}")
|
|
231
|
+
|
|
232
|
+
header = ">{}".format("|".join(header_parts))
|
|
233
|
+
|
|
234
|
+
# Wrap sequence
|
|
235
|
+
if line_width > 0:
|
|
236
|
+
seq_lines = [sequence[i : i + line_width] for i in range(0, len(sequence), line_width)]
|
|
237
|
+
seq_formatted = "\n".join(seq_lines)
|
|
238
|
+
else:
|
|
239
|
+
seq_formatted = sequence
|
|
240
|
+
|
|
241
|
+
fasta_content = f"{header}\n{seq_formatted}\n"
|
|
242
|
+
|
|
243
|
+
# Write file or return string
|
|
244
|
+
if output_file:
|
|
245
|
+
with open(output_file, "w") as f:
|
|
246
|
+
f.write(fasta_content)
|
|
247
|
+
return f"FASTA file written to {output_file}"
|
|
248
|
+
else:
|
|
249
|
+
return fasta_content
|
|
250
|
+
|
|
251
|
+
def export_batch(
|
|
252
|
+
self,
|
|
253
|
+
sequences: list[dict[str, Any]],
|
|
254
|
+
output_format: str = "fasta",
|
|
255
|
+
output_file: str | None = None,
|
|
256
|
+
) -> str:
|
|
257
|
+
"""
|
|
258
|
+
Export batch sequences
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
sequences: [{"sequence": "ATG...", "metadata": {...}}, ...]
|
|
262
|
+
output_format: "fasta" or "genbank"
|
|
263
|
+
output_file: Output file path
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Output message
|
|
267
|
+
|
|
268
|
+
Raises:
|
|
269
|
+
ValueError: Unsupported format or missing output_file for GenBank batch.
|
|
270
|
+
|
|
271
|
+
Examples:
|
|
272
|
+
>>> exporter = SequenceExporter()
|
|
273
|
+
>>> msg = exporter.export_batch([{"sequence": "ATG", "metadata": {}}])
|
|
274
|
+
>>> "sequence" in msg or "FASTA" in msg
|
|
275
|
+
True
|
|
276
|
+
"""
|
|
277
|
+
if output_format.lower() == "fasta":
|
|
278
|
+
# FASTA allows multiple sequences in one file
|
|
279
|
+
all_fasta = []
|
|
280
|
+
for seq_data in sequences:
|
|
281
|
+
fasta = self.export_fasta(seq_data["sequence"], seq_data.get("metadata", {}))
|
|
282
|
+
all_fasta.append(fasta.strip())
|
|
283
|
+
|
|
284
|
+
combined = "\n".join(all_fasta) + "\n"
|
|
285
|
+
|
|
286
|
+
if output_file:
|
|
287
|
+
with open(output_file, "w") as f:
|
|
288
|
+
f.write(combined)
|
|
289
|
+
return f"Batch FASTA written to {output_file} ({len(sequences)} sequences)"
|
|
290
|
+
else:
|
|
291
|
+
return combined
|
|
292
|
+
|
|
293
|
+
elif output_format.lower() == "genbank":
|
|
294
|
+
# GenBank stores each sequence in a separate file
|
|
295
|
+
if not output_file:
|
|
296
|
+
raise ValueError("GenBank batch export requires output_file")
|
|
297
|
+
|
|
298
|
+
import os
|
|
299
|
+
|
|
300
|
+
base_name, ext = os.path.splitext(output_file)
|
|
301
|
+
|
|
302
|
+
for i, seq_data in enumerate(sequences):
|
|
303
|
+
file_name = f"{base_name}_{i+1:03d}{ext}"
|
|
304
|
+
self.export_genbank(
|
|
305
|
+
seq_data["sequence"], seq_data.get("metadata", {}), output_file=file_name
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
return f"Batch GenBank written ({len(sequences)} files)"
|
|
309
|
+
|
|
310
|
+
else:
|
|
311
|
+
raise ValueError(f"Unsupported format: {output_format}")
|
|
312
|
+
|
|
313
|
+
def export_report(
|
|
314
|
+
self,
|
|
315
|
+
sequence: str,
|
|
316
|
+
metadata: dict[str, Any],
|
|
317
|
+
output_file: str | None = None,
|
|
318
|
+
) -> str:
|
|
319
|
+
"""
|
|
320
|
+
Create a human-readable report
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
sequence: Optimized DNA sequence
|
|
324
|
+
metadata: Metadata
|
|
325
|
+
output_file: Output file path
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
Report string
|
|
329
|
+
|
|
330
|
+
Raises:
|
|
331
|
+
None.
|
|
332
|
+
|
|
333
|
+
Examples:
|
|
334
|
+
>>> exporter = SequenceExporter()
|
|
335
|
+
>>> report = exporter.export_report("ATG", {"gene_name": "x"})
|
|
336
|
+
>>> "Optimization Report" in report
|
|
337
|
+
True
|
|
338
|
+
"""
|
|
339
|
+
run_id = metadata.get("run_id", self.generate_run_id(sequence, metadata))
|
|
340
|
+
|
|
341
|
+
report_lines = [
|
|
342
|
+
"=" * 70,
|
|
343
|
+
"FactorForge v2.0 - Optimization Report",
|
|
344
|
+
"=" * 70,
|
|
345
|
+
"",
|
|
346
|
+
f"Run ID: {run_id}",
|
|
347
|
+
f"Timestamp: {metadata.get('timestamp', datetime.now().isoformat())}",
|
|
348
|
+
f"Gene: {metadata.get('gene_name', 'N/A')}",
|
|
349
|
+
"",
|
|
350
|
+
"--- Sequence Information ---",
|
|
351
|
+
f"Length: {len(sequence)} bp",
|
|
352
|
+
f"GC Content: {metadata.get('gc', 0.0):.1f}%",
|
|
353
|
+
f"CAI: {metadata.get('cai', 0.0):.3f}",
|
|
354
|
+
"",
|
|
355
|
+
"--- Optimization Settings ---",
|
|
356
|
+
f"Profile: {metadata.get('profile', 'N/A')}",
|
|
357
|
+
f"Assembly Standard: {metadata.get('assembly_standard', 'None')}",
|
|
358
|
+
f"Organism: {metadata.get('organism', 'Nicotiana benthamiana')}",
|
|
359
|
+
"",
|
|
360
|
+
]
|
|
361
|
+
|
|
362
|
+
# Violations fixed
|
|
363
|
+
violations_fixed = metadata.get("violations_fixed", [])
|
|
364
|
+
if violations_fixed:
|
|
365
|
+
report_lines.append("--- Violations Fixed ---")
|
|
366
|
+
for v in violations_fixed:
|
|
367
|
+
report_lines.append(
|
|
368
|
+
f" • {v.get('type', 'Unknown')} at position {v.get('position', 'N/A')}"
|
|
369
|
+
)
|
|
370
|
+
if v.get("fix_description"):
|
|
371
|
+
report_lines.append(f" → {v['fix_description']}")
|
|
372
|
+
report_lines.append("")
|
|
373
|
+
|
|
374
|
+
# Warnings
|
|
375
|
+
warnings = metadata.get("warnings", [])
|
|
376
|
+
if warnings:
|
|
377
|
+
report_lines.append("--- Warnings ---")
|
|
378
|
+
for w in warnings:
|
|
379
|
+
report_lines.append(f" ⚠ {w.get('message', 'N/A')}")
|
|
380
|
+
if w.get("suggestion"):
|
|
381
|
+
report_lines.append(f" → {w['suggestion']}")
|
|
382
|
+
report_lines.append("")
|
|
383
|
+
|
|
384
|
+
# Quality score
|
|
385
|
+
if metadata.get("quality_score"):
|
|
386
|
+
report_lines.append("--- Quality Assessment ---")
|
|
387
|
+
score = metadata["quality_score"]
|
|
388
|
+
stars = "⭐" * int(score)
|
|
389
|
+
report_lines.append(f"Overall Quality: {stars} ({score}/5)")
|
|
390
|
+
report_lines.append("")
|
|
391
|
+
|
|
392
|
+
# Sequence preview
|
|
393
|
+
report_lines.append("--- Sequence Preview ---")
|
|
394
|
+
preview_len = min(120, len(sequence))
|
|
395
|
+
report_lines.append(sequence[:preview_len])
|
|
396
|
+
if len(sequence) > preview_len:
|
|
397
|
+
report_lines.append(f"... ({len(sequence) - preview_len} more bp)")
|
|
398
|
+
report_lines.append("")
|
|
399
|
+
|
|
400
|
+
report_lines.append("=" * 70)
|
|
401
|
+
|
|
402
|
+
report_content = "\n".join(report_lines)
|
|
403
|
+
|
|
404
|
+
if output_file:
|
|
405
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
406
|
+
f.write(report_content)
|
|
407
|
+
return f"Report written to {output_file}"
|
|
408
|
+
else:
|
|
409
|
+
return report_content
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
# --- Usage example ---
|
|
413
|
+
if __name__ == "__main__":
|
|
414
|
+
exporter = SequenceExporter()
|
|
415
|
+
|
|
416
|
+
# Test data
|
|
417
|
+
test_sequence = "ATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAA"
|
|
418
|
+
|
|
419
|
+
test_metadata = {
|
|
420
|
+
"gene_name": "GFP",
|
|
421
|
+
"protein_seq": "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHKVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK*",
|
|
422
|
+
"profile": "Balanced",
|
|
423
|
+
"cai": 0.87,
|
|
424
|
+
"gc": 51.2,
|
|
425
|
+
"assembly_standard": "Golden Gate (BsaI)",
|
|
426
|
+
"violations_fixed": [
|
|
427
|
+
{
|
|
428
|
+
"type": "BsaI site",
|
|
429
|
+
"position": 147,
|
|
430
|
+
"fix_description": "Synonymous substitution R→R (CGT→AGA)",
|
|
431
|
+
}
|
|
432
|
+
],
|
|
433
|
+
"warnings": [
|
|
434
|
+
{
|
|
435
|
+
"message": "High local GC content at position 450-500",
|
|
436
|
+
"suggestion": "Consider manual review",
|
|
437
|
+
}
|
|
438
|
+
],
|
|
439
|
+
"quality_score": 5,
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
print("=== FASTA Export ===")
|
|
443
|
+
fasta = exporter.export_fasta(test_sequence, test_metadata)
|
|
444
|
+
print(fasta[:200])
|
|
445
|
+
|
|
446
|
+
print("\n=== Report Export ===")
|
|
447
|
+
report = exporter.export_report(test_sequence, test_metadata)
|
|
448
|
+
print(report)
|
|
449
|
+
|
|
450
|
+
print("\n=== GenBank Export ===")
|
|
451
|
+
try:
|
|
452
|
+
genbank = exporter.export_genbank(test_sequence, test_metadata)
|
|
453
|
+
print(genbank[:500])
|
|
454
|
+
except ImportError as e:
|
|
455
|
+
print(f"Biopython not installed: {e}")
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""v2 Rule-based Optimizer Implementation"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from factorforge.core.interfaces import OptimizationResult, OptimizerEngine
|
|
8
|
+
|
|
9
|
+
from .exporter import SequenceExporter
|
|
10
|
+
from .rules.reverse_translator import OptimizationProfile, ReverseTranslator
|
|
11
|
+
from .rules.rule_engine import RuleEngine
|
|
12
|
+
from .scoring import calculate_composite_score
|
|
13
|
+
from .validator import InputValidator
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RuleBasedOptimizer(OptimizerEngine):
|
|
17
|
+
"""Rule-based optimization engine"""
|
|
18
|
+
|
|
19
|
+
name = "Rule-based"
|
|
20
|
+
version = "3.0.0"
|
|
21
|
+
|
|
22
|
+
def __init__(self) -> None:
|
|
23
|
+
self.validator = InputValidator()
|
|
24
|
+
self.translator = ReverseTranslator() # Data files use default path
|
|
25
|
+
self.rule_engine = RuleEngine()
|
|
26
|
+
self.exporter = SequenceExporter()
|
|
27
|
+
|
|
28
|
+
def optimize(
|
|
29
|
+
self,
|
|
30
|
+
sequence: str,
|
|
31
|
+
profile: str | None = "balanced",
|
|
32
|
+
**kwargs: Any,
|
|
33
|
+
) -> OptimizationResult:
|
|
34
|
+
"""
|
|
35
|
+
Rule-based optimization
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
sequence: Protein sequence or DNA sequence
|
|
39
|
+
profile: Optimization profile
|
|
40
|
+
**kwargs: Additional settings
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
OptimizationResult object
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
ValueError: If the input sequence is invalid.
|
|
47
|
+
|
|
48
|
+
Examples:
|
|
49
|
+
>>> optimizer = RuleBasedOptimizer()
|
|
50
|
+
>>> result = optimizer.optimize("MA", profile="balanced")
|
|
51
|
+
>>> len(result.sequence) == 6
|
|
52
|
+
True
|
|
53
|
+
"""
|
|
54
|
+
# 1. Validate input
|
|
55
|
+
val_result = self.validator.validate(sequence)
|
|
56
|
+
if not val_result["valid"]:
|
|
57
|
+
raise ValueError(f"Invalid input sequence: {val_result['errors']}")
|
|
58
|
+
|
|
59
|
+
processed_seq = val_result["processed_sequence"]
|
|
60
|
+
seq_type = val_result["type"]
|
|
61
|
+
if seq_type == "fasta":
|
|
62
|
+
seq_type = self.validator.detect_sequence_type(processed_seq).value
|
|
63
|
+
|
|
64
|
+
# 2. Normalize profile
|
|
65
|
+
profile_value = (profile or "balanced").lower()
|
|
66
|
+
try:
|
|
67
|
+
opt_profile = OptimizationProfile(profile_value)
|
|
68
|
+
except ValueError as exc:
|
|
69
|
+
supported = ", ".join(p.value for p in OptimizationProfile)
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"Unknown profile: {profile_value}. Supported profiles: {supported}"
|
|
72
|
+
) from exc
|
|
73
|
+
|
|
74
|
+
# 3. Reverse-translate (pick the best candidate)
|
|
75
|
+
if seq_type == "dna":
|
|
76
|
+
optimized_dna = processed_seq
|
|
77
|
+
cai = self.translator.calculate_cai(optimized_dna)
|
|
78
|
+
gc = self.translator.calculate_gc_content(optimized_dna)
|
|
79
|
+
score = calculate_composite_score(
|
|
80
|
+
cai=cai, gc=gc, sequence=optimized_dna, profile=profile_value
|
|
81
|
+
)
|
|
82
|
+
candidates = [{"sequence": optimized_dna, "cai": cai, "gc": gc, "score": score}]
|
|
83
|
+
else:
|
|
84
|
+
candidates = self.translator.generate_candidates(
|
|
85
|
+
processed_seq, profile=opt_profile, n=1
|
|
86
|
+
)
|
|
87
|
+
if not candidates:
|
|
88
|
+
raise ValueError("No candidates generated for input sequence.")
|
|
89
|
+
optimized_dna = candidates[0]["sequence"]
|
|
90
|
+
|
|
91
|
+
# 4. Rule checks (PolyA, etc.)
|
|
92
|
+
scan_mode = str(kwargs.get("scan_mode", "full"))
|
|
93
|
+
scan_include = kwargs.get("scan_include")
|
|
94
|
+
scan_exclude = kwargs.get("scan_exclude")
|
|
95
|
+
scan_results = self.rule_engine.scan_all(
|
|
96
|
+
optimized_dna,
|
|
97
|
+
mode=scan_mode,
|
|
98
|
+
include=scan_include,
|
|
99
|
+
exclude=scan_exclude,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# 5. Build result
|
|
103
|
+
metrics = {
|
|
104
|
+
"cai": candidates[0]["cai"],
|
|
105
|
+
# Keep both names for compatibility across v2 tests/callers.
|
|
106
|
+
"gc_content": candidates[0]["gc"],
|
|
107
|
+
"gc_percent": candidates[0]["gc"],
|
|
108
|
+
"score": candidates[0]["score"],
|
|
109
|
+
"violations": sum(len(v) for v in scan_results.values()),
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return OptimizationResult(
|
|
113
|
+
sequence=optimized_dna,
|
|
114
|
+
metrics=metrics,
|
|
115
|
+
metadata={
|
|
116
|
+
"engine": "v2",
|
|
117
|
+
"profile": profile_value,
|
|
118
|
+
"scan_mode": scan_mode,
|
|
119
|
+
"scan_results": scan_results,
|
|
120
|
+
},
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
def optimize_batch(
|
|
124
|
+
self,
|
|
125
|
+
sequences: list[dict[str, str]] | list[str],
|
|
126
|
+
profile: str | None = "balanced",
|
|
127
|
+
**kwargs: Any,
|
|
128
|
+
) -> list[OptimizationResult]:
|
|
129
|
+
"""Optimize a batch of sequences.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
sequences: Either list[str] or list[{"id": str, "sequence": str}].
|
|
133
|
+
profile: Optimization profile.
|
|
134
|
+
**kwargs: Additional optimize options.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
List of OptimizationResult entries in input order.
|
|
138
|
+
"""
|
|
139
|
+
results: list[OptimizationResult] = []
|
|
140
|
+
|
|
141
|
+
for idx, entry in enumerate(sequences, start=1):
|
|
142
|
+
if isinstance(entry, dict):
|
|
143
|
+
seq = entry.get("sequence", "")
|
|
144
|
+
seq_id = entry.get("id", f"seq{idx}")
|
|
145
|
+
else:
|
|
146
|
+
seq = entry
|
|
147
|
+
seq_id = f"seq{idx}"
|
|
148
|
+
|
|
149
|
+
result = self.optimize(seq, profile=profile, **kwargs)
|
|
150
|
+
result.metadata["input_id"] = seq_id
|
|
151
|
+
results.append(result)
|
|
152
|
+
|
|
153
|
+
return results
|
|
154
|
+
|
|
155
|
+
def validate(self, sequence: str) -> bool:
|
|
156
|
+
"""
|
|
157
|
+
Validate input
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
sequence: Input sequence
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Validity flag
|
|
164
|
+
|
|
165
|
+
Raises:
|
|
166
|
+
None.
|
|
167
|
+
|
|
168
|
+
Examples:
|
|
169
|
+
>>> optimizer = RuleBasedOptimizer()
|
|
170
|
+
>>> optimizer.validate("MA")
|
|
171
|
+
True
|
|
172
|
+
"""
|
|
173
|
+
return bool(self.validator.validate(sequence)["valid"])
|
|
174
|
+
|
|
175
|
+
def get_supported_profiles(self) -> list[str]:
|
|
176
|
+
"""
|
|
177
|
+
Return list of supported profiles
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
List of profile strings
|
|
181
|
+
|
|
182
|
+
Raises:
|
|
183
|
+
None.
|
|
184
|
+
|
|
185
|
+
Examples:
|
|
186
|
+
>>> optimizer = RuleBasedOptimizer()
|
|
187
|
+
>>> "balanced" in optimizer.get_supported_profiles()
|
|
188
|
+
True
|
|
189
|
+
"""
|
|
190
|
+
return [p.value for p in OptimizationProfile]
|