factorforge-cds 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. factorforge/__init__.py +19 -0
  2. factorforge/__main__.py +8 -0
  3. factorforge/cli/__init__.py +5 -0
  4. factorforge/cli/legacy_cli.py +157 -0
  5. factorforge/cli/main.py +305 -0
  6. factorforge/core/interfaces/__init__.py +7 -0
  7. factorforge/core/interfaces/exporter.py +13 -0
  8. factorforge/core/interfaces/optimizer.py +85 -0
  9. factorforge/core/interfaces/validator.py +9 -0
  10. factorforge/database.py +150 -0
  11. factorforge/engines/__init__.py +60 -0
  12. factorforge/engines/ml/__init__.py +0 -0
  13. factorforge/engines/ml/plant_optimizer.py +325 -0
  14. factorforge/engines/registry.py +141 -0
  15. factorforge/engines/v1_archived/__init__.py +15 -0
  16. factorforge/engines/v2/__init__.py +13 -0
  17. factorforge/engines/v2/codon_table_builder.py +107 -0
  18. factorforge/engines/v2/construct_builder.py +403 -0
  19. factorforge/engines/v2/exporter.py +455 -0
  20. factorforge/engines/v2/optimizer.py +190 -0
  21. factorforge/engines/v2/pipeline.py +275 -0
  22. factorforge/engines/v2/rules/__init__.py +3 -0
  23. factorforge/engines/v2/rules/domesticator.py +403 -0
  24. factorforge/engines/v2/rules/reverse_translator.py +765 -0
  25. factorforge/engines/v2/rules/rule_engine.py +867 -0
  26. factorforge/engines/v2/scoring.py +232 -0
  27. factorforge/engines/v2/utils.py +231 -0
  28. factorforge/engines/v2/validator.py +383 -0
  29. factorforge/engines/v3/__init__.py +12 -0
  30. factorforge/engines/v3/explain.py +119 -0
  31. factorforge/engines/v3/inference/__init__.py +6 -0
  32. factorforge/engines/v3/inference/constrained_decoder.py +80 -0
  33. factorforge/engines/v3/inference/v2_adapter.py +72 -0
  34. factorforge/engines/v3/metrics.py +145 -0
  35. factorforge/engines/v3/modeling_bart_decoder.py +127 -0
  36. factorforge/engines/v3/pipeline.py +192 -0
  37. factorforge/engines/v3/synonym_mask.py +61 -0
  38. factorforge/engines/v3/tokenizer.py +192 -0
  39. factorforge/ml/__init__.py +33 -0
  40. factorforge/ml/feasibility.py +199 -0
  41. factorforge/ml/metrics.py +295 -0
  42. factorforge/utils/__init__.py +31 -0
  43. factorforge/utils/construct_id.py +8 -0
  44. factorforge/utils/exceptions.py +32 -0
  45. factorforge/utils/sequence_validator.py +189 -0
  46. factorforge/utils/validation.py +104 -0
  47. factorforge_cds-3.0.0.dist-info/METADATA +475 -0
  48. factorforge_cds-3.0.0.dist-info/RECORD +52 -0
  49. factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
  50. factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
  51. factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
  52. factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,455 @@
1
+ """
2
+ Exporter for FactorForge v2
3
+ GenBank and FASTA export module (P0-5)
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import hashlib
9
+ from datetime import datetime
10
+ from io import StringIO
11
+ from typing import Any
12
+
13
+
14
+ class SequenceExporter:
15
+ """
16
+ Export optimized sequences in GenBank and FASTA formats
17
+
18
+ Features:
19
+ - GenBank format (with metadata)
20
+ - FASTA format (key info in header)
21
+ - Reproducibility via run ID
22
+ """
23
+
24
+ def __init__(self) -> None:
25
+ """Initialize"""
26
+ pass
27
+
28
+ def generate_run_id(self, sequence: str, params: dict[str, Any]) -> str:
29
+ """
30
+ Generate a reproducible run_id
31
+
32
+ Args:
33
+ sequence: DNA sequence
34
+ params: Optimization parameters
35
+
36
+ Returns:
37
+ 8-character hash string
38
+
39
+ Raises:
40
+ None.
41
+
42
+ Examples:
43
+ >>> exporter = SequenceExporter()
44
+ >>> run_id = exporter.generate_run_id("ATG", {"profile": "balanced"})
45
+ >>> len(run_id) == 8
46
+ True
47
+ """
48
+ # Create a hash from sequence + parameters
49
+ content = f"{sequence}_{params.get('profile', 'balanced')}_{params.get('assembly_standard', 'none')}"
50
+ hash_obj = hashlib.md5(content.encode())
51
+ return hash_obj.hexdigest()[:8]
52
+
53
+ def export_genbank(
54
+ self,
55
+ sequence: str,
56
+ metadata: dict[str, Any],
57
+ output_file: str | None = None,
58
+ ) -> str:
59
+ """
60
+ Export in GenBank format
61
+
62
+ Args:
63
+ sequence: Optimized DNA sequence
64
+ metadata: {
65
+ "protein_seq": "MAKLFG...",
66
+ "profile": "Balanced",
67
+ "cai": 0.87,
68
+ "gc": 51.2,
69
+ "run_id": "abc12345",
70
+ "timestamp": "2026-01-22T12:00:00",
71
+ "organism": "Nicotiana benthamiana",
72
+ "gene_name": "GFP",
73
+ "violations_fixed": [...],
74
+ "warnings": [...]
75
+ }
76
+ output_file: Output file path (returns string if None)
77
+
78
+ Returns:
79
+ GenBank-formatted string
80
+
81
+ Raises:
82
+ ImportError: If Biopython is not installed.
83
+
84
+ Examples:
85
+ >>> exporter = SequenceExporter()
86
+ >>> gb = exporter.export_genbank("ATG", {"protein_seq": "M"})
87
+ >>> "LOCUS" in gb
88
+ True
89
+ """
90
+ try:
91
+ from Bio import SeqIO
92
+ from Bio.Seq import Seq
93
+ from Bio.SeqFeature import FeatureLocation, SeqFeature
94
+ from Bio.SeqRecord import SeqRecord
95
+ except ImportError:
96
+ raise ImportError("Biopython is required: pip install biopython")
97
+
98
+ # Set defaults
99
+ run_id = metadata.get("run_id", self.generate_run_id(sequence, metadata))
100
+ timestamp = metadata.get("timestamp", datetime.now().strftime("%Y%m%d"))
101
+ gene_name = metadata.get("gene_name", "optimized_gene")
102
+ organism = metadata.get("organism", "Nicotiana benthamiana")
103
+
104
+ # Build locus ID
105
+ locus_id = f"PFORM_{run_id}_{timestamp}"
106
+
107
+ # Build SeqRecord
108
+ record = SeqRecord(
109
+ Seq(sequence),
110
+ id=locus_id,
111
+ name=gene_name[:16], # GenBank name is limited to 16 chars
112
+ description=f"Codon-optimized for {organism}",
113
+ )
114
+
115
+ # Add annotations
116
+ record.annotations["molecule_type"] = "DNA"
117
+ record.annotations["topology"] = "linear"
118
+ record.annotations["date"] = datetime.now().strftime("%d-%b-%Y").upper()
119
+ record.annotations["organism"] = organism
120
+
121
+ # Build COMMENT section
122
+ comment_lines = [
123
+ "FactorForge v2.0 - Plant Codon Optimization Tool",
124
+ f"Run ID: {run_id}",
125
+ f"Timestamp: {metadata.get('timestamp', datetime.now().isoformat())}",
126
+ f"Profile: {metadata.get('profile', 'N/A')}",
127
+ f"CAI: {metadata.get('cai', 0.0):.3f}",
128
+ f"GC%: {metadata.get('gc', 0.0):.1f}",
129
+ ]
130
+
131
+ # Assembly standard info
132
+ if metadata.get("assembly_standard"):
133
+ comment_lines.append(f"Assembly Standard: {metadata['assembly_standard']}")
134
+
135
+ # Fixed violations
136
+ violations_fixed = metadata.get("violations_fixed", [])
137
+ if violations_fixed:
138
+ comment_lines.append(f"Violations Fixed: {len(violations_fixed)}")
139
+ for v in violations_fixed[:5]: # Show at most 5
140
+ comment_lines.append(
141
+ f" - {v.get('type', 'unknown')} at position {v.get('position', 'N/A')}"
142
+ )
143
+
144
+ # Warnings
145
+ warnings = metadata.get("warnings", [])
146
+ if warnings:
147
+ comment_lines.append(f"Warnings: {len(warnings)}")
148
+ for w in warnings[:3]: # Show at most 3
149
+ comment_lines.append(f" - {w.get('message', 'N/A')}")
150
+
151
+ record.annotations["comment"] = "\n".join(comment_lines)
152
+
153
+ # Add CDS feature
154
+ if metadata.get("protein_seq"):
155
+ cds_feature = SeqFeature( # type: ignore[no-untyped-call]
156
+ FeatureLocation(0, len(sequence)), # type: ignore[no-untyped-call]
157
+ type="CDS",
158
+ qualifiers={
159
+ "codon_opt": ["Nicotiana benthamiana"],
160
+ "translation": [metadata["protein_seq"]],
161
+ "note": [
162
+ f"CAI={metadata.get('cai', 0.0):.3f}, GC={metadata.get('gc', 0.0):.1f}%"
163
+ ],
164
+ "gene": [gene_name],
165
+ },
166
+ )
167
+ record.features.append(cds_feature)
168
+
169
+ # Additional feature annotations (promoter, terminator, etc.)
170
+ if metadata.get("features"):
171
+ for feat in metadata["features"]:
172
+ feature = SeqFeature( # type: ignore[no-untyped-call]
173
+ FeatureLocation(feat["start"], feat["end"]), # type: ignore[no-untyped-call]
174
+ type=feat["type"],
175
+ qualifiers=feat.get("qualifiers", {}),
176
+ )
177
+ record.features.append(feature)
178
+
179
+ # Write file or return string
180
+ if output_file:
181
+ SeqIO.write(record, output_file, "genbank")
182
+ return f"GenBank file written to {output_file}"
183
+ else:
184
+ output = StringIO()
185
+ SeqIO.write(record, output, "genbank")
186
+ return output.getvalue()
187
+
188
+ def export_fasta(
189
+ self,
190
+ sequence: str,
191
+ metadata: dict[str, Any],
192
+ output_file: str | None = None,
193
+ line_width: int = 80,
194
+ ) -> str:
195
+ """
196
+ Export in FASTA format
197
+
198
+ Args:
199
+ sequence: Optimized DNA sequence
200
+ metadata: Metadata (same as export_genbank)
201
+ output_file: Output file path (returns string if None)
202
+ line_width: Line wrap width (0 for no wrapping)
203
+
204
+ Returns:
205
+ FASTA-formatted string
206
+
207
+ Raises:
208
+ None.
209
+
210
+ Examples:
211
+ >>> exporter = SequenceExporter()
212
+ >>> fasta = exporter.export_fasta("ATG", {"gene_name": "x"})
213
+ >>> fasta.startswith(">")
214
+ True
215
+ """
216
+ # Set defaults
217
+ run_id = metadata.get("run_id", self.generate_run_id(sequence, metadata))
218
+ gene_name = metadata.get("gene_name", "optimized_gene")
219
+
220
+ # Build header
221
+ header_parts = [
222
+ f"PFORM_{run_id}",
223
+ f"gene={gene_name}",
224
+ f"CAI={metadata.get('cai', 0.0):.3f}",
225
+ f"GC={metadata.get('gc', 0.0):.1f}",
226
+ f"profile={metadata.get('profile', 'N/A')}",
227
+ ]
228
+
229
+ if metadata.get("assembly_standard"):
230
+ header_parts.append(f"assembly={metadata['assembly_standard']}")
231
+
232
+ header = ">{}".format("|".join(header_parts))
233
+
234
+ # Wrap sequence
235
+ if line_width > 0:
236
+ seq_lines = [sequence[i : i + line_width] for i in range(0, len(sequence), line_width)]
237
+ seq_formatted = "\n".join(seq_lines)
238
+ else:
239
+ seq_formatted = sequence
240
+
241
+ fasta_content = f"{header}\n{seq_formatted}\n"
242
+
243
+ # Write file or return string
244
+ if output_file:
245
+ with open(output_file, "w") as f:
246
+ f.write(fasta_content)
247
+ return f"FASTA file written to {output_file}"
248
+ else:
249
+ return fasta_content
250
+
251
+ def export_batch(
252
+ self,
253
+ sequences: list[dict[str, Any]],
254
+ output_format: str = "fasta",
255
+ output_file: str | None = None,
256
+ ) -> str:
257
+ """
258
+ Export batch sequences
259
+
260
+ Args:
261
+ sequences: [{"sequence": "ATG...", "metadata": {...}}, ...]
262
+ output_format: "fasta" or "genbank"
263
+ output_file: Output file path
264
+
265
+ Returns:
266
+ Output message
267
+
268
+ Raises:
269
+ ValueError: Unsupported format or missing output_file for GenBank batch.
270
+
271
+ Examples:
272
+ >>> exporter = SequenceExporter()
273
+ >>> msg = exporter.export_batch([{"sequence": "ATG", "metadata": {}}])
274
+ >>> "sequence" in msg or "FASTA" in msg
275
+ True
276
+ """
277
+ if output_format.lower() == "fasta":
278
+ # FASTA allows multiple sequences in one file
279
+ all_fasta = []
280
+ for seq_data in sequences:
281
+ fasta = self.export_fasta(seq_data["sequence"], seq_data.get("metadata", {}))
282
+ all_fasta.append(fasta.strip())
283
+
284
+ combined = "\n".join(all_fasta) + "\n"
285
+
286
+ if output_file:
287
+ with open(output_file, "w") as f:
288
+ f.write(combined)
289
+ return f"Batch FASTA written to {output_file} ({len(sequences)} sequences)"
290
+ else:
291
+ return combined
292
+
293
+ elif output_format.lower() == "genbank":
294
+ # GenBank stores each sequence in a separate file
295
+ if not output_file:
296
+ raise ValueError("GenBank batch export requires output_file")
297
+
298
+ import os
299
+
300
+ base_name, ext = os.path.splitext(output_file)
301
+
302
+ for i, seq_data in enumerate(sequences):
303
+ file_name = f"{base_name}_{i+1:03d}{ext}"
304
+ self.export_genbank(
305
+ seq_data["sequence"], seq_data.get("metadata", {}), output_file=file_name
306
+ )
307
+
308
+ return f"Batch GenBank written ({len(sequences)} files)"
309
+
310
+ else:
311
+ raise ValueError(f"Unsupported format: {output_format}")
312
+
313
+ def export_report(
314
+ self,
315
+ sequence: str,
316
+ metadata: dict[str, Any],
317
+ output_file: str | None = None,
318
+ ) -> str:
319
+ """
320
+ Create a human-readable report
321
+
322
+ Args:
323
+ sequence: Optimized DNA sequence
324
+ metadata: Metadata
325
+ output_file: Output file path
326
+
327
+ Returns:
328
+ Report string
329
+
330
+ Raises:
331
+ None.
332
+
333
+ Examples:
334
+ >>> exporter = SequenceExporter()
335
+ >>> report = exporter.export_report("ATG", {"gene_name": "x"})
336
+ >>> "Optimization Report" in report
337
+ True
338
+ """
339
+ run_id = metadata.get("run_id", self.generate_run_id(sequence, metadata))
340
+
341
+ report_lines = [
342
+ "=" * 70,
343
+ "FactorForge v2.0 - Optimization Report",
344
+ "=" * 70,
345
+ "",
346
+ f"Run ID: {run_id}",
347
+ f"Timestamp: {metadata.get('timestamp', datetime.now().isoformat())}",
348
+ f"Gene: {metadata.get('gene_name', 'N/A')}",
349
+ "",
350
+ "--- Sequence Information ---",
351
+ f"Length: {len(sequence)} bp",
352
+ f"GC Content: {metadata.get('gc', 0.0):.1f}%",
353
+ f"CAI: {metadata.get('cai', 0.0):.3f}",
354
+ "",
355
+ "--- Optimization Settings ---",
356
+ f"Profile: {metadata.get('profile', 'N/A')}",
357
+ f"Assembly Standard: {metadata.get('assembly_standard', 'None')}",
358
+ f"Organism: {metadata.get('organism', 'Nicotiana benthamiana')}",
359
+ "",
360
+ ]
361
+
362
+ # Violations fixed
363
+ violations_fixed = metadata.get("violations_fixed", [])
364
+ if violations_fixed:
365
+ report_lines.append("--- Violations Fixed ---")
366
+ for v in violations_fixed:
367
+ report_lines.append(
368
+ f" • {v.get('type', 'Unknown')} at position {v.get('position', 'N/A')}"
369
+ )
370
+ if v.get("fix_description"):
371
+ report_lines.append(f" → {v['fix_description']}")
372
+ report_lines.append("")
373
+
374
+ # Warnings
375
+ warnings = metadata.get("warnings", [])
376
+ if warnings:
377
+ report_lines.append("--- Warnings ---")
378
+ for w in warnings:
379
+ report_lines.append(f" ⚠ {w.get('message', 'N/A')}")
380
+ if w.get("suggestion"):
381
+ report_lines.append(f" → {w['suggestion']}")
382
+ report_lines.append("")
383
+
384
+ # Quality score
385
+ if metadata.get("quality_score"):
386
+ report_lines.append("--- Quality Assessment ---")
387
+ score = metadata["quality_score"]
388
+ stars = "⭐" * int(score)
389
+ report_lines.append(f"Overall Quality: {stars} ({score}/5)")
390
+ report_lines.append("")
391
+
392
+ # Sequence preview
393
+ report_lines.append("--- Sequence Preview ---")
394
+ preview_len = min(120, len(sequence))
395
+ report_lines.append(sequence[:preview_len])
396
+ if len(sequence) > preview_len:
397
+ report_lines.append(f"... ({len(sequence) - preview_len} more bp)")
398
+ report_lines.append("")
399
+
400
+ report_lines.append("=" * 70)
401
+
402
+ report_content = "\n".join(report_lines)
403
+
404
+ if output_file:
405
+ with open(output_file, "w", encoding="utf-8") as f:
406
+ f.write(report_content)
407
+ return f"Report written to {output_file}"
408
+ else:
409
+ return report_content
410
+
411
+
412
+ # --- Usage example ---
413
+ if __name__ == "__main__":
414
+ exporter = SequenceExporter()
415
+
416
+ # Test data
417
+ test_sequence = "ATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAA"
418
+
419
+ test_metadata = {
420
+ "gene_name": "GFP",
421
+ "protein_seq": "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHKVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK*",
422
+ "profile": "Balanced",
423
+ "cai": 0.87,
424
+ "gc": 51.2,
425
+ "assembly_standard": "Golden Gate (BsaI)",
426
+ "violations_fixed": [
427
+ {
428
+ "type": "BsaI site",
429
+ "position": 147,
430
+ "fix_description": "Synonymous substitution R→R (CGT→AGA)",
431
+ }
432
+ ],
433
+ "warnings": [
434
+ {
435
+ "message": "High local GC content at position 450-500",
436
+ "suggestion": "Consider manual review",
437
+ }
438
+ ],
439
+ "quality_score": 5,
440
+ }
441
+
442
+ print("=== FASTA Export ===")
443
+ fasta = exporter.export_fasta(test_sequence, test_metadata)
444
+ print(fasta[:200])
445
+
446
+ print("\n=== Report Export ===")
447
+ report = exporter.export_report(test_sequence, test_metadata)
448
+ print(report)
449
+
450
+ print("\n=== GenBank Export ===")
451
+ try:
452
+ genbank = exporter.export_genbank(test_sequence, test_metadata)
453
+ print(genbank[:500])
454
+ except ImportError as e:
455
+ print(f"Biopython not installed: {e}")
@@ -0,0 +1,190 @@
1
+ """v2 Rule-based Optimizer Implementation"""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from factorforge.core.interfaces import OptimizationResult, OptimizerEngine
8
+
9
+ from .exporter import SequenceExporter
10
+ from .rules.reverse_translator import OptimizationProfile, ReverseTranslator
11
+ from .rules.rule_engine import RuleEngine
12
+ from .scoring import calculate_composite_score
13
+ from .validator import InputValidator
14
+
15
+
16
+ class RuleBasedOptimizer(OptimizerEngine):
17
+ """Rule-based optimization engine"""
18
+
19
+ name = "Rule-based"
20
+ version = "3.0.0"
21
+
22
+ def __init__(self) -> None:
23
+ self.validator = InputValidator()
24
+ self.translator = ReverseTranslator() # Data files use default path
25
+ self.rule_engine = RuleEngine()
26
+ self.exporter = SequenceExporter()
27
+
28
+ def optimize(
29
+ self,
30
+ sequence: str,
31
+ profile: str | None = "balanced",
32
+ **kwargs: Any,
33
+ ) -> OptimizationResult:
34
+ """
35
+ Rule-based optimization
36
+
37
+ Args:
38
+ sequence: Protein sequence or DNA sequence
39
+ profile: Optimization profile
40
+ **kwargs: Additional settings
41
+
42
+ Returns:
43
+ OptimizationResult object
44
+
45
+ Raises:
46
+ ValueError: If the input sequence is invalid.
47
+
48
+ Examples:
49
+ >>> optimizer = RuleBasedOptimizer()
50
+ >>> result = optimizer.optimize("MA", profile="balanced")
51
+ >>> len(result.sequence) == 6
52
+ True
53
+ """
54
+ # 1. Validate input
55
+ val_result = self.validator.validate(sequence)
56
+ if not val_result["valid"]:
57
+ raise ValueError(f"Invalid input sequence: {val_result['errors']}")
58
+
59
+ processed_seq = val_result["processed_sequence"]
60
+ seq_type = val_result["type"]
61
+ if seq_type == "fasta":
62
+ seq_type = self.validator.detect_sequence_type(processed_seq).value
63
+
64
+ # 2. Normalize profile
65
+ profile_value = (profile or "balanced").lower()
66
+ try:
67
+ opt_profile = OptimizationProfile(profile_value)
68
+ except ValueError as exc:
69
+ supported = ", ".join(p.value for p in OptimizationProfile)
70
+ raise ValueError(
71
+ f"Unknown profile: {profile_value}. Supported profiles: {supported}"
72
+ ) from exc
73
+
74
+ # 3. Reverse-translate (pick the best candidate)
75
+ if seq_type == "dna":
76
+ optimized_dna = processed_seq
77
+ cai = self.translator.calculate_cai(optimized_dna)
78
+ gc = self.translator.calculate_gc_content(optimized_dna)
79
+ score = calculate_composite_score(
80
+ cai=cai, gc=gc, sequence=optimized_dna, profile=profile_value
81
+ )
82
+ candidates = [{"sequence": optimized_dna, "cai": cai, "gc": gc, "score": score}]
83
+ else:
84
+ candidates = self.translator.generate_candidates(
85
+ processed_seq, profile=opt_profile, n=1
86
+ )
87
+ if not candidates:
88
+ raise ValueError("No candidates generated for input sequence.")
89
+ optimized_dna = candidates[0]["sequence"]
90
+
91
+ # 4. Rule checks (PolyA, etc.)
92
+ scan_mode = str(kwargs.get("scan_mode", "full"))
93
+ scan_include = kwargs.get("scan_include")
94
+ scan_exclude = kwargs.get("scan_exclude")
95
+ scan_results = self.rule_engine.scan_all(
96
+ optimized_dna,
97
+ mode=scan_mode,
98
+ include=scan_include,
99
+ exclude=scan_exclude,
100
+ )
101
+
102
+ # 5. Build result
103
+ metrics = {
104
+ "cai": candidates[0]["cai"],
105
+ # Keep both names for compatibility across v2 tests/callers.
106
+ "gc_content": candidates[0]["gc"],
107
+ "gc_percent": candidates[0]["gc"],
108
+ "score": candidates[0]["score"],
109
+ "violations": sum(len(v) for v in scan_results.values()),
110
+ }
111
+
112
+ return OptimizationResult(
113
+ sequence=optimized_dna,
114
+ metrics=metrics,
115
+ metadata={
116
+ "engine": "v2",
117
+ "profile": profile_value,
118
+ "scan_mode": scan_mode,
119
+ "scan_results": scan_results,
120
+ },
121
+ )
122
+
123
+ def optimize_batch(
124
+ self,
125
+ sequences: list[dict[str, str]] | list[str],
126
+ profile: str | None = "balanced",
127
+ **kwargs: Any,
128
+ ) -> list[OptimizationResult]:
129
+ """Optimize a batch of sequences.
130
+
131
+ Args:
132
+ sequences: Either list[str] or list[{"id": str, "sequence": str}].
133
+ profile: Optimization profile.
134
+ **kwargs: Additional optimize options.
135
+
136
+ Returns:
137
+ List of OptimizationResult entries in input order.
138
+ """
139
+ results: list[OptimizationResult] = []
140
+
141
+ for idx, entry in enumerate(sequences, start=1):
142
+ if isinstance(entry, dict):
143
+ seq = entry.get("sequence", "")
144
+ seq_id = entry.get("id", f"seq{idx}")
145
+ else:
146
+ seq = entry
147
+ seq_id = f"seq{idx}"
148
+
149
+ result = self.optimize(seq, profile=profile, **kwargs)
150
+ result.metadata["input_id"] = seq_id
151
+ results.append(result)
152
+
153
+ return results
154
+
155
+ def validate(self, sequence: str) -> bool:
156
+ """
157
+ Validate input
158
+
159
+ Args:
160
+ sequence: Input sequence
161
+
162
+ Returns:
163
+ Validity flag
164
+
165
+ Raises:
166
+ None.
167
+
168
+ Examples:
169
+ >>> optimizer = RuleBasedOptimizer()
170
+ >>> optimizer.validate("MA")
171
+ True
172
+ """
173
+ return bool(self.validator.validate(sequence)["valid"])
174
+
175
+ def get_supported_profiles(self) -> list[str]:
176
+ """
177
+ Return list of supported profiles
178
+
179
+ Returns:
180
+ List of profile strings
181
+
182
+ Raises:
183
+ None.
184
+
185
+ Examples:
186
+ >>> optimizer = RuleBasedOptimizer()
187
+ >>> "balanced" in optimizer.get_supported_profiles()
188
+ True
189
+ """
190
+ return [p.value for p in OptimizationProfile]