factorforge-cds 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. factorforge/__init__.py +19 -0
  2. factorforge/__main__.py +8 -0
  3. factorforge/cli/__init__.py +5 -0
  4. factorforge/cli/legacy_cli.py +157 -0
  5. factorforge/cli/main.py +305 -0
  6. factorforge/core/interfaces/__init__.py +7 -0
  7. factorforge/core/interfaces/exporter.py +13 -0
  8. factorforge/core/interfaces/optimizer.py +85 -0
  9. factorforge/core/interfaces/validator.py +9 -0
  10. factorforge/database.py +150 -0
  11. factorforge/engines/__init__.py +60 -0
  12. factorforge/engines/ml/__init__.py +0 -0
  13. factorforge/engines/ml/plant_optimizer.py +325 -0
  14. factorforge/engines/registry.py +141 -0
  15. factorforge/engines/v1_archived/__init__.py +15 -0
  16. factorforge/engines/v2/__init__.py +13 -0
  17. factorforge/engines/v2/codon_table_builder.py +107 -0
  18. factorforge/engines/v2/construct_builder.py +403 -0
  19. factorforge/engines/v2/exporter.py +455 -0
  20. factorforge/engines/v2/optimizer.py +190 -0
  21. factorforge/engines/v2/pipeline.py +275 -0
  22. factorforge/engines/v2/rules/__init__.py +3 -0
  23. factorforge/engines/v2/rules/domesticator.py +403 -0
  24. factorforge/engines/v2/rules/reverse_translator.py +765 -0
  25. factorforge/engines/v2/rules/rule_engine.py +867 -0
  26. factorforge/engines/v2/scoring.py +232 -0
  27. factorforge/engines/v2/utils.py +231 -0
  28. factorforge/engines/v2/validator.py +383 -0
  29. factorforge/engines/v3/__init__.py +12 -0
  30. factorforge/engines/v3/explain.py +119 -0
  31. factorforge/engines/v3/inference/__init__.py +6 -0
  32. factorforge/engines/v3/inference/constrained_decoder.py +80 -0
  33. factorforge/engines/v3/inference/v2_adapter.py +72 -0
  34. factorforge/engines/v3/metrics.py +145 -0
  35. factorforge/engines/v3/modeling_bart_decoder.py +127 -0
  36. factorforge/engines/v3/pipeline.py +192 -0
  37. factorforge/engines/v3/synonym_mask.py +61 -0
  38. factorforge/engines/v3/tokenizer.py +192 -0
  39. factorforge/ml/__init__.py +33 -0
  40. factorforge/ml/feasibility.py +199 -0
  41. factorforge/ml/metrics.py +295 -0
  42. factorforge/utils/__init__.py +31 -0
  43. factorforge/utils/construct_id.py +8 -0
  44. factorforge/utils/exceptions.py +32 -0
  45. factorforge/utils/sequence_validator.py +189 -0
  46. factorforge/utils/validation.py +104 -0
  47. factorforge_cds-3.0.0.dist-info/METADATA +475 -0
  48. factorforge_cds-3.0.0.dist-info/RECORD +52 -0
  49. factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
  50. factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
  51. factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
  52. factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,13 @@
1
+ """
2
+ FactorForge v2 - Rule-based Engine
3
+
4
+ Production system (2026)
5
+ Plant-specific rule-based optimization
6
+ """
7
+
8
+ __version__ = "3.0.0"
9
+
10
+ from .optimizer import RuleBasedOptimizer
11
+ from .pipeline import OptimizationPipeline
12
+
13
+ __all__ = ["OptimizationPipeline", "RuleBasedOptimizer"]
@@ -0,0 +1,107 @@
1
+ """
2
+ Codon Table Builder for FactorForge v2.
3
+ Build blended codon usage tables from multiple data sources for optimized CAI calculation.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def build_golden_set(
17
+ high_expression_path: str | Path,
18
+ empirical_path: str | Path,
19
+ blend_ratio: float = 0.7,
20
+ output_path: str | Path | None = None,
21
+ ) -> dict[str, Any]:
22
+ """
23
+ Build a 'golden set' codon table blending high-expression and empirical data.
24
+
25
+ The golden set uses codon frequencies biased toward highly expressed genes,
26
+ providing more accurate CAI reference weights per Sharp & Li (1987).
27
+
28
+ Args:
29
+ high_expression_path: Path to high-expression reference frequencies JSON.
30
+ Must contain a "codon_usage" dict mapping amino acids to codon frequencies.
31
+ empirical_path: Path to empirical codon table JSON (e.g. RNA-seq expression-weighted frequencies).
32
+ Must contain "codons" and "amino_acids" sections.
33
+ blend_ratio: Weight for high-expression data (0.0-1.0). Default 0.7.
34
+ output_path: Optional path to write the blended table.
35
+
36
+ Returns:
37
+ Blended codon table dict in the standard format (codons + amino_acids).
38
+ """
39
+ if not 0.0 <= blend_ratio <= 1.0:
40
+ raise ValueError(f"blend_ratio must be between 0.0 and 1.0, got {blend_ratio}")
41
+
42
+ with open(high_expression_path, "r", encoding="utf-8") as f:
43
+ high_expr = json.load(f)
44
+
45
+ with open(empirical_path, "r", encoding="utf-8") as f:
46
+ empirical = json.load(f)
47
+
48
+ # Extract codon_usage from high-expression source
49
+ high_usage = high_expr.get("codon_usage", {})
50
+
51
+ # Extract per-codon frequencies from empirical source
52
+ empirical_codons = empirical.get("codons", {})
53
+
54
+ # Build blended frequencies per amino acid
55
+ blended_codons: dict[str, dict[str, Any]] = {}
56
+ blended_amino_acids: dict[str, dict[str, Any]] = {}
57
+
58
+ for aa, high_codons in high_usage.items():
59
+ aa_freqs: dict[str, float] = {}
60
+
61
+ for codon, high_freq in high_codons.items():
62
+ # Get empirical frequency
63
+ emp_info = empirical_codons.get(codon, {})
64
+ emp_freq = emp_info.get("frequency", high_freq) if emp_info else high_freq
65
+
66
+ # Blend: weighted average
67
+ blended_freq = blend_ratio * high_freq + (1 - blend_ratio) * emp_freq
68
+ aa_freqs[codon] = blended_freq
69
+
70
+ # Normalize per amino acid (frequencies must sum to 1.0)
71
+ total = sum(aa_freqs.values())
72
+ if total > 0:
73
+ aa_freqs = {c: round(f / total, 4) for c, f in aa_freqs.items()}
74
+
75
+ # Build codons section entries
76
+ for codon, freq in aa_freqs.items():
77
+ blended_codons[codon] = {
78
+ "aa": aa,
79
+ "frequency": freq,
80
+ "per_thousand": round(freq * 1000 / len(aa_freqs), 1),
81
+ }
82
+
83
+ # Build amino_acids section
84
+ sorted_codons = sorted(aa_freqs.keys(), key=lambda c: aa_freqs[c], reverse=True)
85
+ blended_amino_acids[aa] = {
86
+ "codons": sorted_codons,
87
+ "preferred": sorted_codons[0] if sorted_codons else "",
88
+ }
89
+
90
+ result: dict[str, Any] = {
91
+ "organism": high_expr.get("species", empirical.get("organism", "Unknown")),
92
+ "source": (
93
+ f"Golden Set ({int(blend_ratio * 100)}% high-expression "
94
+ f"+ {int((1 - blend_ratio) * 100)}% empirical)"
95
+ ),
96
+ "blend_ratio": blend_ratio,
97
+ "codons": blended_codons,
98
+ "amino_acids": blended_amino_acids,
99
+ "gc_content": empirical.get("gc_content", {"overall": 0.44}),
100
+ }
101
+
102
+ if output_path is not None:
103
+ with open(output_path, "w", encoding="utf-8") as f:
104
+ json.dump(result, f, indent=4, ensure_ascii=False)
105
+ logger.info(f"Golden set written to {output_path}")
106
+
107
+ return result
@@ -0,0 +1,403 @@
1
+ """
2
+ Construct Builder for FactorForge v2.
3
+ Builds Golden Gate-compatible expression constructs from templates.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import TYPE_CHECKING, Any, cast
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ if TYPE_CHECKING:
16
+ from Bio.SeqRecord import SeqRecord
17
+
18
+
19
+ class ConstructBuilder:
20
+ """Assemble constructs from JSON templates."""
21
+
22
+ def __init__(self, template_dir: Path) -> None:
23
+ """
24
+ Args:
25
+ template_dir: Directory containing construct templates.
26
+ """
27
+ self.template_dir = template_dir
28
+
29
+ def load_template(self, name: str) -> dict[str, Any]:
30
+ """
31
+ Load a construct template by name.
32
+
33
+ Args:
34
+ name: Template name or filename (without extension).
35
+
36
+ Returns:
37
+ Template payload as a dictionary.
38
+
39
+ Raises:
40
+ FileNotFoundError: If the template file does not exist.
41
+ json.JSONDecodeError: If the template file is invalid JSON.
42
+ ValueError: If the template path is invalid (path traversal attempt).
43
+ """
44
+ filename = name if name.endswith(".json") else f"{name}.json"
45
+ template_path = self.template_dir / filename
46
+
47
+ # Security: Prevent path traversal attacks
48
+ try:
49
+ resolved_path = template_path.resolve()
50
+ resolved_dir = self.template_dir.resolve()
51
+ if not resolved_path.is_relative_to(resolved_dir):
52
+ raise ValueError(
53
+ f"Invalid template path: {name}. "
54
+ "Template must be within the template directory."
55
+ )
56
+ except (ValueError, OSError) as exc:
57
+ raise ValueError(f"Invalid template path: {name}") from exc
58
+
59
+ with open(template_path, "r", encoding="utf-8") as handle:
60
+ return cast(dict[str, Any], json.load(handle))
61
+
62
+ def assemble_parts(self, gene_sequence: str, template: dict[str, Any]) -> str:
63
+ """
64
+ Assemble a construct sequence from template components.
65
+
66
+ Args:
67
+ gene_sequence: Optimized CDS sequence.
68
+ template: Template dictionary from load_template().
69
+
70
+ Returns:
71
+ Assembled construct DNA sequence.
72
+
73
+ Raises:
74
+ ValueError: If template components are missing.
75
+ """
76
+ components = template.get("components", [])
77
+ if not components:
78
+ raise ValueError("Template has no components to assemble.")
79
+
80
+ parts: list[str] = []
81
+ for component in components:
82
+ sequence = component.get("sequence", "")
83
+ if component.get("type") == "cds" and sequence == "USER_INPUT":
84
+ sequence = gene_sequence
85
+ parts.append(sequence)
86
+
87
+ return "".join(parts)
88
+
89
+ def add_features(self, construct_seq: str, template: dict[str, Any]) -> "SeqRecord":
90
+ """
91
+ Create a SeqRecord with component features.
92
+
93
+ Args:
94
+ construct_seq: Assembled construct sequence.
95
+ template: Template dictionary.
96
+
97
+ Returns:
98
+ SeqRecord with component features added.
99
+
100
+ Raises:
101
+ ImportError: If Biopython is not installed.
102
+ ValueError: If component lengths cannot be resolved.
103
+ """
104
+ try:
105
+ from Bio.Seq import Seq
106
+ from Bio.SeqFeature import FeatureLocation, SeqFeature
107
+ from Bio.SeqRecord import SeqRecord
108
+ except ImportError as exc:
109
+ raise ImportError("Biopython is required: pip install biopython") from exc
110
+
111
+ components: list[dict[str, Any]] = template.get("components", [])
112
+ template_name = template.get("name", "Construct")
113
+ template_desc = template.get("description", "")
114
+ record_id = template_name.replace(" ", "_")
115
+ record_name = record_id[:16]
116
+
117
+ record = SeqRecord(
118
+ Seq(construct_seq),
119
+ id=record_id,
120
+ name=record_name,
121
+ description=template_desc,
122
+ )
123
+ record.annotations["molecule_type"] = "DNA"
124
+
125
+ lengths: list[int | None] = []
126
+ unknown_indices: list[int] = []
127
+ for idx, component in enumerate(components):
128
+ sequence = component.get("sequence", "")
129
+ if component.get("type") == "cds" and sequence == "USER_INPUT":
130
+ lengths.append(None)
131
+ unknown_indices.append(idx)
132
+ else:
133
+ lengths.append(len(sequence))
134
+
135
+ if len(unknown_indices) > 1:
136
+ raise ValueError("Multiple USER_INPUT components are not supported.")
137
+
138
+ if unknown_indices:
139
+ known_total = sum(length for length in lengths if length is not None)
140
+ unknown_length = len(construct_seq) - known_total
141
+ if unknown_length < 0:
142
+ raise ValueError("Construct sequence shorter than template components.")
143
+ lengths[unknown_indices[0]] = unknown_length
144
+
145
+ feature_type_map = {
146
+ "promoter": "promoter",
147
+ "5utr": "5'UTR",
148
+ "cds": "CDS",
149
+ "terminator": "terminator",
150
+ }
151
+
152
+ cursor = 0
153
+ for component, length in zip(components, lengths):
154
+ if length is None:
155
+ raise ValueError("Component length could not be resolved.")
156
+ start = cursor
157
+ end = cursor + length
158
+ cursor = end
159
+
160
+ comp_type = component.get("type", "misc_feature")
161
+ feature_type = feature_type_map.get(comp_type, comp_type)
162
+ label = component.get("name", comp_type)
163
+
164
+ feature = SeqFeature( # type: ignore[no-untyped-call]
165
+ FeatureLocation(start, end), # type: ignore[no-untyped-call]
166
+ type=feature_type,
167
+ qualifiers={
168
+ "label": [label],
169
+ "note": [comp_type],
170
+ },
171
+ )
172
+ record.features.append(feature)
173
+
174
+ return record
175
+
176
+ def validate_construct(
177
+ self, construct: "SeqRecord", template: dict[str, Any]
178
+ ) -> tuple[bool, list[str]]:
179
+ """
180
+ Validate an assembled construct.
181
+
182
+ Args:
183
+ construct: SeqRecord with assembled sequence.
184
+ template: Template dictionary.
185
+
186
+ Returns:
187
+ Tuple of (valid, warnings).
188
+ """
189
+ warnings: list[str] = []
190
+ valid = True
191
+
192
+ seq_str = str(construct.seq)
193
+ seq_len = len(seq_str)
194
+
195
+ if seq_len < 500 or seq_len > 20000:
196
+ warnings.append(f"Construct length {seq_len} bp is outside expected range (500-20000).")
197
+ valid = False
198
+
199
+ expected_features = len(template.get("components", []))
200
+ actual_features = len(construct.features)
201
+ if actual_features != expected_features:
202
+ warnings.append(
203
+ f"Feature count {actual_features} does not match template ({expected_features})."
204
+ )
205
+ valid = False
206
+
207
+ restriction_sites = {
208
+ "BsaI": ["GGTCTC", "GAGACC"],
209
+ "BpiI": ["GAAGAC", "GTCTTC"],
210
+ "BsmBI": ["CGTCTC", "GAGACG"],
211
+ }
212
+ for enzyme, motifs in restriction_sites.items():
213
+ for motif in motifs:
214
+ if motif in seq_str:
215
+ warnings.append(f"{enzyme} site detected: {motif}")
216
+ break
217
+
218
+ polya_patterns = ["AATAAA", "ATTAAA", "AGTAAA"]
219
+ for feature in construct.features:
220
+ if feature.type != "CDS":
221
+ continue
222
+ if feature.location is None:
223
+ warnings.append("CDS feature has no location defined.")
224
+ continue
225
+ start = int(feature.location.start)
226
+ end = int(feature.location.end)
227
+ cds_seq = seq_str[start:end]
228
+ for pattern in polya_patterns:
229
+ if pattern in cds_seq:
230
+ warnings.append(f"PolyA signal {pattern} detected in CDS.")
231
+ break
232
+
233
+ # Check internal overhang collisions within CDS
234
+ collisions = self.check_internal_overhang_collisions(cds_seq)
235
+ for collision in collisions:
236
+ warnings.append(
237
+ f"MoClo overhang '{collision['overhang']}' found internally in CDS "
238
+ f"at position {collision['position']} ({collision['strand']})."
239
+ )
240
+
241
+ # Positive PolyA check: terminator/3'UTR must contain a PolyA signal
242
+ for feature in construct.features:
243
+ if feature.type != "terminator":
244
+ continue
245
+ if feature.location is None:
246
+ continue
247
+ start = int(feature.location.start)
248
+ end = int(feature.location.end)
249
+ term_seq = seq_str[start:end]
250
+ has_polya = any(pattern in term_seq for pattern in polya_patterns)
251
+ if not has_polya:
252
+ warnings.append(
253
+ "No PolyA signal found in terminator region. "
254
+ "This may impair mRNA polyadenylation."
255
+ )
256
+
257
+ return valid, warnings
258
+
259
+ def generate_construct(self, gene_sequence: str, template_name: str) -> "SeqRecord":
260
+ """
261
+ Generate a construct from a template name and gene sequence.
262
+
263
+ Args:
264
+ gene_sequence: Optimized CDS sequence.
265
+ template_name: Template name (e.g., "standard_expression").
266
+
267
+ Returns:
268
+ SeqRecord with features.
269
+ """
270
+ template = self.load_template(template_name)
271
+ construct_seq = self.assemble_parts(gene_sequence, template)
272
+ construct = self.add_features(construct_seq, template)
273
+ valid, warnings = self.validate_construct(construct, template)
274
+
275
+ if warnings:
276
+ status = "VALID" if valid else "INVALID"
277
+ log_func = logger.warning if not valid else logger.info
278
+ log_func(f"Construct {status}: {len(warnings)} warning(s)")
279
+ for warning in warnings:
280
+ log_func(f" - {warning}")
281
+
282
+ return construct
283
+
284
+ # MoClo Level 0 standard overhangs for CDS parts
285
+ MOCLO_LEVEL0_OVERHANGS: dict[str, str] = {
286
+ "cds_5prime": "AATG",
287
+ "cds_3prime": "GCTT",
288
+ }
289
+
290
+ def validate_overhangs(
291
+ self,
292
+ parts: list[dict[str, Any]],
293
+ standard: str = "moclo_level0",
294
+ ) -> tuple[bool, list[str]]:
295
+ """
296
+ Validate Golden Gate overhang consistency for ordered parts.
297
+
298
+ For MoClo Level 0 CDS standard:
299
+ - 5' overhang must be AATG
300
+ - 3' overhang must be GCTT
301
+ - Adjacent parts: 3' overhang of part N must match 5' overhang of part N+1
302
+
303
+ Args:
304
+ parts: Ordered list of part dictionaries with 'overhang_5' and 'overhang_3' keys.
305
+ standard: Assembly standard to validate against.
306
+
307
+ Returns:
308
+ Tuple of (valid, warnings).
309
+ """
310
+ warnings: list[str] = []
311
+
312
+ if not parts:
313
+ warnings.append("No parts provided for overhang validation.")
314
+ return False, warnings
315
+
316
+ if standard == "moclo_level0":
317
+ expected_5 = self.MOCLO_LEVEL0_OVERHANGS["cds_5prime"]
318
+ expected_3 = self.MOCLO_LEVEL0_OVERHANGS["cds_3prime"]
319
+
320
+ # Check first part 5' overhang
321
+ first_oh5 = parts[0].get("overhang_5", "")
322
+ if first_oh5 and first_oh5 != expected_5:
323
+ warnings.append(
324
+ f"First part 5' overhang '{first_oh5}' does not match "
325
+ f"MoClo Level 0 expected '{expected_5}'."
326
+ )
327
+
328
+ # Check last part 3' overhang
329
+ last_oh3 = parts[-1].get("overhang_3", "")
330
+ if last_oh3 and last_oh3 != expected_3:
331
+ warnings.append(
332
+ f"Last part 3' overhang '{last_oh3}' does not match "
333
+ f"MoClo Level 0 expected '{expected_3}'."
334
+ )
335
+
336
+ # Check chain consistency: part N 3' overhang == part N+1 5' overhang
337
+ for i in range(len(parts) - 1):
338
+ oh3 = parts[i].get("overhang_3", "")
339
+ oh5_next = parts[i + 1].get("overhang_5", "")
340
+ if oh3 and oh5_next and oh3 != oh5_next:
341
+ warnings.append(
342
+ f"Overhang mismatch between part {i} (3'={oh3}) "
343
+ f"and part {i + 1} (5'={oh5_next})."
344
+ )
345
+
346
+ valid = len(warnings) == 0
347
+ return valid, warnings
348
+
349
+ def check_internal_overhang_collisions(
350
+ self,
351
+ cds_seq: str,
352
+ overhangs: list[str] | None = None,
353
+ ) -> list[dict[str, Any]]:
354
+ """
355
+ Check for internal occurrences of MoClo overhang sequences within CDS.
356
+
357
+ Scans for both forward and reverse complement of each overhang to prevent
358
+ assembly artifacts during Golden Gate cloning.
359
+
360
+ Args:
361
+ cds_seq: Coding DNA sequence to scan.
362
+ overhangs: List of 4bp overhang sequences to check.
363
+ Defaults to MoClo Level 0 CDS overhangs [AATG, GCTT].
364
+
365
+ Returns:
366
+ List of collision dicts with 'overhang', 'position', 'strand' keys.
367
+ """
368
+ if overhangs is None:
369
+ overhangs = list(self.MOCLO_LEVEL0_OVERHANGS.values())
370
+
371
+ # Build reverse complement lookup
372
+ complement = str.maketrans("ATGC", "TACG")
373
+ collisions: list[dict[str, Any]] = []
374
+
375
+ for overhang in overhangs:
376
+ rc = overhang.translate(complement)[::-1]
377
+
378
+ for i in range(len(cds_seq) - len(overhang) + 1):
379
+ fragment = cds_seq[i : i + len(overhang)]
380
+ if fragment == overhang:
381
+ collisions.append(
382
+ {"overhang": overhang, "position": i, "strand": "forward"}
383
+ )
384
+ elif fragment == rc:
385
+ collisions.append(
386
+ {"overhang": overhang, "position": i, "strand": "reverse_complement"}
387
+ )
388
+
389
+ return collisions
390
+
391
+ def assemble_construct(self, gene: str, template: dict[str, Any]) -> "SeqRecord":
392
+ """
393
+ Assemble a construct from a gene sequence and template.
394
+
395
+ Args:
396
+ gene: Optimized CDS sequence to insert.
397
+ template: Loaded template dictionary.
398
+
399
+ Returns:
400
+ SeqRecord for the assembled construct.
401
+ """
402
+ construct_seq = self.assemble_parts(gene, template)
403
+ return self.add_features(construct_seq, template)