factorforge-cds 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. factorforge/__init__.py +19 -0
  2. factorforge/__main__.py +8 -0
  3. factorforge/cli/__init__.py +5 -0
  4. factorforge/cli/legacy_cli.py +157 -0
  5. factorforge/cli/main.py +305 -0
  6. factorforge/core/interfaces/__init__.py +7 -0
  7. factorforge/core/interfaces/exporter.py +13 -0
  8. factorforge/core/interfaces/optimizer.py +85 -0
  9. factorforge/core/interfaces/validator.py +9 -0
  10. factorforge/database.py +150 -0
  11. factorforge/engines/__init__.py +60 -0
  12. factorforge/engines/ml/__init__.py +0 -0
  13. factorforge/engines/ml/plant_optimizer.py +325 -0
  14. factorforge/engines/registry.py +141 -0
  15. factorforge/engines/v1_archived/__init__.py +15 -0
  16. factorforge/engines/v2/__init__.py +13 -0
  17. factorforge/engines/v2/codon_table_builder.py +107 -0
  18. factorforge/engines/v2/construct_builder.py +403 -0
  19. factorforge/engines/v2/exporter.py +455 -0
  20. factorforge/engines/v2/optimizer.py +190 -0
  21. factorforge/engines/v2/pipeline.py +275 -0
  22. factorforge/engines/v2/rules/__init__.py +3 -0
  23. factorforge/engines/v2/rules/domesticator.py +403 -0
  24. factorforge/engines/v2/rules/reverse_translator.py +765 -0
  25. factorforge/engines/v2/rules/rule_engine.py +867 -0
  26. factorforge/engines/v2/scoring.py +232 -0
  27. factorforge/engines/v2/utils.py +231 -0
  28. factorforge/engines/v2/validator.py +383 -0
  29. factorforge/engines/v3/__init__.py +12 -0
  30. factorforge/engines/v3/explain.py +119 -0
  31. factorforge/engines/v3/inference/__init__.py +6 -0
  32. factorforge/engines/v3/inference/constrained_decoder.py +80 -0
  33. factorforge/engines/v3/inference/v2_adapter.py +72 -0
  34. factorforge/engines/v3/metrics.py +145 -0
  35. factorforge/engines/v3/modeling_bart_decoder.py +127 -0
  36. factorforge/engines/v3/pipeline.py +192 -0
  37. factorforge/engines/v3/synonym_mask.py +61 -0
  38. factorforge/engines/v3/tokenizer.py +192 -0
  39. factorforge/ml/__init__.py +33 -0
  40. factorforge/ml/feasibility.py +199 -0
  41. factorforge/ml/metrics.py +295 -0
  42. factorforge/utils/__init__.py +31 -0
  43. factorforge/utils/construct_id.py +8 -0
  44. factorforge/utils/exceptions.py +32 -0
  45. factorforge/utils/sequence_validator.py +189 -0
  46. factorforge/utils/validation.py +104 -0
  47. factorforge_cds-3.0.0.dist-info/METADATA +475 -0
  48. factorforge_cds-3.0.0.dist-info/RECORD +52 -0
  49. factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
  50. factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
  51. factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
  52. factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,232 @@
1
+ """
2
+ Multidimensional Scoring for FactorForge v2.
3
+ Composite scoring function with optional ViennaRNA MFE integration.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ from dataclasses import dataclass
10
+ from typing import Any
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Optimal GC range for N. benthamiana
15
+ GC_OPT_MIN = 41.0
16
+ GC_OPT_MAX = 44.0
17
+ GC_OPT_MID = 42.5
18
+
19
+ # ViennaRNA availability cache
20
+ _vienna_available: bool | None = None
21
+
22
+
23
+ @dataclass
24
+ class ScoringConfig:
25
+ """Scoring weight configuration for composite optimization score."""
26
+
27
+ w_cai: float = 0.5
28
+ w_gc: float = 0.3
29
+ w_mfe: float = 0.2
30
+ w_dinuc: float = 0.0 # CpG/TpA dinucleotide penalty (opt-in, default off)
31
+ gc_opt: float = GC_OPT_MID
32
+ use_mfe: bool = True
33
+
34
+ def __post_init__(self) -> None:
35
+ """Normalize weights to sum to 1.0."""
36
+ self._normalize()
37
+
38
+ def _normalize(self) -> None:
39
+ """Ensure active weights sum to 1.0."""
40
+ total = (
41
+ self.w_cai
42
+ + self.w_gc
43
+ + (self.w_mfe if self.use_mfe else 0.0)
44
+ + self.w_dinuc
45
+ )
46
+ if total > 0:
47
+ self.w_cai /= total
48
+ self.w_gc /= total
49
+ if self.use_mfe:
50
+ self.w_mfe /= total
51
+ else:
52
+ self.w_mfe = 0.0
53
+ self.w_dinuc /= total
54
+
55
+
56
+ # Pre-defined scoring configs per optimization profile
57
+ PROFILE_SCORING_CONFIGS: dict[str, ScoringConfig] = {
58
+ "balanced": ScoringConfig(w_cai=0.5, w_gc=0.3, w_mfe=0.2, gc_opt=GC_OPT_MID),
59
+ "high_cai": ScoringConfig(w_cai=0.8, w_gc=0.1, w_mfe=0.1, gc_opt=GC_OPT_MID),
60
+ "gc_target": ScoringConfig(w_cai=0.1, w_gc=0.7, w_mfe=0.2, gc_opt=GC_OPT_MID),
61
+ "assembly_friendly": ScoringConfig(w_cai=0.5, w_gc=0.3, w_mfe=0.2, gc_opt=GC_OPT_MID),
62
+ "ramp": ScoringConfig(w_cai=0.4, w_gc=0.3, w_mfe=0.3, gc_opt=GC_OPT_MID),
63
+ # TRV viral-delivery profile — Li et al. (2026): prioritize MFE and viral-context GC target.
64
+ "viral_delivery": ScoringConfig(w_cai=0.35, w_gc=0.25, w_mfe=0.40, gc_opt=47.5, use_mfe=True),
65
+ }
66
+
67
+
68
+ def _check_vienna_available() -> bool:
69
+ """Check if ViennaRNA Python bindings are available."""
70
+ global _vienna_available
71
+ if _vienna_available is None:
72
+ try:
73
+ import RNA # noqa: F401
74
+
75
+ _vienna_available = True
76
+ logger.debug("ViennaRNA Python bindings available")
77
+ except ImportError:
78
+ _vienna_available = False
79
+ logger.debug("ViennaRNA not available; MFE scoring disabled")
80
+ return _vienna_available
81
+
82
+
83
+ def calculate_mfe(sequence: str) -> float | None:
84
+ """
85
+ Calculate minimum free energy (MFE) using ViennaRNA.
86
+
87
+ Args:
88
+ sequence: DNA or RNA sequence.
89
+
90
+ Returns:
91
+ MFE in kcal/mol, or None if ViennaRNA is not available.
92
+ """
93
+ if not _check_vienna_available():
94
+ return None
95
+
96
+ try:
97
+ import RNA
98
+
99
+ # Convert DNA to RNA (T → U)
100
+ rna_seq = sequence.upper().replace("T", "U")
101
+ _, mfe = RNA.fold(rna_seq)
102
+ return float(mfe)
103
+ except Exception as exc:
104
+ logger.debug(f"MFE calculation failed: {exc}")
105
+ return None
106
+
107
+
108
+ def normalize_mfe(mfe: float, seq_length: int) -> float:
109
+ """
110
+ Normalize MFE to 0-1 range where 1 = no structure (favorable).
111
+
112
+ Uses empirical scaling: MFE per nucleotide typically ranges from
113
+ -0.5 to 0.0 kcal/mol/nt for mRNA coding sequences.
114
+
115
+ Args:
116
+ mfe: Minimum free energy in kcal/mol.
117
+ seq_length: Sequence length in nucleotides.
118
+
119
+ Returns:
120
+ Normalized MFE score (0-1, higher = less structured = better for translation).
121
+ """
122
+ if seq_length == 0:
123
+ return 0.5
124
+
125
+ mfe_per_nt = mfe / seq_length
126
+ # Clamp to expected range [-0.5, 0.0]
127
+ clamped = max(-0.5, min(0.0, mfe_per_nt))
128
+ # Map to [0, 1] where 0.0 kcal/mol/nt → 1.0 and -0.5 → 0.0
129
+ return 1.0 + (clamped / 0.5)
130
+
131
+
132
+ def calculate_dinucleotide_score(sequence: str) -> float:
133
+ """Calculate a dinucleotide avoidance score (0-1, higher = fewer CpG/TpA).
134
+
135
+ Combines CpG and TpA observed/expected ratios. A sequence with no CpG
136
+ and no TpA scores 1.0; high density scores toward 0.0.
137
+
138
+ Args:
139
+ sequence: DNA sequence.
140
+
141
+ Returns:
142
+ Dinucleotide avoidance score (0-1).
143
+ """
144
+ from factorforge.engines.v2.utils import calculate_dinucleotide_ratio
145
+
146
+ if len(sequence) < 6:
147
+ return 1.0
148
+
149
+ cpg_ratio = calculate_dinucleotide_ratio(sequence, "CG")
150
+ tpa_ratio = calculate_dinucleotide_ratio(sequence, "TA")
151
+
152
+ # Score: 1.0 when ratio=0, 0.0 when ratio>=2.0
153
+ cpg_score = max(0.0, 1.0 - cpg_ratio / 2.0)
154
+ tpa_score = max(0.0, 1.0 - tpa_ratio / 2.0)
155
+
156
+ return (cpg_score + tpa_score) / 2.0
157
+
158
+
159
+ def calculate_composite_score(
160
+ cai: float,
161
+ gc: float,
162
+ sequence: str | None = None,
163
+ config: ScoringConfig | None = None,
164
+ profile: str | None = None,
165
+ **kwargs: Any,
166
+ ) -> float:
167
+ """
168
+ Calculate multidimensional composite score.
169
+
170
+ S = w1*CAI + w2*(1 - |GC - GC_opt|/50) + w3*MFE_norm + w4*dinuc_score
171
+
172
+ Args:
173
+ cai: Codon Adaptation Index (0-1).
174
+ gc: GC content percentage (0-100).
175
+ sequence: DNA sequence for optional MFE calculation.
176
+ config: Explicit ScoringConfig. Overrides profile.
177
+ profile: Profile name for preset config lookup.
178
+ **kwargs: Additional parameters (e.g., target_gc for gc_target profile).
179
+
180
+ Returns:
181
+ Composite score (0-1).
182
+ """
183
+ # Resolve config
184
+ if config is None:
185
+ profile_name = (profile or "balanced").lower()
186
+ config = PROFILE_SCORING_CONFIGS.get(profile_name)
187
+ if config is None:
188
+ config = PROFILE_SCORING_CONFIGS["balanced"]
189
+
190
+ # Allow target_gc override for gc_target profile
191
+ gc_opt = float(kwargs.get("target_gc", config.gc_opt))
192
+
193
+ # Component 1: CAI (already 0-1)
194
+ cai_score = max(0.0, min(1.0, cai))
195
+
196
+ # Component 2: GC proximity to optimum
197
+ gc_score = max(0.0, 1.0 - abs(gc - gc_opt) / 50.0)
198
+
199
+ # Component 3: MFE (optional)
200
+ mfe_score = 0.5 # neutral default
201
+ actual_w_mfe = config.w_mfe
202
+
203
+ if config.use_mfe and sequence is not None and _check_vienna_available():
204
+ mfe = calculate_mfe(sequence)
205
+ if mfe is not None:
206
+ mfe_score = normalize_mfe(mfe, len(sequence))
207
+ else:
208
+ actual_w_mfe = 0.0
209
+ else:
210
+ actual_w_mfe = 0.0
211
+
212
+ # Component 4: Dinucleotide avoidance (opt-in, default weight 0.0)
213
+ dinuc_score = 0.5 # neutral default
214
+ actual_w_dinuc = config.w_dinuc
215
+ if actual_w_dinuc > 0 and sequence is not None:
216
+ dinuc_score = calculate_dinucleotide_score(sequence)
217
+ elif actual_w_dinuc > 0:
218
+ actual_w_dinuc = 0.0 # Cannot compute without sequence
219
+
220
+ # Compute weighted score (re-normalize if MFE/dinuc disabled)
221
+ w_total = config.w_cai + config.w_gc + actual_w_mfe + actual_w_dinuc
222
+ if w_total == 0:
223
+ return 0.0
224
+
225
+ score = (
226
+ (config.w_cai / w_total) * cai_score
227
+ + (config.w_gc / w_total) * gc_score
228
+ + (actual_w_mfe / w_total) * mfe_score
229
+ + (actual_w_dinuc / w_total) * dinuc_score
230
+ )
231
+
232
+ return round(score, 3)
@@ -0,0 +1,231 @@
1
+ """
2
+ Utility helpers for FactorForge v2 engines.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import importlib.resources
8
+ import json
9
+ import os
10
+ from pathlib import Path
11
+ from typing import Any, cast
12
+
13
+
14
+ def get_data_path() -> Path:
15
+ """Get data directory path from environment or package resources.
16
+
17
+ Checks FACTORFORGE_DATA_DIR environment variable first, then falls back to
18
+ the bundled package data directory (works with pip-installed packages).
19
+
20
+ Returns:
21
+ Path to data directory.
22
+
23
+ Examples:
24
+ >>> data_dir = get_data_path()
25
+ >>> isinstance(data_dir, Path)
26
+ True
27
+ """
28
+ data_dir = os.getenv("FACTORFORGE_DATA_DIR")
29
+ if data_dir:
30
+ return Path(data_dir)
31
+
32
+ # Use importlib.resources to locate bundled package data
33
+ # This works correctly whether the package is installed via pip or run from source
34
+ try:
35
+ ref = importlib.resources.files("factorforge") / "data"
36
+ return Path(str(ref))
37
+ except (TypeError, ModuleNotFoundError):
38
+ # Fallback for development: go up from v2/utils.py to src/factorforge/data
39
+ return Path(__file__).resolve().parents[2] / "data"
40
+
41
+
42
+ def calculate_gc(sequence: str) -> float:
43
+ """Calculate GC content percentage.
44
+
45
+ Args:
46
+ sequence: DNA sequence string.
47
+
48
+ Returns:
49
+ GC content as percentage (0-100).
50
+
51
+ Examples:
52
+ >>> calculate_gc("ATGC")
53
+ 50.0
54
+ """
55
+ if not sequence:
56
+ return 0.0
57
+
58
+ seq = sequence.upper()
59
+ gc_count = seq.count("G") + seq.count("C")
60
+ return (gc_count / len(sequence)) * 100
61
+
62
+
63
+ def count_dinucleotides(sequence: str, dinucleotide: str = "CG") -> int:
64
+ """Count occurrences of a dinucleotide in a DNA sequence.
65
+
66
+ Args:
67
+ sequence: DNA sequence string (case-insensitive).
68
+ dinucleotide: Two-character dinucleotide to count (e.g., "CG", "TA").
69
+
70
+ Returns:
71
+ Count of dinucleotide occurrences.
72
+
73
+ Examples:
74
+ >>> count_dinucleotides("ACGACG", "CG")
75
+ 2
76
+ """
77
+ seq = sequence.upper()
78
+ dn = dinucleotide.upper()
79
+ return sum(1 for i in range(len(seq) - 1) if seq[i : i + 2] == dn)
80
+
81
+
82
+ def calculate_dinucleotide_ratio(sequence: str, dinucleotide: str = "CG") -> float:
83
+ """Calculate observed/expected ratio of a dinucleotide.
84
+
85
+ Compares actual dinucleotide frequency to what would be expected
86
+ from mononucleotide composition. Ratio < 1.0 means suppressed,
87
+ > 1.0 means enriched.
88
+
89
+ Args:
90
+ sequence: DNA sequence string (case-insensitive).
91
+ dinucleotide: Two-character dinucleotide (e.g., "CG", "TA").
92
+
93
+ Returns:
94
+ Observed/expected ratio (0.0 if sequence too short or denominator zero).
95
+
96
+ Examples:
97
+ >>> calculate_dinucleotide_ratio("ACGTACGT", "CG") # doctest: +SKIP
98
+ 1.0
99
+ """
100
+ seq = sequence.upper()
101
+ if len(seq) < 2:
102
+ return 0.0
103
+
104
+ dn = dinucleotide.upper()
105
+ observed = sum(1 for i in range(len(seq) - 1) if seq[i : i + 2] == dn)
106
+
107
+ n1 = seq.count(dn[0])
108
+ n2 = seq.count(dn[1])
109
+ n = len(seq)
110
+
111
+ expected = (n1 * n2) / n if n > 0 else 0.0
112
+ if expected == 0.0:
113
+ return 0.0
114
+
115
+ return observed / expected
116
+
117
+
118
+ def load_codon_table(organism: str, codon_tables_dir: Path) -> dict[str, Any]:
119
+ """Load codon usage table for organism.
120
+
121
+ Args:
122
+ organism: Organism name (e.g., "human", "ecoli").
123
+ codon_tables_dir: Directory containing codon table files.
124
+
125
+ Returns:
126
+ Codon table payload parsed from JSON.
127
+
128
+ Raises:
129
+ FileNotFoundError: If codon table file not found.
130
+ """
131
+ filename = organism if organism.endswith(".json") else f"{organism}_codons.json"
132
+ codon_table_path = codon_tables_dir / filename
133
+
134
+ with open(codon_table_path, "r", encoding="utf-8") as handle:
135
+ return cast(dict[str, Any], json.load(handle))
136
+
137
+
138
+ def build_aa_to_codons_map(codon_table: dict[str, Any]) -> dict[str, list[str]]:
139
+ """Build amino-acid-to-codons map from a codon table payload."""
140
+ amino_acids = codon_table.get("amino_acids", {})
141
+ if not isinstance(amino_acids, dict):
142
+ return {}
143
+ return {aa: list(info.get("codons", [])) for aa, info in amino_acids.items()}
144
+
145
+
146
+ def load_golden_set(data_dir: Path | None = None) -> dict[str, Any]:
147
+ """Load golden set codon table for CAI reference weights.
148
+
149
+ Falls back to the standard codon table if golden set file is not found.
150
+
151
+ Args:
152
+ data_dir: Data directory path. Defaults to get_data_path().
153
+
154
+ Returns:
155
+ Golden set codon table dict.
156
+ """
157
+ if data_dir is None:
158
+ data_dir = get_data_path()
159
+ golden_path = data_dir / "nbenthamiana_golden_set.json"
160
+ if golden_path.exists():
161
+ with open(golden_path, "r", encoding="utf-8") as f:
162
+ return cast(dict[str, Any], json.load(f))
163
+ # Fallback to standard table
164
+ standard_path = data_dir / "nbenthamiana_codons.json"
165
+ with open(standard_path, "r", encoding="utf-8") as f:
166
+ return cast(dict[str, Any], json.load(f))
167
+
168
+
169
+ def translate_codon(codon: str, codon_table: dict[str, str]) -> str:
170
+ """Translate DNA codon to amino acid.
171
+
172
+ Args:
173
+ codon: 3-letter DNA codon.
174
+ codon_table: Codon to amino acid mapping.
175
+
176
+ Returns:
177
+ Single letter amino acid code.
178
+
179
+ Raises:
180
+ KeyError: If codon not in table.
181
+ """
182
+ return codon_table[codon.upper()]
183
+
184
+
185
+ def parse_fasta_records(content: str) -> list[tuple[str, str]]:
186
+ """Parse FASTA content into (record_id, sequence) tuples.
187
+
188
+ Args:
189
+ content: FASTA text content.
190
+
191
+ Returns:
192
+ List of (record_id, sequence) tuples.
193
+
194
+ Raises:
195
+ ValueError: If content is not valid FASTA.
196
+ """
197
+ records: list[tuple[str, str]] = []
198
+ seq_id: str | None = None
199
+ seq_lines: list[str] = []
200
+
201
+ for raw_line in content.splitlines():
202
+ line = raw_line.strip()
203
+ if not line:
204
+ continue
205
+
206
+ if line.startswith(">"):
207
+ if seq_id is not None:
208
+ sequence = "".join(seq_lines).upper()
209
+ if not sequence:
210
+ raise ValueError(f"Empty FASTA record: {seq_id}")
211
+ records.append((seq_id, sequence))
212
+
213
+ header = line[1:].strip()
214
+ seq_id = header.split()[0] if header else f"seq{len(records) + 1}"
215
+ seq_lines = []
216
+ continue
217
+
218
+ if seq_id is None:
219
+ raise ValueError("Invalid FASTA: sequence data found before header")
220
+ seq_lines.append(line)
221
+
222
+ if seq_id is not None:
223
+ sequence = "".join(seq_lines).upper()
224
+ if not sequence:
225
+ raise ValueError(f"Empty FASTA record: {seq_id}")
226
+ records.append((seq_id, sequence))
227
+
228
+ if not records:
229
+ raise ValueError("No FASTA records found")
230
+
231
+ return records