factorforge-cds 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. factorforge/__init__.py +19 -0
  2. factorforge/__main__.py +8 -0
  3. factorforge/cli/__init__.py +5 -0
  4. factorforge/cli/legacy_cli.py +157 -0
  5. factorforge/cli/main.py +305 -0
  6. factorforge/core/interfaces/__init__.py +7 -0
  7. factorforge/core/interfaces/exporter.py +13 -0
  8. factorforge/core/interfaces/optimizer.py +85 -0
  9. factorforge/core/interfaces/validator.py +9 -0
  10. factorforge/database.py +150 -0
  11. factorforge/engines/__init__.py +60 -0
  12. factorforge/engines/ml/__init__.py +0 -0
  13. factorforge/engines/ml/plant_optimizer.py +325 -0
  14. factorforge/engines/registry.py +141 -0
  15. factorforge/engines/v1_archived/__init__.py +15 -0
  16. factorforge/engines/v2/__init__.py +13 -0
  17. factorforge/engines/v2/codon_table_builder.py +107 -0
  18. factorforge/engines/v2/construct_builder.py +403 -0
  19. factorforge/engines/v2/exporter.py +455 -0
  20. factorforge/engines/v2/optimizer.py +190 -0
  21. factorforge/engines/v2/pipeline.py +275 -0
  22. factorforge/engines/v2/rules/__init__.py +3 -0
  23. factorforge/engines/v2/rules/domesticator.py +403 -0
  24. factorforge/engines/v2/rules/reverse_translator.py +765 -0
  25. factorforge/engines/v2/rules/rule_engine.py +867 -0
  26. factorforge/engines/v2/scoring.py +232 -0
  27. factorforge/engines/v2/utils.py +231 -0
  28. factorforge/engines/v2/validator.py +383 -0
  29. factorforge/engines/v3/__init__.py +12 -0
  30. factorforge/engines/v3/explain.py +119 -0
  31. factorforge/engines/v3/inference/__init__.py +6 -0
  32. factorforge/engines/v3/inference/constrained_decoder.py +80 -0
  33. factorforge/engines/v3/inference/v2_adapter.py +72 -0
  34. factorforge/engines/v3/metrics.py +145 -0
  35. factorforge/engines/v3/modeling_bart_decoder.py +127 -0
  36. factorforge/engines/v3/pipeline.py +192 -0
  37. factorforge/engines/v3/synonym_mask.py +61 -0
  38. factorforge/engines/v3/tokenizer.py +192 -0
  39. factorforge/ml/__init__.py +33 -0
  40. factorforge/ml/feasibility.py +199 -0
  41. factorforge/ml/metrics.py +295 -0
  42. factorforge/utils/__init__.py +31 -0
  43. factorforge/utils/construct_id.py +8 -0
  44. factorforge/utils/exceptions.py +32 -0
  45. factorforge/utils/sequence_validator.py +189 -0
  46. factorforge/utils/validation.py +104 -0
  47. factorforge_cds-3.0.0.dist-info/METADATA +475 -0
  48. factorforge_cds-3.0.0.dist-info/RECORD +52 -0
  49. factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
  50. factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
  51. factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
  52. factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,295 @@
1
+ """Shared sequence metrics for v3-alpha evaluation and validation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from collections import Counter
7
+ from typing import Any
8
+
9
+
10
+ STANDARD_GENETIC_CODE: dict[str, str] = {
11
+ "TTT": "F",
12
+ "TTC": "F",
13
+ "TTA": "L",
14
+ "TTG": "L",
15
+ "TCT": "S",
16
+ "TCC": "S",
17
+ "TCA": "S",
18
+ "TCG": "S",
19
+ "TAT": "Y",
20
+ "TAC": "Y",
21
+ "TAA": "*",
22
+ "TAG": "*",
23
+ "TGT": "C",
24
+ "TGC": "C",
25
+ "TGA": "*",
26
+ "TGG": "W",
27
+ "CTT": "L",
28
+ "CTC": "L",
29
+ "CTA": "L",
30
+ "CTG": "L",
31
+ "CCT": "P",
32
+ "CCC": "P",
33
+ "CCA": "P",
34
+ "CCG": "P",
35
+ "CAT": "H",
36
+ "CAC": "H",
37
+ "CAA": "Q",
38
+ "CAG": "Q",
39
+ "CGT": "R",
40
+ "CGC": "R",
41
+ "CGA": "R",
42
+ "CGG": "R",
43
+ "ATT": "I",
44
+ "ATC": "I",
45
+ "ATA": "I",
46
+ "ATG": "M",
47
+ "ACT": "T",
48
+ "ACC": "T",
49
+ "ACA": "T",
50
+ "ACG": "T",
51
+ "AAT": "N",
52
+ "AAC": "N",
53
+ "AAA": "K",
54
+ "AAG": "K",
55
+ "AGT": "S",
56
+ "AGC": "S",
57
+ "AGA": "R",
58
+ "AGG": "R",
59
+ "GTT": "V",
60
+ "GTC": "V",
61
+ "GTA": "V",
62
+ "GTG": "V",
63
+ "GCT": "A",
64
+ "GCC": "A",
65
+ "GCA": "A",
66
+ "GCG": "A",
67
+ "GAT": "D",
68
+ "GAC": "D",
69
+ "GAA": "E",
70
+ "GAG": "E",
71
+ "GGT": "G",
72
+ "GGC": "G",
73
+ "GGA": "G",
74
+ "GGG": "G",
75
+ }
76
+
77
+ STOP_CODONS = {codon for codon, aa in STANDARD_GENETIC_CODE.items() if aa == "*"}
78
+ VALID_BASES = set("ATGC")
79
+
80
+
81
+ def _normalize_dna(sequence: str) -> str:
82
+ return "".join(sequence.upper().replace("U", "T").split())
83
+
84
+
85
+ def _codons(sequence: str, include_partial: bool = False) -> list[str]:
86
+ seq = _normalize_dna(sequence)
87
+ end = len(seq) if include_partial else len(seq) - len(seq) % 3
88
+ return [seq[index : index + 3] for index in range(0, end, 3)]
89
+
90
+
91
+ def calculate_gc(sequence: str) -> float:
92
+ """Calculate GC content as a percentage in the range 0-100."""
93
+ seq = _normalize_dna(sequence)
94
+ if not seq:
95
+ return 0.0
96
+ return ((seq.count("G") + seq.count("C")) / len(seq)) * 100.0
97
+
98
+
99
+ def calculate_gc_windows(
100
+ sequence: str,
101
+ window_size: int = 60,
102
+ step: int = 30,
103
+ ) -> list[dict[str, float | int]]:
104
+ """Calculate sliding-window GC percentages."""
105
+ if window_size <= 0:
106
+ raise ValueError("window_size must be > 0")
107
+ if step <= 0:
108
+ raise ValueError("step must be > 0")
109
+
110
+ seq = _normalize_dna(sequence)
111
+ if not seq:
112
+ return []
113
+ if len(seq) <= window_size:
114
+ return [{"start": 0, "end": len(seq), "gc": calculate_gc(seq)}]
115
+
116
+ windows: list[dict[str, float | int]] = []
117
+ for start in range(0, len(seq) - window_size + 1, step):
118
+ end = start + window_size
119
+ windows.append({"start": start, "end": end, "gc": calculate_gc(seq[start:end])})
120
+ if windows and int(windows[-1]["end"]) < len(seq):
121
+ start = len(seq) - window_size
122
+ if start != int(windows[-1]["start"]):
123
+ windows.append({"start": start, "end": len(seq), "gc": calculate_gc(seq[start:])})
124
+ return windows
125
+
126
+
127
+ def calculate_first_region_gc(
128
+ sequence: str,
129
+ region_sizes: list[int] | None = None,
130
+ ) -> dict[str, float]:
131
+ """Calculate GC for configured 5-prime regions."""
132
+ seq = _normalize_dna(sequence)
133
+ sizes = region_sizes or [30, 60, 90]
134
+ result: dict[str, float] = {}
135
+ for size in sizes:
136
+ if size <= 0:
137
+ raise ValueError("region sizes must be > 0")
138
+ region = seq[: min(size, len(seq))]
139
+ result[f"first_{size}nt_gc"] = calculate_gc(region)
140
+ return result
141
+
142
+
143
+ def translate_dna(sequence: str) -> str:
144
+ """Translate DNA to amino acids, using X for invalid codons."""
145
+ translated: list[str] = []
146
+ for codon in _codons(sequence):
147
+ translated.append(STANDARD_GENETIC_CODE.get(codon, "X"))
148
+ return "".join(translated)
149
+
150
+
151
+ def amino_acid_identity(input_protein: str, dna_sequence: str) -> float:
152
+ """Return exact-position amino acid identity from translated DNA."""
153
+ expected = "".join(input_protein.upper().split())
154
+ observed = translate_dna(dna_sequence)
155
+ if observed.endswith("*") and not expected.endswith("*"):
156
+ observed = observed[:-1]
157
+ if not expected:
158
+ return 0.0
159
+ matches = sum(1 for exp, obs in zip(expected, observed) if exp == obs)
160
+ return matches / len(expected) if len(observed) == len(expected) else matches / len(expected)
161
+
162
+
163
+ def count_internal_stops(dna_sequence: str) -> int:
164
+ """Count stop codons before the final codon."""
165
+ codons = _codons(dna_sequence)
166
+ return sum(1 for codon in codons[:-1] if codon in STOP_CODONS)
167
+
168
+
169
+ def calculate_cai(sequence: str, codon_weights: dict[str, float]) -> float:
170
+ """Calculate CAI as a geometric mean of supplied codon weights."""
171
+ seq = _normalize_dna(sequence)
172
+ if not seq or len(seq) % 3 != 0:
173
+ return 0.0
174
+
175
+ log_sum = 0.0
176
+ count = 0
177
+ for codon in _codons(seq):
178
+ if codon in STOP_CODONS:
179
+ continue
180
+ weight = codon_weights.get(codon)
181
+ if weight is None or weight <= 0:
182
+ return 0.0
183
+ log_sum += math.log(weight)
184
+ count += 1
185
+ return math.exp(log_sum / count) if count else 0.0
186
+
187
+
188
+ def codon_usage_profile(sequence: str) -> dict[str, dict[str, float | int | str]]:
189
+ """Return codon counts and frequencies for a DNA sequence."""
190
+ codons = _codons(sequence)
191
+ counts = Counter(codons)
192
+ total = sum(counts.values())
193
+ profile: dict[str, dict[str, float | int | str]] = {}
194
+ for codon in sorted(counts):
195
+ profile[codon] = {
196
+ "count": counts[codon],
197
+ "frequency": counts[codon] / total if total else 0.0,
198
+ "aa": STANDARD_GENETIC_CODE.get(codon, "X"),
199
+ }
200
+ return profile
201
+
202
+
203
+ def detect_homopolymers(sequence: str, max_run: int = 6) -> list[dict[str, Any]]:
204
+ """Detect runs whose length is greater than or equal to max_run."""
205
+ if max_run <= 1:
206
+ raise ValueError("max_run must be > 1")
207
+
208
+ seq = _normalize_dna(sequence)
209
+ findings: list[dict[str, Any]] = []
210
+ if not seq:
211
+ return findings
212
+
213
+ run_base = seq[0]
214
+ run_start = 0
215
+ for index, base in enumerate(seq[1:], start=1):
216
+ if base == run_base:
217
+ continue
218
+ run_length = index - run_start
219
+ if run_length >= max_run:
220
+ findings.append(
221
+ {"start": run_start, "end": index, "base": run_base, "length": run_length}
222
+ )
223
+ run_base = base
224
+ run_start = index
225
+
226
+ run_length = len(seq) - run_start
227
+ if run_length >= max_run:
228
+ findings.append(
229
+ {"start": run_start, "end": len(seq), "base": run_base, "length": run_length}
230
+ )
231
+ return findings
232
+
233
+
234
+ def detect_repeats(sequence: str) -> list[dict[str, Any]]:
235
+ """Detect simple tandem repeats with motif length 2-6 repeated at least 3 times."""
236
+ seq = _normalize_dna(sequence)
237
+ findings: list[dict[str, Any]] = []
238
+ occupied: set[tuple[int, int]] = set()
239
+
240
+ for motif_len in range(2, 7):
241
+ index = 0
242
+ while index <= len(seq) - motif_len * 3:
243
+ motif = seq[index : index + motif_len]
244
+ repeats = 1
245
+ cursor = index + motif_len
246
+ while seq[cursor : cursor + motif_len] == motif:
247
+ repeats += 1
248
+ cursor += motif_len
249
+ if repeats >= 3:
250
+ span = (index, cursor)
251
+ if span not in occupied:
252
+ findings.append(
253
+ {
254
+ "start": index,
255
+ "end": cursor,
256
+ "motif": motif,
257
+ "repeat_count": repeats,
258
+ }
259
+ )
260
+ occupied.add(span)
261
+ index = cursor
262
+ else:
263
+ index += 1
264
+ return findings
265
+
266
+
267
+ def detect_forbidden_motifs(sequence: str, motifs: list[str]) -> list[dict[str, Any]]:
268
+ """Find all exact forbidden motif occurrences."""
269
+ seq = _normalize_dna(sequence)
270
+ findings: list[dict[str, Any]] = []
271
+ for motif in motifs:
272
+ normalized = _normalize_dna(motif)
273
+ if not normalized:
274
+ continue
275
+ start = seq.find(normalized)
276
+ while start != -1:
277
+ findings.append({"start": start, "end": start + len(normalized), "motif": normalized})
278
+ start = seq.find(normalized, start + 1)
279
+ return findings
280
+
281
+
282
+ def detect_invalid_codons(sequence: str) -> list[dict[str, Any]]:
283
+ """Detect invalid, partial, or non-ATGC codons."""
284
+ seq = _normalize_dna(sequence)
285
+ findings: list[dict[str, Any]] = []
286
+ for index, codon in enumerate(_codons(seq, include_partial=True)):
287
+ start = index * 3
288
+ if len(codon) != 3:
289
+ findings.append({"start": start, "end": len(seq), "codon": codon, "reason": "partial"})
290
+ elif set(codon) - VALID_BASES:
291
+ findings.append({"start": start, "end": start + 3, "codon": codon, "reason": "invalid_base"})
292
+ elif codon not in STANDARD_GENETIC_CODE:
293
+ findings.append({"start": start, "end": start + 3, "codon": codon, "reason": "unknown"})
294
+ return findings
295
+
@@ -0,0 +1,31 @@
1
+ """Utility helpers for FactorForge."""
2
+
3
+ from .exceptions import (
4
+ FactorForgeError,
5
+ CodonTableError,
6
+ EmptyCandidateError,
7
+ FileFormatError,
8
+ OptimizationError,
9
+ SequenceValidationError,
10
+ )
11
+ from .sequence_validator import (
12
+ detect_sequence_type,
13
+ validate_cds_output,
14
+ validate_and_normalize,
15
+ validate_dna_sequence,
16
+ validate_protein_sequence,
17
+ )
18
+
19
+ __all__ = [
20
+ "FactorForgeError",
21
+ "SequenceValidationError",
22
+ "OptimizationError",
23
+ "EmptyCandidateError",
24
+ "FileFormatError",
25
+ "CodonTableError",
26
+ "detect_sequence_type",
27
+ "validate_cds_output",
28
+ "validate_and_normalize",
29
+ "validate_dna_sequence",
30
+ "validate_protein_sequence",
31
+ ]
@@ -0,0 +1,8 @@
1
+ """construct_id 생성 유틸리티."""
2
+ from datetime import datetime
3
+
4
+
5
+ def generate_construct_id() -> str:
6
+ """CF-YYYYMMDD-HHMMSS 형식의 고유 construct ID 생성."""
7
+ now = datetime.now()
8
+ return f"CF-{now.strftime('%Y%m%d-%H%M%S')}"
@@ -0,0 +1,32 @@
1
+ """Custom exceptions for FactorForge."""
2
+
3
+
4
+ class FactorForgeError(Exception):
5
+ """Base exception for FactorForge."""
6
+
7
+
8
+ class SequenceValidationError(FactorForgeError):
9
+ """Raised when sequence validation fails."""
10
+
11
+
12
+ class OptimizationError(FactorForgeError):
13
+ """Raised when optimization fails."""
14
+
15
+
16
+ class EmptyCandidateError(OptimizationError):
17
+ """Raised when no valid codon candidates are generated."""
18
+
19
+ def __init__(self, amino_acid: str, reason: str = "") -> None:
20
+ self.amino_acid = amino_acid
21
+ message = f"No valid codon candidates for amino acid '{amino_acid}'"
22
+ if reason:
23
+ message = f"{message}: {reason}"
24
+ super().__init__(message)
25
+
26
+
27
+ class FileFormatError(FactorForgeError):
28
+ """Raised when file format is invalid."""
29
+
30
+
31
+ class CodonTableError(FactorForgeError):
32
+ """Raised when codon table is invalid or missing."""
@@ -0,0 +1,189 @@
1
+ """Sequence validation and type detection utilities."""
2
+
3
+ from typing import Literal, Tuple
4
+
5
+ from factorforge.ml.metrics import detect_invalid_codons, translate_dna
6
+
7
+ from .exceptions import SequenceValidationError
8
+
9
+ DNA_CHARS = set("ATGCN")
10
+ AMBIGUOUS_DNA_CHARS = set("ATGCM")
11
+ PROTEIN_ONLY_CHARS = set("DEFHIKLPQRSVWY")
12
+ VALID_CHARS = set("ACDEFGHIKLMNPQRSTVWY*")
13
+ MIN_DNA_LEN = 6
14
+
15
+
16
+ def _clean_sequence(seq: str) -> str:
17
+ return "".join(seq.upper().split())
18
+
19
+
20
+ def detect_sequence_type(seq: str) -> Literal["dna", "protein", "ambiguous"]:
21
+ """
22
+ Detect if input sequence is DNA or protein.
23
+
24
+ Args:
25
+ seq: Input sequence string
26
+
27
+ Returns:
28
+ "dna": Valid DNA (only ATGCN)
29
+ "protein": Valid protein (contains non-DNA amino acids)
30
+ "ambiguous": Could be either (only contains A/T/G/C/M)
31
+
32
+ Examples:
33
+ >>> detect_sequence_type("ATGGCC")
34
+ 'dna'
35
+ >>> detect_sequence_type("MKKGEL")
36
+ 'protein'
37
+ >>> detect_sequence_type("MA")
38
+ 'ambiguous'
39
+ """
40
+ seq_upper = _clean_sequence(seq)
41
+ if not seq_upper:
42
+ return "ambiguous"
43
+
44
+ seq_chars = set(seq_upper)
45
+
46
+ # Protein-only letters present.
47
+ if seq_chars & PROTEIN_ONLY_CHARS:
48
+ return "protein"
49
+
50
+ # DNA-only letters present.
51
+ if seq_chars <= DNA_CHARS:
52
+ if len(seq_upper) >= MIN_DNA_LEN and len(seq_upper) % 3 == 0:
53
+ return "dna"
54
+ return "ambiguous"
55
+
56
+ # Ambiguous DNA (IUPAC M code).
57
+ if seq_chars <= AMBIGUOUS_DNA_CHARS:
58
+ return "ambiguous"
59
+
60
+ # Default to protein for other amino-acid characters.
61
+ return "protein"
62
+
63
+
64
+ def validate_and_normalize(
65
+ seq: str,
66
+ expected_type: Literal["dna", "protein", "auto"] = "auto",
67
+ ) -> Tuple[str, Literal["dna", "protein"]]:
68
+ """
69
+ Validate and normalize sequence with type detection.
70
+
71
+ Args:
72
+ seq: Input sequence
73
+ expected_type: Expected type or "auto" for auto-detection
74
+
75
+ Returns:
76
+ (normalized_sequence, detected_type)
77
+
78
+ Raises:
79
+ SequenceValidationError: If sequence is invalid or type mismatch
80
+
81
+ Examples:
82
+ >>> validate_and_normalize("atggcc", "auto")
83
+ ('ATGGCC', 'dna')
84
+ >>> validate_and_normalize("MKKGEL", "protein")
85
+ ('MKKGEL', 'protein')
86
+ """
87
+ seq_clean = _clean_sequence(seq)
88
+
89
+ if not seq_clean:
90
+ raise SequenceValidationError("Empty sequence provided")
91
+
92
+ invalid_chars = set(seq_clean) - VALID_CHARS
93
+ if invalid_chars:
94
+ raise SequenceValidationError(
95
+ f"Invalid characters in sequence: {', '.join(sorted(invalid_chars))}"
96
+ )
97
+
98
+ detected = detect_sequence_type(seq_clean)
99
+
100
+ if expected_type == "auto":
101
+ if detected == "ambiguous":
102
+ if len(seq_clean) >= MIN_DNA_LEN and len(seq_clean) % 3 == 0:
103
+ return seq_clean, "dna"
104
+ return seq_clean, "protein"
105
+ return seq_clean, detected
106
+
107
+ if expected_type != detected and detected != "ambiguous":
108
+ raise SequenceValidationError(
109
+ f"Expected {expected_type} sequence but detected {detected}. "
110
+ f"Sequence: {seq_clean[:20]}{'...' if len(seq_clean) > 20 else ''}"
111
+ )
112
+
113
+ return seq_clean, expected_type
114
+
115
+
116
+ def validate_dna_sequence(seq: str) -> str:
117
+ """
118
+ Validate DNA sequence.
119
+
120
+ Args:
121
+ seq: DNA sequence
122
+
123
+ Returns:
124
+ Normalized DNA sequence
125
+
126
+ Raises:
127
+ SequenceValidationError: If not valid DNA
128
+ """
129
+ seq_clean = _clean_sequence(seq)
130
+ invalid = set(seq_clean) - DNA_CHARS
131
+ if invalid:
132
+ raise SequenceValidationError(f"Invalid DNA characters: {', '.join(sorted(invalid))}")
133
+
134
+ return seq_clean
135
+
136
+
137
+ def validate_protein_sequence(seq: str) -> str:
138
+ """
139
+ Validate protein sequence.
140
+
141
+ Args:
142
+ seq: Protein sequence
143
+
144
+ Returns:
145
+ Normalized protein sequence
146
+
147
+ Raises:
148
+ SequenceValidationError: If not valid protein
149
+ """
150
+ seq_clean = _clean_sequence(seq)
151
+ aa_chars = set("ACDEFGHIKLMNPQRSTVWY*")
152
+ invalid = set(seq_clean) - aa_chars
153
+ if invalid:
154
+ raise SequenceValidationError(
155
+ f"Invalid amino acid characters: {', '.join(sorted(invalid))}"
156
+ )
157
+
158
+ return seq_clean
159
+
160
+
161
+ def validate_cds_output(input_protein: str, dna_sequence: str) -> dict[str, object]:
162
+ """Strictly validate generated CDS output against the input protein.
163
+
164
+ This validator is intentionally narrow: it returns only hard-fail errors
165
+ for generated CDS outputs that should not be recommended.
166
+ """
167
+ expected = _clean_sequence(input_protein).rstrip("*")
168
+ seq = _clean_sequence(dna_sequence).replace("U", "T")
169
+ errors: list[str] = []
170
+
171
+ if len(seq) % 3 != 0:
172
+ errors.append("length_not_divisible_by_3")
173
+
174
+ invalid = detect_invalid_codons(seq)
175
+ if invalid:
176
+ errors.append(f"invalid_codons: {invalid[:3]}")
177
+
178
+ translated = translate_dna(seq)
179
+ if "*" in translated[:-1]:
180
+ errors.append("internal_stop_codon")
181
+
182
+ observed = translated.rstrip("*")
183
+ if expected != observed:
184
+ errors.append(f"aa_mismatch: expected_len={len(expected)} observed_len={len(observed)}")
185
+
186
+ return {
187
+ "passed": not errors,
188
+ "errors": errors,
189
+ }
@@ -0,0 +1,104 @@
1
+ """Structured candidate DNA validation for v3-alpha."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from factorforge.ml.metrics import (
8
+ amino_acid_identity,
9
+ calculate_first_region_gc,
10
+ calculate_gc,
11
+ calculate_gc_windows,
12
+ count_internal_stops,
13
+ detect_forbidden_motifs,
14
+ detect_homopolymers,
15
+ detect_invalid_codons,
16
+ detect_repeats,
17
+ )
18
+
19
+
20
+ DEFAULT_CONFIG: dict[str, Any] = {
21
+ "gc_window_low": 30.0,
22
+ "gc_window_high": 70.0,
23
+ "gc_window_size": 60,
24
+ "gc_window_step": 30,
25
+ "forbidden_motifs": [],
26
+ "fail_forbidden_motifs": False,
27
+ "homopolymer_max_run": 6,
28
+ }
29
+
30
+
31
+ def validate_candidate_sequence(
32
+ input_protein: str,
33
+ dna_sequence: str,
34
+ config: dict[str, Any] | None = None,
35
+ ) -> dict[str, Any]:
36
+ """Validate a candidate CDS and return a machine-readable result."""
37
+ cfg = {**DEFAULT_CONFIG, **(config or {})}
38
+ seq = "".join(dna_sequence.upper().replace("U", "T").split())
39
+ errors: list[str] = []
40
+ warnings: list[str] = []
41
+
42
+ if not seq:
43
+ errors.append("empty sequence")
44
+ if len(seq) % 3 != 0:
45
+ errors.append("sequence length is not divisible by 3")
46
+
47
+ invalid_codons = detect_invalid_codons(seq)
48
+ internal_stop_count = count_internal_stops(seq)
49
+ identity = amino_acid_identity(input_protein, seq) if seq else 0.0
50
+ gc = calculate_gc(seq)
51
+ windows = calculate_gc_windows(
52
+ seq,
53
+ window_size=int(cfg["gc_window_size"]),
54
+ step=int(cfg["gc_window_step"]),
55
+ )
56
+ window_values = [float(window["gc"]) for window in windows]
57
+ gc_window_min = min(window_values) if window_values else 0.0
58
+ gc_window_max = max(window_values) if window_values else 0.0
59
+ low = float(cfg["gc_window_low"])
60
+ high = float(cfg["gc_window_high"])
61
+ gc_window_outlier_count = sum(1 for value in window_values if value < low or value > high)
62
+
63
+ first_region = calculate_first_region_gc(seq)
64
+ forbidden = detect_forbidden_motifs(seq, list(cfg.get("forbidden_motifs", [])))
65
+ homopolymers = detect_homopolymers(seq, max_run=int(cfg["homopolymer_max_run"]))
66
+ repeats = detect_repeats(seq)
67
+
68
+ if identity < 1.0:
69
+ errors.append("amino acid identity is below 1.0")
70
+ if invalid_codons:
71
+ errors.append("invalid codons detected")
72
+ if internal_stop_count:
73
+ errors.append("internal stop codons detected")
74
+ if forbidden and bool(cfg.get("fail_forbidden_motifs")):
75
+ errors.append("forbidden motifs detected")
76
+
77
+ if gc_window_outlier_count:
78
+ warnings.append("local GC window outliers detected")
79
+ if forbidden and not bool(cfg.get("fail_forbidden_motifs")):
80
+ warnings.append("forbidden motifs detected")
81
+ if homopolymers:
82
+ warnings.append("homopolymers detected")
83
+ if repeats:
84
+ warnings.append("simple repeats detected")
85
+
86
+ return {
87
+ "passed": not errors,
88
+ "amino_acid_identity": identity,
89
+ "gc": gc,
90
+ "gc_window_min": gc_window_min,
91
+ "gc_window_max": gc_window_max,
92
+ "gc_window_outlier_count": gc_window_outlier_count,
93
+ "first_30nt_gc": first_region["first_30nt_gc"],
94
+ "first_60nt_gc": first_region["first_60nt_gc"],
95
+ "first_90nt_gc": first_region["first_90nt_gc"],
96
+ "internal_stop_count": internal_stop_count,
97
+ "invalid_codon_count": len(invalid_codons),
98
+ "forbidden_motif_count": len(forbidden),
99
+ "homopolymer_count": len(homopolymers),
100
+ "repeat_count": len(repeats),
101
+ "warnings": warnings,
102
+ "errors": errors,
103
+ }
104
+