factorforge-cds 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- factorforge/__init__.py +19 -0
- factorforge/__main__.py +8 -0
- factorforge/cli/__init__.py +5 -0
- factorforge/cli/legacy_cli.py +157 -0
- factorforge/cli/main.py +305 -0
- factorforge/core/interfaces/__init__.py +7 -0
- factorforge/core/interfaces/exporter.py +13 -0
- factorforge/core/interfaces/optimizer.py +85 -0
- factorforge/core/interfaces/validator.py +9 -0
- factorforge/database.py +150 -0
- factorforge/engines/__init__.py +60 -0
- factorforge/engines/ml/__init__.py +0 -0
- factorforge/engines/ml/plant_optimizer.py +325 -0
- factorforge/engines/registry.py +141 -0
- factorforge/engines/v1_archived/__init__.py +15 -0
- factorforge/engines/v2/__init__.py +13 -0
- factorforge/engines/v2/codon_table_builder.py +107 -0
- factorforge/engines/v2/construct_builder.py +403 -0
- factorforge/engines/v2/exporter.py +455 -0
- factorforge/engines/v2/optimizer.py +190 -0
- factorforge/engines/v2/pipeline.py +275 -0
- factorforge/engines/v2/rules/__init__.py +3 -0
- factorforge/engines/v2/rules/domesticator.py +403 -0
- factorforge/engines/v2/rules/reverse_translator.py +765 -0
- factorforge/engines/v2/rules/rule_engine.py +867 -0
- factorforge/engines/v2/scoring.py +232 -0
- factorforge/engines/v2/utils.py +231 -0
- factorforge/engines/v2/validator.py +383 -0
- factorforge/engines/v3/__init__.py +12 -0
- factorforge/engines/v3/explain.py +119 -0
- factorforge/engines/v3/inference/__init__.py +6 -0
- factorforge/engines/v3/inference/constrained_decoder.py +80 -0
- factorforge/engines/v3/inference/v2_adapter.py +72 -0
- factorforge/engines/v3/metrics.py +145 -0
- factorforge/engines/v3/modeling_bart_decoder.py +127 -0
- factorforge/engines/v3/pipeline.py +192 -0
- factorforge/engines/v3/synonym_mask.py +61 -0
- factorforge/engines/v3/tokenizer.py +192 -0
- factorforge/ml/__init__.py +33 -0
- factorforge/ml/feasibility.py +199 -0
- factorforge/ml/metrics.py +295 -0
- factorforge/utils/__init__.py +31 -0
- factorforge/utils/construct_id.py +8 -0
- factorforge/utils/exceptions.py +32 -0
- factorforge/utils/sequence_validator.py +189 -0
- factorforge/utils/validation.py +104 -0
- factorforge_cds-3.0.0.dist-info/METADATA +475 -0
- factorforge_cds-3.0.0.dist-info/RECORD +52 -0
- factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
- factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
- factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
- factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
"""Shared sequence metrics for v3-alpha evaluation and validation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from collections import Counter
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
STANDARD_GENETIC_CODE: dict[str, str] = {
|
|
11
|
+
"TTT": "F",
|
|
12
|
+
"TTC": "F",
|
|
13
|
+
"TTA": "L",
|
|
14
|
+
"TTG": "L",
|
|
15
|
+
"TCT": "S",
|
|
16
|
+
"TCC": "S",
|
|
17
|
+
"TCA": "S",
|
|
18
|
+
"TCG": "S",
|
|
19
|
+
"TAT": "Y",
|
|
20
|
+
"TAC": "Y",
|
|
21
|
+
"TAA": "*",
|
|
22
|
+
"TAG": "*",
|
|
23
|
+
"TGT": "C",
|
|
24
|
+
"TGC": "C",
|
|
25
|
+
"TGA": "*",
|
|
26
|
+
"TGG": "W",
|
|
27
|
+
"CTT": "L",
|
|
28
|
+
"CTC": "L",
|
|
29
|
+
"CTA": "L",
|
|
30
|
+
"CTG": "L",
|
|
31
|
+
"CCT": "P",
|
|
32
|
+
"CCC": "P",
|
|
33
|
+
"CCA": "P",
|
|
34
|
+
"CCG": "P",
|
|
35
|
+
"CAT": "H",
|
|
36
|
+
"CAC": "H",
|
|
37
|
+
"CAA": "Q",
|
|
38
|
+
"CAG": "Q",
|
|
39
|
+
"CGT": "R",
|
|
40
|
+
"CGC": "R",
|
|
41
|
+
"CGA": "R",
|
|
42
|
+
"CGG": "R",
|
|
43
|
+
"ATT": "I",
|
|
44
|
+
"ATC": "I",
|
|
45
|
+
"ATA": "I",
|
|
46
|
+
"ATG": "M",
|
|
47
|
+
"ACT": "T",
|
|
48
|
+
"ACC": "T",
|
|
49
|
+
"ACA": "T",
|
|
50
|
+
"ACG": "T",
|
|
51
|
+
"AAT": "N",
|
|
52
|
+
"AAC": "N",
|
|
53
|
+
"AAA": "K",
|
|
54
|
+
"AAG": "K",
|
|
55
|
+
"AGT": "S",
|
|
56
|
+
"AGC": "S",
|
|
57
|
+
"AGA": "R",
|
|
58
|
+
"AGG": "R",
|
|
59
|
+
"GTT": "V",
|
|
60
|
+
"GTC": "V",
|
|
61
|
+
"GTA": "V",
|
|
62
|
+
"GTG": "V",
|
|
63
|
+
"GCT": "A",
|
|
64
|
+
"GCC": "A",
|
|
65
|
+
"GCA": "A",
|
|
66
|
+
"GCG": "A",
|
|
67
|
+
"GAT": "D",
|
|
68
|
+
"GAC": "D",
|
|
69
|
+
"GAA": "E",
|
|
70
|
+
"GAG": "E",
|
|
71
|
+
"GGT": "G",
|
|
72
|
+
"GGC": "G",
|
|
73
|
+
"GGA": "G",
|
|
74
|
+
"GGG": "G",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
STOP_CODONS = {codon for codon, aa in STANDARD_GENETIC_CODE.items() if aa == "*"}
|
|
78
|
+
VALID_BASES = set("ATGC")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _normalize_dna(sequence: str) -> str:
|
|
82
|
+
return "".join(sequence.upper().replace("U", "T").split())
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _codons(sequence: str, include_partial: bool = False) -> list[str]:
|
|
86
|
+
seq = _normalize_dna(sequence)
|
|
87
|
+
end = len(seq) if include_partial else len(seq) - len(seq) % 3
|
|
88
|
+
return [seq[index : index + 3] for index in range(0, end, 3)]
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def calculate_gc(sequence: str) -> float:
|
|
92
|
+
"""Calculate GC content as a percentage in the range 0-100."""
|
|
93
|
+
seq = _normalize_dna(sequence)
|
|
94
|
+
if not seq:
|
|
95
|
+
return 0.0
|
|
96
|
+
return ((seq.count("G") + seq.count("C")) / len(seq)) * 100.0
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def calculate_gc_windows(
|
|
100
|
+
sequence: str,
|
|
101
|
+
window_size: int = 60,
|
|
102
|
+
step: int = 30,
|
|
103
|
+
) -> list[dict[str, float | int]]:
|
|
104
|
+
"""Calculate sliding-window GC percentages."""
|
|
105
|
+
if window_size <= 0:
|
|
106
|
+
raise ValueError("window_size must be > 0")
|
|
107
|
+
if step <= 0:
|
|
108
|
+
raise ValueError("step must be > 0")
|
|
109
|
+
|
|
110
|
+
seq = _normalize_dna(sequence)
|
|
111
|
+
if not seq:
|
|
112
|
+
return []
|
|
113
|
+
if len(seq) <= window_size:
|
|
114
|
+
return [{"start": 0, "end": len(seq), "gc": calculate_gc(seq)}]
|
|
115
|
+
|
|
116
|
+
windows: list[dict[str, float | int]] = []
|
|
117
|
+
for start in range(0, len(seq) - window_size + 1, step):
|
|
118
|
+
end = start + window_size
|
|
119
|
+
windows.append({"start": start, "end": end, "gc": calculate_gc(seq[start:end])})
|
|
120
|
+
if windows and int(windows[-1]["end"]) < len(seq):
|
|
121
|
+
start = len(seq) - window_size
|
|
122
|
+
if start != int(windows[-1]["start"]):
|
|
123
|
+
windows.append({"start": start, "end": len(seq), "gc": calculate_gc(seq[start:])})
|
|
124
|
+
return windows
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def calculate_first_region_gc(
|
|
128
|
+
sequence: str,
|
|
129
|
+
region_sizes: list[int] | None = None,
|
|
130
|
+
) -> dict[str, float]:
|
|
131
|
+
"""Calculate GC for configured 5-prime regions."""
|
|
132
|
+
seq = _normalize_dna(sequence)
|
|
133
|
+
sizes = region_sizes or [30, 60, 90]
|
|
134
|
+
result: dict[str, float] = {}
|
|
135
|
+
for size in sizes:
|
|
136
|
+
if size <= 0:
|
|
137
|
+
raise ValueError("region sizes must be > 0")
|
|
138
|
+
region = seq[: min(size, len(seq))]
|
|
139
|
+
result[f"first_{size}nt_gc"] = calculate_gc(region)
|
|
140
|
+
return result
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def translate_dna(sequence: str) -> str:
|
|
144
|
+
"""Translate DNA to amino acids, using X for invalid codons."""
|
|
145
|
+
translated: list[str] = []
|
|
146
|
+
for codon in _codons(sequence):
|
|
147
|
+
translated.append(STANDARD_GENETIC_CODE.get(codon, "X"))
|
|
148
|
+
return "".join(translated)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def amino_acid_identity(input_protein: str, dna_sequence: str) -> float:
|
|
152
|
+
"""Return exact-position amino acid identity from translated DNA."""
|
|
153
|
+
expected = "".join(input_protein.upper().split())
|
|
154
|
+
observed = translate_dna(dna_sequence)
|
|
155
|
+
if observed.endswith("*") and not expected.endswith("*"):
|
|
156
|
+
observed = observed[:-1]
|
|
157
|
+
if not expected:
|
|
158
|
+
return 0.0
|
|
159
|
+
matches = sum(1 for exp, obs in zip(expected, observed) if exp == obs)
|
|
160
|
+
return matches / len(expected) if len(observed) == len(expected) else matches / len(expected)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def count_internal_stops(dna_sequence: str) -> int:
|
|
164
|
+
"""Count stop codons before the final codon."""
|
|
165
|
+
codons = _codons(dna_sequence)
|
|
166
|
+
return sum(1 for codon in codons[:-1] if codon in STOP_CODONS)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def calculate_cai(sequence: str, codon_weights: dict[str, float]) -> float:
|
|
170
|
+
"""Calculate CAI as a geometric mean of supplied codon weights."""
|
|
171
|
+
seq = _normalize_dna(sequence)
|
|
172
|
+
if not seq or len(seq) % 3 != 0:
|
|
173
|
+
return 0.0
|
|
174
|
+
|
|
175
|
+
log_sum = 0.0
|
|
176
|
+
count = 0
|
|
177
|
+
for codon in _codons(seq):
|
|
178
|
+
if codon in STOP_CODONS:
|
|
179
|
+
continue
|
|
180
|
+
weight = codon_weights.get(codon)
|
|
181
|
+
if weight is None or weight <= 0:
|
|
182
|
+
return 0.0
|
|
183
|
+
log_sum += math.log(weight)
|
|
184
|
+
count += 1
|
|
185
|
+
return math.exp(log_sum / count) if count else 0.0
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def codon_usage_profile(sequence: str) -> dict[str, dict[str, float | int | str]]:
|
|
189
|
+
"""Return codon counts and frequencies for a DNA sequence."""
|
|
190
|
+
codons = _codons(sequence)
|
|
191
|
+
counts = Counter(codons)
|
|
192
|
+
total = sum(counts.values())
|
|
193
|
+
profile: dict[str, dict[str, float | int | str]] = {}
|
|
194
|
+
for codon in sorted(counts):
|
|
195
|
+
profile[codon] = {
|
|
196
|
+
"count": counts[codon],
|
|
197
|
+
"frequency": counts[codon] / total if total else 0.0,
|
|
198
|
+
"aa": STANDARD_GENETIC_CODE.get(codon, "X"),
|
|
199
|
+
}
|
|
200
|
+
return profile
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def detect_homopolymers(sequence: str, max_run: int = 6) -> list[dict[str, Any]]:
|
|
204
|
+
"""Detect runs whose length is greater than or equal to max_run."""
|
|
205
|
+
if max_run <= 1:
|
|
206
|
+
raise ValueError("max_run must be > 1")
|
|
207
|
+
|
|
208
|
+
seq = _normalize_dna(sequence)
|
|
209
|
+
findings: list[dict[str, Any]] = []
|
|
210
|
+
if not seq:
|
|
211
|
+
return findings
|
|
212
|
+
|
|
213
|
+
run_base = seq[0]
|
|
214
|
+
run_start = 0
|
|
215
|
+
for index, base in enumerate(seq[1:], start=1):
|
|
216
|
+
if base == run_base:
|
|
217
|
+
continue
|
|
218
|
+
run_length = index - run_start
|
|
219
|
+
if run_length >= max_run:
|
|
220
|
+
findings.append(
|
|
221
|
+
{"start": run_start, "end": index, "base": run_base, "length": run_length}
|
|
222
|
+
)
|
|
223
|
+
run_base = base
|
|
224
|
+
run_start = index
|
|
225
|
+
|
|
226
|
+
run_length = len(seq) - run_start
|
|
227
|
+
if run_length >= max_run:
|
|
228
|
+
findings.append(
|
|
229
|
+
{"start": run_start, "end": len(seq), "base": run_base, "length": run_length}
|
|
230
|
+
)
|
|
231
|
+
return findings
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def detect_repeats(sequence: str) -> list[dict[str, Any]]:
|
|
235
|
+
"""Detect simple tandem repeats with motif length 2-6 repeated at least 3 times."""
|
|
236
|
+
seq = _normalize_dna(sequence)
|
|
237
|
+
findings: list[dict[str, Any]] = []
|
|
238
|
+
occupied: set[tuple[int, int]] = set()
|
|
239
|
+
|
|
240
|
+
for motif_len in range(2, 7):
|
|
241
|
+
index = 0
|
|
242
|
+
while index <= len(seq) - motif_len * 3:
|
|
243
|
+
motif = seq[index : index + motif_len]
|
|
244
|
+
repeats = 1
|
|
245
|
+
cursor = index + motif_len
|
|
246
|
+
while seq[cursor : cursor + motif_len] == motif:
|
|
247
|
+
repeats += 1
|
|
248
|
+
cursor += motif_len
|
|
249
|
+
if repeats >= 3:
|
|
250
|
+
span = (index, cursor)
|
|
251
|
+
if span not in occupied:
|
|
252
|
+
findings.append(
|
|
253
|
+
{
|
|
254
|
+
"start": index,
|
|
255
|
+
"end": cursor,
|
|
256
|
+
"motif": motif,
|
|
257
|
+
"repeat_count": repeats,
|
|
258
|
+
}
|
|
259
|
+
)
|
|
260
|
+
occupied.add(span)
|
|
261
|
+
index = cursor
|
|
262
|
+
else:
|
|
263
|
+
index += 1
|
|
264
|
+
return findings
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def detect_forbidden_motifs(sequence: str, motifs: list[str]) -> list[dict[str, Any]]:
|
|
268
|
+
"""Find all exact forbidden motif occurrences."""
|
|
269
|
+
seq = _normalize_dna(sequence)
|
|
270
|
+
findings: list[dict[str, Any]] = []
|
|
271
|
+
for motif in motifs:
|
|
272
|
+
normalized = _normalize_dna(motif)
|
|
273
|
+
if not normalized:
|
|
274
|
+
continue
|
|
275
|
+
start = seq.find(normalized)
|
|
276
|
+
while start != -1:
|
|
277
|
+
findings.append({"start": start, "end": start + len(normalized), "motif": normalized})
|
|
278
|
+
start = seq.find(normalized, start + 1)
|
|
279
|
+
return findings
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def detect_invalid_codons(sequence: str) -> list[dict[str, Any]]:
|
|
283
|
+
"""Detect invalid, partial, or non-ATGC codons."""
|
|
284
|
+
seq = _normalize_dna(sequence)
|
|
285
|
+
findings: list[dict[str, Any]] = []
|
|
286
|
+
for index, codon in enumerate(_codons(seq, include_partial=True)):
|
|
287
|
+
start = index * 3
|
|
288
|
+
if len(codon) != 3:
|
|
289
|
+
findings.append({"start": start, "end": len(seq), "codon": codon, "reason": "partial"})
|
|
290
|
+
elif set(codon) - VALID_BASES:
|
|
291
|
+
findings.append({"start": start, "end": start + 3, "codon": codon, "reason": "invalid_base"})
|
|
292
|
+
elif codon not in STANDARD_GENETIC_CODE:
|
|
293
|
+
findings.append({"start": start, "end": start + 3, "codon": codon, "reason": "unknown"})
|
|
294
|
+
return findings
|
|
295
|
+
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Utility helpers for FactorForge."""
|
|
2
|
+
|
|
3
|
+
from .exceptions import (
|
|
4
|
+
FactorForgeError,
|
|
5
|
+
CodonTableError,
|
|
6
|
+
EmptyCandidateError,
|
|
7
|
+
FileFormatError,
|
|
8
|
+
OptimizationError,
|
|
9
|
+
SequenceValidationError,
|
|
10
|
+
)
|
|
11
|
+
from .sequence_validator import (
|
|
12
|
+
detect_sequence_type,
|
|
13
|
+
validate_cds_output,
|
|
14
|
+
validate_and_normalize,
|
|
15
|
+
validate_dna_sequence,
|
|
16
|
+
validate_protein_sequence,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"FactorForgeError",
|
|
21
|
+
"SequenceValidationError",
|
|
22
|
+
"OptimizationError",
|
|
23
|
+
"EmptyCandidateError",
|
|
24
|
+
"FileFormatError",
|
|
25
|
+
"CodonTableError",
|
|
26
|
+
"detect_sequence_type",
|
|
27
|
+
"validate_cds_output",
|
|
28
|
+
"validate_and_normalize",
|
|
29
|
+
"validate_dna_sequence",
|
|
30
|
+
"validate_protein_sequence",
|
|
31
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Custom exceptions for FactorForge."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class FactorForgeError(Exception):
|
|
5
|
+
"""Base exception for FactorForge."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SequenceValidationError(FactorForgeError):
|
|
9
|
+
"""Raised when sequence validation fails."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class OptimizationError(FactorForgeError):
|
|
13
|
+
"""Raised when optimization fails."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EmptyCandidateError(OptimizationError):
|
|
17
|
+
"""Raised when no valid codon candidates are generated."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, amino_acid: str, reason: str = "") -> None:
|
|
20
|
+
self.amino_acid = amino_acid
|
|
21
|
+
message = f"No valid codon candidates for amino acid '{amino_acid}'"
|
|
22
|
+
if reason:
|
|
23
|
+
message = f"{message}: {reason}"
|
|
24
|
+
super().__init__(message)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class FileFormatError(FactorForgeError):
|
|
28
|
+
"""Raised when file format is invalid."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class CodonTableError(FactorForgeError):
|
|
32
|
+
"""Raised when codon table is invalid or missing."""
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Sequence validation and type detection utilities."""
|
|
2
|
+
|
|
3
|
+
from typing import Literal, Tuple
|
|
4
|
+
|
|
5
|
+
from factorforge.ml.metrics import detect_invalid_codons, translate_dna
|
|
6
|
+
|
|
7
|
+
from .exceptions import SequenceValidationError
|
|
8
|
+
|
|
9
|
+
DNA_CHARS = set("ATGCN")
|
|
10
|
+
AMBIGUOUS_DNA_CHARS = set("ATGCM")
|
|
11
|
+
PROTEIN_ONLY_CHARS = set("DEFHIKLPQRSVWY")
|
|
12
|
+
VALID_CHARS = set("ACDEFGHIKLMNPQRSTVWY*")
|
|
13
|
+
MIN_DNA_LEN = 6
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _clean_sequence(seq: str) -> str:
|
|
17
|
+
return "".join(seq.upper().split())
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def detect_sequence_type(seq: str) -> Literal["dna", "protein", "ambiguous"]:
|
|
21
|
+
"""
|
|
22
|
+
Detect if input sequence is DNA or protein.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
seq: Input sequence string
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
"dna": Valid DNA (only ATGCN)
|
|
29
|
+
"protein": Valid protein (contains non-DNA amino acids)
|
|
30
|
+
"ambiguous": Could be either (only contains A/T/G/C/M)
|
|
31
|
+
|
|
32
|
+
Examples:
|
|
33
|
+
>>> detect_sequence_type("ATGGCC")
|
|
34
|
+
'dna'
|
|
35
|
+
>>> detect_sequence_type("MKKGEL")
|
|
36
|
+
'protein'
|
|
37
|
+
>>> detect_sequence_type("MA")
|
|
38
|
+
'ambiguous'
|
|
39
|
+
"""
|
|
40
|
+
seq_upper = _clean_sequence(seq)
|
|
41
|
+
if not seq_upper:
|
|
42
|
+
return "ambiguous"
|
|
43
|
+
|
|
44
|
+
seq_chars = set(seq_upper)
|
|
45
|
+
|
|
46
|
+
# Protein-only letters present.
|
|
47
|
+
if seq_chars & PROTEIN_ONLY_CHARS:
|
|
48
|
+
return "protein"
|
|
49
|
+
|
|
50
|
+
# DNA-only letters present.
|
|
51
|
+
if seq_chars <= DNA_CHARS:
|
|
52
|
+
if len(seq_upper) >= MIN_DNA_LEN and len(seq_upper) % 3 == 0:
|
|
53
|
+
return "dna"
|
|
54
|
+
return "ambiguous"
|
|
55
|
+
|
|
56
|
+
# Ambiguous DNA (IUPAC M code).
|
|
57
|
+
if seq_chars <= AMBIGUOUS_DNA_CHARS:
|
|
58
|
+
return "ambiguous"
|
|
59
|
+
|
|
60
|
+
# Default to protein for other amino-acid characters.
|
|
61
|
+
return "protein"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def validate_and_normalize(
|
|
65
|
+
seq: str,
|
|
66
|
+
expected_type: Literal["dna", "protein", "auto"] = "auto",
|
|
67
|
+
) -> Tuple[str, Literal["dna", "protein"]]:
|
|
68
|
+
"""
|
|
69
|
+
Validate and normalize sequence with type detection.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
seq: Input sequence
|
|
73
|
+
expected_type: Expected type or "auto" for auto-detection
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
(normalized_sequence, detected_type)
|
|
77
|
+
|
|
78
|
+
Raises:
|
|
79
|
+
SequenceValidationError: If sequence is invalid or type mismatch
|
|
80
|
+
|
|
81
|
+
Examples:
|
|
82
|
+
>>> validate_and_normalize("atggcc", "auto")
|
|
83
|
+
('ATGGCC', 'dna')
|
|
84
|
+
>>> validate_and_normalize("MKKGEL", "protein")
|
|
85
|
+
('MKKGEL', 'protein')
|
|
86
|
+
"""
|
|
87
|
+
seq_clean = _clean_sequence(seq)
|
|
88
|
+
|
|
89
|
+
if not seq_clean:
|
|
90
|
+
raise SequenceValidationError("Empty sequence provided")
|
|
91
|
+
|
|
92
|
+
invalid_chars = set(seq_clean) - VALID_CHARS
|
|
93
|
+
if invalid_chars:
|
|
94
|
+
raise SequenceValidationError(
|
|
95
|
+
f"Invalid characters in sequence: {', '.join(sorted(invalid_chars))}"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
detected = detect_sequence_type(seq_clean)
|
|
99
|
+
|
|
100
|
+
if expected_type == "auto":
|
|
101
|
+
if detected == "ambiguous":
|
|
102
|
+
if len(seq_clean) >= MIN_DNA_LEN and len(seq_clean) % 3 == 0:
|
|
103
|
+
return seq_clean, "dna"
|
|
104
|
+
return seq_clean, "protein"
|
|
105
|
+
return seq_clean, detected
|
|
106
|
+
|
|
107
|
+
if expected_type != detected and detected != "ambiguous":
|
|
108
|
+
raise SequenceValidationError(
|
|
109
|
+
f"Expected {expected_type} sequence but detected {detected}. "
|
|
110
|
+
f"Sequence: {seq_clean[:20]}{'...' if len(seq_clean) > 20 else ''}"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
return seq_clean, expected_type
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def validate_dna_sequence(seq: str) -> str:
|
|
117
|
+
"""
|
|
118
|
+
Validate DNA sequence.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
seq: DNA sequence
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Normalized DNA sequence
|
|
125
|
+
|
|
126
|
+
Raises:
|
|
127
|
+
SequenceValidationError: If not valid DNA
|
|
128
|
+
"""
|
|
129
|
+
seq_clean = _clean_sequence(seq)
|
|
130
|
+
invalid = set(seq_clean) - DNA_CHARS
|
|
131
|
+
if invalid:
|
|
132
|
+
raise SequenceValidationError(f"Invalid DNA characters: {', '.join(sorted(invalid))}")
|
|
133
|
+
|
|
134
|
+
return seq_clean
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def validate_protein_sequence(seq: str) -> str:
|
|
138
|
+
"""
|
|
139
|
+
Validate protein sequence.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
seq: Protein sequence
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Normalized protein sequence
|
|
146
|
+
|
|
147
|
+
Raises:
|
|
148
|
+
SequenceValidationError: If not valid protein
|
|
149
|
+
"""
|
|
150
|
+
seq_clean = _clean_sequence(seq)
|
|
151
|
+
aa_chars = set("ACDEFGHIKLMNPQRSTVWY*")
|
|
152
|
+
invalid = set(seq_clean) - aa_chars
|
|
153
|
+
if invalid:
|
|
154
|
+
raise SequenceValidationError(
|
|
155
|
+
f"Invalid amino acid characters: {', '.join(sorted(invalid))}"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
return seq_clean
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def validate_cds_output(input_protein: str, dna_sequence: str) -> dict[str, object]:
|
|
162
|
+
"""Strictly validate generated CDS output against the input protein.
|
|
163
|
+
|
|
164
|
+
This validator is intentionally narrow: it returns only hard-fail errors
|
|
165
|
+
for generated CDS outputs that should not be recommended.
|
|
166
|
+
"""
|
|
167
|
+
expected = _clean_sequence(input_protein).rstrip("*")
|
|
168
|
+
seq = _clean_sequence(dna_sequence).replace("U", "T")
|
|
169
|
+
errors: list[str] = []
|
|
170
|
+
|
|
171
|
+
if len(seq) % 3 != 0:
|
|
172
|
+
errors.append("length_not_divisible_by_3")
|
|
173
|
+
|
|
174
|
+
invalid = detect_invalid_codons(seq)
|
|
175
|
+
if invalid:
|
|
176
|
+
errors.append(f"invalid_codons: {invalid[:3]}")
|
|
177
|
+
|
|
178
|
+
translated = translate_dna(seq)
|
|
179
|
+
if "*" in translated[:-1]:
|
|
180
|
+
errors.append("internal_stop_codon")
|
|
181
|
+
|
|
182
|
+
observed = translated.rstrip("*")
|
|
183
|
+
if expected != observed:
|
|
184
|
+
errors.append(f"aa_mismatch: expected_len={len(expected)} observed_len={len(observed)}")
|
|
185
|
+
|
|
186
|
+
return {
|
|
187
|
+
"passed": not errors,
|
|
188
|
+
"errors": errors,
|
|
189
|
+
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Structured candidate DNA validation for v3-alpha."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from factorforge.ml.metrics import (
|
|
8
|
+
amino_acid_identity,
|
|
9
|
+
calculate_first_region_gc,
|
|
10
|
+
calculate_gc,
|
|
11
|
+
calculate_gc_windows,
|
|
12
|
+
count_internal_stops,
|
|
13
|
+
detect_forbidden_motifs,
|
|
14
|
+
detect_homopolymers,
|
|
15
|
+
detect_invalid_codons,
|
|
16
|
+
detect_repeats,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
DEFAULT_CONFIG: dict[str, Any] = {
|
|
21
|
+
"gc_window_low": 30.0,
|
|
22
|
+
"gc_window_high": 70.0,
|
|
23
|
+
"gc_window_size": 60,
|
|
24
|
+
"gc_window_step": 30,
|
|
25
|
+
"forbidden_motifs": [],
|
|
26
|
+
"fail_forbidden_motifs": False,
|
|
27
|
+
"homopolymer_max_run": 6,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def validate_candidate_sequence(
|
|
32
|
+
input_protein: str,
|
|
33
|
+
dna_sequence: str,
|
|
34
|
+
config: dict[str, Any] | None = None,
|
|
35
|
+
) -> dict[str, Any]:
|
|
36
|
+
"""Validate a candidate CDS and return a machine-readable result."""
|
|
37
|
+
cfg = {**DEFAULT_CONFIG, **(config or {})}
|
|
38
|
+
seq = "".join(dna_sequence.upper().replace("U", "T").split())
|
|
39
|
+
errors: list[str] = []
|
|
40
|
+
warnings: list[str] = []
|
|
41
|
+
|
|
42
|
+
if not seq:
|
|
43
|
+
errors.append("empty sequence")
|
|
44
|
+
if len(seq) % 3 != 0:
|
|
45
|
+
errors.append("sequence length is not divisible by 3")
|
|
46
|
+
|
|
47
|
+
invalid_codons = detect_invalid_codons(seq)
|
|
48
|
+
internal_stop_count = count_internal_stops(seq)
|
|
49
|
+
identity = amino_acid_identity(input_protein, seq) if seq else 0.0
|
|
50
|
+
gc = calculate_gc(seq)
|
|
51
|
+
windows = calculate_gc_windows(
|
|
52
|
+
seq,
|
|
53
|
+
window_size=int(cfg["gc_window_size"]),
|
|
54
|
+
step=int(cfg["gc_window_step"]),
|
|
55
|
+
)
|
|
56
|
+
window_values = [float(window["gc"]) for window in windows]
|
|
57
|
+
gc_window_min = min(window_values) if window_values else 0.0
|
|
58
|
+
gc_window_max = max(window_values) if window_values else 0.0
|
|
59
|
+
low = float(cfg["gc_window_low"])
|
|
60
|
+
high = float(cfg["gc_window_high"])
|
|
61
|
+
gc_window_outlier_count = sum(1 for value in window_values if value < low or value > high)
|
|
62
|
+
|
|
63
|
+
first_region = calculate_first_region_gc(seq)
|
|
64
|
+
forbidden = detect_forbidden_motifs(seq, list(cfg.get("forbidden_motifs", [])))
|
|
65
|
+
homopolymers = detect_homopolymers(seq, max_run=int(cfg["homopolymer_max_run"]))
|
|
66
|
+
repeats = detect_repeats(seq)
|
|
67
|
+
|
|
68
|
+
if identity < 1.0:
|
|
69
|
+
errors.append("amino acid identity is below 1.0")
|
|
70
|
+
if invalid_codons:
|
|
71
|
+
errors.append("invalid codons detected")
|
|
72
|
+
if internal_stop_count:
|
|
73
|
+
errors.append("internal stop codons detected")
|
|
74
|
+
if forbidden and bool(cfg.get("fail_forbidden_motifs")):
|
|
75
|
+
errors.append("forbidden motifs detected")
|
|
76
|
+
|
|
77
|
+
if gc_window_outlier_count:
|
|
78
|
+
warnings.append("local GC window outliers detected")
|
|
79
|
+
if forbidden and not bool(cfg.get("fail_forbidden_motifs")):
|
|
80
|
+
warnings.append("forbidden motifs detected")
|
|
81
|
+
if homopolymers:
|
|
82
|
+
warnings.append("homopolymers detected")
|
|
83
|
+
if repeats:
|
|
84
|
+
warnings.append("simple repeats detected")
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
"passed": not errors,
|
|
88
|
+
"amino_acid_identity": identity,
|
|
89
|
+
"gc": gc,
|
|
90
|
+
"gc_window_min": gc_window_min,
|
|
91
|
+
"gc_window_max": gc_window_max,
|
|
92
|
+
"gc_window_outlier_count": gc_window_outlier_count,
|
|
93
|
+
"first_30nt_gc": first_region["first_30nt_gc"],
|
|
94
|
+
"first_60nt_gc": first_region["first_60nt_gc"],
|
|
95
|
+
"first_90nt_gc": first_region["first_90nt_gc"],
|
|
96
|
+
"internal_stop_count": internal_stop_count,
|
|
97
|
+
"invalid_codon_count": len(invalid_codons),
|
|
98
|
+
"forbidden_motif_count": len(forbidden),
|
|
99
|
+
"homopolymer_count": len(homopolymers),
|
|
100
|
+
"repeat_count": len(repeats),
|
|
101
|
+
"warnings": warnings,
|
|
102
|
+
"errors": errors,
|
|
103
|
+
}
|
|
104
|
+
|