factorforge-cds 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- factorforge/__init__.py +19 -0
- factorforge/__main__.py +8 -0
- factorforge/cli/__init__.py +5 -0
- factorforge/cli/legacy_cli.py +157 -0
- factorforge/cli/main.py +305 -0
- factorforge/core/interfaces/__init__.py +7 -0
- factorforge/core/interfaces/exporter.py +13 -0
- factorforge/core/interfaces/optimizer.py +85 -0
- factorforge/core/interfaces/validator.py +9 -0
- factorforge/database.py +150 -0
- factorforge/engines/__init__.py +60 -0
- factorforge/engines/ml/__init__.py +0 -0
- factorforge/engines/ml/plant_optimizer.py +325 -0
- factorforge/engines/registry.py +141 -0
- factorforge/engines/v1_archived/__init__.py +15 -0
- factorforge/engines/v2/__init__.py +13 -0
- factorforge/engines/v2/codon_table_builder.py +107 -0
- factorforge/engines/v2/construct_builder.py +403 -0
- factorforge/engines/v2/exporter.py +455 -0
- factorforge/engines/v2/optimizer.py +190 -0
- factorforge/engines/v2/pipeline.py +275 -0
- factorforge/engines/v2/rules/__init__.py +3 -0
- factorforge/engines/v2/rules/domesticator.py +403 -0
- factorforge/engines/v2/rules/reverse_translator.py +765 -0
- factorforge/engines/v2/rules/rule_engine.py +867 -0
- factorforge/engines/v2/scoring.py +232 -0
- factorforge/engines/v2/utils.py +231 -0
- factorforge/engines/v2/validator.py +383 -0
- factorforge/engines/v3/__init__.py +12 -0
- factorforge/engines/v3/explain.py +119 -0
- factorforge/engines/v3/inference/__init__.py +6 -0
- factorforge/engines/v3/inference/constrained_decoder.py +80 -0
- factorforge/engines/v3/inference/v2_adapter.py +72 -0
- factorforge/engines/v3/metrics.py +145 -0
- factorforge/engines/v3/modeling_bart_decoder.py +127 -0
- factorforge/engines/v3/pipeline.py +192 -0
- factorforge/engines/v3/synonym_mask.py +61 -0
- factorforge/engines/v3/tokenizer.py +192 -0
- factorforge/ml/__init__.py +33 -0
- factorforge/ml/feasibility.py +199 -0
- factorforge/ml/metrics.py +295 -0
- factorforge/utils/__init__.py +31 -0
- factorforge/utils/construct_id.py +8 -0
- factorforge/utils/exceptions.py +32 -0
- factorforge/utils/sequence_validator.py +189 -0
- factorforge/utils/validation.py +104 -0
- factorforge_cds-3.0.0.dist-info/METADATA +475 -0
- factorforge_cds-3.0.0.dist-info/RECORD +52 -0
- factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
- factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
- factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
- factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,867 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Rule Engine for FactorForge v2
|
|
3
|
+
Plant-aware rule engine - scanning + auto-fix (P0-3)
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import re
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
from factorforge.engines.v2.utils import (
|
|
16
|
+
build_aa_to_codons_map,
|
|
17
|
+
count_dinucleotides,
|
|
18
|
+
get_data_path,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RuleEngine:
|
|
23
|
+
"""
|
|
24
|
+
Plant-aware rule engine
|
|
25
|
+
|
|
26
|
+
Features:
|
|
27
|
+
- Detect and remove PolyA signals
|
|
28
|
+
- Detect ARE (AU-rich elements)
|
|
29
|
+
- Detect repeats/homopolymer runs
|
|
30
|
+
- Detect extreme GC regions
|
|
31
|
+
- Detect potential splice sites
|
|
32
|
+
- Auto-fix via synonymous substitutions
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Pattern definitions
|
|
36
|
+
# PolyA signal patterns: Tier 1 (canonical & high-frequency) + Tier 2 (plant-functional)
|
|
37
|
+
POLYA_PATTERNS = {
|
|
38
|
+
# Tier 1 (canonical & high-frequency variants)
|
|
39
|
+
"AATAAA": "canonical",
|
|
40
|
+
"ATTAAA": "variant_1",
|
|
41
|
+
"AGTAAA": "variant_2",
|
|
42
|
+
# Tier 2 (lower-frequency but functional in plants)
|
|
43
|
+
"AATACA": "variant_3",
|
|
44
|
+
"AAGAAA": "variant_4",
|
|
45
|
+
"AATGAA": "variant_5",
|
|
46
|
+
}
|
|
47
|
+
POLYA_TIER1_PATTERNS = {"AATAAA", "ATTAAA", "AGTAAA"}
|
|
48
|
+
POLYA_TIER2_PATTERNS = {"AATACA", "AAGAAA", "AATGAA"}
|
|
49
|
+
|
|
50
|
+
UNSTABLE_MOTIFS = {"ATTTA": "ARE (AU-rich element)", "WWWWWW": "W=A/T, 6+ in a row"}
|
|
51
|
+
|
|
52
|
+
def __init__(self, codon_table: dict[str, Any] | None = None) -> None:
|
|
53
|
+
"""
|
|
54
|
+
Args:
|
|
55
|
+
codon_table: Codon table (loads default if None)
|
|
56
|
+
"""
|
|
57
|
+
if codon_table is None:
|
|
58
|
+
# Use centralized data path management
|
|
59
|
+
data_dir = get_data_path()
|
|
60
|
+
codon_table_path = data_dir / "nbenthamiana_codons.json"
|
|
61
|
+
with open(codon_table_path, "r", encoding="utf-8") as f:
|
|
62
|
+
codon_table = json.load(f)
|
|
63
|
+
|
|
64
|
+
self.codon_table: dict[str, Any] = codon_table
|
|
65
|
+
self.aa_to_codons: dict[str, list[str]] = self._build_aa_to_codons_map()
|
|
66
|
+
|
|
67
|
+
def _build_aa_to_codons_map(self) -> dict[str, list[str]]:
|
|
68
|
+
"""Build amino-acid-to-codons map"""
|
|
69
|
+
return build_aa_to_codons_map(self.codon_table)
|
|
70
|
+
|
|
71
|
+
def scan_polya(self, seq: str, window: int = 30) -> list[dict[str, Any]]:
|
|
72
|
+
"""
|
|
73
|
+
Detect PolyA signal family
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
seq: DNA sequence
|
|
77
|
+
window: Window size (bp)
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
List of violations
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
None.
|
|
84
|
+
|
|
85
|
+
Examples:
|
|
86
|
+
>>> engine = RuleEngine()
|
|
87
|
+
>>> engine.scan_polya("AATAAA")
|
|
88
|
+
[{'type': 'polya_signal', 'pattern': 'AATAAA', ...}]
|
|
89
|
+
"""
|
|
90
|
+
violations: list[dict[str, Any]] = []
|
|
91
|
+
seq_len = len(seq)
|
|
92
|
+
pattern_hits: dict[str, list[int]] = {}
|
|
93
|
+
|
|
94
|
+
# Detect individual patterns
|
|
95
|
+
for pattern, pattern_type in self.POLYA_PATTERNS.items():
|
|
96
|
+
hits: list[int] = []
|
|
97
|
+
pos = 0
|
|
98
|
+
while True:
|
|
99
|
+
idx = seq.find(pattern, pos)
|
|
100
|
+
if idx == -1:
|
|
101
|
+
break
|
|
102
|
+
|
|
103
|
+
hits.append(idx)
|
|
104
|
+
violations.append(
|
|
105
|
+
{
|
|
106
|
+
"type": "polya_signal",
|
|
107
|
+
"pattern": pattern,
|
|
108
|
+
"pattern_type": pattern_type,
|
|
109
|
+
"position": idx,
|
|
110
|
+
"context": seq[max(0, idx - 10) : min(len(seq), idx + len(pattern) + 10)],
|
|
111
|
+
}
|
|
112
|
+
)
|
|
113
|
+
pos = idx + 1
|
|
114
|
+
pattern_hits[pattern] = hits
|
|
115
|
+
|
|
116
|
+
if window < 1 or seq_len < window:
|
|
117
|
+
return violations
|
|
118
|
+
|
|
119
|
+
# Precompute per-pattern prefix arrays for fast "pattern exists in window".
|
|
120
|
+
# Semantics match `pattern in window_seq`: count each pattern at most once/window.
|
|
121
|
+
pattern_prefix: dict[str, tuple[int, list[int]]] = {}
|
|
122
|
+
for pattern, hits in pattern_hits.items():
|
|
123
|
+
plen = len(pattern)
|
|
124
|
+
if plen > window or not hits:
|
|
125
|
+
continue
|
|
126
|
+
prefix = [0] * (seq_len + 1)
|
|
127
|
+
for idx in hits:
|
|
128
|
+
prefix[idx + 1] = 1
|
|
129
|
+
for i in range(1, seq_len + 1):
|
|
130
|
+
prefix[i] += prefix[i - 1]
|
|
131
|
+
pattern_prefix[pattern] = (plen, prefix)
|
|
132
|
+
|
|
133
|
+
# Add warning if 2+ patterns in 30 bp window
|
|
134
|
+
for i in range(seq_len - window + 1):
|
|
135
|
+
count = 0
|
|
136
|
+
for _pattern, (plen, prefix) in pattern_prefix.items():
|
|
137
|
+
max_start = i + window - plen
|
|
138
|
+
if max_start >= i and (prefix[max_start + 1] - prefix[i]) > 0:
|
|
139
|
+
count += 1
|
|
140
|
+
|
|
141
|
+
if count >= 2:
|
|
142
|
+
window_seq = seq[i : i + window]
|
|
143
|
+
violations.append(
|
|
144
|
+
{
|
|
145
|
+
"type": "multiple_polya",
|
|
146
|
+
"position": i,
|
|
147
|
+
"window_size": window,
|
|
148
|
+
"count": count,
|
|
149
|
+
"context": window_seq,
|
|
150
|
+
"severity": "high",
|
|
151
|
+
}
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
return violations
|
|
155
|
+
|
|
156
|
+
def scan_are(self, seq: str) -> list[dict[str, Any]]:
|
|
157
|
+
"""
|
|
158
|
+
Detect ARE (AU-rich element) pattern
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
seq: DNA sequence
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
List of violations
|
|
165
|
+
|
|
166
|
+
Raises:
|
|
167
|
+
None.
|
|
168
|
+
|
|
169
|
+
Examples:
|
|
170
|
+
>>> engine = RuleEngine()
|
|
171
|
+
>>> engine.scan_are("ATTTA")
|
|
172
|
+
[{'type': 'are_element', ...}]
|
|
173
|
+
"""
|
|
174
|
+
violations: list[dict[str, Any]] = []
|
|
175
|
+
|
|
176
|
+
# ATTTA pattern
|
|
177
|
+
pos = 0
|
|
178
|
+
while True:
|
|
179
|
+
idx = seq.find("ATTTA", pos)
|
|
180
|
+
if idx == -1:
|
|
181
|
+
break
|
|
182
|
+
|
|
183
|
+
violations.append(
|
|
184
|
+
{
|
|
185
|
+
"type": "are_element",
|
|
186
|
+
"pattern": "ATTTA",
|
|
187
|
+
"position": idx,
|
|
188
|
+
"context": seq[max(0, idx - 10) : min(len(seq), idx + 15)],
|
|
189
|
+
"severity": "medium",
|
|
190
|
+
}
|
|
191
|
+
)
|
|
192
|
+
pos = idx + 1
|
|
193
|
+
|
|
194
|
+
return violations
|
|
195
|
+
|
|
196
|
+
def scan_at_runs(self, seq: str, min_length: int = 6) -> list[dict[str, Any]]:
|
|
197
|
+
"""
|
|
198
|
+
Detect A/T runs
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
seq: DNA sequence
|
|
202
|
+
min_length: Minimum length
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
List of violations
|
|
206
|
+
|
|
207
|
+
Raises:
|
|
208
|
+
None.
|
|
209
|
+
|
|
210
|
+
Examples:
|
|
211
|
+
>>> engine = RuleEngine()
|
|
212
|
+
>>> engine.scan_at_runs("AAAAAATTT", min_length=6)
|
|
213
|
+
[{'type': 'at_run', ...}]
|
|
214
|
+
"""
|
|
215
|
+
violations: list[dict[str, Any]] = []
|
|
216
|
+
pattern = r"[AT]{" + str(min_length) + r",}"
|
|
217
|
+
|
|
218
|
+
for match in re.finditer(pattern, seq):
|
|
219
|
+
violations.append(
|
|
220
|
+
{
|
|
221
|
+
"type": "at_run",
|
|
222
|
+
"position": match.start(),
|
|
223
|
+
"length": len(match.group()),
|
|
224
|
+
"sequence": match.group(),
|
|
225
|
+
"context": seq[max(0, match.start() - 5) : min(len(seq), match.end() + 5)],
|
|
226
|
+
"severity": "medium" if len(match.group()) < 8 else "high",
|
|
227
|
+
}
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return violations
|
|
231
|
+
|
|
232
|
+
def scan_homopolymers(self, seq: str, min_length: int = 8) -> list[dict[str, Any]]:
|
|
233
|
+
"""
|
|
234
|
+
Detect 8+ homopolymers (synthesis risk)
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
seq: DNA sequence
|
|
238
|
+
min_length: Minimum length
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
List of violations
|
|
242
|
+
|
|
243
|
+
Raises:
|
|
244
|
+
None.
|
|
245
|
+
|
|
246
|
+
Examples:
|
|
247
|
+
>>> engine = RuleEngine()
|
|
248
|
+
>>> engine.scan_homopolymers("AAAAAAAA", min_length=8)
|
|
249
|
+
[{'type': 'homopolymer', ...}]
|
|
250
|
+
"""
|
|
251
|
+
violations: list[dict[str, Any]] = []
|
|
252
|
+
|
|
253
|
+
for base in "ATGC":
|
|
254
|
+
pattern = base * min_length
|
|
255
|
+
pos = 0
|
|
256
|
+
while True:
|
|
257
|
+
idx = seq.find(pattern, pos)
|
|
258
|
+
if idx == -1:
|
|
259
|
+
break
|
|
260
|
+
|
|
261
|
+
# Compute actual run length
|
|
262
|
+
actual_length = min_length
|
|
263
|
+
while idx + actual_length < len(seq) and seq[idx + actual_length] == base:
|
|
264
|
+
actual_length += 1
|
|
265
|
+
|
|
266
|
+
violations.append(
|
|
267
|
+
{
|
|
268
|
+
"type": "homopolymer",
|
|
269
|
+
"base": base,
|
|
270
|
+
"position": idx,
|
|
271
|
+
"length": actual_length,
|
|
272
|
+
"sequence": base * actual_length,
|
|
273
|
+
"severity": "high" if actual_length >= 10 else "medium",
|
|
274
|
+
}
|
|
275
|
+
)
|
|
276
|
+
pos = idx + actual_length
|
|
277
|
+
|
|
278
|
+
return violations
|
|
279
|
+
|
|
280
|
+
def scan_repeats(self, seq: str, min_length: int = 15) -> list[dict[str, Any]]:
|
|
281
|
+
"""
|
|
282
|
+
Detect perfect repeats >= 15 bp (recombination risk)
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
seq: DNA sequence
|
|
286
|
+
min_length: Minimum repeat length
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
List of violations
|
|
290
|
+
|
|
291
|
+
Raises:
|
|
292
|
+
None.
|
|
293
|
+
|
|
294
|
+
Examples:
|
|
295
|
+
>>> engine = RuleEngine()
|
|
296
|
+
>>> engine.scan_repeats("ATGATGATGATGATG", min_length=3)
|
|
297
|
+
[{'type': 'repeat', ...}]
|
|
298
|
+
"""
|
|
299
|
+
violations: list[dict[str, Any]] = []
|
|
300
|
+
seen_fragments: dict[str, list[int]] = {}
|
|
301
|
+
|
|
302
|
+
for i in range(len(seq) - min_length + 1):
|
|
303
|
+
fragment = seq[i : i + min_length]
|
|
304
|
+
|
|
305
|
+
if fragment in seen_fragments:
|
|
306
|
+
# Already found repeat
|
|
307
|
+
seen_fragments[fragment].append(i)
|
|
308
|
+
else:
|
|
309
|
+
# First occurrence
|
|
310
|
+
seen_fragments[fragment] = [i]
|
|
311
|
+
|
|
312
|
+
# Report only fragments that appear 2+ times
|
|
313
|
+
for fragment, positions in seen_fragments.items():
|
|
314
|
+
if len(positions) > 1:
|
|
315
|
+
violations.append(
|
|
316
|
+
{
|
|
317
|
+
"type": "repeat",
|
|
318
|
+
"fragment": fragment,
|
|
319
|
+
"length": len(fragment),
|
|
320
|
+
"positions": positions,
|
|
321
|
+
"count": len(positions),
|
|
322
|
+
"severity": "high" if len(positions) > 2 else "medium",
|
|
323
|
+
}
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
return violations
|
|
327
|
+
|
|
328
|
+
def scan_gc_extremes(
|
|
329
|
+
self,
|
|
330
|
+
seq: str,
|
|
331
|
+
window: int = 50,
|
|
332
|
+
min_gc: float = 25,
|
|
333
|
+
max_gc: float = 75,
|
|
334
|
+
) -> list[dict[str, Any]]:
|
|
335
|
+
"""
|
|
336
|
+
Detect extreme GC regions
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
seq: DNA sequence
|
|
340
|
+
window: Window size (bp)
|
|
341
|
+
min_gc: Minimum GC% threshold
|
|
342
|
+
max_gc: Maximum GC% threshold
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
List of violations
|
|
346
|
+
|
|
347
|
+
Raises:
|
|
348
|
+
None.
|
|
349
|
+
|
|
350
|
+
Examples:
|
|
351
|
+
>>> engine = RuleEngine()
|
|
352
|
+
>>> engine.scan_gc_extremes("GGGGGG", window=3, max_gc=80)
|
|
353
|
+
[{'type': 'gc_extreme', ...}]
|
|
354
|
+
"""
|
|
355
|
+
violations: list[dict[str, Any]] = []
|
|
356
|
+
seq_len = len(seq)
|
|
357
|
+
if window < 1 or seq_len < window:
|
|
358
|
+
return violations
|
|
359
|
+
|
|
360
|
+
seq_upper = seq.upper()
|
|
361
|
+
gc_count = sum(1 for b in seq_upper[:window] if b == "G" or b == "C")
|
|
362
|
+
last_start = seq_len - window
|
|
363
|
+
|
|
364
|
+
for i in range(last_start + 1):
|
|
365
|
+
if i > 0:
|
|
366
|
+
left = seq_upper[i - 1]
|
|
367
|
+
right = seq_upper[i + window - 1]
|
|
368
|
+
if left == "G" or left == "C":
|
|
369
|
+
gc_count -= 1
|
|
370
|
+
if right == "G" or right == "C":
|
|
371
|
+
gc_count += 1
|
|
372
|
+
|
|
373
|
+
gc = (gc_count / window) * 100.0
|
|
374
|
+
|
|
375
|
+
if gc < min_gc or gc > max_gc:
|
|
376
|
+
severity = "high" if gc < 20 or gc > 80 else "medium"
|
|
377
|
+
window_seq = seq[i : i + window]
|
|
378
|
+
|
|
379
|
+
violations.append(
|
|
380
|
+
{
|
|
381
|
+
"type": "gc_extreme",
|
|
382
|
+
"position": i,
|
|
383
|
+
"window_size": window,
|
|
384
|
+
"gc": round(gc, 1),
|
|
385
|
+
"context": window_seq,
|
|
386
|
+
"severity": severity,
|
|
387
|
+
}
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
return violations
|
|
391
|
+
|
|
392
|
+
def scan_splice_sites(self, seq: str) -> list[dict[str, Any]]:
|
|
393
|
+
"""
|
|
394
|
+
Detect potential splice-site-like patterns
|
|
395
|
+
|
|
396
|
+
Scan GT...AG pattern (20-200 bp spacing)
|
|
397
|
+
Plant consensus: GTRAG...YAG (Y=C/T)
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
seq: DNA sequence
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
List of violations
|
|
404
|
+
|
|
405
|
+
Raises:
|
|
406
|
+
None.
|
|
407
|
+
|
|
408
|
+
Examples:
|
|
409
|
+
>>> engine = RuleEngine()
|
|
410
|
+
>>> engine.scan_splice_sites("GTAG" + "A" * 20 + "CAG")
|
|
411
|
+
[{'type': 'potential_splice_site', ...}]
|
|
412
|
+
"""
|
|
413
|
+
violations: list[dict[str, Any]] = []
|
|
414
|
+
|
|
415
|
+
# Donor site: GT[AG]AG
|
|
416
|
+
donor_pattern = r"GT[AG]AG"
|
|
417
|
+
# Acceptor site: [CT]AG
|
|
418
|
+
acceptor_pattern = r"[CT]AG"
|
|
419
|
+
|
|
420
|
+
donors = [(m.start(), m.group()) for m in re.finditer(donor_pattern, seq)]
|
|
421
|
+
acceptors = [(m.start(), m.group()) for m in re.finditer(acceptor_pattern, seq)]
|
|
422
|
+
|
|
423
|
+
# Check 20-200 bp spacing
|
|
424
|
+
for d_pos, d_seq in donors:
|
|
425
|
+
for a_pos, a_seq in acceptors:
|
|
426
|
+
distance = a_pos - d_pos
|
|
427
|
+
|
|
428
|
+
if 20 <= distance <= 200:
|
|
429
|
+
violations.append(
|
|
430
|
+
{
|
|
431
|
+
"type": "potential_splice_site",
|
|
432
|
+
"donor": {"pos": d_pos, "seq": d_seq},
|
|
433
|
+
"acceptor": {"pos": a_pos, "seq": a_seq},
|
|
434
|
+
"distance": distance,
|
|
435
|
+
"severity": "low",
|
|
436
|
+
"warning": "Potential cryptic splice site",
|
|
437
|
+
}
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
return violations
|
|
441
|
+
|
|
442
|
+
def scan_polya_positive(
|
|
443
|
+
self,
|
|
444
|
+
seq: str,
|
|
445
|
+
required_patterns: set[str] | None = None,
|
|
446
|
+
) -> list[dict[str, Any]]:
|
|
447
|
+
"""
|
|
448
|
+
Positive PolyA validation: check that a region CONTAINS a PolyA signal.
|
|
449
|
+
|
|
450
|
+
Used for terminator/3'UTR regions where a PolyA signal must be present
|
|
451
|
+
for proper mRNA polyadenylation.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
seq: DNA sequence of the terminator/3'UTR region.
|
|
455
|
+
required_patterns: Set of acceptable PolyA patterns.
|
|
456
|
+
Defaults to Tier 1 patterns.
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
List of violations (non-empty if PolyA signal is missing).
|
|
460
|
+
"""
|
|
461
|
+
if required_patterns is None:
|
|
462
|
+
required_patterns = self.POLYA_TIER1_PATTERNS
|
|
463
|
+
|
|
464
|
+
for pattern in required_patterns:
|
|
465
|
+
if pattern in seq:
|
|
466
|
+
return [] # At least one PolyA signal found
|
|
467
|
+
|
|
468
|
+
return [
|
|
469
|
+
{
|
|
470
|
+
"type": "missing_polya_signal",
|
|
471
|
+
"severity": "high",
|
|
472
|
+
"message": "No PolyA signal found in terminator/3'UTR region.",
|
|
473
|
+
"checked_patterns": sorted(required_patterns),
|
|
474
|
+
}
|
|
475
|
+
]
|
|
476
|
+
|
|
477
|
+
def fix_polya_iterative(
|
|
478
|
+
self,
|
|
479
|
+
seq: str,
|
|
480
|
+
max_rounds: int = 10,
|
|
481
|
+
) -> dict[str, Any]:
|
|
482
|
+
"""
|
|
483
|
+
Iteratively remove all PolyA signals from a CDS via synonymous substitutions.
|
|
484
|
+
|
|
485
|
+
Fixing one PolyA violation can create another at a different codon boundary,
|
|
486
|
+
so this method loops until no violations remain or max rounds are reached.
|
|
487
|
+
|
|
488
|
+
Args:
|
|
489
|
+
seq: DNA coding sequence (must be divisible by 3).
|
|
490
|
+
max_rounds: Maximum number of fix-scan cycles.
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
Dict with success, modified_seq, rounds, and fixes_applied.
|
|
494
|
+
"""
|
|
495
|
+
current_seq = seq
|
|
496
|
+
all_fixes: list[dict[str, Any]] = []
|
|
497
|
+
|
|
498
|
+
for round_num in range(1, max_rounds + 1):
|
|
499
|
+
violations = self.scan_polya(current_seq)
|
|
500
|
+
# Filter to only polya_signal type (not multiple_polya warnings)
|
|
501
|
+
signal_violations = [v for v in violations if v["type"] == "polya_signal"]
|
|
502
|
+
|
|
503
|
+
if not signal_violations:
|
|
504
|
+
return {
|
|
505
|
+
"success": True,
|
|
506
|
+
"modified_seq": current_seq,
|
|
507
|
+
"rounds": round_num - 1,
|
|
508
|
+
"fixes_applied": all_fixes,
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
# Try to fix the first violation
|
|
512
|
+
fix_result = self.fix_violation(current_seq, signal_violations[0])
|
|
513
|
+
if fix_result["success"]:
|
|
514
|
+
current_seq = fix_result["modified_seq"]
|
|
515
|
+
all_fixes.extend(fix_result.get("changes", []))
|
|
516
|
+
else:
|
|
517
|
+
logger.debug(
|
|
518
|
+
f"Could not fix PolyA at position {signal_violations[0]['position']} "
|
|
519
|
+
f"in round {round_num}"
|
|
520
|
+
)
|
|
521
|
+
return {
|
|
522
|
+
"success": False,
|
|
523
|
+
"modified_seq": current_seq,
|
|
524
|
+
"rounds": round_num,
|
|
525
|
+
"fixes_applied": all_fixes,
|
|
526
|
+
"remaining_violations": len(signal_violations),
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
# Max rounds exhausted
|
|
530
|
+
remaining = [
|
|
531
|
+
v for v in self.scan_polya(current_seq) if v["type"] == "polya_signal"
|
|
532
|
+
]
|
|
533
|
+
return {
|
|
534
|
+
"success": len(remaining) == 0,
|
|
535
|
+
"modified_seq": current_seq,
|
|
536
|
+
"rounds": max_rounds,
|
|
537
|
+
"fixes_applied": all_fixes,
|
|
538
|
+
"remaining_violations": len(remaining),
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
def scan_dinucleotides(
|
|
542
|
+
self,
|
|
543
|
+
seq: str,
|
|
544
|
+
window: int = 50,
|
|
545
|
+
cpg_threshold: float = 0.05,
|
|
546
|
+
tpa_threshold: float = 0.05,
|
|
547
|
+
) -> list[dict[str, Any]]:
|
|
548
|
+
"""
|
|
549
|
+
Detect CpG and TpA dinucleotide-dense regions in CDS.
|
|
550
|
+
|
|
551
|
+
CpG dinucleotides trigger methylation-based gene silencing in plants.
|
|
552
|
+
TpA (UpA in RNA) dinucleotides are associated with mRNA instability.
|
|
553
|
+
|
|
554
|
+
Args:
|
|
555
|
+
seq: DNA sequence.
|
|
556
|
+
window: Sliding window size (bp).
|
|
557
|
+
cpg_threshold: CpG density (count/window) above which a violation
|
|
558
|
+
is reported. Default 0.05 = >1 CpG per 20 bp.
|
|
559
|
+
tpa_threshold: TpA density threshold (same units).
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
List of violation dicts with type, dinucleotide, position,
|
|
563
|
+
density, and severity.
|
|
564
|
+
|
|
565
|
+
Examples:
|
|
566
|
+
>>> engine = RuleEngine()
|
|
567
|
+
>>> engine.scan_dinucleotides("ACGACGACG" * 10) # doctest: +SKIP
|
|
568
|
+
[{'type': 'dinucleotide_hotspot', ...}]
|
|
569
|
+
"""
|
|
570
|
+
violations: list[dict[str, Any]] = []
|
|
571
|
+
seq_upper = seq.upper()
|
|
572
|
+
seq_len = len(seq_upper)
|
|
573
|
+
|
|
574
|
+
if window < 2 or seq_len < 2:
|
|
575
|
+
return violations
|
|
576
|
+
|
|
577
|
+
if seq_len < window:
|
|
578
|
+
# Scan the full sequence as a single window
|
|
579
|
+
cpg_count = count_dinucleotides(seq_upper, "CG")
|
|
580
|
+
tpa_count = count_dinucleotides(seq_upper, "TA")
|
|
581
|
+
cpg_density = cpg_count / seq_len
|
|
582
|
+
if cpg_density > cpg_threshold:
|
|
583
|
+
violations.append({
|
|
584
|
+
"type": "dinucleotide_hotspot",
|
|
585
|
+
"dinucleotide": "CpG",
|
|
586
|
+
"position": 0,
|
|
587
|
+
"window_size": seq_len,
|
|
588
|
+
"count": cpg_count,
|
|
589
|
+
"density": round(cpg_density, 4),
|
|
590
|
+
"severity": "high" if cpg_density > cpg_threshold * 2 else "medium",
|
|
591
|
+
})
|
|
592
|
+
tpa_density = tpa_count / seq_len
|
|
593
|
+
if tpa_density > tpa_threshold:
|
|
594
|
+
violations.append({
|
|
595
|
+
"type": "dinucleotide_hotspot",
|
|
596
|
+
"dinucleotide": "TpA",
|
|
597
|
+
"position": 0,
|
|
598
|
+
"window_size": seq_len,
|
|
599
|
+
"count": tpa_count,
|
|
600
|
+
"density": round(tpa_density, 4),
|
|
601
|
+
"severity": "high" if tpa_density > tpa_threshold * 2 else "medium",
|
|
602
|
+
})
|
|
603
|
+
return violations
|
|
604
|
+
|
|
605
|
+
# Rolling dinucleotide counts:
|
|
606
|
+
# a window of `window` bases contains `window - 1` adjacent pairs.
|
|
607
|
+
pair_len = seq_len - 1
|
|
608
|
+
cpg_flags = [1 if seq_upper[i : i + 2] == "CG" else 0 for i in range(pair_len)]
|
|
609
|
+
tpa_flags = [1 if seq_upper[i : i + 2] == "TA" else 0 for i in range(pair_len)]
|
|
610
|
+
pairs_in_window = window - 1
|
|
611
|
+
cpg_count = sum(cpg_flags[:pairs_in_window])
|
|
612
|
+
tpa_count = sum(tpa_flags[:pairs_in_window])
|
|
613
|
+
last_start = seq_len - window
|
|
614
|
+
|
|
615
|
+
for i in range(last_start + 1):
|
|
616
|
+
if i > 0:
|
|
617
|
+
# Shift by one base: remove pair at i-1, add pair at i+window-2
|
|
618
|
+
add_idx = i + window - 2
|
|
619
|
+
cpg_count += cpg_flags[add_idx] - cpg_flags[i - 1]
|
|
620
|
+
tpa_count += tpa_flags[add_idx] - tpa_flags[i - 1]
|
|
621
|
+
|
|
622
|
+
cpg_density = cpg_count / window
|
|
623
|
+
if cpg_density > cpg_threshold:
|
|
624
|
+
violations.append({
|
|
625
|
+
"type": "dinucleotide_hotspot",
|
|
626
|
+
"dinucleotide": "CpG",
|
|
627
|
+
"position": i,
|
|
628
|
+
"window_size": window,
|
|
629
|
+
"count": cpg_count,
|
|
630
|
+
"density": round(cpg_density, 4),
|
|
631
|
+
"severity": "high" if cpg_density > cpg_threshold * 2 else "medium",
|
|
632
|
+
})
|
|
633
|
+
|
|
634
|
+
tpa_density = tpa_count / window
|
|
635
|
+
if tpa_density > tpa_threshold:
|
|
636
|
+
violations.append({
|
|
637
|
+
"type": "dinucleotide_hotspot",
|
|
638
|
+
"dinucleotide": "TpA",
|
|
639
|
+
"position": i,
|
|
640
|
+
"window_size": window,
|
|
641
|
+
"count": tpa_count,
|
|
642
|
+
"density": round(tpa_density, 4),
|
|
643
|
+
"severity": "high" if tpa_density > tpa_threshold * 2 else "medium",
|
|
644
|
+
})
|
|
645
|
+
|
|
646
|
+
return violations
|
|
647
|
+
|
|
648
|
+
def scan_all(
|
|
649
|
+
self,
|
|
650
|
+
seq: str,
|
|
651
|
+
mode: str = "full",
|
|
652
|
+
include: list[str] | None = None,
|
|
653
|
+
exclude: list[str] | None = None,
|
|
654
|
+
) -> dict[str, list[dict[str, Any]]]:
|
|
655
|
+
"""
|
|
656
|
+
Scan all rules
|
|
657
|
+
|
|
658
|
+
Args:
|
|
659
|
+
seq: DNA sequence
|
|
660
|
+
mode: Scan mode. "full" runs all scanners; "fast" skips heavier scanners.
|
|
661
|
+
include: Explicit scanner names to run. Overrides mode.
|
|
662
|
+
exclude: Scanner names to exclude from the selected set.
|
|
663
|
+
|
|
664
|
+
Returns:
|
|
665
|
+
{
|
|
666
|
+
"polya": [...],
|
|
667
|
+
"are": [...],
|
|
668
|
+
"at_runs": [...],
|
|
669
|
+
"homopolymers": [...],
|
|
670
|
+
"repeats": [...],
|
|
671
|
+
"gc_extremes": [...],
|
|
672
|
+
"splice_sites": [...],
|
|
673
|
+
"dinucleotides": [...]
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
Raises:
|
|
677
|
+
None.
|
|
678
|
+
|
|
679
|
+
Examples:
|
|
680
|
+
>>> engine = RuleEngine()
|
|
681
|
+
>>> result = engine.scan_all("ATG" * 10)
|
|
682
|
+
>>> "polya" in result
|
|
683
|
+
True
|
|
684
|
+
"""
|
|
685
|
+
scanner_map = {
|
|
686
|
+
"polya": self.scan_polya,
|
|
687
|
+
"are": self.scan_are,
|
|
688
|
+
"at_runs": self.scan_at_runs,
|
|
689
|
+
"homopolymers": self.scan_homopolymers,
|
|
690
|
+
"repeats": self.scan_repeats,
|
|
691
|
+
"gc_extremes": self.scan_gc_extremes,
|
|
692
|
+
"splice_sites": self.scan_splice_sites,
|
|
693
|
+
"dinucleotides": self.scan_dinucleotides,
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
mode_name = mode.lower().strip()
|
|
697
|
+
if include is not None:
|
|
698
|
+
selected_names = [name.strip() for name in include if name.strip()]
|
|
699
|
+
elif mode_name == "full":
|
|
700
|
+
selected_names = list(scanner_map.keys())
|
|
701
|
+
elif mode_name == "fast":
|
|
702
|
+
selected_names = [
|
|
703
|
+
"polya",
|
|
704
|
+
"are",
|
|
705
|
+
"at_runs",
|
|
706
|
+
"homopolymers",
|
|
707
|
+
"gc_extremes",
|
|
708
|
+
"splice_sites",
|
|
709
|
+
]
|
|
710
|
+
else:
|
|
711
|
+
raise ValueError(f"Unknown scan mode: {mode}. Supported: full, fast")
|
|
712
|
+
|
|
713
|
+
if exclude:
|
|
714
|
+
excluded = {name.strip() for name in exclude if name.strip()}
|
|
715
|
+
selected_names = [name for name in selected_names if name not in excluded]
|
|
716
|
+
|
|
717
|
+
unknown = sorted({name for name in selected_names if name not in scanner_map})
|
|
718
|
+
if unknown:
|
|
719
|
+
known = ", ".join(scanner_map.keys())
|
|
720
|
+
raise ValueError(f"Unknown scanners: {', '.join(unknown)}. Known scanners: {known}")
|
|
721
|
+
|
|
722
|
+
return {name: scanner_map[name](seq) for name in selected_names} # type: ignore[operator]
|
|
723
|
+
|
|
724
|
+
def fix_violation(self, seq: str, violation: dict[str, Any]) -> dict[str, Any]:
|
|
725
|
+
"""
|
|
726
|
+
Fix violations via synonymous substitutions
|
|
727
|
+
|
|
728
|
+
Args:
|
|
729
|
+
seq: DNA sequence
|
|
730
|
+
violation: Violation entry
|
|
731
|
+
|
|
732
|
+
Returns:
|
|
733
|
+
{
|
|
734
|
+
"success": True/False,
|
|
735
|
+
"modified_seq": "...",
|
|
736
|
+
"changes": [{...}],
|
|
737
|
+
"aa_preserved": True/False
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
Raises:
|
|
741
|
+
None.
|
|
742
|
+
|
|
743
|
+
Examples:
|
|
744
|
+
>>> engine = RuleEngine()
|
|
745
|
+
>>> v = {"type": "polya_signal", "pattern": "AATAAA", "position": 0}
|
|
746
|
+
>>> engine.fix_violation("AATAAA", v)["success"]
|
|
747
|
+
False
|
|
748
|
+
"""
|
|
749
|
+
if len(seq) % 3 != 0:
|
|
750
|
+
return {
|
|
751
|
+
"success": False,
|
|
752
|
+
"error": "Sequence length not divisible by 3",
|
|
753
|
+
"aa_preserved": False,
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
pos = violation["position"]
|
|
757
|
+
pattern_type = violation["type"]
|
|
758
|
+
|
|
759
|
+
# Compute codon range overlapping violation pattern
|
|
760
|
+
if pattern_type == "polya_signal":
|
|
761
|
+
pattern_len = len(violation["pattern"])
|
|
762
|
+
elif pattern_type == "are_element":
|
|
763
|
+
pattern_len = 5 # ATTTA
|
|
764
|
+
elif pattern_type == "at_run":
|
|
765
|
+
pattern_len = violation["length"]
|
|
766
|
+
elif pattern_type == "homopolymer":
|
|
767
|
+
pattern_len = violation["length"]
|
|
768
|
+
else:
|
|
769
|
+
# Other types not supported yet
|
|
770
|
+
return {
|
|
771
|
+
"success": False,
|
|
772
|
+
"error": f"Unsupported violation type: {pattern_type}",
|
|
773
|
+
"aa_preserved": False,
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
first_codon_idx = (pos // 3) * 3
|
|
777
|
+
last_codon_idx = ((pos + pattern_len - 1) // 3) * 3
|
|
778
|
+
|
|
779
|
+
# Try synonymous substitutions per codon
|
|
780
|
+
modified_seq = list(seq)
|
|
781
|
+
changes: list[dict[str, Any]] = []
|
|
782
|
+
|
|
783
|
+
for codon_start in range(first_codon_idx, last_codon_idx + 1, 3):
|
|
784
|
+
if codon_start + 3 > len(seq):
|
|
785
|
+
continue
|
|
786
|
+
|
|
787
|
+
original_codon = seq[codon_start : codon_start + 3]
|
|
788
|
+
|
|
789
|
+
# Validate amino acid
|
|
790
|
+
if original_codon not in self.codon_table["codons"]:
|
|
791
|
+
continue
|
|
792
|
+
|
|
793
|
+
aa = self.codon_table["codons"][original_codon]["aa"]
|
|
794
|
+
|
|
795
|
+
# Find synonymous codons
|
|
796
|
+
synonymous_codons = [c for c in self.aa_to_codons.get(aa, []) if c != original_codon]
|
|
797
|
+
|
|
798
|
+
if not synonymous_codons:
|
|
799
|
+
continue
|
|
800
|
+
|
|
801
|
+
# Try each synonymous codon
|
|
802
|
+
for alt_codon in synonymous_codons:
|
|
803
|
+
# Temporary substitution
|
|
804
|
+
test_seq = modified_seq[:]
|
|
805
|
+
test_seq[codon_start : codon_start + 3] = list(alt_codon)
|
|
806
|
+
test_seq_str = "".join(test_seq)
|
|
807
|
+
|
|
808
|
+
# Check if violation pattern is removed
|
|
809
|
+
test_region = test_seq_str[
|
|
810
|
+
max(0, pos - 10) : min(len(test_seq_str), pos + pattern_len + 10)
|
|
811
|
+
]
|
|
812
|
+
|
|
813
|
+
pattern_removed = False
|
|
814
|
+
if pattern_type == "polya_signal":
|
|
815
|
+
pattern_removed = violation["pattern"] not in test_region
|
|
816
|
+
elif pattern_type == "are_element":
|
|
817
|
+
pattern_removed = "ATTTA" not in test_region
|
|
818
|
+
elif pattern_type == "at_run":
|
|
819
|
+
pattern_removed = not re.search(r"[AT]{6,}", test_region)
|
|
820
|
+
elif pattern_type == "homopolymer":
|
|
821
|
+
base = violation["base"]
|
|
822
|
+
pattern_removed = (base * 8) not in test_region
|
|
823
|
+
|
|
824
|
+
if pattern_removed:
|
|
825
|
+
# Success
|
|
826
|
+
modified_seq = test_seq
|
|
827
|
+
changes.append(
|
|
828
|
+
{
|
|
829
|
+
"pos": codon_start,
|
|
830
|
+
"original": original_codon,
|
|
831
|
+
"fixed": alt_codon,
|
|
832
|
+
"aa": aa,
|
|
833
|
+
}
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
return {
|
|
837
|
+
"success": True,
|
|
838
|
+
"modified_seq": "".join(modified_seq),
|
|
839
|
+
"changes": changes,
|
|
840
|
+
"aa_preserved": True,
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
# Failed to fix
|
|
844
|
+
return {
|
|
845
|
+
"success": False,
|
|
846
|
+
"modified_seq": seq,
|
|
847
|
+
"changes": [],
|
|
848
|
+
"aa_preserved": True,
|
|
849
|
+
"reason": "No synonymous codon available to remove violation",
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
# --- Usage example ---
|
|
854
|
+
if __name__ == "__main__":
|
|
855
|
+
engine = RuleEngine()
|
|
856
|
+
|
|
857
|
+
# Test sequence (partial GFP)
|
|
858
|
+
test_seq = "ATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAA"
|
|
859
|
+
|
|
860
|
+
print("=== Scanning for violations ===")
|
|
861
|
+
results = engine.scan_all(test_seq)
|
|
862
|
+
|
|
863
|
+
for rule_type, violations in results.items():
|
|
864
|
+
if violations:
|
|
865
|
+
print(f"\n{rule_type.upper()}: {len(violations)} violations")
|
|
866
|
+
for v in violations[:3]: # Show only the first 3
|
|
867
|
+
print(f" - Position {v.get('position', 'N/A')}: {v.get('type', 'N/A')}")
|