factorforge-cds 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. factorforge/__init__.py +19 -0
  2. factorforge/__main__.py +8 -0
  3. factorforge/cli/__init__.py +5 -0
  4. factorforge/cli/legacy_cli.py +157 -0
  5. factorforge/cli/main.py +305 -0
  6. factorforge/core/interfaces/__init__.py +7 -0
  7. factorforge/core/interfaces/exporter.py +13 -0
  8. factorforge/core/interfaces/optimizer.py +85 -0
  9. factorforge/core/interfaces/validator.py +9 -0
  10. factorforge/database.py +150 -0
  11. factorforge/engines/__init__.py +60 -0
  12. factorforge/engines/ml/__init__.py +0 -0
  13. factorforge/engines/ml/plant_optimizer.py +325 -0
  14. factorforge/engines/registry.py +141 -0
  15. factorforge/engines/v1_archived/__init__.py +15 -0
  16. factorforge/engines/v2/__init__.py +13 -0
  17. factorforge/engines/v2/codon_table_builder.py +107 -0
  18. factorforge/engines/v2/construct_builder.py +403 -0
  19. factorforge/engines/v2/exporter.py +455 -0
  20. factorforge/engines/v2/optimizer.py +190 -0
  21. factorforge/engines/v2/pipeline.py +275 -0
  22. factorforge/engines/v2/rules/__init__.py +3 -0
  23. factorforge/engines/v2/rules/domesticator.py +403 -0
  24. factorforge/engines/v2/rules/reverse_translator.py +765 -0
  25. factorforge/engines/v2/rules/rule_engine.py +867 -0
  26. factorforge/engines/v2/scoring.py +232 -0
  27. factorforge/engines/v2/utils.py +231 -0
  28. factorforge/engines/v2/validator.py +383 -0
  29. factorforge/engines/v3/__init__.py +12 -0
  30. factorforge/engines/v3/explain.py +119 -0
  31. factorforge/engines/v3/inference/__init__.py +6 -0
  32. factorforge/engines/v3/inference/constrained_decoder.py +80 -0
  33. factorforge/engines/v3/inference/v2_adapter.py +72 -0
  34. factorforge/engines/v3/metrics.py +145 -0
  35. factorforge/engines/v3/modeling_bart_decoder.py +127 -0
  36. factorforge/engines/v3/pipeline.py +192 -0
  37. factorforge/engines/v3/synonym_mask.py +61 -0
  38. factorforge/engines/v3/tokenizer.py +192 -0
  39. factorforge/ml/__init__.py +33 -0
  40. factorforge/ml/feasibility.py +199 -0
  41. factorforge/ml/metrics.py +295 -0
  42. factorforge/utils/__init__.py +31 -0
  43. factorforge/utils/construct_id.py +8 -0
  44. factorforge/utils/exceptions.py +32 -0
  45. factorforge/utils/sequence_validator.py +189 -0
  46. factorforge/utils/validation.py +104 -0
  47. factorforge_cds-3.0.0.dist-info/METADATA +475 -0
  48. factorforge_cds-3.0.0.dist-info/RECORD +52 -0
  49. factorforge_cds-3.0.0.dist-info/WHEEL +5 -0
  50. factorforge_cds-3.0.0.dist-info/entry_points.txt +2 -0
  51. factorforge_cds-3.0.0.dist-info/licenses/LICENSE +201 -0
  52. factorforge_cds-3.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,867 @@
1
+ """
2
+ Rule Engine for FactorForge v2
3
+ Plant-aware rule engine - scanning + auto-fix (P0-3)
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import logging
10
+ import re
11
+ from typing import Any
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ from factorforge.engines.v2.utils import (
16
+ build_aa_to_codons_map,
17
+ count_dinucleotides,
18
+ get_data_path,
19
+ )
20
+
21
+
22
+ class RuleEngine:
23
+ """
24
+ Plant-aware rule engine
25
+
26
+ Features:
27
+ - Detect and remove PolyA signals
28
+ - Detect ARE (AU-rich elements)
29
+ - Detect repeats/homopolymer runs
30
+ - Detect extreme GC regions
31
+ - Detect potential splice sites
32
+ - Auto-fix via synonymous substitutions
33
+ """
34
+
35
+ # Pattern definitions
36
+ # PolyA signal patterns: Tier 1 (canonical & high-frequency) + Tier 2 (plant-functional)
37
+ POLYA_PATTERNS = {
38
+ # Tier 1 (canonical & high-frequency variants)
39
+ "AATAAA": "canonical",
40
+ "ATTAAA": "variant_1",
41
+ "AGTAAA": "variant_2",
42
+ # Tier 2 (lower-frequency but functional in plants)
43
+ "AATACA": "variant_3",
44
+ "AAGAAA": "variant_4",
45
+ "AATGAA": "variant_5",
46
+ }
47
+ POLYA_TIER1_PATTERNS = {"AATAAA", "ATTAAA", "AGTAAA"}
48
+ POLYA_TIER2_PATTERNS = {"AATACA", "AAGAAA", "AATGAA"}
49
+
50
+ UNSTABLE_MOTIFS = {"ATTTA": "ARE (AU-rich element)", "WWWWWW": "W=A/T, 6+ in a row"}
51
+
52
+ def __init__(self, codon_table: dict[str, Any] | None = None) -> None:
53
+ """
54
+ Args:
55
+ codon_table: Codon table (loads default if None)
56
+ """
57
+ if codon_table is None:
58
+ # Use centralized data path management
59
+ data_dir = get_data_path()
60
+ codon_table_path = data_dir / "nbenthamiana_codons.json"
61
+ with open(codon_table_path, "r", encoding="utf-8") as f:
62
+ codon_table = json.load(f)
63
+
64
+ self.codon_table: dict[str, Any] = codon_table
65
+ self.aa_to_codons: dict[str, list[str]] = self._build_aa_to_codons_map()
66
+
67
+ def _build_aa_to_codons_map(self) -> dict[str, list[str]]:
68
+ """Build amino-acid-to-codons map"""
69
+ return build_aa_to_codons_map(self.codon_table)
70
+
71
+ def scan_polya(self, seq: str, window: int = 30) -> list[dict[str, Any]]:
72
+ """
73
+ Detect PolyA signal family
74
+
75
+ Args:
76
+ seq: DNA sequence
77
+ window: Window size (bp)
78
+
79
+ Returns:
80
+ List of violations
81
+
82
+ Raises:
83
+ None.
84
+
85
+ Examples:
86
+ >>> engine = RuleEngine()
87
+ >>> engine.scan_polya("AATAAA")
88
+ [{'type': 'polya_signal', 'pattern': 'AATAAA', ...}]
89
+ """
90
+ violations: list[dict[str, Any]] = []
91
+ seq_len = len(seq)
92
+ pattern_hits: dict[str, list[int]] = {}
93
+
94
+ # Detect individual patterns
95
+ for pattern, pattern_type in self.POLYA_PATTERNS.items():
96
+ hits: list[int] = []
97
+ pos = 0
98
+ while True:
99
+ idx = seq.find(pattern, pos)
100
+ if idx == -1:
101
+ break
102
+
103
+ hits.append(idx)
104
+ violations.append(
105
+ {
106
+ "type": "polya_signal",
107
+ "pattern": pattern,
108
+ "pattern_type": pattern_type,
109
+ "position": idx,
110
+ "context": seq[max(0, idx - 10) : min(len(seq), idx + len(pattern) + 10)],
111
+ }
112
+ )
113
+ pos = idx + 1
114
+ pattern_hits[pattern] = hits
115
+
116
+ if window < 1 or seq_len < window:
117
+ return violations
118
+
119
+ # Precompute per-pattern prefix arrays for fast "pattern exists in window".
120
+ # Semantics match `pattern in window_seq`: count each pattern at most once/window.
121
+ pattern_prefix: dict[str, tuple[int, list[int]]] = {}
122
+ for pattern, hits in pattern_hits.items():
123
+ plen = len(pattern)
124
+ if plen > window or not hits:
125
+ continue
126
+ prefix = [0] * (seq_len + 1)
127
+ for idx in hits:
128
+ prefix[idx + 1] = 1
129
+ for i in range(1, seq_len + 1):
130
+ prefix[i] += prefix[i - 1]
131
+ pattern_prefix[pattern] = (plen, prefix)
132
+
133
+ # Add warning if 2+ patterns in 30 bp window
134
+ for i in range(seq_len - window + 1):
135
+ count = 0
136
+ for _pattern, (plen, prefix) in pattern_prefix.items():
137
+ max_start = i + window - plen
138
+ if max_start >= i and (prefix[max_start + 1] - prefix[i]) > 0:
139
+ count += 1
140
+
141
+ if count >= 2:
142
+ window_seq = seq[i : i + window]
143
+ violations.append(
144
+ {
145
+ "type": "multiple_polya",
146
+ "position": i,
147
+ "window_size": window,
148
+ "count": count,
149
+ "context": window_seq,
150
+ "severity": "high",
151
+ }
152
+ )
153
+
154
+ return violations
155
+
156
+ def scan_are(self, seq: str) -> list[dict[str, Any]]:
157
+ """
158
+ Detect ARE (AU-rich element) pattern
159
+
160
+ Args:
161
+ seq: DNA sequence
162
+
163
+ Returns:
164
+ List of violations
165
+
166
+ Raises:
167
+ None.
168
+
169
+ Examples:
170
+ >>> engine = RuleEngine()
171
+ >>> engine.scan_are("ATTTA")
172
+ [{'type': 'are_element', ...}]
173
+ """
174
+ violations: list[dict[str, Any]] = []
175
+
176
+ # ATTTA pattern
177
+ pos = 0
178
+ while True:
179
+ idx = seq.find("ATTTA", pos)
180
+ if idx == -1:
181
+ break
182
+
183
+ violations.append(
184
+ {
185
+ "type": "are_element",
186
+ "pattern": "ATTTA",
187
+ "position": idx,
188
+ "context": seq[max(0, idx - 10) : min(len(seq), idx + 15)],
189
+ "severity": "medium",
190
+ }
191
+ )
192
+ pos = idx + 1
193
+
194
+ return violations
195
+
196
+ def scan_at_runs(self, seq: str, min_length: int = 6) -> list[dict[str, Any]]:
197
+ """
198
+ Detect A/T runs
199
+
200
+ Args:
201
+ seq: DNA sequence
202
+ min_length: Minimum length
203
+
204
+ Returns:
205
+ List of violations
206
+
207
+ Raises:
208
+ None.
209
+
210
+ Examples:
211
+ >>> engine = RuleEngine()
212
+ >>> engine.scan_at_runs("AAAAAATTT", min_length=6)
213
+ [{'type': 'at_run', ...}]
214
+ """
215
+ violations: list[dict[str, Any]] = []
216
+ pattern = r"[AT]{" + str(min_length) + r",}"
217
+
218
+ for match in re.finditer(pattern, seq):
219
+ violations.append(
220
+ {
221
+ "type": "at_run",
222
+ "position": match.start(),
223
+ "length": len(match.group()),
224
+ "sequence": match.group(),
225
+ "context": seq[max(0, match.start() - 5) : min(len(seq), match.end() + 5)],
226
+ "severity": "medium" if len(match.group()) < 8 else "high",
227
+ }
228
+ )
229
+
230
+ return violations
231
+
232
+ def scan_homopolymers(self, seq: str, min_length: int = 8) -> list[dict[str, Any]]:
233
+ """
234
+ Detect 8+ homopolymers (synthesis risk)
235
+
236
+ Args:
237
+ seq: DNA sequence
238
+ min_length: Minimum length
239
+
240
+ Returns:
241
+ List of violations
242
+
243
+ Raises:
244
+ None.
245
+
246
+ Examples:
247
+ >>> engine = RuleEngine()
248
+ >>> engine.scan_homopolymers("AAAAAAAA", min_length=8)
249
+ [{'type': 'homopolymer', ...}]
250
+ """
251
+ violations: list[dict[str, Any]] = []
252
+
253
+ for base in "ATGC":
254
+ pattern = base * min_length
255
+ pos = 0
256
+ while True:
257
+ idx = seq.find(pattern, pos)
258
+ if idx == -1:
259
+ break
260
+
261
+ # Compute actual run length
262
+ actual_length = min_length
263
+ while idx + actual_length < len(seq) and seq[idx + actual_length] == base:
264
+ actual_length += 1
265
+
266
+ violations.append(
267
+ {
268
+ "type": "homopolymer",
269
+ "base": base,
270
+ "position": idx,
271
+ "length": actual_length,
272
+ "sequence": base * actual_length,
273
+ "severity": "high" if actual_length >= 10 else "medium",
274
+ }
275
+ )
276
+ pos = idx + actual_length
277
+
278
+ return violations
279
+
280
+ def scan_repeats(self, seq: str, min_length: int = 15) -> list[dict[str, Any]]:
281
+ """
282
+ Detect perfect repeats >= 15 bp (recombination risk)
283
+
284
+ Args:
285
+ seq: DNA sequence
286
+ min_length: Minimum repeat length
287
+
288
+ Returns:
289
+ List of violations
290
+
291
+ Raises:
292
+ None.
293
+
294
+ Examples:
295
+ >>> engine = RuleEngine()
296
+ >>> engine.scan_repeats("ATGATGATGATGATG", min_length=3)
297
+ [{'type': 'repeat', ...}]
298
+ """
299
+ violations: list[dict[str, Any]] = []
300
+ seen_fragments: dict[str, list[int]] = {}
301
+
302
+ for i in range(len(seq) - min_length + 1):
303
+ fragment = seq[i : i + min_length]
304
+
305
+ if fragment in seen_fragments:
306
+ # Already found repeat
307
+ seen_fragments[fragment].append(i)
308
+ else:
309
+ # First occurrence
310
+ seen_fragments[fragment] = [i]
311
+
312
+ # Report only fragments that appear 2+ times
313
+ for fragment, positions in seen_fragments.items():
314
+ if len(positions) > 1:
315
+ violations.append(
316
+ {
317
+ "type": "repeat",
318
+ "fragment": fragment,
319
+ "length": len(fragment),
320
+ "positions": positions,
321
+ "count": len(positions),
322
+ "severity": "high" if len(positions) > 2 else "medium",
323
+ }
324
+ )
325
+
326
+ return violations
327
+
328
+ def scan_gc_extremes(
329
+ self,
330
+ seq: str,
331
+ window: int = 50,
332
+ min_gc: float = 25,
333
+ max_gc: float = 75,
334
+ ) -> list[dict[str, Any]]:
335
+ """
336
+ Detect extreme GC regions
337
+
338
+ Args:
339
+ seq: DNA sequence
340
+ window: Window size (bp)
341
+ min_gc: Minimum GC% threshold
342
+ max_gc: Maximum GC% threshold
343
+
344
+ Returns:
345
+ List of violations
346
+
347
+ Raises:
348
+ None.
349
+
350
+ Examples:
351
+ >>> engine = RuleEngine()
352
+ >>> engine.scan_gc_extremes("GGGGGG", window=3, max_gc=80)
353
+ [{'type': 'gc_extreme', ...}]
354
+ """
355
+ violations: list[dict[str, Any]] = []
356
+ seq_len = len(seq)
357
+ if window < 1 or seq_len < window:
358
+ return violations
359
+
360
+ seq_upper = seq.upper()
361
+ gc_count = sum(1 for b in seq_upper[:window] if b == "G" or b == "C")
362
+ last_start = seq_len - window
363
+
364
+ for i in range(last_start + 1):
365
+ if i > 0:
366
+ left = seq_upper[i - 1]
367
+ right = seq_upper[i + window - 1]
368
+ if left == "G" or left == "C":
369
+ gc_count -= 1
370
+ if right == "G" or right == "C":
371
+ gc_count += 1
372
+
373
+ gc = (gc_count / window) * 100.0
374
+
375
+ if gc < min_gc or gc > max_gc:
376
+ severity = "high" if gc < 20 or gc > 80 else "medium"
377
+ window_seq = seq[i : i + window]
378
+
379
+ violations.append(
380
+ {
381
+ "type": "gc_extreme",
382
+ "position": i,
383
+ "window_size": window,
384
+ "gc": round(gc, 1),
385
+ "context": window_seq,
386
+ "severity": severity,
387
+ }
388
+ )
389
+
390
+ return violations
391
+
392
+ def scan_splice_sites(self, seq: str) -> list[dict[str, Any]]:
393
+ """
394
+ Detect potential splice-site-like patterns
395
+
396
+ Scan GT...AG pattern (20-200 bp spacing)
397
+ Plant consensus: GTRAG...YAG (Y=C/T)
398
+
399
+ Args:
400
+ seq: DNA sequence
401
+
402
+ Returns:
403
+ List of violations
404
+
405
+ Raises:
406
+ None.
407
+
408
+ Examples:
409
+ >>> engine = RuleEngine()
410
+ >>> engine.scan_splice_sites("GTAG" + "A" * 20 + "CAG")
411
+ [{'type': 'potential_splice_site', ...}]
412
+ """
413
+ violations: list[dict[str, Any]] = []
414
+
415
+ # Donor site: GT[AG]AG
416
+ donor_pattern = r"GT[AG]AG"
417
+ # Acceptor site: [CT]AG
418
+ acceptor_pattern = r"[CT]AG"
419
+
420
+ donors = [(m.start(), m.group()) for m in re.finditer(donor_pattern, seq)]
421
+ acceptors = [(m.start(), m.group()) for m in re.finditer(acceptor_pattern, seq)]
422
+
423
+ # Check 20-200 bp spacing
424
+ for d_pos, d_seq in donors:
425
+ for a_pos, a_seq in acceptors:
426
+ distance = a_pos - d_pos
427
+
428
+ if 20 <= distance <= 200:
429
+ violations.append(
430
+ {
431
+ "type": "potential_splice_site",
432
+ "donor": {"pos": d_pos, "seq": d_seq},
433
+ "acceptor": {"pos": a_pos, "seq": a_seq},
434
+ "distance": distance,
435
+ "severity": "low",
436
+ "warning": "Potential cryptic splice site",
437
+ }
438
+ )
439
+
440
+ return violations
441
+
442
+ def scan_polya_positive(
443
+ self,
444
+ seq: str,
445
+ required_patterns: set[str] | None = None,
446
+ ) -> list[dict[str, Any]]:
447
+ """
448
+ Positive PolyA validation: check that a region CONTAINS a PolyA signal.
449
+
450
+ Used for terminator/3'UTR regions where a PolyA signal must be present
451
+ for proper mRNA polyadenylation.
452
+
453
+ Args:
454
+ seq: DNA sequence of the terminator/3'UTR region.
455
+ required_patterns: Set of acceptable PolyA patterns.
456
+ Defaults to Tier 1 patterns.
457
+
458
+ Returns:
459
+ List of violations (non-empty if PolyA signal is missing).
460
+ """
461
+ if required_patterns is None:
462
+ required_patterns = self.POLYA_TIER1_PATTERNS
463
+
464
+ for pattern in required_patterns:
465
+ if pattern in seq:
466
+ return [] # At least one PolyA signal found
467
+
468
+ return [
469
+ {
470
+ "type": "missing_polya_signal",
471
+ "severity": "high",
472
+ "message": "No PolyA signal found in terminator/3'UTR region.",
473
+ "checked_patterns": sorted(required_patterns),
474
+ }
475
+ ]
476
+
477
+ def fix_polya_iterative(
478
+ self,
479
+ seq: str,
480
+ max_rounds: int = 10,
481
+ ) -> dict[str, Any]:
482
+ """
483
+ Iteratively remove all PolyA signals from a CDS via synonymous substitutions.
484
+
485
+ Fixing one PolyA violation can create another at a different codon boundary,
486
+ so this method loops until no violations remain or max rounds are reached.
487
+
488
+ Args:
489
+ seq: DNA coding sequence (must be divisible by 3).
490
+ max_rounds: Maximum number of fix-scan cycles.
491
+
492
+ Returns:
493
+ Dict with success, modified_seq, rounds, and fixes_applied.
494
+ """
495
+ current_seq = seq
496
+ all_fixes: list[dict[str, Any]] = []
497
+
498
+ for round_num in range(1, max_rounds + 1):
499
+ violations = self.scan_polya(current_seq)
500
+ # Filter to only polya_signal type (not multiple_polya warnings)
501
+ signal_violations = [v for v in violations if v["type"] == "polya_signal"]
502
+
503
+ if not signal_violations:
504
+ return {
505
+ "success": True,
506
+ "modified_seq": current_seq,
507
+ "rounds": round_num - 1,
508
+ "fixes_applied": all_fixes,
509
+ }
510
+
511
+ # Try to fix the first violation
512
+ fix_result = self.fix_violation(current_seq, signal_violations[0])
513
+ if fix_result["success"]:
514
+ current_seq = fix_result["modified_seq"]
515
+ all_fixes.extend(fix_result.get("changes", []))
516
+ else:
517
+ logger.debug(
518
+ f"Could not fix PolyA at position {signal_violations[0]['position']} "
519
+ f"in round {round_num}"
520
+ )
521
+ return {
522
+ "success": False,
523
+ "modified_seq": current_seq,
524
+ "rounds": round_num,
525
+ "fixes_applied": all_fixes,
526
+ "remaining_violations": len(signal_violations),
527
+ }
528
+
529
+ # Max rounds exhausted
530
+ remaining = [
531
+ v for v in self.scan_polya(current_seq) if v["type"] == "polya_signal"
532
+ ]
533
+ return {
534
+ "success": len(remaining) == 0,
535
+ "modified_seq": current_seq,
536
+ "rounds": max_rounds,
537
+ "fixes_applied": all_fixes,
538
+ "remaining_violations": len(remaining),
539
+ }
540
+
541
+ def scan_dinucleotides(
542
+ self,
543
+ seq: str,
544
+ window: int = 50,
545
+ cpg_threshold: float = 0.05,
546
+ tpa_threshold: float = 0.05,
547
+ ) -> list[dict[str, Any]]:
548
+ """
549
+ Detect CpG and TpA dinucleotide-dense regions in CDS.
550
+
551
+ CpG dinucleotides trigger methylation-based gene silencing in plants.
552
+ TpA (UpA in RNA) dinucleotides are associated with mRNA instability.
553
+
554
+ Args:
555
+ seq: DNA sequence.
556
+ window: Sliding window size (bp).
557
+ cpg_threshold: CpG density (count/window) above which a violation
558
+ is reported. Default 0.05 = >1 CpG per 20 bp.
559
+ tpa_threshold: TpA density threshold (same units).
560
+
561
+ Returns:
562
+ List of violation dicts with type, dinucleotide, position,
563
+ density, and severity.
564
+
565
+ Examples:
566
+ >>> engine = RuleEngine()
567
+ >>> engine.scan_dinucleotides("ACGACGACG" * 10) # doctest: +SKIP
568
+ [{'type': 'dinucleotide_hotspot', ...}]
569
+ """
570
+ violations: list[dict[str, Any]] = []
571
+ seq_upper = seq.upper()
572
+ seq_len = len(seq_upper)
573
+
574
+ if window < 2 or seq_len < 2:
575
+ return violations
576
+
577
+ if seq_len < window:
578
+ # Scan the full sequence as a single window
579
+ cpg_count = count_dinucleotides(seq_upper, "CG")
580
+ tpa_count = count_dinucleotides(seq_upper, "TA")
581
+ cpg_density = cpg_count / seq_len
582
+ if cpg_density > cpg_threshold:
583
+ violations.append({
584
+ "type": "dinucleotide_hotspot",
585
+ "dinucleotide": "CpG",
586
+ "position": 0,
587
+ "window_size": seq_len,
588
+ "count": cpg_count,
589
+ "density": round(cpg_density, 4),
590
+ "severity": "high" if cpg_density > cpg_threshold * 2 else "medium",
591
+ })
592
+ tpa_density = tpa_count / seq_len
593
+ if tpa_density > tpa_threshold:
594
+ violations.append({
595
+ "type": "dinucleotide_hotspot",
596
+ "dinucleotide": "TpA",
597
+ "position": 0,
598
+ "window_size": seq_len,
599
+ "count": tpa_count,
600
+ "density": round(tpa_density, 4),
601
+ "severity": "high" if tpa_density > tpa_threshold * 2 else "medium",
602
+ })
603
+ return violations
604
+
605
+ # Rolling dinucleotide counts:
606
+ # a window of `window` bases contains `window - 1` adjacent pairs.
607
+ pair_len = seq_len - 1
608
+ cpg_flags = [1 if seq_upper[i : i + 2] == "CG" else 0 for i in range(pair_len)]
609
+ tpa_flags = [1 if seq_upper[i : i + 2] == "TA" else 0 for i in range(pair_len)]
610
+ pairs_in_window = window - 1
611
+ cpg_count = sum(cpg_flags[:pairs_in_window])
612
+ tpa_count = sum(tpa_flags[:pairs_in_window])
613
+ last_start = seq_len - window
614
+
615
+ for i in range(last_start + 1):
616
+ if i > 0:
617
+ # Shift by one base: remove pair at i-1, add pair at i+window-2
618
+ add_idx = i + window - 2
619
+ cpg_count += cpg_flags[add_idx] - cpg_flags[i - 1]
620
+ tpa_count += tpa_flags[add_idx] - tpa_flags[i - 1]
621
+
622
+ cpg_density = cpg_count / window
623
+ if cpg_density > cpg_threshold:
624
+ violations.append({
625
+ "type": "dinucleotide_hotspot",
626
+ "dinucleotide": "CpG",
627
+ "position": i,
628
+ "window_size": window,
629
+ "count": cpg_count,
630
+ "density": round(cpg_density, 4),
631
+ "severity": "high" if cpg_density > cpg_threshold * 2 else "medium",
632
+ })
633
+
634
+ tpa_density = tpa_count / window
635
+ if tpa_density > tpa_threshold:
636
+ violations.append({
637
+ "type": "dinucleotide_hotspot",
638
+ "dinucleotide": "TpA",
639
+ "position": i,
640
+ "window_size": window,
641
+ "count": tpa_count,
642
+ "density": round(tpa_density, 4),
643
+ "severity": "high" if tpa_density > tpa_threshold * 2 else "medium",
644
+ })
645
+
646
+ return violations
647
+
648
+ def scan_all(
649
+ self,
650
+ seq: str,
651
+ mode: str = "full",
652
+ include: list[str] | None = None,
653
+ exclude: list[str] | None = None,
654
+ ) -> dict[str, list[dict[str, Any]]]:
655
+ """
656
+ Scan all rules
657
+
658
+ Args:
659
+ seq: DNA sequence
660
+ mode: Scan mode. "full" runs all scanners; "fast" skips heavier scanners.
661
+ include: Explicit scanner names to run. Overrides mode.
662
+ exclude: Scanner names to exclude from the selected set.
663
+
664
+ Returns:
665
+ {
666
+ "polya": [...],
667
+ "are": [...],
668
+ "at_runs": [...],
669
+ "homopolymers": [...],
670
+ "repeats": [...],
671
+ "gc_extremes": [...],
672
+ "splice_sites": [...],
673
+ "dinucleotides": [...]
674
+ }
675
+
676
+ Raises:
677
+ None.
678
+
679
+ Examples:
680
+ >>> engine = RuleEngine()
681
+ >>> result = engine.scan_all("ATG" * 10)
682
+ >>> "polya" in result
683
+ True
684
+ """
685
+ scanner_map = {
686
+ "polya": self.scan_polya,
687
+ "are": self.scan_are,
688
+ "at_runs": self.scan_at_runs,
689
+ "homopolymers": self.scan_homopolymers,
690
+ "repeats": self.scan_repeats,
691
+ "gc_extremes": self.scan_gc_extremes,
692
+ "splice_sites": self.scan_splice_sites,
693
+ "dinucleotides": self.scan_dinucleotides,
694
+ }
695
+
696
+ mode_name = mode.lower().strip()
697
+ if include is not None:
698
+ selected_names = [name.strip() for name in include if name.strip()]
699
+ elif mode_name == "full":
700
+ selected_names = list(scanner_map.keys())
701
+ elif mode_name == "fast":
702
+ selected_names = [
703
+ "polya",
704
+ "are",
705
+ "at_runs",
706
+ "homopolymers",
707
+ "gc_extremes",
708
+ "splice_sites",
709
+ ]
710
+ else:
711
+ raise ValueError(f"Unknown scan mode: {mode}. Supported: full, fast")
712
+
713
+ if exclude:
714
+ excluded = {name.strip() for name in exclude if name.strip()}
715
+ selected_names = [name for name in selected_names if name not in excluded]
716
+
717
+ unknown = sorted({name for name in selected_names if name not in scanner_map})
718
+ if unknown:
719
+ known = ", ".join(scanner_map.keys())
720
+ raise ValueError(f"Unknown scanners: {', '.join(unknown)}. Known scanners: {known}")
721
+
722
+ return {name: scanner_map[name](seq) for name in selected_names} # type: ignore[operator]
723
+
724
+ def fix_violation(self, seq: str, violation: dict[str, Any]) -> dict[str, Any]:
725
+ """
726
+ Fix violations via synonymous substitutions
727
+
728
+ Args:
729
+ seq: DNA sequence
730
+ violation: Violation entry
731
+
732
+ Returns:
733
+ {
734
+ "success": True/False,
735
+ "modified_seq": "...",
736
+ "changes": [{...}],
737
+ "aa_preserved": True/False
738
+ }
739
+
740
+ Raises:
741
+ None.
742
+
743
+ Examples:
744
+ >>> engine = RuleEngine()
745
+ >>> v = {"type": "polya_signal", "pattern": "AATAAA", "position": 0}
746
+ >>> engine.fix_violation("AATAAA", v)["success"]
747
+ False
748
+ """
749
+ if len(seq) % 3 != 0:
750
+ return {
751
+ "success": False,
752
+ "error": "Sequence length not divisible by 3",
753
+ "aa_preserved": False,
754
+ }
755
+
756
+ pos = violation["position"]
757
+ pattern_type = violation["type"]
758
+
759
+ # Compute codon range overlapping violation pattern
760
+ if pattern_type == "polya_signal":
761
+ pattern_len = len(violation["pattern"])
762
+ elif pattern_type == "are_element":
763
+ pattern_len = 5 # ATTTA
764
+ elif pattern_type == "at_run":
765
+ pattern_len = violation["length"]
766
+ elif pattern_type == "homopolymer":
767
+ pattern_len = violation["length"]
768
+ else:
769
+ # Other types not supported yet
770
+ return {
771
+ "success": False,
772
+ "error": f"Unsupported violation type: {pattern_type}",
773
+ "aa_preserved": False,
774
+ }
775
+
776
+ first_codon_idx = (pos // 3) * 3
777
+ last_codon_idx = ((pos + pattern_len - 1) // 3) * 3
778
+
779
+ # Try synonymous substitutions per codon
780
+ modified_seq = list(seq)
781
+ changes: list[dict[str, Any]] = []
782
+
783
+ for codon_start in range(first_codon_idx, last_codon_idx + 1, 3):
784
+ if codon_start + 3 > len(seq):
785
+ continue
786
+
787
+ original_codon = seq[codon_start : codon_start + 3]
788
+
789
+ # Validate amino acid
790
+ if original_codon not in self.codon_table["codons"]:
791
+ continue
792
+
793
+ aa = self.codon_table["codons"][original_codon]["aa"]
794
+
795
+ # Find synonymous codons
796
+ synonymous_codons = [c for c in self.aa_to_codons.get(aa, []) if c != original_codon]
797
+
798
+ if not synonymous_codons:
799
+ continue
800
+
801
+ # Try each synonymous codon
802
+ for alt_codon in synonymous_codons:
803
+ # Temporary substitution
804
+ test_seq = modified_seq[:]
805
+ test_seq[codon_start : codon_start + 3] = list(alt_codon)
806
+ test_seq_str = "".join(test_seq)
807
+
808
+ # Check if violation pattern is removed
809
+ test_region = test_seq_str[
810
+ max(0, pos - 10) : min(len(test_seq_str), pos + pattern_len + 10)
811
+ ]
812
+
813
+ pattern_removed = False
814
+ if pattern_type == "polya_signal":
815
+ pattern_removed = violation["pattern"] not in test_region
816
+ elif pattern_type == "are_element":
817
+ pattern_removed = "ATTTA" not in test_region
818
+ elif pattern_type == "at_run":
819
+ pattern_removed = not re.search(r"[AT]{6,}", test_region)
820
+ elif pattern_type == "homopolymer":
821
+ base = violation["base"]
822
+ pattern_removed = (base * 8) not in test_region
823
+
824
+ if pattern_removed:
825
+ # Success
826
+ modified_seq = test_seq
827
+ changes.append(
828
+ {
829
+ "pos": codon_start,
830
+ "original": original_codon,
831
+ "fixed": alt_codon,
832
+ "aa": aa,
833
+ }
834
+ )
835
+
836
+ return {
837
+ "success": True,
838
+ "modified_seq": "".join(modified_seq),
839
+ "changes": changes,
840
+ "aa_preserved": True,
841
+ }
842
+
843
+ # Failed to fix
844
+ return {
845
+ "success": False,
846
+ "modified_seq": seq,
847
+ "changes": [],
848
+ "aa_preserved": True,
849
+ "reason": "No synonymous codon available to remove violation",
850
+ }
851
+
852
+
853
+ # --- Usage example ---
854
+ if __name__ == "__main__":
855
+ engine = RuleEngine()
856
+
857
+ # Test sequence (partial GFP)
858
+ test_seq = "ATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAA"
859
+
860
+ print("=== Scanning for violations ===")
861
+ results = engine.scan_all(test_seq)
862
+
863
+ for rule_type, violations in results.items():
864
+ if violations:
865
+ print(f"\n{rule_type.upper()}: {len(violations)} violations")
866
+ for v in violations[:3]: # Show only the first 3
867
+ print(f" - Position {v.get('position', 'N/A')}: {v.get('type', 'N/A')}")