celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/tools/dna.py ADDED
@@ -0,0 +1,575 @@
1
+ """
2
+ DNA biology utilities for sequence analysis and planning.
3
+
4
+ Includes sequence transforms, ORF detection, primer suggestions, codon optimization,
5
+ restriction analysis, and assembly helper templates.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+ import re
12
+
13
+ from ct.tools import registry
14
+
15
+
16
+ _DNA_ALPHABET = set("ACGTN")
17
+ _STOP_CODONS = {"TAA", "TAG", "TGA"}
18
+ _CODON_TABLE = {
19
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
20
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
21
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
22
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
23
+ "TAT": "Y", "TAC": "Y", "TAA": "*", "TAG": "*", "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
24
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
25
+ "TGT": "C", "TGC": "C", "TGA": "*", "TGG": "W", "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
26
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
27
+ }
28
+
29
+ # Preferred codons (lightweight, pragmatic defaults).
30
+ _PREF_CODONS = {
31
+ "human": {
32
+ "A": "GCC", "R": "CGC", "N": "AAC", "D": "GAC", "C": "TGC", "Q": "CAG", "E": "GAG", "G": "GGC",
33
+ "H": "CAC", "I": "ATC", "L": "CTG", "K": "AAG", "M": "ATG", "F": "TTC", "P": "CCC", "S": "AGC",
34
+ "T": "ACC", "W": "TGG", "Y": "TAC", "V": "GTG",
35
+ },
36
+ "ecoli": {
37
+ "A": "GCG", "R": "CGT", "N": "AAC", "D": "GAT", "C": "TGC", "Q": "CAG", "E": "GAA", "G": "GGC",
38
+ "H": "CAT", "I": "ATT", "L": "CTG", "K": "AAA", "M": "ATG", "F": "TTT", "P": "CCG", "S": "TCT",
39
+ "T": "ACC", "W": "TGG", "Y": "TAT", "V": "GTG",
40
+ },
41
+ }
42
+
43
+ _ENZYME_MOTIFS = {
44
+ "EcoRI": "GAATTC",
45
+ "BamHI": "GGATCC",
46
+ "HindIII": "AAGCTT",
47
+ "NotI": "GCGGCCGC",
48
+ "XhoI": "CTCGAG",
49
+ "NheI": "GCTAGC",
50
+ "BsaI": "GGTCTC",
51
+ "BsmBI": "CGTCTC",
52
+ }
53
+
54
+
55
+ @dataclass
56
+ class _PrimerCandidate:
57
+ seq: str
58
+ tm: float
59
+ gc: float
60
+
61
+
62
+ def _clean_seq(seq: str) -> str:
63
+ return re.sub(r"\s+", "", str(seq or "").upper())
64
+
65
+
66
+ def _validate_dna(seq: str) -> tuple[str, str | None]:
67
+ s = _clean_seq(seq)
68
+ if not s:
69
+ return s, "sequence is required"
70
+ if any(base not in _DNA_ALPHABET for base in s):
71
+ return s, "sequence contains non-DNA characters"
72
+ return s, None
73
+
74
+
75
+ def _reverse_complement(seq: str) -> str:
76
+ table = str.maketrans("ACGTN", "TGCAN")
77
+ return seq.translate(table)[::-1]
78
+
79
+
80
+ def _wallace_tm(seq: str) -> float:
81
+ seq = seq.upper()
82
+ at = seq.count("A") + seq.count("T")
83
+ gc = seq.count("G") + seq.count("C")
84
+ return 2.0 * at + 4.0 * gc
85
+
86
+
87
+ def _gc_content(seq: str) -> float:
88
+ seq = seq.upper()
89
+ gc = seq.count("G") + seq.count("C")
90
+ return (100.0 * gc / len(seq)) if seq else 0.0
91
+
92
+
93
+ def _translate_dna(seq: str, frame: int = 1, to_stop: bool = False) -> str:
94
+ offset = max(0, min(2, frame - 1))
95
+ aa = []
96
+ for i in range(offset, len(seq) - 2, 3):
97
+ codon = seq[i : i + 3]
98
+ residue = _CODON_TABLE.get(codon, "X")
99
+ if residue == "*" and to_stop:
100
+ break
101
+ aa.append(residue)
102
+ return "".join(aa)
103
+
104
+
105
+ def _find_sites(seq: str, motif: str) -> list[int]:
106
+ positions = []
107
+ start = 0
108
+ while True:
109
+ idx = seq.find(motif, start)
110
+ if idx < 0:
111
+ break
112
+ positions.append(idx + 1) # 1-based
113
+ start = idx + 1
114
+ return positions
115
+
116
+
117
+ def _pick_primer_candidates(seq: str, min_len: int, max_len: int, tm_target: float) -> list[_PrimerCandidate]:
118
+ out = []
119
+ for n in range(min_len, max_len + 1):
120
+ if n > len(seq):
121
+ break
122
+ cand = seq[:n]
123
+ tm = _wallace_tm(cand)
124
+ gc = _gc_content(cand)
125
+ if 35 <= gc <= 70:
126
+ out.append(_PrimerCandidate(seq=cand, tm=tm, gc=gc))
127
+ out.sort(key=lambda c: abs(c.tm - tm_target))
128
+ return out
129
+
130
+
131
+ @registry.register(
132
+ name="dna.reverse_complement",
133
+ description="Compute reverse complement of a DNA sequence",
134
+ category="dna",
135
+ parameters={"sequence": "DNA sequence"},
136
+ usage_guide="Use for strand conversion and antisense oligo planning.",
137
+ )
138
+ def reverse_complement(sequence: str, **kwargs) -> dict:
139
+ seq, err = _validate_dna(sequence)
140
+ if err:
141
+ return {"summary": err, "error": "invalid_sequence"}
142
+ rc = _reverse_complement(seq)
143
+ return {"summary": f"Reverse complement computed ({len(seq)} bp).", "sequence": seq, "reverse_complement": rc}
144
+
145
+
146
+ @registry.register(
147
+ name="dna.translate",
148
+ description="Translate DNA sequence to amino-acid sequence",
149
+ category="dna",
150
+ parameters={
151
+ "sequence": "DNA sequence",
152
+ "frame": "Reading frame 1-3 (default 1)",
153
+ "to_stop": "Stop translation at first stop codon (default false)",
154
+ },
155
+ usage_guide="Use to inspect coding potential and validate ORF translations.",
156
+ )
157
+ def translate(sequence: str, frame: int = 1, to_stop: bool = False, **kwargs) -> dict:
158
+ seq, err = _validate_dna(sequence)
159
+ if err:
160
+ return {"summary": err, "error": "invalid_sequence"}
161
+ frame = max(1, min(3, int(frame)))
162
+ aa = _translate_dna(seq, frame=frame, to_stop=bool(to_stop))
163
+ return {
164
+ "summary": f"Translated DNA in frame {frame}: {len(aa)} aa.",
165
+ "frame": frame,
166
+ "protein": aa,
167
+ "protein_length": len(aa),
168
+ }
169
+
170
+
171
+ @registry.register(
172
+ name="dna.find_orfs",
173
+ description="Find open reading frames in a DNA sequence",
174
+ category="dna",
175
+ parameters={
176
+ "sequence": "DNA sequence",
177
+ "min_aa_length": "Minimum amino-acid length (default 30)",
178
+ "include_reverse": "Also scan reverse complement (default false)",
179
+ },
180
+ usage_guide="Use to identify candidate coding regions before cloning or expression.",
181
+ )
182
+ def find_orfs(sequence: str, min_aa_length: int = 30, include_reverse: bool = False, **kwargs) -> dict:
183
+ seq, err = _validate_dna(sequence)
184
+ if err:
185
+ return {"summary": err, "error": "invalid_sequence"}
186
+
187
+ min_aa = max(5, int(min_aa_length))
188
+ scans = [("forward", seq)]
189
+ if include_reverse:
190
+ scans.append(("reverse", _reverse_complement(seq)))
191
+
192
+ orfs = []
193
+ for strand, dna in scans:
194
+ for frame in (0, 1, 2):
195
+ i = frame
196
+ while i <= len(dna) - 3:
197
+ codon = dna[i : i + 3]
198
+ if codon != "ATG":
199
+ i += 3
200
+ continue
201
+ j = i + 3
202
+ while j <= len(dna) - 3:
203
+ stop = dna[j : j + 3]
204
+ if stop in _STOP_CODONS:
205
+ aa_len = (j + 3 - i) // 3
206
+ if aa_len >= min_aa:
207
+ nt_seq = dna[i : j + 3]
208
+ orfs.append(
209
+ {
210
+ "strand": strand,
211
+ "frame": frame + 1,
212
+ "start": i + 1,
213
+ "end": j + 3,
214
+ "length_nt": len(nt_seq),
215
+ "length_aa": aa_len,
216
+ "protein": _translate_dna(nt_seq, frame=1, to_stop=True),
217
+ }
218
+ )
219
+ break
220
+ j += 3
221
+ i += 3
222
+
223
+ orfs.sort(key=lambda x: x["length_aa"], reverse=True)
224
+ return {
225
+ "summary": f"Found {len(orfs)} ORFs with length >= {min_aa} aa.",
226
+ "orfs": orfs,
227
+ "count": len(orfs),
228
+ }
229
+
230
+
231
+ @registry.register(
232
+ name="dna.codon_optimize",
233
+ description="Codon-optimize a protein sequence for a host species",
234
+ category="dna",
235
+ parameters={
236
+ "protein_sequence": "Amino-acid sequence (single-letter, may include *)",
237
+ "species": "Target host codon table: human or ecoli",
238
+ },
239
+ usage_guide="Use for expression construct design in common hosts.",
240
+ )
241
+ def codon_optimize(protein_sequence: str, species: str = "human", **kwargs) -> dict:
242
+ protein = re.sub(r"\s+", "", str(protein_sequence or "").upper())
243
+ protein = protein.replace("*", "")
244
+ if not protein:
245
+ return {"summary": "protein_sequence is required.", "error": "missing_protein"}
246
+
247
+ host = str(species or "human").strip().lower()
248
+ if host not in _PREF_CODONS:
249
+ return {"summary": "Unsupported species. Use human or ecoli.", "error": "invalid_species"}
250
+
251
+ mapping = _PREF_CODONS[host]
252
+ invalid = sorted({aa for aa in protein if aa not in mapping})
253
+ if invalid:
254
+ return {"summary": f"Invalid amino acids: {', '.join(invalid)}", "error": "invalid_protein"}
255
+
256
+ dna = "".join(mapping[aa] for aa in protein)
257
+ return {
258
+ "summary": f"Codon-optimized sequence generated for {host} ({len(protein)} aa).",
259
+ "species": host,
260
+ "protein_length": len(protein),
261
+ "optimized_dna": dna,
262
+ "gc_content": round(_gc_content(dna), 2),
263
+ }
264
+
265
+
266
+ @registry.register(
267
+ name="dna.restriction_sites",
268
+ description="Find common restriction enzyme sites in a DNA sequence",
269
+ category="dna",
270
+ parameters={
271
+ "sequence": "DNA sequence",
272
+ "enzymes": "Optional list or comma-separated enzyme names",
273
+ },
274
+ usage_guide="Use to choose cloning strategy and verify unwanted cut sites.",
275
+ )
276
+ def restriction_sites(sequence: str, enzymes: list[str] | str | None = None, **kwargs) -> dict:
277
+ seq, err = _validate_dna(sequence)
278
+ if err:
279
+ return {"summary": err, "error": "invalid_sequence"}
280
+
281
+ if enzymes is None:
282
+ selected = list(_ENZYME_MOTIFS.keys())
283
+ elif isinstance(enzymes, str):
284
+ selected = [x.strip() for x in enzymes.split(",") if x.strip()]
285
+ else:
286
+ selected = [str(x).strip() for x in enzymes if str(x).strip()]
287
+
288
+ unknown = [e for e in selected if e not in _ENZYME_MOTIFS]
289
+ if unknown:
290
+ return {
291
+ "summary": f"Unknown enzymes: {', '.join(unknown)}",
292
+ "error": "invalid_enzyme",
293
+ "available_enzymes": sorted(_ENZYME_MOTIFS.keys()),
294
+ }
295
+
296
+ matches = []
297
+ for enzyme in selected:
298
+ motif = _ENZYME_MOTIFS[enzyme]
299
+ positions = _find_sites(seq, motif)
300
+ matches.append(
301
+ {
302
+ "enzyme": enzyme,
303
+ "motif": motif,
304
+ "n_sites": len(positions),
305
+ "positions": positions,
306
+ }
307
+ )
308
+
309
+ total = sum(m["n_sites"] for m in matches)
310
+ return {
311
+ "summary": f"Restriction scan complete: {total} total sites across {len(selected)} enzymes.",
312
+ "sequence_length": len(seq),
313
+ "results": matches,
314
+ }
315
+
316
+
317
+ @registry.register(
318
+ name="dna.virtual_digest",
319
+ description="Perform an in-silico digest and return fragment sizes",
320
+ category="dna",
321
+ parameters={
322
+ "sequence": "DNA sequence",
323
+ "enzymes": "List or comma-separated enzymes (supported: EcoRI,BamHI,HindIII,NotI,XhoI,NheI,BsaI,BsmBI)",
324
+ "circular": "Treat sequence as circular (default false)",
325
+ },
326
+ usage_guide="Use to predict gel band patterns before running wet-lab digests.",
327
+ )
328
+ def virtual_digest(sequence: str, enzymes: list[str] | str, circular: bool = False, **kwargs) -> dict:
329
+ seq, err = _validate_dna(sequence)
330
+ if err:
331
+ return {"summary": err, "error": "invalid_sequence"}
332
+
333
+ site_result = restriction_sites(seq, enzymes=enzymes)
334
+ if site_result.get("error"):
335
+ return site_result
336
+
337
+ cut_positions = sorted({p for item in site_result["results"] for p in item["positions"]})
338
+ if not cut_positions:
339
+ return {
340
+ "summary": "No cut sites found.",
341
+ "fragments_bp": [len(seq)],
342
+ "n_fragments": 1,
343
+ "cut_positions": [],
344
+ }
345
+
346
+ cuts = [p - 1 for p in cut_positions] # 0-based
347
+ if circular:
348
+ cuts = sorted(cuts)
349
+ fragments = []
350
+ for idx, cut in enumerate(cuts):
351
+ nxt = cuts[(idx + 1) % len(cuts)]
352
+ if nxt > cut:
353
+ fragments.append(nxt - cut)
354
+ else:
355
+ fragments.append((len(seq) - cut) + nxt)
356
+ else:
357
+ points = [0] + cuts + [len(seq)]
358
+ fragments = [points[i + 1] - points[i] for i in range(len(points) - 1) if points[i + 1] - points[i] > 0]
359
+
360
+ fragments = sorted(fragments, reverse=True)
361
+ return {
362
+ "summary": f"Virtual digest produced {len(fragments)} fragments.",
363
+ "n_fragments": len(fragments),
364
+ "fragments_bp": fragments,
365
+ "cut_positions": cut_positions,
366
+ "circular": bool(circular),
367
+ }
368
+
369
+
370
+ @registry.register(
371
+ name="dna.primer_design",
372
+ description="Design simple PCR primers around a target region",
373
+ category="dna",
374
+ parameters={
375
+ "sequence": "Template DNA sequence",
376
+ "target_start": "Target region start (1-based, optional)",
377
+ "target_end": "Target region end (1-based, optional)",
378
+ "primer_min_len": "Minimum primer length (default 18)",
379
+ "primer_max_len": "Maximum primer length (default 24)",
380
+ "tm_target": "Target primer Tm in C (default 60)",
381
+ },
382
+ usage_guide="Use as a fast first-pass primer suggestion before detailed wet-lab validation.",
383
+ )
384
+ def primer_design(
385
+ sequence: str,
386
+ target_start: int | None = None,
387
+ target_end: int | None = None,
388
+ primer_min_len: int = 18,
389
+ primer_max_len: int = 24,
390
+ tm_target: float = 60.0,
391
+ **kwargs,
392
+ ) -> dict:
393
+ seq, err = _validate_dna(sequence)
394
+ if err:
395
+ return {"summary": err, "error": "invalid_sequence"}
396
+
397
+ min_len = max(16, int(primer_min_len))
398
+ max_len = max(min_len, min(35, int(primer_max_len)))
399
+ tm_target = float(tm_target)
400
+
401
+ start = int(target_start) if target_start else 1
402
+ end = int(target_end) if target_end else len(seq)
403
+ start = max(1, min(start, len(seq)))
404
+ end = max(start, min(end, len(seq)))
405
+
406
+ left_window = seq[max(0, start - 1 - 80) : start - 1 + max_len]
407
+ right_window = seq[max(0, end - max_len) : min(len(seq), end + 80)]
408
+
409
+ left_cands = _pick_primer_candidates(left_window, min_len, max_len, tm_target)
410
+ right_rev = _reverse_complement(right_window)
411
+ right_cands = _pick_primer_candidates(right_rev, min_len, max_len, tm_target)
412
+
413
+ if not left_cands or not right_cands:
414
+ return {"summary": "Unable to design primers in target windows.", "error": "design_failed"}
415
+
416
+ fwd = left_cands[0]
417
+ rev = right_cands[0]
418
+
419
+ # Approximate amplicon based on target bounds (not exact genomic placement).
420
+ amplicon_bp = max(1, end - start + 1 + len(fwd.seq) + len(rev.seq))
421
+
422
+ return {
423
+ "summary": f"Designed primer pair (F {len(fwd.seq)} nt, R {len(rev.seq)} nt) for ~{amplicon_bp} bp amplicon.",
424
+ "forward_primer": {
425
+ "sequence": fwd.seq,
426
+ "length": len(fwd.seq),
427
+ "tm_c": round(fwd.tm, 2),
428
+ "gc_percent": round(fwd.gc, 2),
429
+ },
430
+ "reverse_primer": {
431
+ "sequence": rev.seq,
432
+ "length": len(rev.seq),
433
+ "tm_c": round(rev.tm, 2),
434
+ "gc_percent": round(rev.gc, 2),
435
+ },
436
+ "target_region": {"start": start, "end": end},
437
+ "estimated_amplicon_bp": amplicon_bp,
438
+ "note": "Heuristic first-pass design; verify specificity with BLAST/in-silico PCR.",
439
+ }
440
+
441
+
442
+ @registry.register(
443
+ name="dna.pcr_protocol",
444
+ description="Generate a PCR thermal cycling protocol",
445
+ category="dna",
446
+ parameters={
447
+ "product_size_bp": "Expected amplicon size",
448
+ "primer_tm": "Primer melting temperature in C",
449
+ "polymerase": "Polymerase name (default Q5)",
450
+ "cycles": "Number of PCR cycles (default 30)",
451
+ },
452
+ usage_guide="Use to quickly draft PCR conditions aligned with primer and amplicon properties.",
453
+ )
454
+ def pcr_protocol(
455
+ product_size_bp: int = 1000,
456
+ primer_tm: float = 60.0,
457
+ polymerase: str = "Q5",
458
+ cycles: int = 30,
459
+ **kwargs,
460
+ ) -> dict:
461
+ size_bp = max(50, int(product_size_bp))
462
+ tm = float(primer_tm)
463
+ cycles = max(15, min(40, int(cycles)))
464
+
465
+ anneal = max(45.0, min(72.0, tm - 3.0))
466
+ extension_s = max(10, int(round(size_bp / 1000.0 * 30)))
467
+
468
+ protocol = [
469
+ {"step": "Initial denaturation", "temperature_c": 98, "time_s": 30},
470
+ {"step": f"{cycles} cycles: denaturation", "temperature_c": 98, "time_s": 10},
471
+ {"step": f"{cycles} cycles: annealing", "temperature_c": round(anneal, 1), "time_s": 20},
472
+ {"step": f"{cycles} cycles: extension", "temperature_c": 72, "time_s": extension_s},
473
+ {"step": "Final extension", "temperature_c": 72, "time_s": 120},
474
+ {"step": "Hold", "temperature_c": 4, "time_s": 0},
475
+ ]
476
+
477
+ return {
478
+ "summary": f"PCR protocol generated for {size_bp} bp product using {polymerase}.",
479
+ "polymerase": polymerase,
480
+ "cycles": cycles,
481
+ "product_size_bp": size_bp,
482
+ "annealing_temp_c": round(anneal, 1),
483
+ "extension_time_s": extension_s,
484
+ "protocol": protocol,
485
+ "note": "General starting conditions; optimize empirically for template/primers/polymerase buffer.",
486
+ }
487
+
488
+
489
+ @registry.register(
490
+ name="dna.gibson_design",
491
+ description="Suggest overlap sequences for Gibson assembly fragments",
492
+ category="dna",
493
+ parameters={
494
+ "fragments": "Ordered list of DNA fragment sequences",
495
+ "overlap_length": "Desired overlap length (default 25)",
496
+ },
497
+ usage_guide="Use to draft overlap strategy for multi-fragment Gibson assembly.",
498
+ )
499
+ def gibson_design(fragments: list[str], overlap_length: int = 25, **kwargs) -> dict:
500
+ if not isinstance(fragments, list) or len(fragments) < 2:
501
+ return {"summary": "Provide at least two fragment sequences.", "error": "invalid_fragments"}
502
+
503
+ ov = max(15, min(60, int(overlap_length)))
504
+ cleaned = []
505
+ for idx, fragment in enumerate(fragments, 1):
506
+ seq, err = _validate_dna(fragment)
507
+ if err:
508
+ return {"summary": f"Fragment {idx}: {err}", "error": "invalid_sequence"}
509
+ if len(seq) < ov:
510
+ return {"summary": f"Fragment {idx} shorter than overlap_length {ov}.", "error": "fragment_too_short"}
511
+ cleaned.append(seq)
512
+
513
+ joins = []
514
+ for i in range(len(cleaned) - 1):
515
+ left = cleaned[i][-ov:]
516
+ right = cleaned[i + 1][:ov]
517
+ joins.append(
518
+ {
519
+ "join": f"{i + 1}->{i + 2}",
520
+ "left_tail": left,
521
+ "right_head": right,
522
+ "gc_percent": round((_gc_content(left) + _gc_content(right)) / 2.0, 2),
523
+ }
524
+ )
525
+
526
+ return {
527
+ "summary": f"Generated Gibson overlap plan for {len(cleaned)} fragments ({len(joins)} joins).",
528
+ "overlap_length": ov,
529
+ "joins": joins,
530
+ "note": "Ensure overlaps are unique and avoid strong secondary structures.",
531
+ }
532
+
533
+
534
+ @registry.register(
535
+ name="dna.golden_gate_design",
536
+ description="Suggest Golden Gate-compatible overhang plan",
537
+ category="dna",
538
+ parameters={
539
+ "parts": "Ordered list of DNA part names or labels",
540
+ "enzyme": "Type IIS enzyme (BsaI or BsmBI; default BsaI)",
541
+ },
542
+ usage_guide="Use to draft overhang strategy for modular Golden Gate assemblies.",
543
+ )
544
+ def golden_gate_design(parts: list[str], enzyme: str = "BsaI", **kwargs) -> dict:
545
+ if not isinstance(parts, list) or len(parts) < 2:
546
+ return {"summary": "Provide at least two part labels.", "error": "invalid_parts"}
547
+
548
+ enzyme_norm = str(enzyme or "BsaI").strip()
549
+ if enzyme_norm not in {"BsaI", "BsmBI"}:
550
+ return {"summary": "Unsupported enzyme. Use BsaI or BsmBI.", "error": "invalid_enzyme"}
551
+
552
+ # Simple deterministic non-palindromic overhang set.
553
+ overhang_pool = ["AATG", "GCTT", "CGAA", "TGCC", "ACTA", "GGCT", "TTAC", "CAGG", "AGTC", "TCGA"]
554
+ n_joins = len(parts) - 1
555
+ if n_joins > len(overhang_pool):
556
+ return {"summary": "Too many parts for built-in overhang pool.", "error": "too_many_parts"}
557
+
558
+ joins = []
559
+ for i in range(n_joins):
560
+ joins.append(
561
+ {
562
+ "from_part": str(parts[i]),
563
+ "to_part": str(parts[i + 1]),
564
+ "overhang": overhang_pool[i],
565
+ }
566
+ )
567
+
568
+ motif = _ENZYME_MOTIFS[enzyme_norm]
569
+ return {
570
+ "summary": f"Golden Gate plan generated for {len(parts)} parts using {enzyme_norm}.",
571
+ "enzyme": enzyme_norm,
572
+ "enzyme_motif": motif,
573
+ "joins": joins,
574
+ "note": "Validate overhang uniqueness and absence of internal Type IIS sites before synthesis.",
575
+ }