seqsplit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
seqsplit/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ from importlib.metadata import version, PackageNotFoundError
2
+
3
+ try:
4
+ __version__ = version("seq-splitter")
5
+ except PackageNotFoundError:
6
+ __version__ = "0.1.0-dev"
7
+
8
+ from .api import split_fna
9
+
10
+ __all__ = ["split_fna", "__version__"]
seqsplit/api.py ADDED
@@ -0,0 +1,133 @@
1
+ """
2
+ Python API for seqsplit.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any
8
+
9
+ import numpy as np
10
+
11
+ from .tables import LigationTable
12
+ from .sequence import get_all_possible_idx_matrices
13
+ from .search import run_beam_search
14
+
15
+
16
+ def split_fna(
17
+ fna_path: str,
18
+ table: LigationTable,
19
+ *,
20
+ max_oligo_len: int = 250,
21
+ region_len: int = 20,
22
+ overhang_len: int = 4,
23
+ beam_width: int = 100,
24
+ mode: str = "greedy",
25
+ rollout_samples: int = 100,
26
+ heuristic_percentile: float = 100.0,
27
+ seed: int = 42,
28
+ verbose: bool = True,
29
+ ) -> list[dict[str, Any]]:
30
+ """
31
+ Split all sequences in a FASTA/FNA file at optimal ligation sites.
32
+
33
+ Parameters
34
+ ----------
35
+ fna_path : str
36
+ Path to input FASTA/FNA file.
37
+ table : LigationTable
38
+ Loaded ligation frequency table. Obtain via
39
+ :func:`~seqsplit.tables.load_ligation_table` or
40
+ :func:`~seqsplit.tables.load_builtin_table`.
41
+ max_oligo_len : int
42
+ Maximum allowed oligo/fragment length in nt.
43
+ region_len : int
44
+ Width of each candidate overhang region.
45
+ ``region_len - overhang_len + 1`` candidate overhangs are evaluated
46
+ per region.
47
+ overhang_len : int
48
+ Overhang length in nt. Must match the ligation table.
49
+ beam_width : int
50
+ Number of partial paths kept alive in the beam.
51
+ mode : {'greedy', 'rollout'}
52
+ 'greedy' scores by current-prefix fidelity only (fast, recommended).
53
+ 'rollout' uses random completions as a lookahead heuristic.
54
+ rollout_samples : int
55
+ Number of random rollouts used to evaluate each candidate (rollout
56
+ mode only).
57
+ heuristic_percentile : float
58
+ Percentile of rollout scores used as the heuristic to guide the search.
59
+ 100 → max (default), 98 → more pessimistic heuristic, etc.
60
+ seed : ints
61
+ NumPy random seed for reproducibility.
62
+ verbose : bool
63
+ Print per-sequence progress.
64
+
65
+ Returns
66
+ -------
67
+ list of dicts with keys:
68
+ * ``header`` – header string of sequence from FASTA/FNA
69
+ * ``seq_len`` – sequence length in nt
70
+ * ``num_fragments`` – number of fragments produced
71
+ * ``overhangs`` – list of overhang DNA strings
72
+ * ``oh_row_indices`` – list of ligation-table row indices
73
+ * ``oh_start_coords`` – list of 0-indexed overhang start positions
74
+ * ``log_fidelity`` – best log-fidelity
75
+ * ``fidelity`` – best fidelity (exp of log_fidelity)
76
+ * ``runtime_s`` – wall-clock time for this sequence
77
+ """
78
+ if mode not in ("greedy", "rollout"):
79
+ raise ValueError(f"mode must be 'greedy' or 'rollout', got '{mode}'.")
80
+ if overhang_len != table.overhang_len:
81
+ raise ValueError(
82
+ f"overhang_len={overhang_len} does not match the loaded table "
83
+ f"(table.overhang_len={table.overhang_len})."
84
+ )
85
+
86
+ branching_factor = region_len - overhang_len + 1
87
+ n_rollouts = rollout_samples if mode == "rollout" else 0
88
+ rng = np.random.default_rng(seed)
89
+
90
+ all_matrices = get_all_possible_idx_matrices(
91
+ fna_path,
92
+ table.kmer_enc_to_row_idx,
93
+ oh_region_len=region_len,
94
+ overhang_len=overhang_len,
95
+ max_oligo_len=max_oligo_len,
96
+ )
97
+
98
+ results = []
99
+ for header, (mtrx, region_starts, seq_len) in all_matrices.items():
100
+ oh_list, log_fid, runtime, oh_coords = run_beam_search(
101
+ possible_idx_matrix=mtrx,
102
+ branching_factor=branching_factor,
103
+ total_num_regions=mtrx.shape[0],
104
+ beam_width=beam_width,
105
+ rollout_samples=n_rollouts,
106
+ region_starts=region_starts,
107
+ rng=rng,
108
+ table=table,
109
+ overhang_len=overhang_len,
110
+ heuristic_percentile=heuristic_percentile,
111
+ verbose=verbose,
112
+ )
113
+
114
+ if oh_list is None:
115
+ results.append({"header": header, "error": "no solution found"})
116
+ continue
117
+
118
+ oh_strings = [table.row_overhangs[idx] for idx in oh_list]
119
+ results.append(
120
+ {
121
+ "header": header,
122
+ "seq_len": seq_len,
123
+ "num_fragments": len(oh_strings) + 1,
124
+ "overhangs": oh_strings,
125
+ "oh_row_indices": oh_list,
126
+ "oh_start_coords": oh_coords,
127
+ "log_fidelity": float(log_fid),
128
+ "fidelity": float(np.exp(log_fid)),
129
+ "runtime_s": float(runtime),
130
+ }
131
+ )
132
+
133
+ return results
seqsplit/cli.py ADDED
@@ -0,0 +1,370 @@
1
+ """
2
+ Command-line interface for seqsplit.
3
+
4
+ Usage examples
5
+ --------------
6
+ # Run overhang search in greedy mode with a built-in ligation table
7
+ seqsplit sequences.fna --table potapov2018_T4_18h_25C
8
+
9
+ # Rollout mode with a custom table
10
+ seqsplit sequences.fna --table-path my_table.csv --mode rollout --rollout-samples 100
11
+
12
+ # More pessimistic rollout heuristic (98th percentile instead of max) – generally not recommended
13
+ seqsplit sequences.fna --table potapov2018_T4_18h_25C \\
14
+ --mode rollout --rollout-samples 100 --heuristic-percentile 98
15
+
16
+ # Allow for larger oligos and consider wider candidate overhang regions
17
+ seqsplit sequences.fna --table potapov2018_T4_18h_25C \\
18
+ --max-oligo-len 300 --region-len 25 --overhang-len 4
19
+
20
+ # List bundled tables
21
+ seqsplit --list-tables
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import os
28
+ import sys
29
+ from typing import Optional
30
+
31
+ import numpy as np
32
+ import pandas as pd
33
+
34
+ from .tables import load_ligation_table, load_builtin_table, list_builtin_tables
35
+ from .sequence import get_all_possible_idx_matrices
36
+ from .search import run_beam_search
37
+
38
+
39
+ def _build_parser() -> argparse.ArgumentParser:
40
+ p = argparse.ArgumentParser(
41
+ prog="seqsplit",
42
+ description=(
43
+ "Split DNA sequences at optimal ligation sites for synthesis."
44
+ "Outputs a CSV with selected overhangs and split coordinates."
45
+ ),
46
+ formatter_class=argparse.RawDescriptionHelpFormatter,
47
+ epilog=__doc__,
48
+ )
49
+
50
+ p.add_argument(
51
+ "fna",
52
+ nargs="?",
53
+ metavar="FNA_FILE",
54
+ help="Path to input DNA sequences in FASTA / FNA format.",
55
+ )
56
+
57
+ # ---- Ligation table (mutually exclusive) ----
58
+ tbl = p.add_mutually_exclusive_group()
59
+ tbl.add_argument(
60
+ "--table",
61
+ metavar="NAME",
62
+ help=(
63
+ "Name of a bundled ligation table. "
64
+ "Run --list-tables to see available names."
65
+ ),
66
+ )
67
+ tbl.add_argument(
68
+ "--table-path",
69
+ metavar="CSV",
70
+ help=(
71
+ "Path to a custom ligation frequency CSV. "
72
+ "See docs/ligation_table_format.md for the required format."
73
+ ),
74
+ )
75
+
76
+ # ---- Core parameters ----
77
+ p.add_argument(
78
+ "--max-oligo-len",
79
+ type=int,
80
+ default=250,
81
+ metavar="NT",
82
+ help="Maximum allowed oligo length in nt (default: %(default)s).",
83
+ )
84
+ p.add_argument(
85
+ "--region-len",
86
+ type=int,
87
+ default=20,
88
+ metavar="NT",
89
+ help=(
90
+ "Width of each candidate overhang region in nt (default: %(default)s). "
91
+ "There are (region-len − overhang-len + 1) candidate overhangs per region."
92
+ ),
93
+ )
94
+ p.add_argument(
95
+ "--overhang-len",
96
+ type=int,
97
+ default=4,
98
+ metavar="NT",
99
+ help=(
100
+ "Overhang length in nt (default: %(default)s). "
101
+ "This must match the length of overhangs in the ligation frequencies table."
102
+ ),
103
+ )
104
+ p.add_argument(
105
+ "--beam-width",
106
+ type=int,
107
+ default=100,
108
+ metavar="N",
109
+ help="Beam width (number of partial paths kept alive in the search) (default: %(default)s).",
110
+ )
111
+ p.add_argument(
112
+ "--seed",
113
+ type=int,
114
+ default=42,
115
+ metavar="INT",
116
+ help="Random seed for reproducibility (default: %(default)s).",
117
+ )
118
+
119
+ # ---- Search mode ----
120
+ p.add_argument(
121
+ "--mode",
122
+ choices=["greedy", "rollout"],
123
+ default="greedy",
124
+ help=(
125
+ "Search mode. 'greedy': score by current-prefix fidelity only "
126
+ "(fast, good default). 'rollout': score by heuristic of future fidelity "
127
+ "via random completions (slower, not generally recommended). "
128
+ "(default: %(default)s)"
129
+ ),
130
+ )
131
+
132
+ # ---- Rollout-specific arguments ----
133
+ rollout = p.add_argument_group(
134
+ "rollout options",
135
+ "Only used when --mode rollout is set.",
136
+ )
137
+ rollout.add_argument(
138
+ "--rollout-samples",
139
+ type=int,
140
+ default=100,
141
+ metavar="N",
142
+ help="Number of random rollouts used to evaluate each candidate (default: %(default)s).",
143
+ )
144
+ rollout.add_argument(
145
+ "--heuristic-percentile",
146
+ type=float,
147
+ default=100.0,
148
+ metavar="PCT",
149
+ help=(
150
+ "Percentile of rollout fidelities used as the beam-search heuristic. "
151
+ "100 = maximum / best fidelity of the set."
152
+ "Lower values, e.g. 98, are more pessimistic heuristics. (default: %(default)s)"
153
+ ),
154
+ )
155
+
156
+ # ---- Output ----
157
+ p.add_argument(
158
+ "--output",
159
+ "-o",
160
+ metavar="CSV",
161
+ help=(
162
+ "Output CSV path (default: <input_stem>.seqsplit_results.csv). "
163
+ "Rows are appended in real time so partial results are preserved "
164
+ "if the run is interrupted."
165
+ ),
166
+ )
167
+ p.add_argument(
168
+ "--quiet",
169
+ "-q",
170
+ action="store_true",
171
+ help="Suppress per-sequence progress output.",
172
+ )
173
+
174
+ # ---- Utility ----
175
+ p.add_argument(
176
+ "--list-tables",
177
+ action="store_true",
178
+ help="Print available bundled ligation tables.",
179
+ )
180
+ p.add_argument("--version", action="version", version="%(prog)s 0.1.0")
181
+
182
+ return p
183
+
184
+
185
+ # ---------------------------------------------------------------------------
186
+ # Validation helpers
187
+ # ---------------------------------------------------------------------------
188
+
189
+ def _validate_args(args: argparse.Namespace) -> Optional[str]:
190
+ """Return an error string if args are invalid, else None."""
191
+ if args.list_tables:
192
+ return None
193
+
194
+ if args.fna is None:
195
+ return "FNA_FILE is required (or use --list-tables if just interested in viewing the bundled ligation frequency tables)."
196
+ if not os.path.exists(args.fna):
197
+ return f"File not found: {args.fna}"
198
+ if args.table is None and args.table_path is None:
199
+ return "A ligation table is required: use --table NAME or --table-path CSV."
200
+ if args.overhang_len < 1:
201
+ return "--overhang-len must be >= 1."
202
+ if args.region_len < args.overhang_len:
203
+ return "--region-len must be >= --overhang-len."
204
+ if args.max_oligo_len <= args.region_len:
205
+ return "--max-oligo-len must be > --region-len."
206
+ if args.beam_width < 1:
207
+ return "--beam-width must be >= 1."
208
+ if args.mode == "rollout" and args.rollout_samples < 1:
209
+ return "--rollout-samples must be >= 1 in rollout mode."
210
+ if not (0 < args.heuristic_percentile <= 100):
211
+ return "--heuristic-percentile must be in (0, 100]."
212
+ return None
213
+
214
+
215
+ # ---------------------------------------------------------------------------
216
+ # Main
217
+ # ---------------------------------------------------------------------------
218
+
219
+ def main(argv=None) -> int:
220
+ parser = _build_parser()
221
+ args = parser.parse_args(argv)
222
+
223
+ # ---- --list-tables ----
224
+ if args.list_tables:
225
+ tables = list_builtin_tables()
226
+ if tables:
227
+ print("Bundled ligation tables:")
228
+ for name in tables:
229
+ print(f" {name}")
230
+ else:
231
+ print(
232
+ "No tables are currently bundled.\n"
233
+ "Place a CSV in seqsplit/data/ and register it in "
234
+ "seqsplit/tables.py:BUILTIN_TABLES."
235
+ )
236
+ return 0
237
+
238
+ # ---- Validate ----
239
+ err = _validate_args(args)
240
+ if err:
241
+ print(f"Error: {err}", file=sys.stderr)
242
+ return 1
243
+
244
+ # ---- Load ligation table ----
245
+ try:
246
+ if args.table_path:
247
+ if not args.quiet:
248
+ print(f"Loading ligation table: {args.table_path}")
249
+ table = load_ligation_table(
250
+ args.table_path, overhang_len=args.overhang_len
251
+ )
252
+ else:
253
+ if not args.quiet:
254
+ print(f"Loading built-in table: {args.table}")
255
+ table = load_builtin_table(args.table, overhang_len=args.overhang_len)
256
+ except (FileNotFoundError, ValueError) as exc:
257
+ print(f"Error loading ligation table: {exc}", file=sys.stderr)
258
+ return 1
259
+
260
+ branching_factor = args.region_len - args.overhang_len + 1
261
+ rollout_samples = args.rollout_samples if args.mode == "rollout" else 0
262
+
263
+ if not args.quiet:
264
+ print(f"\nParameters")
265
+ print(f" input : {args.fna}")
266
+ print(f" max_oligo_len : {args.max_oligo_len} nt")
267
+ print(f" region_len : {args.region_len} nt "
268
+ f"({branching_factor} candidates / region)")
269
+ print(f" overhang_len : {args.overhang_len} nt")
270
+ print(f" beam_width : {args.beam_width}")
271
+ print(f" mode : {args.mode}")
272
+ if args.mode == "rollout":
273
+ print(f" rollout_samples : {rollout_samples}")
274
+ print(f" heuristic percentile : p{args.heuristic_percentile:.0f}")
275
+ print()
276
+
277
+ # ---- Build index matrices ----
278
+ if not args.quiet:
279
+ print(f"Building overhang candidate matrices representing search tree from '{args.fna}'...")
280
+ try:
281
+ all_matrices = get_all_possible_idx_matrices(
282
+ args.fna,
283
+ table.kmer_enc_to_row_idx,
284
+ oh_region_len=args.region_len,
285
+ overhang_len=args.overhang_len,
286
+ max_oligo_len=args.max_oligo_len,
287
+ )
288
+ except Exception as exc:
289
+ print(f"Error reading input file: {exc}", file=sys.stderr)
290
+ return 1
291
+
292
+ n_seqs = len(all_matrices)
293
+ if not args.quiet:
294
+ print(f" {n_seqs} sequence(s) loaded.\n")
295
+
296
+ # ---- Output CSV ----
297
+ out_path = args.output or (
298
+ os.path.splitext(args.fna)[0] + ".seqsplit_results.csv"
299
+ )
300
+ _init_output_csv(out_path)
301
+ if not args.quiet:
302
+ print(f"Results → {out_path}\n")
303
+
304
+ # ---- Run search ----
305
+ rng = np.random.default_rng(args.seed)
306
+ errors = 0
307
+
308
+ for i, (header, (mtrx, region_starts, seq_len)) in enumerate(
309
+ all_matrices.items(), start=1
310
+ ):
311
+ if not args.quiet:
312
+ short_header = header[:72] + ("..." if len(header) > 72 else "")
313
+ print(f"[{i}/{n_seqs}] {short_header}")
314
+
315
+ oh_list, log_fid, runtime, oh_coords = run_beam_search(
316
+ possible_idx_matrix=mtrx,
317
+ branching_factor=branching_factor,
318
+ total_num_regions=mtrx.shape[0],
319
+ beam_width=args.beam_width,
320
+ rollout_samples=rollout_samples,
321
+ region_starts=region_starts,
322
+ rng=rng,
323
+ table=table,
324
+ overhang_len=args.overhang_len,
325
+ heuristic_percentile=args.heuristic_percentile,
326
+ verbose=not args.quiet,
327
+ )
328
+
329
+ if oh_list is None:
330
+ print(f" WARNING: no solution found for '{header}'", file=sys.stderr)
331
+ errors += 1
332
+ continue
333
+
334
+ oh_strings = [table.row_overhangs[idx] for idx in oh_list]
335
+
336
+ row = {
337
+ "seq_header": header,
338
+ "seq_len_nt": seq_len,
339
+ "num_fragments": len(oh_strings) + 1,
340
+ "best_log_fidelity": log_fid,
341
+ "best_fidelity": float(np.exp(log_fid)),
342
+ "overhangs": str(oh_strings),
343
+ "overhang_start_coords": str(oh_coords),
344
+ "runtime_s": f"{runtime:.2f}",
345
+ }
346
+ pd.DataFrame([row]).to_csv(out_path, mode="a", header=False, index=False)
347
+
348
+ print(f"\nDone. {n_seqs - errors}/{n_seqs} sequences written to {out_path}")
349
+ return 0 if errors == 0 else 1
350
+
351
+
352
+ def _init_output_csv(path: str) -> None:
353
+ """Create the output CSV with a header row if it does not already exist."""
354
+ if not os.path.exists(path):
355
+ pd.DataFrame(
356
+ columns=[
357
+ "seq_header",
358
+ "seq_len_nt",
359
+ "num_fragments",
360
+ "best_log_fidelity",
361
+ "best_fidelity",
362
+ "overhangs",
363
+ "overhang_start_coords",
364
+ "runtime_s",
365
+ ]
366
+ ).to_csv(path, index=False)
367
+
368
+
369
+ if __name__ == "__main__":
370
+ sys.exit(main())
File without changes