PyPI - seqsplit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

seqsplit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

seqsplit/__init__.py +10 -0
seqsplit/api.py +133 -0
seqsplit/cli.py +370 -0
seqsplit/data/__init__.py +0 -0
seqsplit/data/potapov2018_T4_01h_25C_4nt_oh_ligation_freqs.csv +257 -0
seqsplit/data/potapov2018_T4_01h_37C_4nt_oh_ligation_freqs.csv +257 -0
seqsplit/data/potapov2018_T4_18h_25C_4nt_oh_ligation_freqs.csv +257 -0
seqsplit/data/potapov2018_T4_18h_37C_4nt_oh_ligation_freqs.csv +257 -0
seqsplit/data/potapov2018_T7_18h_25C_4nt_oh_ligation_freqs.csv +257 -0
seqsplit/data/potapov2018_T7_18h_37C_4nt_oh_ligation_freqs.csv +257 -0
seqsplit/search.py +312 -0
seqsplit/sequence.py +191 -0
seqsplit/tables.py +211 -0
seqsplit-0.1.0.dist-info/METADATA +116 -0
seqsplit-0.1.0.dist-info/RECORD +18 -0
seqsplit-0.1.0.dist-info/WHEEL +5 -0
seqsplit-0.1.0.dist-info/entry_points.txt +2 -0
seqsplit-0.1.0.dist-info/top_level.txt +1 -0

seqsplit/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from importlib.metadata import version, PackageNotFoundError
+try:
+    __version__ = version("seq-splitter")
+except PackageNotFoundError:
+    __version__ = "0.1.0-dev"
+from .api import split_fna
+__all__ = ["split_fna", "__version__"]

seqsplit/api.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""
+Python API for seqsplit.
+"""
+from __future__ import annotations
+from typing import Any
+import numpy as np
+from .tables import LigationTable
+from .sequence import get_all_possible_idx_matrices
+from .search import run_beam_search
+def split_fna(
+    fna_path: str,
+    table: LigationTable,
+    *,
+    max_oligo_len: int = 250,
+    region_len: int = 20,
+    overhang_len: int = 4,
+    beam_width: int = 100,
+    mode: str = "greedy",
+    rollout_samples: int = 100,
+    heuristic_percentile: float = 100.0,
+    seed: int = 42,
+    verbose: bool = True,
+) -> list[dict[str, Any]]:
+    """
+    Split all sequences in a FASTA/FNA file at optimal ligation sites.
+    Parameters
+    ----------
+    fna_path : str
+        Path to input FASTA/FNA file.
+    table : LigationTable
+        Loaded ligation frequency table. Obtain via
+        :func:`~seqsplit.tables.load_ligation_table` or
+        :func:`~seqsplit.tables.load_builtin_table`.
+    max_oligo_len : int
+        Maximum allowed oligo/fragment length in nt.
+    region_len : int
+        Width of each candidate overhang region.
+        ``region_len - overhang_len + 1`` candidate overhangs are evaluated
+        per region.
+    overhang_len : int
+        Overhang length in nt. Must match the ligation table.
+    beam_width : int
+        Number of partial paths kept alive in the beam.
+    mode : {'greedy', 'rollout'}
+        'greedy' scores by current-prefix fidelity only (fast, recommended).
+        'rollout' uses random completions as a lookahead heuristic.
+    rollout_samples : int
+        Number of random rollouts used to evaluate each candidate (rollout
+        mode only).
+    heuristic_percentile : float
+        Percentile of rollout scores used as the heuristic to guide the search.
+        100 → max (default), 98 → more pessimistic heuristic, etc.
+    seed : ints
+        NumPy random seed for reproducibility.
+    verbose : bool
+        Print per-sequence progress.
+    Returns
+    -------
+    list of dicts with keys:
+        * ``header``           – header string of sequence from FASTA/FNA
+        * ``seq_len``          – sequence length in nt
+        * ``num_fragments``    – number of fragments produced
+        * ``overhangs``        – list of overhang DNA strings
+        * ``oh_row_indices``   – list of ligation-table row indices
+        * ``oh_start_coords``  – list of 0-indexed overhang start positions
+        * ``log_fidelity``     – best log-fidelity
+        * ``fidelity``         – best fidelity (exp of log_fidelity)
+        * ``runtime_s``        – wall-clock time for this sequence
+    """
+    if mode not in ("greedy", "rollout"):
+        raise ValueError(f"mode must be 'greedy' or 'rollout', got '{mode}'.")
+    if overhang_len != table.overhang_len:
+        raise ValueError(
+            f"overhang_len={overhang_len} does not match the loaded table "
+            f"(table.overhang_len={table.overhang_len})."
+        )
+    branching_factor = region_len - overhang_len + 1
+    n_rollouts = rollout_samples if mode == "rollout" else 0
+    rng = np.random.default_rng(seed)
+    all_matrices = get_all_possible_idx_matrices(
+        fna_path,
+        table.kmer_enc_to_row_idx,
+        oh_region_len=region_len,
+        overhang_len=overhang_len,
+        max_oligo_len=max_oligo_len,
+    )
+    results = []
+    for header, (mtrx, region_starts, seq_len) in all_matrices.items():
+        oh_list, log_fid, runtime, oh_coords = run_beam_search(
+            possible_idx_matrix=mtrx,
+            branching_factor=branching_factor,
+            total_num_regions=mtrx.shape[0],
+            beam_width=beam_width,
+            rollout_samples=n_rollouts,
+            region_starts=region_starts,
+            rng=rng,
+            table=table,
+            overhang_len=overhang_len,
+            heuristic_percentile=heuristic_percentile,
+            verbose=verbose,
+        )
+        if oh_list is None:
+            results.append({"header": header, "error": "no solution found"})
+            continue
+        oh_strings = [table.row_overhangs[idx] for idx in oh_list]
+        results.append(
+            {
+                "header": header,
+                "seq_len": seq_len,
+                "num_fragments": len(oh_strings) + 1,
+                "overhangs": oh_strings,
+                "oh_row_indices": oh_list,
+                "oh_start_coords": oh_coords,
+                "log_fidelity": float(log_fid),
+                "fidelity": float(np.exp(log_fid)),
+                "runtime_s": float(runtime),
+            }
+        )
+    return results

seqsplit/cli.py ADDED Viewed

@@ -0,0 +1,370 @@
+"""
+Command-line interface for seqsplit.
+Usage examples
+--------------
+# Run overhang search in greedy mode with a built-in ligation table
+seqsplit sequences.fna --table potapov2018_T4_18h_25C
+# Rollout mode with a custom table
+seqsplit sequences.fna --table-path my_table.csv --mode rollout --rollout-samples 100
+# More pessimistic rollout heuristic (98th percentile instead of max) – generally not recommended
+seqsplit sequences.fna --table potapov2018_T4_18h_25C \\
+    --mode rollout --rollout-samples 100 --heuristic-percentile 98
+# Allow for larger oligos and consider wider candidate overhang regions
+seqsplit sequences.fna --table potapov2018_T4_18h_25C \\
+    --max-oligo-len 300 --region-len 25 --overhang-len 4
+# List bundled tables
+seqsplit --list-tables
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+from typing import Optional
+import numpy as np
+import pandas as pd
+from .tables import load_ligation_table, load_builtin_table, list_builtin_tables
+from .sequence import get_all_possible_idx_matrices
+from .search import run_beam_search
+def _build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="seqsplit",
+        description=(
+            "Split DNA sequences at optimal ligation sites for synthesis."
+            "Outputs a CSV with selected overhangs and split coordinates."
+        ),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    p.add_argument(
+        "fna",
+        nargs="?",
+        metavar="FNA_FILE",
+        help="Path to input DNA sequences in FASTA / FNA format.",
+    )
+    # ---- Ligation table (mutually exclusive) ----
+    tbl = p.add_mutually_exclusive_group()
+    tbl.add_argument(
+        "--table",
+        metavar="NAME",
+        help=(
+            "Name of a bundled ligation table. "
+            "Run --list-tables to see available names."
+        ),
+    )
+    tbl.add_argument(
+        "--table-path",
+        metavar="CSV",
+        help=(
+            "Path to a custom ligation frequency CSV. "
+            "See docs/ligation_table_format.md for the required format."
+        ),
+    )
+    # ---- Core parameters ----
+    p.add_argument(
+        "--max-oligo-len",
+        type=int,
+        default=250,
+        metavar="NT",
+        help="Maximum allowed oligo length in nt (default: %(default)s).",
+    )
+    p.add_argument(
+        "--region-len",
+        type=int,
+        default=20,
+        metavar="NT",
+        help=(
+            "Width of each candidate overhang region in nt (default: %(default)s). "
+            "There are (region-len − overhang-len + 1) candidate overhangs per region."
+        ),
+    )
+    p.add_argument(
+        "--overhang-len",
+        type=int,
+        default=4,
+        metavar="NT",
+        help=(
+            "Overhang length in nt (default: %(default)s). "
+            "This must match the length of overhangs in the ligation frequencies table."
+        ),
+    )
+    p.add_argument(
+        "--beam-width",
+        type=int,
+        default=100,
+        metavar="N",
+        help="Beam width (number of partial paths kept alive in the search) (default: %(default)s).",
+    )
+    p.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        metavar="INT",
+        help="Random seed for reproducibility (default: %(default)s).",
+    )
+    # ---- Search mode ----
+    p.add_argument(
+        "--mode",
+        choices=["greedy", "rollout"],
+        default="greedy",
+        help=(
+            "Search mode. 'greedy': score by current-prefix fidelity only "
+            "(fast, good default). 'rollout': score by heuristic of future fidelity "
+            "via random completions (slower, not generally recommended). "
+            "(default: %(default)s)"
+        ),
+    )
+    # ---- Rollout-specific arguments ----
+    rollout = p.add_argument_group(
+        "rollout options",
+        "Only used when --mode rollout is set.",
+    )
+    rollout.add_argument(
+        "--rollout-samples",
+        type=int,
+        default=100,
+        metavar="N",
+        help="Number of random rollouts used to evaluate each candidate (default: %(default)s).",
+    )
+    rollout.add_argument(
+        "--heuristic-percentile",
+        type=float,
+        default=100.0,
+        metavar="PCT",
+        help=(
+            "Percentile of rollout fidelities used as the beam-search heuristic. "
+            "100 = maximum / best fidelity of the set."
+            "Lower values, e.g. 98, are more pessimistic heuristics. (default: %(default)s)"
+        ),
+    )
+    # ---- Output ----
+    p.add_argument(
+        "--output",
+        "-o",
+        metavar="CSV",
+        help=(
+            "Output CSV path (default: <input_stem>.seqsplit_results.csv). "
+            "Rows are appended in real time so partial results are preserved "
+            "if the run is interrupted."
+        ),
+    )
+    p.add_argument(
+        "--quiet",
+        "-q",
+        action="store_true",
+        help="Suppress per-sequence progress output.",
+    )
+    # ---- Utility ----
+    p.add_argument(
+        "--list-tables",
+        action="store_true",
+        help="Print available bundled ligation tables.",
+    )
+    p.add_argument("--version", action="version", version="%(prog)s 0.1.0")
+    return p
+# ---------------------------------------------------------------------------
+# Validation helpers
+# ---------------------------------------------------------------------------
+def _validate_args(args: argparse.Namespace) -> Optional[str]:
+    """Return an error string if args are invalid, else None."""
+    if args.list_tables:
+        return None
+    if args.fna is None:
+        return "FNA_FILE is required (or use --list-tables if just interested in viewing the bundled ligation frequency tables)."
+    if not os.path.exists(args.fna):
+        return f"File not found: {args.fna}"
+    if args.table is None and args.table_path is None:
+        return "A ligation table is required: use --table NAME or --table-path CSV."
+    if args.overhang_len < 1:
+        return "--overhang-len must be >= 1."
+    if args.region_len < args.overhang_len:
+        return "--region-len must be >= --overhang-len."
+    if args.max_oligo_len <= args.region_len:
+        return "--max-oligo-len must be > --region-len."
+    if args.beam_width < 1:
+        return "--beam-width must be >= 1."
+    if args.mode == "rollout" and args.rollout_samples < 1:
+        return "--rollout-samples must be >= 1 in rollout mode."
+    if not (0 < args.heuristic_percentile <= 100):
+        return "--heuristic-percentile must be in (0, 100]."
+    return None
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main(argv=None) -> int:
+    parser = _build_parser()
+    args = parser.parse_args(argv)
+    # ---- --list-tables ----
+    if args.list_tables:
+        tables = list_builtin_tables()
+        if tables:
+            print("Bundled ligation tables:")
+            for name in tables:
+                print(f"  {name}")
+        else:
+            print(
+                "No tables are currently bundled.\n"
+                "Place a CSV in seqsplit/data/ and register it in "
+                "seqsplit/tables.py:BUILTIN_TABLES."
+            )
+        return 0
+    # ---- Validate ----
+    err = _validate_args(args)
+    if err:
+        print(f"Error: {err}", file=sys.stderr)
+        return 1
+    # ---- Load ligation table ----
+    try:
+        if args.table_path:
+            if not args.quiet:
+                print(f"Loading ligation table: {args.table_path}")
+            table = load_ligation_table(
+                args.table_path, overhang_len=args.overhang_len
+            )
+        else:
+            if not args.quiet:
+                print(f"Loading built-in table:  {args.table}")
+            table = load_builtin_table(args.table, overhang_len=args.overhang_len)
+    except (FileNotFoundError, ValueError) as exc:
+        print(f"Error loading ligation table: {exc}", file=sys.stderr)
+        return 1
+    branching_factor = args.region_len - args.overhang_len + 1
+    rollout_samples  = args.rollout_samples if args.mode == "rollout" else 0
+    if not args.quiet:
+        print(f"\nParameters")
+        print(f"  input            : {args.fna}")
+        print(f"  max_oligo_len    : {args.max_oligo_len} nt")
+        print(f"  region_len        : {args.region_len} nt  "
+              f"({branching_factor} candidates / region)")
+        print(f"  overhang_len     : {args.overhang_len} nt")
+        print(f"  beam_width       : {args.beam_width}")
+        print(f"  mode             : {args.mode}")
+        if args.mode == "rollout":
+            print(f"  rollout_samples       : {rollout_samples}")
+            print(f"  heuristic percentile  : p{args.heuristic_percentile:.0f}")
+        print()
+    # ---- Build index matrices ----
+    if not args.quiet:
+        print(f"Building overhang candidate matrices representing search tree from '{args.fna}'...")
+    try:
+        all_matrices = get_all_possible_idx_matrices(
+            args.fna,
+            table.kmer_enc_to_row_idx,
+            oh_region_len=args.region_len,
+            overhang_len=args.overhang_len,
+            max_oligo_len=args.max_oligo_len,
+        )
+    except Exception as exc:
+        print(f"Error reading input file: {exc}", file=sys.stderr)
+        return 1
+    n_seqs = len(all_matrices)
+    if not args.quiet:
+        print(f"  {n_seqs} sequence(s) loaded.\n")
+    # ---- Output CSV ----
+    out_path = args.output or (
+        os.path.splitext(args.fna)[0] + ".seqsplit_results.csv"
+    )
+    _init_output_csv(out_path)
+    if not args.quiet:
+        print(f"Results → {out_path}\n")
+    # ---- Run search ----
+    rng = np.random.default_rng(args.seed)
+    errors = 0
+    for i, (header, (mtrx, region_starts, seq_len)) in enumerate(
+        all_matrices.items(), start=1
+    ):
+        if not args.quiet:
+            short_header = header[:72] + ("..." if len(header) > 72 else "")
+            print(f"[{i}/{n_seqs}] {short_header}")
+        oh_list, log_fid, runtime, oh_coords = run_beam_search(
+            possible_idx_matrix=mtrx,
+            branching_factor=branching_factor,
+            total_num_regions=mtrx.shape[0],
+            beam_width=args.beam_width,
+            rollout_samples=rollout_samples,
+            region_starts=region_starts,
+            rng=rng,
+            table=table,
+            overhang_len=args.overhang_len,
+            heuristic_percentile=args.heuristic_percentile,
+            verbose=not args.quiet,
+        )
+        if oh_list is None:
+            print(f"  WARNING: no solution found for '{header}'", file=sys.stderr)
+            errors += 1
+            continue
+        oh_strings = [table.row_overhangs[idx] for idx in oh_list]
+        row = {
+            "seq_header":        header,
+            "seq_len_nt":        seq_len,
+            "num_fragments":     len(oh_strings) + 1,
+            "best_log_fidelity": log_fid,
+            "best_fidelity":     float(np.exp(log_fid)),
+            "overhangs":         str(oh_strings),
+            "overhang_start_coords": str(oh_coords),
+            "runtime_s":         f"{runtime:.2f}",
+        }
+        pd.DataFrame([row]).to_csv(out_path, mode="a", header=False, index=False)
+    print(f"\nDone. {n_seqs - errors}/{n_seqs} sequences written to {out_path}")
+    return 0 if errors == 0 else 1
+def _init_output_csv(path: str) -> None:
+    """Create the output CSV with a header row if it does not already exist."""
+    if not os.path.exists(path):
+        pd.DataFrame(
+            columns=[
+                "seq_header",
+                "seq_len_nt",
+                "num_fragments",
+                "best_log_fidelity",
+                "best_fidelity",
+                "overhangs",
+                "overhang_start_coords",
+                "runtime_s",
+            ]
+        ).to_csv(path, index=False)
+if __name__ == "__main__":
+    sys.exit(main())

seqsplit/data/__init__.py ADDED Viewed

File without changes