PyPI - seqsplit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

seqsplit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

seqsplit/__init__.py +10 -0
seqsplit/api.py +133 -0
seqsplit/cli.py +370 -0
seqsplit/data/__init__.py +0 -0
seqsplit/data/potapov2018_T4_01h_25C_4nt_oh_ligation_freqs.csv +257 -0
seqsplit/data/potapov2018_T4_01h_37C_4nt_oh_ligation_freqs.csv +257 -0
seqsplit/data/potapov2018_T4_18h_25C_4nt_oh_ligation_freqs.csv +257 -0
seqsplit/data/potapov2018_T4_18h_37C_4nt_oh_ligation_freqs.csv +257 -0
seqsplit/data/potapov2018_T7_18h_25C_4nt_oh_ligation_freqs.csv +257 -0
seqsplit/data/potapov2018_T7_18h_37C_4nt_oh_ligation_freqs.csv +257 -0
seqsplit/search.py +312 -0
seqsplit/sequence.py +191 -0
seqsplit/tables.py +211 -0
seqsplit-0.1.0.dist-info/METADATA +116 -0
seqsplit-0.1.0.dist-info/RECORD +18 -0
seqsplit-0.1.0.dist-info/WHEEL +5 -0
seqsplit-0.1.0.dist-info/entry_points.txt +2 -0
seqsplit-0.1.0.dist-info/top_level.txt +1 -0

seqsplit/tables.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""
+Loading and management of ligation frequency tables.
+A ligation table is an N×N matrix of overhang-pair ligation counts, where N is
+the number of distinct k-mer overhangs (4^k for k-nt overhangs). See
+docs/ligation_table_format.md for the full CSV specification.
+"""
+from __future__ import annotations
+import importlib.resources
+from dataclasses import dataclass
+from pathlib import Path
+import numpy as np
+import pandas as pd
+BASE_TO_INT = np.zeros(256, dtype=np.uint8)
+BASE_TO_INT[ord("A")] = 0
+BASE_TO_INT[ord("C")] = 1
+BASE_TO_INT[ord("G")] = 2
+BASE_TO_INT[ord("T")] = 3
+# ---------------------------------------------------------------------------------------
+# Bundled table registry
+# ---------------------------------------------------------------------------------------
+# To add a new bundled table: drop the CSV into seqsplit/data/ and add an entry here.
+BUILTIN_TABLES: dict[str, str] = {
+    "potapov2018_T4_01h_25C": "potapov2018_T4_01h_25C_4nt_oh_ligation_freqs.csv",
+    "potapov2018_T4_01h_37C": "potapov2018_T4_01h_37C_4nt_oh_ligation_freqs.csv",
+    "potapov2018_T4_18h_25C": "potapov2018_T4_18h_25C_4nt_oh_ligation_freqs.csv",
+    "potapov2018_T4_18h_37C": "potapov2018_T4_18h_37C_4nt_oh_ligation_freqs.csv",
+    "potapov2018_T7_18h_25C": "potapov2018_T7_18h_25C_4nt_oh_ligation_freqs.csv",
+    "potapov2018_T7_18h_37C": "potapov2018_T7_18h_37C_4nt_oh_ligation_freqs.csv",
+}
+@dataclass
+class LigationTable:
+    """
+    A loaded ligation frequency table together with all derived lookup arrays.
+    Attributes
+    ----------
+    name : str
+        Descriptive name (e.g. 'potapov2018_T4_18h_25C').
+    overhang_len : int
+        Length of overhangs covered (e.g. 4 for 4-nt overhangs).
+    lig_freqs_mtrx : np.ndarray, shape (N, N), dtype uint16
+        Raw ligation count matrix indexed [row overhang, column overhang].
+        N should be equal to 4^overhang_len.
+    row_overhangs : list[str]
+        Overhang sequences corresponding to row indices.
+    col_overhangs : list[str]
+        Overhang sequences corresponding to column indices.
+    oh_to_row_idx : dict[str, int]
+    oh_to_col_idx : dict[str, int]
+    rev_comp_row_idx_map : np.ndarray, shape (N,), dtype int32
+        Maps row index i → row index of reverse complement of overhang i.
+    row_col_idx_map : np.ndarray, shape (N,), dtype int32
+        Maps row index i → column index of overhang i.
+    kmer_enc_to_row_idx : np.ndarray, shape (4^overhang_len,), dtype int32
+        Maps integer k-mer encoding → row index in lig_freqs_mtrx.
+    """
+    name: str
+    overhang_len: int
+    lig_freqs_mtrx: np.ndarray
+    row_overhangs: list[str]
+    col_overhangs: list[str]
+    oh_to_row_idx: dict[str, int]
+    oh_to_col_idx: dict[str, int]
+    rev_comp_row_idx_map: np.ndarray
+    row_col_idx_map: np.ndarray
+    kmer_enc_to_row_idx: np.ndarray
+def _get_rev_comp(seq: str) -> str:
+    return seq.translate(str.maketrans("ACGT", "TGCA"))[::-1]
+def load_ligation_table(
+    path: str | Path,
+    name: str | None = None,
+    overhang_len: int | None = None,
+) -> LigationTable:
+    """
+    Load a ligation frequency table from a CSV file.
+    Parameters
+    ----------
+    path : str or Path
+        Path to the CSV file. See docs/ligation_table_format.md for format.
+    name : str, optional
+        Table name; defaults to the file stem.
+    overhang_len : int, optional
+        Expected overhang length. If this does not match the provided table,
+        a ValueError is raised.
+    Returns
+    -------
+    LigationTable
+    """
+    path = Path(path)
+    if name is None:
+        name = path.stem
+    df = pd.read_csv(path)
+    if "Overhang" not in df.columns:
+        raise ValueError(
+            f"Ligation table CSV must contain an 'Overhang' column "
+            f"(see docs/ligation_table_format.md). Got columns: {df.columns.tolist()}"
+        )
+    oh_index_df = df.set_index("Overhang")
+    lig_freqs_mtrx = oh_index_df.to_numpy().astype("uint16")
+    row_overhangs: list[str] = oh_index_df.index.tolist()
+    col_overhangs: list[str] = oh_index_df.columns.tolist()
+    # Infer and validate overhang length
+    inferred_len = len(row_overhangs[0])
+    if not all(len(oh) == inferred_len for oh in row_overhangs):
+        raise ValueError("All row overhangs must have the same length.")
+    if overhang_len is not None and overhang_len != inferred_len:
+        raise ValueError(
+            f"--overhang-len={overhang_len} does not match the overhang length "
+            f"in '{path.name}' (table has {inferred_len}-nt overhangs). "
+        )
+    # Index dicts
+    oh_to_row_idx = {oh: i for i, oh in enumerate(row_overhangs)}
+    oh_to_col_idx = {oh: j for j, oh in enumerate(col_overhangs)}
+    # Reverse-complement lookup maps (indexed by row index)
+    n_rows = len(row_overhangs)
+    rev_comp_row_idx_map = np.zeros(n_rows, dtype=np.int32)
+    row_col_idx_map = np.zeros(n_rows, dtype=np.int32)
+    for oh, idx in oh_to_row_idx.items():
+        rc = _get_rev_comp(oh)
+        if rc not in oh_to_row_idx:
+            raise ValueError(
+                f"Reverse complement '{rc}' of overhang '{oh}' is not present "
+                f"in the table rows. The table must contain all 4^{overhang_len} "
+                f"k-mers (see docs/ligation_table_format.md)."
+            )
+        rev_comp_row_idx_map[idx] = oh_to_row_idx[rc]
+        row_col_idx_map[idx] = oh_to_col_idx[oh]
+    # k-mer integer encoding → row index
+    n_kmers = 4 ** overhang_len
+    kmer_enc_to_row_idx = np.empty(n_kmers, dtype=np.int32)
+    for kmer_str, idx in oh_to_row_idx.items():
+        val = 0
+        for base in kmer_str:
+            val = (val << 2) | int(BASE_TO_INT[ord(base)])
+        kmer_enc_to_row_idx[val] = idx
+    return LigationTable(
+        name=name,
+        overhang_len=overhang_len,
+        lig_freqs_mtrx=lig_freqs_mtrx,
+        row_overhangs=row_overhangs,
+        col_overhangs=col_overhangs,
+        oh_to_row_idx=oh_to_row_idx,
+        oh_to_col_idx=oh_to_col_idx,
+        rev_comp_row_idx_map=rev_comp_row_idx_map,
+        row_col_idx_map=row_col_idx_map,
+        kmer_enc_to_row_idx=kmer_enc_to_row_idx,
+    )
+def list_builtin_tables() -> list[str]:
+    """Return names of all bundled ligation tables."""
+    return list(BUILTIN_TABLES.keys())
+def load_builtin_table(name: str, overhang_len: int | None = None) -> LigationTable:
+    """
+    Load a bundled ligation table by name.
+    Parameters
+    ----------
+    name : str
+        One of the names returned by :func:`list_builtin_tables`.
+    overhang_len : int, optional
+        Passed through to :func:`load_ligation_table` for validation.
+    """
+    if name not in BUILTIN_TABLES:
+        raise ValueError(
+            f"Unknown built-in table '{name}'. "
+            f"Available tables: {list(BUILTIN_TABLES.keys())}. "
+            f"Use --table-path to supply a custom CSV."
+        )
+    filename = BUILTIN_TABLES[name]
+    try:
+        pkg = importlib.resources.files("seqsplit.data").joinpath(filename)
+        with importlib.resources.as_file(pkg) as p:
+            return load_ligation_table(p, name=name, overhang_len=overhang_len)
+    except (FileNotFoundError, TypeError):
+        raise FileNotFoundError(
+            f"Built-in table '{name}' ('{filename}') was not found in the package data. "
+            f"Copy the CSV into seqsplit/data/ and reinstall, or use --table-path."
+        )

seqsplit-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,116 @@
+Metadata-Version: 2.4
+Name: seqsplit
+Version: 0.1.0
+Summary: Splits DNA sequences at optimal ligation sites for synthesis
+License: MIT
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: numpy>=1.24
+Requires-Dist: pandas>=2.0
+Provides-Extra: analysis
+Requires-Dist: matplotlib>=3.7; extra == "analysis"
+Requires-Dist: statsmodels>=0.14; extra == "analysis"
+Requires-Dist: scipy>=1.10; extra == "analysis"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Requires-Dist: pytest-cov; extra == "dev"
+# seqsplit
+**Splitting DNA sequences at optimal ligation sites for synthesis.**
+`seqsplit` takes a set of long sequences and splits them into fragments by
+selecting overhang positions that maximize assembly fidelity. These overhangs
+are selected using a beam search guided by an empirical ligation frequency
+table (e.g. from Potapov *et al.*, 2018).
+---
+## Installation
+```bash
+pip install seqsplit
+```
+**Requirements:** Python ≥ 3.10, NumPy ≥ 1.24, pandas ≥ 2.0.
+---
+## Quick start
+### Command line
+```bash
+# Beam search guided by greedy heuristic (fast, good default)
+seqsplit my_sequences.fna --table potapov2018_T4_18h_25C
+# Beam search guided by rollout-based heuristic (slower, generally not recommended)
+seqsplit my_sequences.fna --table potapov2018_T4_18h_25C \
+    --mode rollout --rollout-samples 50
+# More pessimistic rollout heuristic (take fidelity at 98th percentile of rollout sample instead of max)
+seqsplit my_sequences.fna --table potapov2018_T4_18h_25C \
+    --mode rollout --rollout-samples 100 --heuristic-percentile 98
+# Input custom ligation table (see docs/ligation_freq_table_format.md for formatting)
+seqsplit my_sequences.fna --table-path my_conditions.csv
+# List bundled ligation frequency tables
+seqsplit --list-tables
+```
+Results are written to `my_sequences.seqsplit_results.csv` by default (use `-o`
+to change).
+---
+## Parameters
+| CLI flag | API argument | Default | Description |
+|---|---|---|---|
+| *(positional)* | `fna_path` | — | Input FASTA / FNA file of sequences to split |
+| `--table NAME` | — | — | Bundled ligation table name |
+| `--table-path CSV` | `table` | — | Custom ligation table CSV |
+| `--max-oligo-len` | `max_oligo_len` | 250 | Max oligo/fragment length (nt) |
+| `--region-len` | `region_len` | 20 | Candidate overhang region width (nt) |
+| `--overhang-len` | `overhang_len` | 4 | Overhang length (nt); must match selected/provided ligation frequencies table |
+| `--beam-width` | `beam_width` | 100 | Beam width for the search |
+| `--seed` | `seed` | 42 | Random seed |
+| `--mode` | `mode` | `greedy` | `greedy` or `rollout` |
+| `--rollout-samples` | `rollout_samples` | 100 | Number of random path completions per candidate (rollout mode only) |
+| `--heuristic-percentile` | `heuristic_percentile` | 100 | Percentile of rollout fidelity scores to use as heuristic (rollout mode only) |
+### Search modes
+**`greedy`** (default)
+Each candidate prefix is scored by the fidelity of its overhangs alone.
+There is no lookahead component.
+**`rollout`**
+Each candidate prefix is randomly extended to a complete path
+`--rollout-samples` times; the heuristic score is the maximum (or N-th
+percentile) of those complete-path fidelities.
+### Output CSV columns
+| Column | Description |
+|--------|-------------|
+| `seq_header` | FASTA header |
+| `seq_len_nt` | Sequence length (nt) |
+| `num_fragments` | Number of fragments produced |
+| `best_log_fidelity` | Log of assembly fidelity |
+| `best_fidelity` | Assembly fidelity (0–1) |
+| `overhangs` | List of selected overhang sequences |
+| `overhang_start_coords` | List of 0-indexed overhang start positions |
+| `runtime_s` | Wall-clock time per sequence (seconds) |
+---
+## Citation
+If you use the bundled ligation frequency data from Potapov *et al.*, please cite:
+> Potapov V, *et al.* (2018). Comprehensive Profiling of Four Base Overhang
+> Ligation Fidelity by T4 DNA Ligase and Application to DNA Assembly.
+> *ACS Synthetic Biology*, 7(11), 2665–2674.
+> https://doi.org/10.1021/acssynbio.8b00333

seqsplit-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+seqsplit/__init__.py,sha256=YJyBiWii-9vl36aNVOsD5OEasP3_RtFeyXObUlYHleE,236
+seqsplit/api.py,sha256=PPTteYpOQR9Exk7kcyRbmOF9B_F3dtv2Bn3gvbSEhL4,4570
+seqsplit/cli.py,sha256=5_DDZS3g29_CF-v8dHO9gZ65ms9wSObcf0ywPh6d-II,12063
+seqsplit/search.py,sha256=3vxH8igjpg32wxnpKyRXvkXRzweE8W812s85IrHS9vo,11606
+seqsplit/sequence.py,sha256=oNqx_8tDSOqe5uWqdps8Ceam8HoFc3k_6TuXdJyhUiA,5934
+seqsplit/tables.py,sha256=KNJEmv871pFYy1AHGffIWdZoq_AkY2SpB9Gcp99LiiQ,7411
+seqsplit/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+seqsplit/data/potapov2018_T4_01h_25C_4nt_oh_ligation_freqs.csv,sha256=YhOlDTNZ3u-diKd9uj42pg_ct6ku9xnzdWfG0fZnqeU,137144
+seqsplit/data/potapov2018_T4_01h_37C_4nt_oh_ligation_freqs.csv,sha256=8EoP4XnLH8H5IyY1faCfjna3lWoAWtWRSGeNTnS8jPM,135274
+seqsplit/data/potapov2018_T4_18h_25C_4nt_oh_ligation_freqs.csv,sha256=lyQDIsCNcCtL-kjvqDB4qZ7AZQEAPIkI9aqDnMoQTGw,137187
+seqsplit/data/potapov2018_T4_18h_37C_4nt_oh_ligation_freqs.csv,sha256=RAOdUSM-M65gXeQcsbQwEwAQUJTHfr4UxHEhfT0RtK8,135354
+seqsplit/data/potapov2018_T7_18h_25C_4nt_oh_ligation_freqs.csv,sha256=WMwZ_xnWXvh_eceTSLeJ3T7hXWkKLF8PkmerEpu_4k8,135112
+seqsplit/data/potapov2018_T7_18h_37C_4nt_oh_ligation_freqs.csv,sha256=3sZKeTFamiZlYtf58jmcIilLPZRPcAkuCAtEt-nqO-E,135126
+seqsplit-0.1.0.dist-info/METADATA,sha256=SqrBEtuGA7o6go8oJVtpnc6rQlhepWTHJcQTm11-GhA,4075
+seqsplit-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+seqsplit-0.1.0.dist-info/entry_points.txt,sha256=BRLSzfUnJIlm6ytkKiOq6KYJB_bwXGZek5ft4kWMrPU,47
+seqsplit-0.1.0.dist-info/top_level.txt,sha256=_RB0BNav3i-yjHci2fX6ThUNydWY3NfwEO-xe5OvAM0,9
+seqsplit-0.1.0.dist-info/RECORD,,

seqsplit-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

seqsplit-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ seqsplit = seqsplit.cli:main

seqsplit-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ seqsplit