seqsplit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- seqsplit-0.1.0/PKG-INFO +116 -0
- seqsplit-0.1.0/README.md +99 -0
- seqsplit-0.1.0/pyproject.toml +36 -0
- seqsplit-0.1.0/seqsplit/__init__.py +10 -0
- seqsplit-0.1.0/seqsplit/api.py +133 -0
- seqsplit-0.1.0/seqsplit/cli.py +370 -0
- seqsplit-0.1.0/seqsplit/data/__init__.py +0 -0
- seqsplit-0.1.0/seqsplit/data/potapov2018_T4_01h_25C_4nt_oh_ligation_freqs.csv +257 -0
- seqsplit-0.1.0/seqsplit/data/potapov2018_T4_01h_37C_4nt_oh_ligation_freqs.csv +257 -0
- seqsplit-0.1.0/seqsplit/data/potapov2018_T4_18h_25C_4nt_oh_ligation_freqs.csv +257 -0
- seqsplit-0.1.0/seqsplit/data/potapov2018_T4_18h_37C_4nt_oh_ligation_freqs.csv +257 -0
- seqsplit-0.1.0/seqsplit/data/potapov2018_T7_18h_25C_4nt_oh_ligation_freqs.csv +257 -0
- seqsplit-0.1.0/seqsplit/data/potapov2018_T7_18h_37C_4nt_oh_ligation_freqs.csv +257 -0
- seqsplit-0.1.0/seqsplit/search.py +312 -0
- seqsplit-0.1.0/seqsplit/sequence.py +191 -0
- seqsplit-0.1.0/seqsplit/tables.py +211 -0
- seqsplit-0.1.0/seqsplit.egg-info/PKG-INFO +116 -0
- seqsplit-0.1.0/seqsplit.egg-info/SOURCES.txt +21 -0
- seqsplit-0.1.0/seqsplit.egg-info/dependency_links.txt +1 -0
- seqsplit-0.1.0/seqsplit.egg-info/entry_points.txt +2 -0
- seqsplit-0.1.0/seqsplit.egg-info/requires.txt +11 -0
- seqsplit-0.1.0/seqsplit.egg-info/top_level.txt +4 -0
- seqsplit-0.1.0/setup.cfg +4 -0
seqsplit-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: seqsplit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Splits DNA sequences at optimal ligation sites for synthesis
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: numpy>=1.24
|
|
9
|
+
Requires-Dist: pandas>=2.0
|
|
10
|
+
Provides-Extra: analysis
|
|
11
|
+
Requires-Dist: matplotlib>=3.7; extra == "analysis"
|
|
12
|
+
Requires-Dist: statsmodels>=0.14; extra == "analysis"
|
|
13
|
+
Requires-Dist: scipy>=1.10; extra == "analysis"
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
16
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
17
|
+
|
|
18
|
+
# seqsplit
|
|
19
|
+
|
|
20
|
+
**Splitting DNA sequences at optimal ligation sites for synthesis.**
|
|
21
|
+
|
|
22
|
+
`seqsplit` takes a set of long sequences and splits them into fragments by
|
|
23
|
+
selecting overhang positions that maximize assembly fidelity. These overhangs
|
|
24
|
+
are selected using a beam search guided by an empirical ligation frequency
|
|
25
|
+
table (e.g. from Potapov *et al.*, 2018).
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install seqsplit
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
**Requirements:** Python ≥ 3.10, NumPy ≥ 1.24, pandas ≥ 2.0.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Quick start
|
|
40
|
+
|
|
41
|
+
### Command line
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
# Beam search guided by greedy heuristic (fast, good default)
|
|
45
|
+
seqsplit my_sequences.fna --table potapov2018_T4_18h_25C
|
|
46
|
+
|
|
47
|
+
# Beam search guided by rollout-based heuristic (slower, generally not recommended)
|
|
48
|
+
seqsplit my_sequences.fna --table potapov2018_T4_18h_25C \
|
|
49
|
+
--mode rollout --rollout-samples 50
|
|
50
|
+
|
|
51
|
+
# More pessimistic rollout heuristic (take fidelity at 98th percentile of rollout sample instead of max)
|
|
52
|
+
seqsplit my_sequences.fna --table potapov2018_T4_18h_25C \
|
|
53
|
+
--mode rollout --rollout-samples 100 --heuristic-percentile 98
|
|
54
|
+
|
|
55
|
+
# Input custom ligation table (see docs/ligation_freq_table_format.md for formatting)
|
|
56
|
+
seqsplit my_sequences.fna --table-path my_conditions.csv
|
|
57
|
+
|
|
58
|
+
# List bundled ligation frequency tables
|
|
59
|
+
seqsplit --list-tables
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Results are written to `my_sequences.seqsplit_results.csv` by default (use `-o`
|
|
63
|
+
to change).
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Parameters
|
|
68
|
+
|
|
69
|
+
| CLI flag | API argument | Default | Description |
|
|
70
|
+
|---|---|---|---|
|
|
71
|
+
| *(positional)* | `fna_path` | — | Input FASTA / FNA file of sequences to split |
|
|
72
|
+
| `--table NAME` | — | — | Bundled ligation table name |
|
|
73
|
+
| `--table-path CSV` | `table` | — | Custom ligation table CSV |
|
|
74
|
+
| `--max-oligo-len` | `max_oligo_len` | 250 | Max oligo/fragment length (nt) |
|
|
75
|
+
| `--region-len` | `region_len` | 20 | Candidate overhang region width (nt) |
|
|
76
|
+
| `--overhang-len` | `overhang_len` | 4 | Overhang length (nt); must match selected/provided ligation frequencies table |
|
|
77
|
+
| `--beam-width` | `beam_width` | 100 | Beam width for the search |
|
|
78
|
+
| `--seed` | `seed` | 42 | Random seed |
|
|
79
|
+
| `--mode` | `mode` | `greedy` | `greedy` or `rollout` |
|
|
80
|
+
| `--rollout-samples` | `rollout_samples` | 100 | Number of random path completions per candidate (rollout mode only) |
|
|
81
|
+
| `--heuristic-percentile` | `heuristic_percentile` | 100 | Percentile of rollout fidelity scores to use as heuristic (rollout mode only) |
|
|
82
|
+
|
|
83
|
+
### Search modes
|
|
84
|
+
|
|
85
|
+
**`greedy`** (default)
|
|
86
|
+
Each candidate prefix is scored by the fidelity of its overhangs alone.
|
|
87
|
+
There is no lookahead component.
|
|
88
|
+
|
|
89
|
+
**`rollout`**
|
|
90
|
+
Each candidate prefix is randomly extended to a complete path
|
|
91
|
+
`--rollout-samples` times; the heuristic score is the maximum (or N-th
|
|
92
|
+
percentile) of those complete-path fidelities.
|
|
93
|
+
|
|
94
|
+
### Output CSV columns
|
|
95
|
+
|
|
96
|
+
| Column | Description |
|
|
97
|
+
|--------|-------------|
|
|
98
|
+
| `seq_header` | FASTA header |
|
|
99
|
+
| `seq_len_nt` | Sequence length (nt) |
|
|
100
|
+
| `num_fragments` | Number of fragments produced |
|
|
101
|
+
| `best_log_fidelity` | Log of assembly fidelity |
|
|
102
|
+
| `best_fidelity` | Assembly fidelity (0–1) |
|
|
103
|
+
| `overhangs` | List of selected overhang sequences |
|
|
104
|
+
| `overhang_start_coords` | List of 0-indexed overhang start positions |
|
|
105
|
+
| `runtime_s` | Wall-clock time per sequence (seconds) |
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## Citation
|
|
110
|
+
|
|
111
|
+
If you use the bundled ligation frequency data from Potapov *et al.*, please cite:
|
|
112
|
+
|
|
113
|
+
> Potapov V, *et al.* (2018). Comprehensive Profiling of Four Base Overhang
|
|
114
|
+
> Ligation Fidelity by T4 DNA Ligase and Application to DNA Assembly.
|
|
115
|
+
> *ACS Synthetic Biology*, 7(11), 2665–2674.
|
|
116
|
+
> https://doi.org/10.1021/acssynbio.8b00333
|
seqsplit-0.1.0/README.md
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# seqsplit
|
|
2
|
+
|
|
3
|
+
**Splitting DNA sequences at optimal ligation sites for synthesis.**
|
|
4
|
+
|
|
5
|
+
`seqsplit` takes a set of long sequences and splits them into fragments by
|
|
6
|
+
selecting overhang positions that maximize assembly fidelity. These overhangs
|
|
7
|
+
are selected using a beam search guided by an empirical ligation frequency
|
|
8
|
+
table (e.g. from Potapov *et al.*, 2018).
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install seqsplit
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
**Requirements:** Python ≥ 3.10, NumPy ≥ 1.24, pandas ≥ 2.0.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Quick start
|
|
23
|
+
|
|
24
|
+
### Command line
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# Beam search guided by greedy heuristic (fast, good default)
|
|
28
|
+
seqsplit my_sequences.fna --table potapov2018_T4_18h_25C
|
|
29
|
+
|
|
30
|
+
# Beam search guided by rollout-based heuristic (slower, generally not recommended)
|
|
31
|
+
seqsplit my_sequences.fna --table potapov2018_T4_18h_25C \
|
|
32
|
+
--mode rollout --rollout-samples 50
|
|
33
|
+
|
|
34
|
+
# More pessimistic rollout heuristic (take fidelity at 98th percentile of rollout sample instead of max)
|
|
35
|
+
seqsplit my_sequences.fna --table potapov2018_T4_18h_25C \
|
|
36
|
+
--mode rollout --rollout-samples 100 --heuristic-percentile 98
|
|
37
|
+
|
|
38
|
+
# Input custom ligation table (see docs/ligation_freq_table_format.md for formatting)
|
|
39
|
+
seqsplit my_sequences.fna --table-path my_conditions.csv
|
|
40
|
+
|
|
41
|
+
# List bundled ligation frequency tables
|
|
42
|
+
seqsplit --list-tables
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Results are written to `my_sequences.seqsplit_results.csv` by default (use `-o`
|
|
46
|
+
to change).
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Parameters
|
|
51
|
+
|
|
52
|
+
| CLI flag | API argument | Default | Description |
|
|
53
|
+
|---|---|---|---|
|
|
54
|
+
| *(positional)* | `fna_path` | — | Input FASTA / FNA file of sequences to split |
|
|
55
|
+
| `--table NAME` | — | — | Bundled ligation table name |
|
|
56
|
+
| `--table-path CSV` | `table` | — | Custom ligation table CSV |
|
|
57
|
+
| `--max-oligo-len` | `max_oligo_len` | 250 | Max oligo/fragment length (nt) |
|
|
58
|
+
| `--region-len` | `region_len` | 20 | Candidate overhang region width (nt) |
|
|
59
|
+
| `--overhang-len` | `overhang_len` | 4 | Overhang length (nt); must match selected/provided ligation frequencies table |
|
|
60
|
+
| `--beam-width` | `beam_width` | 100 | Beam width for the search |
|
|
61
|
+
| `--seed` | `seed` | 42 | Random seed |
|
|
62
|
+
| `--mode` | `mode` | `greedy` | `greedy` or `rollout` |
|
|
63
|
+
| `--rollout-samples` | `rollout_samples` | 100 | Number of random path completions per candidate (rollout mode only) |
|
|
64
|
+
| `--heuristic-percentile` | `heuristic_percentile` | 100 | Percentile of rollout fidelity scores to use as heuristic (rollout mode only) |
|
|
65
|
+
|
|
66
|
+
### Search modes
|
|
67
|
+
|
|
68
|
+
**`greedy`** (default)
|
|
69
|
+
Each candidate prefix is scored by the fidelity of its overhangs alone.
|
|
70
|
+
There is no lookahead component.
|
|
71
|
+
|
|
72
|
+
**`rollout`**
|
|
73
|
+
Each candidate prefix is randomly extended to a complete path
|
|
74
|
+
`--rollout-samples` times; the heuristic score is the maximum (or N-th
|
|
75
|
+
percentile) of those complete-path fidelities.
|
|
76
|
+
|
|
77
|
+
### Output CSV columns
|
|
78
|
+
|
|
79
|
+
| Column | Description |
|
|
80
|
+
|--------|-------------|
|
|
81
|
+
| `seq_header` | FASTA header |
|
|
82
|
+
| `seq_len_nt` | Sequence length (nt) |
|
|
83
|
+
| `num_fragments` | Number of fragments produced |
|
|
84
|
+
| `best_log_fidelity` | Log of assembly fidelity |
|
|
85
|
+
| `best_fidelity` | Assembly fidelity (0–1) |
|
|
86
|
+
| `overhangs` | List of selected overhang sequences |
|
|
87
|
+
| `overhang_start_coords` | List of 0-indexed overhang start positions |
|
|
88
|
+
| `runtime_s` | Wall-clock time per sequence (seconds) |
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Citation
|
|
93
|
+
|
|
94
|
+
If you use the bundled ligation frequency data from Potapov *et al.*, please cite:
|
|
95
|
+
|
|
96
|
+
> Potapov V, *et al.* (2018). Comprehensive Profiling of Four Base Overhang
|
|
97
|
+
> Ligation Fidelity by T4 DNA Ligase and Application to DNA Assembly.
|
|
98
|
+
> *ACS Synthetic Biology*, 7(11), 2665–2674.
|
|
99
|
+
> https://doi.org/10.1021/acssynbio.8b00333
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "seqsplit"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Splits DNA sequences at optimal ligation sites for synthesis"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
dependencies = [
|
|
13
|
+
"numpy>=1.24",
|
|
14
|
+
"pandas>=2.0",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.optional-dependencies]
|
|
18
|
+
# For extended analysis / plotting utilities
|
|
19
|
+
analysis = [
|
|
20
|
+
"matplotlib>=3.7",
|
|
21
|
+
"statsmodels>=0.14",
|
|
22
|
+
"scipy>=1.10",
|
|
23
|
+
]
|
|
24
|
+
dev = [
|
|
25
|
+
"pytest>=7.0",
|
|
26
|
+
"pytest-cov",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.scripts]
|
|
30
|
+
seqsplit = "seqsplit.cli:main"
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.packages.find]
|
|
33
|
+
where = ["."]
|
|
34
|
+
|
|
35
|
+
[tool.setuptools.package-data]
|
|
36
|
+
"seqsplit.data" = ["*.csv"]
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Python API for seqsplit.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from .tables import LigationTable
|
|
12
|
+
from .sequence import get_all_possible_idx_matrices
|
|
13
|
+
from .search import run_beam_search
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def split_fna(
|
|
17
|
+
fna_path: str,
|
|
18
|
+
table: LigationTable,
|
|
19
|
+
*,
|
|
20
|
+
max_oligo_len: int = 250,
|
|
21
|
+
region_len: int = 20,
|
|
22
|
+
overhang_len: int = 4,
|
|
23
|
+
beam_width: int = 100,
|
|
24
|
+
mode: str = "greedy",
|
|
25
|
+
rollout_samples: int = 100,
|
|
26
|
+
heuristic_percentile: float = 100.0,
|
|
27
|
+
seed: int = 42,
|
|
28
|
+
verbose: bool = True,
|
|
29
|
+
) -> list[dict[str, Any]]:
|
|
30
|
+
"""
|
|
31
|
+
Split all sequences in a FASTA/FNA file at optimal ligation sites.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
fna_path : str
|
|
36
|
+
Path to input FASTA/FNA file.
|
|
37
|
+
table : LigationTable
|
|
38
|
+
Loaded ligation frequency table. Obtain via
|
|
39
|
+
:func:`~seqsplit.tables.load_ligation_table` or
|
|
40
|
+
:func:`~seqsplit.tables.load_builtin_table`.
|
|
41
|
+
max_oligo_len : int
|
|
42
|
+
Maximum allowed oligo/fragment length in nt.
|
|
43
|
+
region_len : int
|
|
44
|
+
Width of each candidate overhang region.
|
|
45
|
+
``region_len - overhang_len + 1`` candidate overhangs are evaluated
|
|
46
|
+
per region.
|
|
47
|
+
overhang_len : int
|
|
48
|
+
Overhang length in nt. Must match the ligation table.
|
|
49
|
+
beam_width : int
|
|
50
|
+
Number of partial paths kept alive in the beam.
|
|
51
|
+
mode : {'greedy', 'rollout'}
|
|
52
|
+
'greedy' scores by current-prefix fidelity only (fast, recommended).
|
|
53
|
+
'rollout' uses random completions as a lookahead heuristic.
|
|
54
|
+
rollout_samples : int
|
|
55
|
+
Number of random rollouts used to evaluate each candidate (rollout
|
|
56
|
+
mode only).
|
|
57
|
+
heuristic_percentile : float
|
|
58
|
+
Percentile of rollout scores used as the heuristic to guide the search.
|
|
59
|
+
100 → max (default), 98 → more pessimistic heuristic, etc.
|
|
60
|
+
seed : ints
|
|
61
|
+
NumPy random seed for reproducibility.
|
|
62
|
+
verbose : bool
|
|
63
|
+
Print per-sequence progress.
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
list of dicts with keys:
|
|
68
|
+
* ``header`` – header string of sequence from FASTA/FNA
|
|
69
|
+
* ``seq_len`` – sequence length in nt
|
|
70
|
+
* ``num_fragments`` – number of fragments produced
|
|
71
|
+
* ``overhangs`` – list of overhang DNA strings
|
|
72
|
+
* ``oh_row_indices`` – list of ligation-table row indices
|
|
73
|
+
* ``oh_start_coords`` – list of 0-indexed overhang start positions
|
|
74
|
+
* ``log_fidelity`` – best log-fidelity
|
|
75
|
+
* ``fidelity`` – best fidelity (exp of log_fidelity)
|
|
76
|
+
* ``runtime_s`` – wall-clock time for this sequence
|
|
77
|
+
"""
|
|
78
|
+
if mode not in ("greedy", "rollout"):
|
|
79
|
+
raise ValueError(f"mode must be 'greedy' or 'rollout', got '{mode}'.")
|
|
80
|
+
if overhang_len != table.overhang_len:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"overhang_len={overhang_len} does not match the loaded table "
|
|
83
|
+
f"(table.overhang_len={table.overhang_len})."
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
branching_factor = region_len - overhang_len + 1
|
|
87
|
+
n_rollouts = rollout_samples if mode == "rollout" else 0
|
|
88
|
+
rng = np.random.default_rng(seed)
|
|
89
|
+
|
|
90
|
+
all_matrices = get_all_possible_idx_matrices(
|
|
91
|
+
fna_path,
|
|
92
|
+
table.kmer_enc_to_row_idx,
|
|
93
|
+
oh_region_len=region_len,
|
|
94
|
+
overhang_len=overhang_len,
|
|
95
|
+
max_oligo_len=max_oligo_len,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
results = []
|
|
99
|
+
for header, (mtrx, region_starts, seq_len) in all_matrices.items():
|
|
100
|
+
oh_list, log_fid, runtime, oh_coords = run_beam_search(
|
|
101
|
+
possible_idx_matrix=mtrx,
|
|
102
|
+
branching_factor=branching_factor,
|
|
103
|
+
total_num_regions=mtrx.shape[0],
|
|
104
|
+
beam_width=beam_width,
|
|
105
|
+
rollout_samples=n_rollouts,
|
|
106
|
+
region_starts=region_starts,
|
|
107
|
+
rng=rng,
|
|
108
|
+
table=table,
|
|
109
|
+
overhang_len=overhang_len,
|
|
110
|
+
heuristic_percentile=heuristic_percentile,
|
|
111
|
+
verbose=verbose,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if oh_list is None:
|
|
115
|
+
results.append({"header": header, "error": "no solution found"})
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
oh_strings = [table.row_overhangs[idx] for idx in oh_list]
|
|
119
|
+
results.append(
|
|
120
|
+
{
|
|
121
|
+
"header": header,
|
|
122
|
+
"seq_len": seq_len,
|
|
123
|
+
"num_fragments": len(oh_strings) + 1,
|
|
124
|
+
"overhangs": oh_strings,
|
|
125
|
+
"oh_row_indices": oh_list,
|
|
126
|
+
"oh_start_coords": oh_coords,
|
|
127
|
+
"log_fidelity": float(log_fid),
|
|
128
|
+
"fidelity": float(np.exp(log_fid)),
|
|
129
|
+
"runtime_s": float(runtime),
|
|
130
|
+
}
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
return results
|