seqsplit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
seqsplit/tables.py ADDED
@@ -0,0 +1,211 @@
1
+ """
2
+ Loading and management of ligation frequency tables.
3
+
4
+ A ligation table is an N×N matrix of overhang-pair ligation counts, where N is
5
+ the number of distinct k-mer overhangs (4^k for k-nt overhangs). See
6
+ docs/ligation_table_format.md for the full CSV specification.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import importlib.resources
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+
18
+
19
+ BASE_TO_INT = np.zeros(256, dtype=np.uint8)
20
+ BASE_TO_INT[ord("A")] = 0
21
+ BASE_TO_INT[ord("C")] = 1
22
+ BASE_TO_INT[ord("G")] = 2
23
+ BASE_TO_INT[ord("T")] = 3
24
+
25
+
26
+ # ---------------------------------------------------------------------------------------
27
+ # Bundled table registry
28
+ # ---------------------------------------------------------------------------------------
29
+ # To add a new bundled table: drop the CSV into seqsplit/data/ and add an entry here.
30
+
31
+ BUILTIN_TABLES: dict[str, str] = {
32
+ "potapov2018_T4_01h_25C": "potapov2018_T4_01h_25C_4nt_oh_ligation_freqs.csv",
33
+ "potapov2018_T4_01h_37C": "potapov2018_T4_01h_37C_4nt_oh_ligation_freqs.csv",
34
+ "potapov2018_T4_18h_25C": "potapov2018_T4_18h_25C_4nt_oh_ligation_freqs.csv",
35
+ "potapov2018_T4_18h_37C": "potapov2018_T4_18h_37C_4nt_oh_ligation_freqs.csv",
36
+ "potapov2018_T7_18h_25C": "potapov2018_T7_18h_25C_4nt_oh_ligation_freqs.csv",
37
+ "potapov2018_T7_18h_37C": "potapov2018_T7_18h_37C_4nt_oh_ligation_freqs.csv",
38
+ }
39
+
40
+
41
+ @dataclass
42
+ class LigationTable:
43
+ """
44
+ A loaded ligation frequency table together with all derived lookup arrays.
45
+
46
+ Attributes
47
+ ----------
48
+ name : str
49
+ Descriptive name (e.g. 'potapov2018_T4_18h_25C').
50
+ overhang_len : int
51
+ Length of overhangs covered (e.g. 4 for 4-nt overhangs).
52
+ lig_freqs_mtrx : np.ndarray, shape (N, N), dtype uint16
53
+ Raw ligation count matrix indexed [row overhang, column overhang].
54
+ N should be equal to 4^overhang_len.
55
+ row_overhangs : list[str]
56
+ Overhang sequences corresponding to row indices.
57
+ col_overhangs : list[str]
58
+ Overhang sequences corresponding to column indices.
59
+ oh_to_row_idx : dict[str, int]
60
+ oh_to_col_idx : dict[str, int]
61
+ rev_comp_row_idx_map : np.ndarray, shape (N,), dtype int32
62
+ Maps row index i → row index of reverse complement of overhang i.
63
+ row_col_idx_map : np.ndarray, shape (N,), dtype int32
64
+ Maps row index i → column index of overhang i.
65
+ kmer_enc_to_row_idx : np.ndarray, shape (4^overhang_len,), dtype int32
66
+ Maps integer k-mer encoding → row index in lig_freqs_mtrx.
67
+ """
68
+
69
+ name: str
70
+ overhang_len: int
71
+ lig_freqs_mtrx: np.ndarray
72
+ row_overhangs: list[str]
73
+ col_overhangs: list[str]
74
+ oh_to_row_idx: dict[str, int]
75
+ oh_to_col_idx: dict[str, int]
76
+ rev_comp_row_idx_map: np.ndarray
77
+ row_col_idx_map: np.ndarray
78
+ kmer_enc_to_row_idx: np.ndarray
79
+
80
+
81
+ def _get_rev_comp(seq: str) -> str:
82
+ return seq.translate(str.maketrans("ACGT", "TGCA"))[::-1]
83
+
84
+
85
+ def load_ligation_table(
86
+ path: str | Path,
87
+ name: str | None = None,
88
+ overhang_len: int | None = None,
89
+ ) -> LigationTable:
90
+ """
91
+ Load a ligation frequency table from a CSV file.
92
+
93
+ Parameters
94
+ ----------
95
+ path : str or Path
96
+ Path to the CSV file. See docs/ligation_table_format.md for format.
97
+ name : str, optional
98
+ Table name; defaults to the file stem.
99
+ overhang_len : int, optional
100
+ Expected overhang length. If this does not match the provided table,
101
+ a ValueError is raised.
102
+
103
+ Returns
104
+ -------
105
+ LigationTable
106
+ """
107
+ path = Path(path)
108
+ if name is None:
109
+ name = path.stem
110
+
111
+ df = pd.read_csv(path)
112
+
113
+ if "Overhang" not in df.columns:
114
+ raise ValueError(
115
+ f"Ligation table CSV must contain an 'Overhang' column "
116
+ f"(see docs/ligation_table_format.md). Got columns: {df.columns.tolist()}"
117
+ )
118
+
119
+ oh_index_df = df.set_index("Overhang")
120
+ lig_freqs_mtrx = oh_index_df.to_numpy().astype("uint16")
121
+
122
+ row_overhangs: list[str] = oh_index_df.index.tolist()
123
+ col_overhangs: list[str] = oh_index_df.columns.tolist()
124
+
125
+ # Infer and validate overhang length
126
+ inferred_len = len(row_overhangs[0])
127
+ if not all(len(oh) == inferred_len for oh in row_overhangs):
128
+ raise ValueError("All row overhangs must have the same length.")
129
+
130
+ if overhang_len is not None and overhang_len != inferred_len:
131
+ raise ValueError(
132
+ f"--overhang-len={overhang_len} does not match the overhang length "
133
+ f"in '{path.name}' (table has {inferred_len}-nt overhangs). "
134
+ )
135
+
136
+ # Index dicts
137
+ oh_to_row_idx = {oh: i for i, oh in enumerate(row_overhangs)}
138
+ oh_to_col_idx = {oh: j for j, oh in enumerate(col_overhangs)}
139
+
140
+ # Reverse-complement lookup maps (indexed by row index)
141
+ n_rows = len(row_overhangs)
142
+ rev_comp_row_idx_map = np.zeros(n_rows, dtype=np.int32)
143
+ row_col_idx_map = np.zeros(n_rows, dtype=np.int32)
144
+
145
+ for oh, idx in oh_to_row_idx.items():
146
+ rc = _get_rev_comp(oh)
147
+ if rc not in oh_to_row_idx:
148
+ raise ValueError(
149
+ f"Reverse complement '{rc}' of overhang '{oh}' is not present "
150
+ f"in the table rows. The table must contain all 4^{overhang_len} "
151
+ f"k-mers (see docs/ligation_table_format.md)."
152
+ )
153
+ rev_comp_row_idx_map[idx] = oh_to_row_idx[rc]
154
+ row_col_idx_map[idx] = oh_to_col_idx[oh]
155
+
156
+ # k-mer integer encoding → row index
157
+ n_kmers = 4 ** overhang_len
158
+ kmer_enc_to_row_idx = np.empty(n_kmers, dtype=np.int32)
159
+ for kmer_str, idx in oh_to_row_idx.items():
160
+ val = 0
161
+ for base in kmer_str:
162
+ val = (val << 2) | int(BASE_TO_INT[ord(base)])
163
+ kmer_enc_to_row_idx[val] = idx
164
+
165
+ return LigationTable(
166
+ name=name,
167
+ overhang_len=overhang_len,
168
+ lig_freqs_mtrx=lig_freqs_mtrx,
169
+ row_overhangs=row_overhangs,
170
+ col_overhangs=col_overhangs,
171
+ oh_to_row_idx=oh_to_row_idx,
172
+ oh_to_col_idx=oh_to_col_idx,
173
+ rev_comp_row_idx_map=rev_comp_row_idx_map,
174
+ row_col_idx_map=row_col_idx_map,
175
+ kmer_enc_to_row_idx=kmer_enc_to_row_idx,
176
+ )
177
+
178
+
179
+ def list_builtin_tables() -> list[str]:
180
+ """Return names of all bundled ligation tables."""
181
+ return list(BUILTIN_TABLES.keys())
182
+
183
+
184
+ def load_builtin_table(name: str, overhang_len: int | None = None) -> LigationTable:
185
+ """
186
+ Load a bundled ligation table by name.
187
+
188
+ Parameters
189
+ ----------
190
+ name : str
191
+ One of the names returned by :func:`list_builtin_tables`.
192
+ overhang_len : int, optional
193
+ Passed through to :func:`load_ligation_table` for validation.
194
+ """
195
+ if name not in BUILTIN_TABLES:
196
+ raise ValueError(
197
+ f"Unknown built-in table '{name}'. "
198
+ f"Available tables: {list(BUILTIN_TABLES.keys())}. "
199
+ f"Use --table-path to supply a custom CSV."
200
+ )
201
+
202
+ filename = BUILTIN_TABLES[name]
203
+ try:
204
+ pkg = importlib.resources.files("seqsplit.data").joinpath(filename)
205
+ with importlib.resources.as_file(pkg) as p:
206
+ return load_ligation_table(p, name=name, overhang_len=overhang_len)
207
+ except (FileNotFoundError, TypeError):
208
+ raise FileNotFoundError(
209
+ f"Built-in table '{name}' ('{filename}') was not found in the package data. "
210
+ f"Copy the CSV into seqsplit/data/ and reinstall, or use --table-path."
211
+ )
@@ -0,0 +1,116 @@
1
+ Metadata-Version: 2.4
2
+ Name: seqsplit
3
+ Version: 0.1.0
4
+ Summary: Splits DNA sequences at optimal ligation sites for synthesis
5
+ License: MIT
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: numpy>=1.24
9
+ Requires-Dist: pandas>=2.0
10
+ Provides-Extra: analysis
11
+ Requires-Dist: matplotlib>=3.7; extra == "analysis"
12
+ Requires-Dist: statsmodels>=0.14; extra == "analysis"
13
+ Requires-Dist: scipy>=1.10; extra == "analysis"
14
+ Provides-Extra: dev
15
+ Requires-Dist: pytest>=7.0; extra == "dev"
16
+ Requires-Dist: pytest-cov; extra == "dev"
17
+
18
+ # seqsplit
19
+
20
+ **Splitting DNA sequences at optimal ligation sites for synthesis.**
21
+
22
+ `seqsplit` takes a set of long sequences and splits them into fragments by
23
+ selecting overhang positions that maximize assembly fidelity. These overhangs
24
+ are selected using a beam search guided by an empirical ligation frequency
25
+ table (e.g. from Potapov *et al.*, 2018).
26
+
27
+ ---
28
+
29
+ ## Installation
30
+
31
+ ```bash
32
+ pip install seqsplit
33
+ ```
34
+
35
+ **Requirements:** Python ≥ 3.10, NumPy ≥ 1.24, pandas ≥ 2.0.
36
+
37
+ ---
38
+
39
+ ## Quick start
40
+
41
+ ### Command line
42
+
43
+ ```bash
44
+ # Beam search guided by greedy heuristic (fast, good default)
45
+ seqsplit my_sequences.fna --table potapov2018_T4_18h_25C
46
+
47
+ # Beam search guided by rollout-based heuristic (slower, generally not recommended)
48
+ seqsplit my_sequences.fna --table potapov2018_T4_18h_25C \
49
+ --mode rollout --rollout-samples 50
50
+
51
+ # More pessimistic rollout heuristic (take fidelity at 98th percentile of rollout sample instead of max)
52
+ seqsplit my_sequences.fna --table potapov2018_T4_18h_25C \
53
+ --mode rollout --rollout-samples 100 --heuristic-percentile 98
54
+
55
+ # Input custom ligation table (see docs/ligation_freq_table_format.md for formatting)
56
+ seqsplit my_sequences.fna --table-path my_conditions.csv
57
+
58
+ # List bundled ligation frequency tables
59
+ seqsplit --list-tables
60
+ ```
61
+
62
+ Results are written to `my_sequences.seqsplit_results.csv` by default (use `-o`
63
+ to change).
64
+
65
+ ---
66
+
67
+ ## Parameters
68
+
69
+ | CLI flag | API argument | Default | Description |
70
+ |---|---|---|---|
71
+ | *(positional)* | `fna_path` | — | Input FASTA / FNA file of sequences to split |
72
+ | `--table NAME` | — | — | Bundled ligation table name |
73
+ | `--table-path CSV` | `table` | — | Custom ligation table CSV |
74
+ | `--max-oligo-len` | `max_oligo_len` | 250 | Max oligo/fragment length (nt) |
75
+ | `--region-len` | `region_len` | 20 | Candidate overhang region width (nt) |
76
+ | `--overhang-len` | `overhang_len` | 4 | Overhang length (nt); must match selected/provided ligation frequencies table |
77
+ | `--beam-width` | `beam_width` | 100 | Beam width for the search |
78
+ | `--seed` | `seed` | 42 | Random seed |
79
+ | `--mode` | `mode` | `greedy` | `greedy` or `rollout` |
80
+ | `--rollout-samples` | `rollout_samples` | 100 | Number of random path completions per candidate (rollout mode only) |
81
+ | `--heuristic-percentile` | `heuristic_percentile` | 100 | Percentile of rollout fidelity scores to use as heuristic (rollout mode only) |
82
+
83
+ ### Search modes
84
+
85
+ **`greedy`** (default)
86
+ Each candidate prefix is scored by the fidelity of its overhangs alone.
87
+ There is no lookahead component.
88
+
89
+ **`rollout`**
90
+ Each candidate prefix is randomly extended to a complete path
91
+ `--rollout-samples` times; the heuristic score is the maximum (or N-th
92
+ percentile) of those complete-path fidelities.
93
+
94
+ ### Output CSV columns
95
+
96
+ | Column | Description |
97
+ |--------|-------------|
98
+ | `seq_header` | FASTA header |
99
+ | `seq_len_nt` | Sequence length (nt) |
100
+ | `num_fragments` | Number of fragments produced |
101
+ | `best_log_fidelity` | Log of assembly fidelity |
102
+ | `best_fidelity` | Assembly fidelity (0–1) |
103
+ | `overhangs` | List of selected overhang sequences |
104
+ | `overhang_start_coords` | List of 0-indexed overhang start positions |
105
+ | `runtime_s` | Wall-clock time per sequence (seconds) |
106
+
107
+ ---
108
+
109
+ ## Citation
110
+
111
+ If you use the bundled ligation frequency data from Potapov *et al.*, please cite:
112
+
113
+ > Potapov V, *et al.* (2018). Comprehensive Profiling of Four Base Overhang
114
+ > Ligation Fidelity by T4 DNA Ligase and Application to DNA Assembly.
115
+ > *ACS Synthetic Biology*, 7(11), 2665–2674.
116
+ > https://doi.org/10.1021/acssynbio.8b00333
@@ -0,0 +1,18 @@
1
+ seqsplit/__init__.py,sha256=YJyBiWii-9vl36aNVOsD5OEasP3_RtFeyXObUlYHleE,236
2
+ seqsplit/api.py,sha256=PPTteYpOQR9Exk7kcyRbmOF9B_F3dtv2Bn3gvbSEhL4,4570
3
+ seqsplit/cli.py,sha256=5_DDZS3g29_CF-v8dHO9gZ65ms9wSObcf0ywPh6d-II,12063
4
+ seqsplit/search.py,sha256=3vxH8igjpg32wxnpKyRXvkXRzweE8W812s85IrHS9vo,11606
5
+ seqsplit/sequence.py,sha256=oNqx_8tDSOqe5uWqdps8Ceam8HoFc3k_6TuXdJyhUiA,5934
6
+ seqsplit/tables.py,sha256=KNJEmv871pFYy1AHGffIWdZoq_AkY2SpB9Gcp99LiiQ,7411
7
+ seqsplit/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ seqsplit/data/potapov2018_T4_01h_25C_4nt_oh_ligation_freqs.csv,sha256=YhOlDTNZ3u-diKd9uj42pg_ct6ku9xnzdWfG0fZnqeU,137144
9
+ seqsplit/data/potapov2018_T4_01h_37C_4nt_oh_ligation_freqs.csv,sha256=8EoP4XnLH8H5IyY1faCfjna3lWoAWtWRSGeNTnS8jPM,135274
10
+ seqsplit/data/potapov2018_T4_18h_25C_4nt_oh_ligation_freqs.csv,sha256=lyQDIsCNcCtL-kjvqDB4qZ7AZQEAPIkI9aqDnMoQTGw,137187
11
+ seqsplit/data/potapov2018_T4_18h_37C_4nt_oh_ligation_freqs.csv,sha256=RAOdUSM-M65gXeQcsbQwEwAQUJTHfr4UxHEhfT0RtK8,135354
12
+ seqsplit/data/potapov2018_T7_18h_25C_4nt_oh_ligation_freqs.csv,sha256=WMwZ_xnWXvh_eceTSLeJ3T7hXWkKLF8PkmerEpu_4k8,135112
13
+ seqsplit/data/potapov2018_T7_18h_37C_4nt_oh_ligation_freqs.csv,sha256=3sZKeTFamiZlYtf58jmcIilLPZRPcAkuCAtEt-nqO-E,135126
14
+ seqsplit-0.1.0.dist-info/METADATA,sha256=SqrBEtuGA7o6go8oJVtpnc6rQlhepWTHJcQTm11-GhA,4075
15
+ seqsplit-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
16
+ seqsplit-0.1.0.dist-info/entry_points.txt,sha256=BRLSzfUnJIlm6ytkKiOq6KYJB_bwXGZek5ft4kWMrPU,47
17
+ seqsplit-0.1.0.dist-info/top_level.txt,sha256=_RB0BNav3i-yjHci2fX6ThUNydWY3NfwEO-xe5OvAM0,9
18
+ seqsplit-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ seqsplit = seqsplit.cli:main
@@ -0,0 +1 @@
1
+ seqsplit