kmer-learn 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kmer_learn-0.1.0/LICENSE +1 -0
- kmer_learn-0.1.0/MANIFEST.in +7 -0
- kmer_learn-0.1.0/PKG-INFO +187 -0
- kmer_learn-0.1.0/README.md +145 -0
- kmer_learn-0.1.0/examples/01_basic_kernel_matrix.ipynb +73 -0
- kmer_learn-0.1.0/examples/02_distance_metrics_and_kernels.ipynb +355 -0
- kmer_learn-0.1.0/examples/03_svc_with_kernel.ipynb +111 -0
- kmer_learn-0.1.0/examples/04_clustering_sequences.ipynb +55 -0
- kmer_learn-0.1.0/examples/05_score_long_sequence.ipynb +114 -0
- kmer_learn-0.1.0/examples/06_weighted_kernel.ipynb +84 -0
- kmer_learn-0.1.0/examples/07_transform_and_comparison.ipynb +148 -0
- kmer_learn-0.1.0/examples/08_windowed_3d_tensors.ipynb +87 -0
- kmer_learn-0.1.0/examples/09_spectrum_encoder_and_differential.ipynb +148 -0
- kmer_learn-0.1.0/examples/10_gappy_encoder.ipynb +127 -0
- kmer_learn-0.1.0/examples/11_mismatch_encoder.ipynb +127 -0
- kmer_learn-0.1.0/examples/12_shuffler_and_chunker.ipynb +181 -0
- kmer_learn-0.1.0/kmer/__init__.py +19 -0
- kmer_learn-0.1.0/kmer/_common.h +227 -0
- kmer_learn-0.1.0/kmer/distance/__init__.py +25 -0
- kmer_learn-0.1.0/kmer/distance/_base.py +175 -0
- kmer_learn-0.1.0/kmer/distance/alignment.py +275 -0
- kmer_learn-0.1.0/kmer/distance/edit.py +98 -0
- kmer_learn-0.1.0/kmer/encoders/__init__.py +18 -0
- kmer_learn-0.1.0/kmer/encoders/_native/__init__.py +0 -0
- kmer_learn-0.1.0/kmer/encoders/_native/_common.h +10 -0
- kmer_learn-0.1.0/kmer/encoders/_native/_gappy.c +377 -0
- kmer_learn-0.1.0/kmer/encoders/_native/_gappy_pylib.c +280 -0
- kmer_learn-0.1.0/kmer/encoders/_native/_mismatch.c +202 -0
- kmer_learn-0.1.0/kmer/encoders/_native/_mismatch_pylib.c +137 -0
- kmer_learn-0.1.0/kmer/encoders/_native/_spectrum.c +169 -0
- kmer_learn-0.1.0/kmer/encoders/_native/_spectrum_pylib.c +146 -0
- kmer_learn-0.1.0/kmer/encoders/gappy.py +199 -0
- kmer_learn-0.1.0/kmer/encoders/mismatch.py +99 -0
- kmer_learn-0.1.0/kmer/encoders/spectrum.py +142 -0
- kmer_learn-0.1.0/kmer/kernels/__init__.py +28 -0
- kmer_learn-0.1.0/kmer/kernels/_native/__init__.py +9 -0
- kmer_learn-0.1.0/kmer/kernels/_native/_gkmkern_pylib.c +1307 -0
- kmer_learn-0.1.0/kmer/kernels/_native/gkmkern.c +2603 -0
- kmer_learn-0.1.0/kmer/kernels/_native/gkmkern.h +311 -0
- kmer_learn-0.1.0/kmer/kernels/gkmkernel.py +941 -0
- kmer_learn-0.1.0/kmer/models/__init__.py +21 -0
- kmer_learn-0.1.0/kmer/models/differential.py +193 -0
- kmer_learn-0.1.0/kmer/models/knn.py +64 -0
- kmer_learn-0.1.0/kmer/models/svm.py +146 -0
- kmer_learn-0.1.0/kmer/perturb/__init__.py +21 -0
- kmer_learn-0.1.0/kmer/perturb/_base.py +63 -0
- kmer_learn-0.1.0/kmer/perturb/_native/__init__.py +5 -0
- kmer_learn-0.1.0/kmer/perturb/_native/_chunker.c +321 -0
- kmer_learn-0.1.0/kmer/perturb/_native/_chunker_pylib.c +362 -0
- kmer_learn-0.1.0/kmer/perturb/_native/_common.h +30 -0
- kmer_learn-0.1.0/kmer/perturb/_native/_shuffler.c +674 -0
- kmer_learn-0.1.0/kmer/perturb/_native/_shuffler_pylib.c +391 -0
- kmer_learn-0.1.0/kmer/perturb/chunker.py +98 -0
- kmer_learn-0.1.0/kmer/perturb/shuffler.py +92 -0
- kmer_learn-0.1.0/kmer/tests/__init__.py +0 -0
- kmer_learn-0.1.0/kmer/tests/brute_force_reference.py +155 -0
- kmer_learn-0.1.0/kmer/tests/gkm/__init__.py +0 -0
- kmer_learn-0.1.0/kmer/tests/gkm/test_gkmkern.py +622 -0
- kmer_learn-0.1.0/kmer/tests/test_background_model.py +87 -0
- kmer_learn-0.1.0/kmer/tests/test_chunker.py +296 -0
- kmer_learn-0.1.0/kmer/tests/test_differential.py +197 -0
- kmer_learn-0.1.0/kmer/tests/test_distances.py +292 -0
- kmer_learn-0.1.0/kmer/tests/test_encoders.py +342 -0
- kmer_learn-0.1.0/kmer/tests/test_gkm_reference.py +205 -0
- kmer_learn-0.1.0/kmer/tests/test_mismatch.py +118 -0
- kmer_learn-0.1.0/kmer/tests/test_models.py +137 -0
- kmer_learn-0.1.0/kmer/tests/test_shuffler.py +281 -0
- kmer_learn-0.1.0/kmer/utils/__init__.py +23 -0
- kmer_learn-0.1.0/kmer/utils/seq.py +86 -0
- kmer_learn-0.1.0/kmer_learn.egg-info/PKG-INFO +187 -0
- kmer_learn-0.1.0/kmer_learn.egg-info/SOURCES.txt +75 -0
- kmer_learn-0.1.0/kmer_learn.egg-info/dependency_links.txt +1 -0
- kmer_learn-0.1.0/kmer_learn.egg-info/requires.txt +22 -0
- kmer_learn-0.1.0/kmer_learn.egg-info/top_level.txt +1 -0
- kmer_learn-0.1.0/pyproject.toml +74 -0
- kmer_learn-0.1.0/setup.cfg +4 -0
- kmer_learn-0.1.0/setup.py +89 -0
kmer_learn-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
TODO: Add license here.
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kmer-learn
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A modern Python toolkit for classical sequence machine learning
|
|
5
|
+
Author: Arsenii Zinkevich
|
|
6
|
+
Project-URL: Homepage, https://github.com/synbioml/kmer-learn
|
|
7
|
+
Project-URL: Repository, https://github.com/synbioml/kmer-learn
|
|
8
|
+
Project-URL: Issues, https://github.com/synbioml/kmer-learn/issues
|
|
9
|
+
Keywords: bioinformatics,machine-learning,sequence-analysis,kmer,svm,kernel,genomics
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: C
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: numpy>=1.20
|
|
24
|
+
Requires-Dist: scipy>=1.7
|
|
25
|
+
Requires-Dist: scikit-learn>=1.0
|
|
26
|
+
Provides-Extra: fast
|
|
27
|
+
Requires-Dist: rapidfuzz>=3.0; extra == "fast"
|
|
28
|
+
Requires-Dist: parasail>=1.3; extra == "fast"
|
|
29
|
+
Provides-Extra: io
|
|
30
|
+
Requires-Dist: pyfastx>=0.8; extra == "io"
|
|
31
|
+
Provides-Extra: test
|
|
32
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
35
|
+
Requires-Dist: rapidfuzz>=3.0; extra == "dev"
|
|
36
|
+
Requires-Dist: parasail>=1.3; extra == "dev"
|
|
37
|
+
Requires-Dist: jupyter; extra == "dev"
|
|
38
|
+
Requires-Dist: matplotlib; extra == "dev"
|
|
39
|
+
Requires-Dist: nbformat; extra == "dev"
|
|
40
|
+
Requires-Dist: nbclient; extra == "dev"
|
|
41
|
+
Dynamic: license-file
|
|
42
|
+
|
|
43
|
+
# kmer-learn
|
|
44
|
+
|
|
45
|
+
Classical machine learning primitives for nucleotide sequences — kernels, encoders, distances, models, and sequence perturbation, all in one composable Python package.
|
|
46
|
+
|
|
47
|
+
> **⚠ API stability:** This package is in early development (v0.x). The public API is **not yet stable** — breaking changes may be introduced between minor versions until v1.0. Pin your dependency to an exact version (e.g. `kmer-learn==0.1.0`) if reproducibility matters.
|
|
48
|
+
|
|
49
|
+
## Install
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install kmer-learn
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
From source (requires a C compiler):
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
git clone https://github.com/synbioml/kmer-learn.git
|
|
59
|
+
cd kmer-learn
|
|
60
|
+
pip install -e .
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Optional backends (strongly recommended for speed):
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install rapidfuzz parasail # 10-100x faster edit distances and alignments
|
|
67
|
+
pip install pyfastx # fast FASTA parsing for examples
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Features
|
|
71
|
+
|
|
72
|
+
### Kernels
|
|
73
|
+
- **GKMKernel** family — gapped k-mer kernels from gkmSVM / LS-GKM (Ghandi et al. 2014; Lee 2016). C-backed, OpenMP-parallel, with full/estimated/truncated mismatch schemes, 6 post-transforms (RBF, poly, sigmoid, exponential, …), 5 positional weighting kernels (triangular, Epanechnikov, Gaussian, Laplacian, Cauchy), reverse-complement indexing, sliding-window scan, and 3D windowed tensors.
|
|
74
|
+
- **DistanceKernel** — turns any distance into a kernel via a post-transform.
|
|
75
|
+
|
|
76
|
+
### Encoders (CSR output)
|
|
77
|
+
- **SpectrumEncoder** — plain k-mer counts via rolling hash (k ≤ 12).
|
|
78
|
+
- **GappyEncoder** — gappy k-mer counts with explicit masks (`"*--*"`) or gap ranges (`L=6, g_min=2, g_max=3`).
|
|
79
|
+
- **MismatchEncoder** — mismatch-tolerant k-mer counts (Leslie, Eskin, Noble 2004).
|
|
80
|
+
|
|
81
|
+
All encoders support `canonical_rc=True` for reverse-complement collapsing.
|
|
82
|
+
|
|
83
|
+
### Distances
|
|
84
|
+
- **Hamming**, **Levenshtein** (rapidfuzz backend + Python fallback).
|
|
85
|
+
- **NeedlemanWunsch**, **SmithWaterman** (parasail backend + Python fallback), with custom substitution matrices (NUC4.4, BLOSUM62, …).
|
|
86
|
+
|
|
87
|
+
### Models
|
|
88
|
+
- **DifferentialKmerScorer** — Multinomial Naive Bayes on k-mer features, with auto-generated negatives via Shuffler/Chunker.
|
|
89
|
+
- **KernelSVM** — SVM with a precomputed kernel (works with GKMKernel and DistanceKernel).
|
|
90
|
+
- **LinearSVM** — Linear SVM on encoder features.
|
|
91
|
+
- **KNNClassifier** — k-Nearest Neighbors with a sequence distance.
|
|
92
|
+
|
|
93
|
+
### Sequence perturbation
|
|
94
|
+
- **KmerShuffler** — k-mer-preserving shuffle via random Eulerian paths in the De Bruijn graph. Three endpoint modes (preserve / free / crop). Philox4×32-10 RNG, reproducible across `n_jobs`.
|
|
95
|
+
- **Chunker** — block-level perturbation: split into chunks of size `[min, max]`, optionally reverse-complement each, shuffle, concatenate. Five residual-handling modes, two algorithms (random / backtrack).
|
|
96
|
+
- **BaseBackgroundModel** — ABC for custom background models.
|
|
97
|
+
|
|
98
|
+
### Utilities
|
|
99
|
+
- `kmer.utils` — bit-packed k-mer helpers (`kmer_to_code`, `code_to_kmer`, `reverse_complement`, `canonical_code`).
|
|
100
|
+
|
|
101
|
+
## Quick start
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from kmer.kernels import GKMKernel
|
|
105
|
+
from kmer.models import KernelSVM
|
|
106
|
+
|
|
107
|
+
# Train a gkm-SVM
|
|
108
|
+
clf = KernelSVM(GKMKernel(L=10, k=6, d=3, kernel_type="truncated", use_rc=True), C=1.0)
|
|
109
|
+
clf.fit(positives + negatives, [1]*len(positives) + [0]*len(negatives))
|
|
110
|
+
preds = clf.predict(test_seqs)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from kmer.encoders import SpectrumEncoder
|
|
115
|
+
from kmer.models import DifferentialKmerScorer
|
|
116
|
+
from kmer.perturb import KmerShuffler
|
|
117
|
+
|
|
118
|
+
# Differential k-mer scoring with dinucleotide-shuffled background
|
|
119
|
+
scorer = DifferentialKmerScorer(
|
|
120
|
+
featurizer=SpectrumEncoder(k=6, canonical_rc=True),
|
|
121
|
+
background=KmerShuffler(k=2, seed=42),
|
|
122
|
+
)
|
|
123
|
+
scorer.fit(positives)
|
|
124
|
+
top_motifs = scorer.kmer_scores_.sort_values(ascending=False).head(20)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from kmer.distance import Levenshtein, DistanceKernel
|
|
129
|
+
from kmer.models import KNNClassifier
|
|
130
|
+
|
|
131
|
+
# KNN with edit distance
|
|
132
|
+
clf = KNNClassifier(Levenshtein(), n_neighbors=5)
|
|
133
|
+
clf.fit(train_seqs, y_train)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Examples (Vignettes)
|
|
137
|
+
|
|
138
|
+
The `examples/` directory (top-level, next to `kmer/`) contains a series of cross-linked Jupyter notebooks. Each notebook starts with a vignette index linking to all others.
|
|
139
|
+
|
|
140
|
+
| # | Notebook | Topic |
|
|
141
|
+
|---|----------|-------|
|
|
142
|
+
| 01 | `01_basic_kernel_matrix.ipynb` | GKMKernel: build, inspect, verify invariants |
|
|
143
|
+
| 02 | `02_distance_metrics_and_kernels.ipynb` | Distance metrics (Hamming, Levenshtein, NW, SW) + DistanceKernel (RBF, PSD, KernelSVM) |
|
|
144
|
+
| 03 | `03_svc_with_kernel.ipynb` | Train a gkm-SVM with KernelSVM |
|
|
145
|
+
| 04 | `04_clustering_sequences.ipynb` | Hierarchical clustering with kernel distances |
|
|
146
|
+
| 05 | `05_score_long_sequence.ipynb` | Sliding-window scan of a long sequence |
|
|
147
|
+
| 06 | `06_weighted_kernel.ipynb` | WGKMKernel positional weighting (centered motif) |
|
|
148
|
+
| 07 | `07_transform_and_comparison.ipynb` | All 3 schemes × 6 transforms, GKM vs WGKM |
|
|
149
|
+
| 08 | `08_windowed_3d_tensors.ipynb` | WindowedGKMKernel 3D output (line plot) |
|
|
150
|
+
| 09 | `09_spectrum_encoder_and_differential.ipynb` | SpectrumEncoder + DifferentialKmerScorer |
|
|
151
|
+
| 10 | `10_gappy_encoder.ipynb` | GappyEncoder with masks, gap ranges, RC collapse |
|
|
152
|
+
| 11 | `11_mismatch_encoder.ipynb` | MismatchEncoder and comparison to spectrum |
|
|
153
|
+
| 12 | `12_shuffler_and_chunker.ipynb` | KmerShuffler + Chunker for negative-set generation |
|
|
154
|
+
|
|
155
|
+
## Citation
|
|
156
|
+
|
|
157
|
+
An article describing this package is in preparation. Until it is published, please cite the package as:
|
|
158
|
+
|
|
159
|
+
> *kmer-learn: Classical machine learning primitives for nucleotide sequences.* (in preparation).
|
|
160
|
+
|
|
161
|
+
For the mean time, if you use the package in your research, please cite the relevant foundational works listed below.
|
|
162
|
+
|
|
163
|
+
## References
|
|
164
|
+
|
|
165
|
+
The package builds on the following foundational works:
|
|
166
|
+
|
|
167
|
+
- **gkmSVM** — Ghandi M, Lee D, Mohammad-Noori M, Beer MA. *Enhanced regulatory sequence prediction using gapped k-mer features.* PLoS Comput Biol. 2014;10(7):e1003711.
|
|
168
|
+
- **LS-GKM** — Lee D. *LS-GKM: a new gkm-SVM for large-scale datasets.* Bioinformatics. 2016;32(14):2196–8.
|
|
169
|
+
- **Mismatch kernel** — Leslie CS, Eskin E, Cohen A, Weston J, Noble WS. *Mismatch string kernels for discriminative protein classification.* Bioinformatics. 2004;20 Suppl 1:i467–76.
|
|
170
|
+
- **Spectrum / gappy kernel** — Leslie CS, Eskin E, Weston J, Noble WS. *The spectrum kernel: a string kernel for SVM protein classification.* Pacific Symposium on Biocomputing. 2002:564–75.
|
|
171
|
+
- **Dinucleotide shuffle** — Clote P. *Efficient calculation of the number of native states of a protein.* (2003, unpublished note); Altschul SF, Erickson BW. *Significance of nucleotide sequence alignments: a method for random sequence permutation.* Bull Math Biol. 1985;47(4):541–51.
|
|
172
|
+
- **Philox4×32 RNG** — Salmon JK, Moraes MA, Dror RO, Shaw DE. *Parallel random numbers: as easy as 1, 2, 3.* SC '11.
|
|
173
|
+
- **Multinomial Naive Bayes** — Manning CD, Raghavan P, Schütze H. *Introduction to Information Retrieval.* Cambridge University Press, 2008.
|
|
174
|
+
- **Needleman-Wunsch** — Needleman SB, Wunsch CD. *A general method applicable to the search for similarities in the amino acid sequence of two proteins.* J Mol Biol. 1970;48(3):443–53.
|
|
175
|
+
- **Smith-Waterman** — Smith TF, Waterman MS. *Identification of common molecular subsequences.* J Mol Biol. 1981;147(1):195–7.
|
|
176
|
+
- **NUC4.4 matrix** — NCBI standard DNA scoring matrix.
|
|
177
|
+
|
|
178
|
+
Third-party libraries used as optional backends:
|
|
179
|
+
|
|
180
|
+
- **[rapidfuzz](https://github.com/maxbachmann/RapidFuzz)** — fast Levenshtein and Hamming distances.
|
|
181
|
+
- **[parasail](https://github.com/jeffdaily/parasail)** — SIMD-accelerated sequence alignment (Daily, 2016).
|
|
182
|
+
- **[scikit-learn](https://scikit-learn.org)** — SVM, Naive Bayes, KNN.
|
|
183
|
+
- **[NumPy](https://numpy.org) / [SciPy](https://scipy.org)** — array and sparse-matrix infrastructure.
|
|
184
|
+
|
|
185
|
+
## License
|
|
186
|
+
|
|
187
|
+
To be specified.
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# kmer-learn
|
|
2
|
+
|
|
3
|
+
Classical machine learning primitives for nucleotide sequences — kernels, encoders, distances, models, and sequence perturbation, all in one composable Python package.
|
|
4
|
+
|
|
5
|
+
> **⚠ API stability:** This package is in early development (v0.x). The public API is **not yet stable** — breaking changes may be introduced between minor versions until v1.0. Pin your dependency to an exact version (e.g. `kmer-learn==0.1.0`) if reproducibility matters.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install kmer-learn
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
From source (requires a C compiler):
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
git clone https://github.com/synbioml/kmer-learn.git
|
|
17
|
+
cd kmer-learn
|
|
18
|
+
pip install -e .
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Optional backends (strongly recommended for speed):
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install rapidfuzz parasail # 10-100x faster edit distances and alignments
|
|
25
|
+
pip install pyfastx # fast FASTA parsing for examples
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
|
|
30
|
+
### Kernels
|
|
31
|
+
- **GKMKernel** family — gapped k-mer kernels from gkmSVM / LS-GKM (Ghandi et al. 2014; Lee 2016). C-backed, OpenMP-parallel, with full/estimated/truncated mismatch schemes, 6 post-transforms (RBF, poly, sigmoid, exponential, …), 5 positional weighting kernels (triangular, Epanechnikov, Gaussian, Laplacian, Cauchy), reverse-complement indexing, sliding-window scan, and 3D windowed tensors.
|
|
32
|
+
- **DistanceKernel** — turns any distance into a kernel via a post-transform.
|
|
33
|
+
|
|
34
|
+
### Encoders (CSR output)
|
|
35
|
+
- **SpectrumEncoder** — plain k-mer counts via rolling hash (k ≤ 12).
|
|
36
|
+
- **GappyEncoder** — gappy k-mer counts with explicit masks (`"*--*"`) or gap ranges (`L=6, g_min=2, g_max=3`).
|
|
37
|
+
- **MismatchEncoder** — mismatch-tolerant k-mer counts (Leslie, Eskin, Noble 2004).
|
|
38
|
+
|
|
39
|
+
All encoders support `canonical_rc=True` for reverse-complement collapsing.
|
|
40
|
+
|
|
41
|
+
### Distances
|
|
42
|
+
- **Hamming**, **Levenshtein** (rapidfuzz backend + Python fallback).
|
|
43
|
+
- **NeedlemanWunsch**, **SmithWaterman** (parasail backend + Python fallback), with custom substitution matrices (NUC4.4, BLOSUM62, …).
|
|
44
|
+
|
|
45
|
+
### Models
|
|
46
|
+
- **DifferentialKmerScorer** — Multinomial Naive Bayes on k-mer features, with auto-generated negatives via Shuffler/Chunker.
|
|
47
|
+
- **KernelSVM** — SVM with a precomputed kernel (works with GKMKernel and DistanceKernel).
|
|
48
|
+
- **LinearSVM** — Linear SVM on encoder features.
|
|
49
|
+
- **KNNClassifier** — k-Nearest Neighbors with a sequence distance.
|
|
50
|
+
|
|
51
|
+
### Sequence perturbation
|
|
52
|
+
- **KmerShuffler** — k-mer-preserving shuffle via random Eulerian paths in the De Bruijn graph. Three endpoint modes (preserve / free / crop). Philox4×32-10 RNG, reproducible across `n_jobs`.
|
|
53
|
+
- **Chunker** — block-level perturbation: split into chunks of size `[min, max]`, optionally reverse-complement each, shuffle, concatenate. Five residual-handling modes, two algorithms (random / backtrack).
|
|
54
|
+
- **BaseBackgroundModel** — ABC for custom background models.
|
|
55
|
+
|
|
56
|
+
### Utilities
|
|
57
|
+
- `kmer.utils` — bit-packed k-mer helpers (`kmer_to_code`, `code_to_kmer`, `reverse_complement`, `canonical_code`).
|
|
58
|
+
|
|
59
|
+
## Quick start
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from kmer.kernels import GKMKernel
|
|
63
|
+
from kmer.models import KernelSVM
|
|
64
|
+
|
|
65
|
+
# Train a gkm-SVM
|
|
66
|
+
clf = KernelSVM(GKMKernel(L=10, k=6, d=3, kernel_type="truncated", use_rc=True), C=1.0)
|
|
67
|
+
clf.fit(positives + negatives, [1]*len(positives) + [0]*len(negatives))
|
|
68
|
+
preds = clf.predict(test_seqs)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from kmer.encoders import SpectrumEncoder
|
|
73
|
+
from kmer.models import DifferentialKmerScorer
|
|
74
|
+
from kmer.perturb import KmerShuffler
|
|
75
|
+
|
|
76
|
+
# Differential k-mer scoring with dinucleotide-shuffled background
|
|
77
|
+
scorer = DifferentialKmerScorer(
|
|
78
|
+
featurizer=SpectrumEncoder(k=6, canonical_rc=True),
|
|
79
|
+
background=KmerShuffler(k=2, seed=42),
|
|
80
|
+
)
|
|
81
|
+
scorer.fit(positives)
|
|
82
|
+
top_motifs = scorer.kmer_scores_.sort_values(ascending=False).head(20)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from kmer.distance import Levenshtein, DistanceKernel
|
|
87
|
+
from kmer.models import KNNClassifier
|
|
88
|
+
|
|
89
|
+
# KNN with edit distance
|
|
90
|
+
clf = KNNClassifier(Levenshtein(), n_neighbors=5)
|
|
91
|
+
clf.fit(train_seqs, y_train)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Examples (Vignettes)
|
|
95
|
+
|
|
96
|
+
The `examples/` directory (top-level, next to `kmer/`) contains a series of cross-linked Jupyter notebooks. Each notebook starts with a vignette index linking to all others.
|
|
97
|
+
|
|
98
|
+
| # | Notebook | Topic |
|
|
99
|
+
|---|----------|-------|
|
|
100
|
+
| 01 | `01_basic_kernel_matrix.ipynb` | GKMKernel: build, inspect, verify invariants |
|
|
101
|
+
| 02 | `02_distance_metrics_and_kernels.ipynb` | Distance metrics (Hamming, Levenshtein, NW, SW) + DistanceKernel (RBF, PSD, KernelSVM) |
|
|
102
|
+
| 03 | `03_svc_with_kernel.ipynb` | Train a gkm-SVM with KernelSVM |
|
|
103
|
+
| 04 | `04_clustering_sequences.ipynb` | Hierarchical clustering with kernel distances |
|
|
104
|
+
| 05 | `05_score_long_sequence.ipynb` | Sliding-window scan of a long sequence |
|
|
105
|
+
| 06 | `06_weighted_kernel.ipynb` | WGKMKernel positional weighting (centered motif) |
|
|
106
|
+
| 07 | `07_transform_and_comparison.ipynb` | All 3 schemes × 6 transforms, GKM vs WGKM |
|
|
107
|
+
| 08 | `08_windowed_3d_tensors.ipynb` | WindowedGKMKernel 3D output (line plot) |
|
|
108
|
+
| 09 | `09_spectrum_encoder_and_differential.ipynb` | SpectrumEncoder + DifferentialKmerScorer |
|
|
109
|
+
| 10 | `10_gappy_encoder.ipynb` | GappyEncoder with masks, gap ranges, RC collapse |
|
|
110
|
+
| 11 | `11_mismatch_encoder.ipynb` | MismatchEncoder and comparison to spectrum |
|
|
111
|
+
| 12 | `12_shuffler_and_chunker.ipynb` | KmerShuffler + Chunker for negative-set generation |
|
|
112
|
+
|
|
113
|
+
## Citation
|
|
114
|
+
|
|
115
|
+
An article describing this package is in preparation. Until it is published, please cite the package as:
|
|
116
|
+
|
|
117
|
+
> *kmer-learn: Classical machine learning primitives for nucleotide sequences.* (in preparation).
|
|
118
|
+
|
|
119
|
+
For the mean time, if you use the package in your research, please cite the relevant foundational works listed below.
|
|
120
|
+
|
|
121
|
+
## References
|
|
122
|
+
|
|
123
|
+
The package builds on the following foundational works:
|
|
124
|
+
|
|
125
|
+
- **gkmSVM** — Ghandi M, Lee D, Mohammad-Noori M, Beer MA. *Enhanced regulatory sequence prediction using gapped k-mer features.* PLoS Comput Biol. 2014;10(7):e1003711.
|
|
126
|
+
- **LS-GKM** — Lee D. *LS-GKM: a new gkm-SVM for large-scale datasets.* Bioinformatics. 2016;32(14):2196–8.
|
|
127
|
+
- **Mismatch kernel** — Leslie CS, Eskin E, Cohen A, Weston J, Noble WS. *Mismatch string kernels for discriminative protein classification.* Bioinformatics. 2004;20 Suppl 1:i467–76.
|
|
128
|
+
- **Spectrum / gappy kernel** — Leslie CS, Eskin E, Weston J, Noble WS. *The spectrum kernel: a string kernel for SVM protein classification.* Pacific Symposium on Biocomputing. 2002:564–75.
|
|
129
|
+
- **Dinucleotide shuffle** — Clote P. *Efficient calculation of the number of native states of a protein.* (2003, unpublished note); Altschul SF, Erickson BW. *Significance of nucleotide sequence alignments: a method for random sequence permutation.* Bull Math Biol. 1985;47(4):541–51.
|
|
130
|
+
- **Philox4×32 RNG** — Salmon JK, Moraes MA, Dror RO, Shaw DE. *Parallel random numbers: as easy as 1, 2, 3.* SC '11.
|
|
131
|
+
- **Multinomial Naive Bayes** — Manning CD, Raghavan P, Schütze H. *Introduction to Information Retrieval.* Cambridge University Press, 2008.
|
|
132
|
+
- **Needleman-Wunsch** — Needleman SB, Wunsch CD. *A general method applicable to the search for similarities in the amino acid sequence of two proteins.* J Mol Biol. 1970;48(3):443–53.
|
|
133
|
+
- **Smith-Waterman** — Smith TF, Waterman MS. *Identification of common molecular subsequences.* J Mol Biol. 1981;147(1):195–7.
|
|
134
|
+
- **NUC4.4 matrix** — NCBI standard DNA scoring matrix.
|
|
135
|
+
|
|
136
|
+
Third-party libraries used as optional backends:
|
|
137
|
+
|
|
138
|
+
- **[rapidfuzz](https://github.com/maxbachmann/RapidFuzz)** — fast Levenshtein and Hamming distances.
|
|
139
|
+
- **[parasail](https://github.com/jeffdaily/parasail)** — SIMD-accelerated sequence alignment (Daily, 2016).
|
|
140
|
+
- **[scikit-learn](https://scikit-learn.org)** — SVM, Naive Bayes, KNN.
|
|
141
|
+
- **[NumPy](https://numpy.org) / [SciPy](https://scipy.org)** — array and sparse-matrix infrastructure.
|
|
142
|
+
|
|
143
|
+
## License
|
|
144
|
+
|
|
145
|
+
To be specified.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
{
|
|
2
|
+
"nbformat": 4,
|
|
3
|
+
"nbformat_minor": 5,
|
|
4
|
+
"metadata": {
|
|
5
|
+
"kernelspec": {
|
|
6
|
+
"display_name": "Python 3",
|
|
7
|
+
"language": "python",
|
|
8
|
+
"name": "python3"
|
|
9
|
+
},
|
|
10
|
+
"language_info": {
|
|
11
|
+
"name": "python",
|
|
12
|
+
"version": "3.13.5",
|
|
13
|
+
"mimetype": "text/x-python",
|
|
14
|
+
"codemirror_mode": {
|
|
15
|
+
"name": "ipython",
|
|
16
|
+
"version": 3
|
|
17
|
+
},
|
|
18
|
+
"pygments_lexer": "ipython3",
|
|
19
|
+
"nbconvert_exporter": "python",
|
|
20
|
+
"file_extension": ".py"
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"cells": [
|
|
24
|
+
{
|
|
25
|
+
"id": "40a8967d",
|
|
26
|
+
"cell_type": "markdown",
|
|
27
|
+
"source": "# 01 \u2014 GKMKernel: Basic Kernel Matrix\n\n**Vignette index:** | `**01**` GKMKernel basics | [`02` Distance metrics & kernels](02_distance_metrics_and_kernels.ipynb) | [`03` SVM with kernel](03_svc_with_kernel.ipynb) | [`04` Clustering](04_clustering_sequences.ipynb) | [`05` Long sequence scoring](05_score_long_sequence.ipynb) | [`06` Weighted (WGKM) kernel](06_weighted_kernel.ipynb) | [`07` Transforms & comparison](07_transform_and_comparison.ipynb) | [`08` Windowed 3D tensors](08_windowed_3d_tensors.ipynb) | [`09` Spectrum encoder & NB](09_spectrum_encoder_and_differential.ipynb) | [`10` Gappy encoder](10_gappy_encoder.ipynb) | [`11` Mismatch encoder](11_mismatch_encoder.ipynb) | [`12` Shuffler & chunker](12_shuffler_and_chunker.ipynb)\n\nThis vignette shows how to build a gkm kernel matrix and verify its basic invariants (symmetry, unit diagonal, PSD).",
|
|
28
|
+
"metadata": {}
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"id": "0401a422",
|
|
32
|
+
"cell_type": "code",
|
|
33
|
+
"metadata": {
|
|
34
|
+
"execution": {
|
|
35
|
+
"iopub.status.busy": "2026-06-21T16:14:02.435770Z",
|
|
36
|
+
"iopub.execute_input": "2026-06-21T16:14:02.435946Z",
|
|
37
|
+
"shell.execute_reply": "2026-06-21T16:14:02.509236Z",
|
|
38
|
+
"iopub.status.idle": "2026-06-21T16:14:02.510335Z"
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
"execution_count": 1,
|
|
42
|
+
"source": "import numpy as np\nfrom kmer.kernels import GKMKernel\n\nseqs = [\n \"ACGTACGTACGTACGTACGT\",\n \"TTTTAAAAGGGGCCCCAAAA\",\n \"ACGTTGCATGCATGCATGCA\",\n \"CCCCGGGGTTTTAAAACCCC\",\n \"ATATGCGCATATGCGCATAT\",\n \"GAATTCGAATTCGAATTCGA\",\n]\n\nkern = GKMKernel(L=10, k=6, d=3, kernel_type=\"truncated\", use_rc=True)\nkern.set_references(seqs)\nK = np.asarray(kern.kernel())\nprint(\"Shape:\", K.shape)\nprint(\"Symmetric:\", np.allclose(K, K.T))\nprint(\"Unit diagonal:\", np.allclose(np.diag(K), 1.0))\nprint(\"Min eigenvalue:\", np.linalg.eigvalsh(K).min(), \"(PSD if >= 0)\")",
|
|
43
|
+
"outputs": [
|
|
44
|
+
{
|
|
45
|
+
"output_type": "stream",
|
|
46
|
+
"name": "stdout",
|
|
47
|
+
"text": "Shape: (6, 6)\nSymmetric: True\nUnit diagonal: True\nMin eigenvalue: 0.98676708653555 (PSD if >= 0)\n"
|
|
48
|
+
}
|
|
49
|
+
]
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"id": "81e9494f",
|
|
53
|
+
"cell_type": "code",
|
|
54
|
+
"metadata": {
|
|
55
|
+
"execution": {
|
|
56
|
+
"iopub.status.busy": "2026-06-21T16:14:02.511866Z",
|
|
57
|
+
"iopub.execute_input": "2026-06-21T16:14:02.512072Z",
|
|
58
|
+
"iopub.status.idle": "2026-06-21T16:14:02.529055Z",
|
|
59
|
+
"shell.execute_reply": "2026-06-21T16:14:02.528680Z"
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
"execution_count": 2,
|
|
63
|
+
"source": "# Cross-kernel: query sequences vs reference set\nqueries = [\"ACGTACGTACGTACGTACGTAC\", \"CCCCGGGGTTTTAAAACCCCAG\"]\nkern.set_references(seqs)\nKq = np.asarray(kern.kernel(X_query=queries))\nprint(\"Cross-kernel shape:\", Kq.shape)",
|
|
64
|
+
"outputs": [
|
|
65
|
+
{
|
|
66
|
+
"output_type": "stream",
|
|
67
|
+
"name": "stdout",
|
|
68
|
+
"text": "Cross-kernel shape: (2, 6)\n"
|
|
69
|
+
}
|
|
70
|
+
]
|
|
71
|
+
}
|
|
72
|
+
]
|
|
73
|
+
}
|