kmer-learn 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. kmer_learn-0.1.0/LICENSE +1 -0
  2. kmer_learn-0.1.0/MANIFEST.in +7 -0
  3. kmer_learn-0.1.0/PKG-INFO +187 -0
  4. kmer_learn-0.1.0/README.md +145 -0
  5. kmer_learn-0.1.0/examples/01_basic_kernel_matrix.ipynb +73 -0
  6. kmer_learn-0.1.0/examples/02_distance_metrics_and_kernels.ipynb +355 -0
  7. kmer_learn-0.1.0/examples/03_svc_with_kernel.ipynb +111 -0
  8. kmer_learn-0.1.0/examples/04_clustering_sequences.ipynb +55 -0
  9. kmer_learn-0.1.0/examples/05_score_long_sequence.ipynb +114 -0
  10. kmer_learn-0.1.0/examples/06_weighted_kernel.ipynb +84 -0
  11. kmer_learn-0.1.0/examples/07_transform_and_comparison.ipynb +148 -0
  12. kmer_learn-0.1.0/examples/08_windowed_3d_tensors.ipynb +87 -0
  13. kmer_learn-0.1.0/examples/09_spectrum_encoder_and_differential.ipynb +148 -0
  14. kmer_learn-0.1.0/examples/10_gappy_encoder.ipynb +127 -0
  15. kmer_learn-0.1.0/examples/11_mismatch_encoder.ipynb +127 -0
  16. kmer_learn-0.1.0/examples/12_shuffler_and_chunker.ipynb +181 -0
  17. kmer_learn-0.1.0/kmer/__init__.py +19 -0
  18. kmer_learn-0.1.0/kmer/_common.h +227 -0
  19. kmer_learn-0.1.0/kmer/distance/__init__.py +25 -0
  20. kmer_learn-0.1.0/kmer/distance/_base.py +175 -0
  21. kmer_learn-0.1.0/kmer/distance/alignment.py +275 -0
  22. kmer_learn-0.1.0/kmer/distance/edit.py +98 -0
  23. kmer_learn-0.1.0/kmer/encoders/__init__.py +18 -0
  24. kmer_learn-0.1.0/kmer/encoders/_native/__init__.py +0 -0
  25. kmer_learn-0.1.0/kmer/encoders/_native/_common.h +10 -0
  26. kmer_learn-0.1.0/kmer/encoders/_native/_gappy.c +377 -0
  27. kmer_learn-0.1.0/kmer/encoders/_native/_gappy_pylib.c +280 -0
  28. kmer_learn-0.1.0/kmer/encoders/_native/_mismatch.c +202 -0
  29. kmer_learn-0.1.0/kmer/encoders/_native/_mismatch_pylib.c +137 -0
  30. kmer_learn-0.1.0/kmer/encoders/_native/_spectrum.c +169 -0
  31. kmer_learn-0.1.0/kmer/encoders/_native/_spectrum_pylib.c +146 -0
  32. kmer_learn-0.1.0/kmer/encoders/gappy.py +199 -0
  33. kmer_learn-0.1.0/kmer/encoders/mismatch.py +99 -0
  34. kmer_learn-0.1.0/kmer/encoders/spectrum.py +142 -0
  35. kmer_learn-0.1.0/kmer/kernels/__init__.py +28 -0
  36. kmer_learn-0.1.0/kmer/kernels/_native/__init__.py +9 -0
  37. kmer_learn-0.1.0/kmer/kernels/_native/_gkmkern_pylib.c +1307 -0
  38. kmer_learn-0.1.0/kmer/kernels/_native/gkmkern.c +2603 -0
  39. kmer_learn-0.1.0/kmer/kernels/_native/gkmkern.h +311 -0
  40. kmer_learn-0.1.0/kmer/kernels/gkmkernel.py +941 -0
  41. kmer_learn-0.1.0/kmer/models/__init__.py +21 -0
  42. kmer_learn-0.1.0/kmer/models/differential.py +193 -0
  43. kmer_learn-0.1.0/kmer/models/knn.py +64 -0
  44. kmer_learn-0.1.0/kmer/models/svm.py +146 -0
  45. kmer_learn-0.1.0/kmer/perturb/__init__.py +21 -0
  46. kmer_learn-0.1.0/kmer/perturb/_base.py +63 -0
  47. kmer_learn-0.1.0/kmer/perturb/_native/__init__.py +5 -0
  48. kmer_learn-0.1.0/kmer/perturb/_native/_chunker.c +321 -0
  49. kmer_learn-0.1.0/kmer/perturb/_native/_chunker_pylib.c +362 -0
  50. kmer_learn-0.1.0/kmer/perturb/_native/_common.h +30 -0
  51. kmer_learn-0.1.0/kmer/perturb/_native/_shuffler.c +674 -0
  52. kmer_learn-0.1.0/kmer/perturb/_native/_shuffler_pylib.c +391 -0
  53. kmer_learn-0.1.0/kmer/perturb/chunker.py +98 -0
  54. kmer_learn-0.1.0/kmer/perturb/shuffler.py +92 -0
  55. kmer_learn-0.1.0/kmer/tests/__init__.py +0 -0
  56. kmer_learn-0.1.0/kmer/tests/brute_force_reference.py +155 -0
  57. kmer_learn-0.1.0/kmer/tests/gkm/__init__.py +0 -0
  58. kmer_learn-0.1.0/kmer/tests/gkm/test_gkmkern.py +622 -0
  59. kmer_learn-0.1.0/kmer/tests/test_background_model.py +87 -0
  60. kmer_learn-0.1.0/kmer/tests/test_chunker.py +296 -0
  61. kmer_learn-0.1.0/kmer/tests/test_differential.py +197 -0
  62. kmer_learn-0.1.0/kmer/tests/test_distances.py +292 -0
  63. kmer_learn-0.1.0/kmer/tests/test_encoders.py +342 -0
  64. kmer_learn-0.1.0/kmer/tests/test_gkm_reference.py +205 -0
  65. kmer_learn-0.1.0/kmer/tests/test_mismatch.py +118 -0
  66. kmer_learn-0.1.0/kmer/tests/test_models.py +137 -0
  67. kmer_learn-0.1.0/kmer/tests/test_shuffler.py +281 -0
  68. kmer_learn-0.1.0/kmer/utils/__init__.py +23 -0
  69. kmer_learn-0.1.0/kmer/utils/seq.py +86 -0
  70. kmer_learn-0.1.0/kmer_learn.egg-info/PKG-INFO +187 -0
  71. kmer_learn-0.1.0/kmer_learn.egg-info/SOURCES.txt +75 -0
  72. kmer_learn-0.1.0/kmer_learn.egg-info/dependency_links.txt +1 -0
  73. kmer_learn-0.1.0/kmer_learn.egg-info/requires.txt +22 -0
  74. kmer_learn-0.1.0/kmer_learn.egg-info/top_level.txt +1 -0
  75. kmer_learn-0.1.0/pyproject.toml +74 -0
  76. kmer_learn-0.1.0/setup.cfg +4 -0
  77. kmer_learn-0.1.0/setup.py +89 -0
@@ -0,0 +1 @@
1
+ TODO: Add license here.
@@ -0,0 +1,7 @@
1
+ include README.md
2
+ include LICENSE
3
+ include pyproject.toml
4
+ include setup.py
5
+ recursive-include kmer *.h
6
+ recursive-include kmer *.c
7
+ recursive-include examples *.ipynb
@@ -0,0 +1,187 @@
1
+ Metadata-Version: 2.4
2
+ Name: kmer-learn
3
+ Version: 0.1.0
4
+ Summary: A modern Python toolkit for classical sequence machine learning
5
+ Author: Arsenii Zinkevich
6
+ Project-URL: Homepage, https://github.com/synbioml/kmer-learn
7
+ Project-URL: Repository, https://github.com/synbioml/kmer-learn
8
+ Project-URL: Issues, https://github.com/synbioml/kmer-learn/issues
9
+ Keywords: bioinformatics,machine-learning,sequence-analysis,kmer,svm,kernel,genomics
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: C
19
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: numpy>=1.20
24
+ Requires-Dist: scipy>=1.7
25
+ Requires-Dist: scikit-learn>=1.0
26
+ Provides-Extra: fast
27
+ Requires-Dist: rapidfuzz>=3.0; extra == "fast"
28
+ Requires-Dist: parasail>=1.3; extra == "fast"
29
+ Provides-Extra: io
30
+ Requires-Dist: pyfastx>=0.8; extra == "io"
31
+ Provides-Extra: test
32
+ Requires-Dist: pytest>=7.0; extra == "test"
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest>=7.0; extra == "dev"
35
+ Requires-Dist: rapidfuzz>=3.0; extra == "dev"
36
+ Requires-Dist: parasail>=1.3; extra == "dev"
37
+ Requires-Dist: jupyter; extra == "dev"
38
+ Requires-Dist: matplotlib; extra == "dev"
39
+ Requires-Dist: nbformat; extra == "dev"
40
+ Requires-Dist: nbclient; extra == "dev"
41
+ Dynamic: license-file
42
+
43
+ # kmer-learn
44
+
45
+ Classical machine learning primitives for nucleotide sequences — kernels, encoders, distances, models, and sequence perturbation, all in one composable Python package.
46
+
47
+ > **⚠ API stability:** This package is in early development (v0.x). The public API is **not yet stable** — breaking changes may be introduced between minor versions until v1.0. Pin your dependency to an exact version (e.g. `kmer-learn==0.1.0`) if reproducibility matters.
48
+
49
+ ## Install
50
+
51
+ ```bash
52
+ pip install kmer-learn
53
+ ```
54
+
55
+ From source (requires a C compiler):
56
+
57
+ ```bash
58
+ git clone https://github.com/synbioml/kmer-learn.git
59
+ cd kmer-learn
60
+ pip install -e .
61
+ ```
62
+
63
+ Optional backends (strongly recommended for speed):
64
+
65
+ ```bash
66
+ pip install rapidfuzz parasail # 10-100x faster edit distances and alignments
67
+ pip install pyfastx # fast FASTA parsing for examples
68
+ ```
69
+
70
+ ## Features
71
+
72
+ ### Kernels
73
+ - **GKMKernel** family — gapped k-mer kernels from gkmSVM / LS-GKM (Ghandi et al. 2014; Lee 2016). C-backed, OpenMP-parallel, with full/estimated/truncated mismatch schemes, 6 post-transforms (RBF, poly, sigmoid, exponential, …), 5 positional weighting kernels (triangular, Epanechnikov, Gaussian, Laplacian, Cauchy), reverse-complement indexing, sliding-window scan, and 3D windowed tensors.
74
+ - **DistanceKernel** — turns any distance into a kernel via a post-transform.
75
+
76
+ ### Encoders (CSR output)
77
+ - **SpectrumEncoder** — plain k-mer counts via rolling hash (k ≤ 12).
78
+ - **GappyEncoder** — gappy k-mer counts with explicit masks (`"*--*"`) or gap ranges (`L=6, g_min=2, g_max=3`).
79
+ - **MismatchEncoder** — mismatch-tolerant k-mer counts (Leslie, Eskin, Noble 2004).
80
+
81
+ All encoders support `canonical_rc=True` for reverse-complement collapsing.
82
+
83
+ ### Distances
84
+ - **Hamming**, **Levenshtein** (rapidfuzz backend + Python fallback).
85
+ - **NeedlemanWunsch**, **SmithWaterman** (parasail backend + Python fallback), with custom substitution matrices (NUC4.4, BLOSUM62, …).
86
+
87
+ ### Models
88
+ - **DifferentialKmerScorer** — Multinomial Naive Bayes on k-mer features, with auto-generated negatives via Shuffler/Chunker.
89
+ - **KernelSVM** — SVM with a precomputed kernel (works with GKMKernel and DistanceKernel).
90
+ - **LinearSVM** — Linear SVM on encoder features.
91
+ - **KNNClassifier** — k-Nearest Neighbors with a sequence distance.
92
+
93
+ ### Sequence perturbation
94
+ - **KmerShuffler** — k-mer-preserving shuffle via random Eulerian paths in the De Bruijn graph. Three endpoint modes (preserve / free / crop). Philox4×32-10 RNG, reproducible across `n_jobs`.
95
+ - **Chunker** — block-level perturbation: split into chunks of size `[min, max]`, optionally reverse-complement each, shuffle, concatenate. Five residual-handling modes, two algorithms (random / backtrack).
96
+ - **BaseBackgroundModel** — ABC for custom background models.
97
+
98
+ ### Utilities
99
+ - `kmer.utils` — bit-packed k-mer helpers (`kmer_to_code`, `code_to_kmer`, `reverse_complement`, `canonical_code`).
100
+
101
+ ## Quick start
102
+
103
+ ```python
104
+ from kmer.kernels import GKMKernel
105
+ from kmer.models import KernelSVM
106
+
107
+ # Train a gkm-SVM
108
+ clf = KernelSVM(GKMKernel(L=10, k=6, d=3, kernel_type="truncated", use_rc=True), C=1.0)
109
+ clf.fit(positives + negatives, [1]*len(positives) + [0]*len(negatives))
110
+ preds = clf.predict(test_seqs)
111
+ ```
112
+
113
+ ```python
114
+ from kmer.encoders import SpectrumEncoder
115
+ from kmer.models import DifferentialKmerScorer
116
+ from kmer.perturb import KmerShuffler
117
+
118
+ # Differential k-mer scoring with dinucleotide-shuffled background
119
+ scorer = DifferentialKmerScorer(
120
+ featurizer=SpectrumEncoder(k=6, canonical_rc=True),
121
+ background=KmerShuffler(k=2, seed=42),
122
+ )
123
+ scorer.fit(positives)
124
+ top_motifs = scorer.kmer_scores_.sort_values(ascending=False).head(20)
125
+ ```
126
+
127
+ ```python
128
+ from kmer.distance import Levenshtein, DistanceKernel
129
+ from kmer.models import KNNClassifier
130
+
131
+ # KNN with edit distance
132
+ clf = KNNClassifier(Levenshtein(), n_neighbors=5)
133
+ clf.fit(train_seqs, y_train)
134
+ ```
135
+
136
+ ## Examples (Vignettes)
137
+
138
+ The `examples/` directory (top-level, next to `kmer/`) contains a series of cross-linked Jupyter notebooks. Each notebook starts with a vignette index linking to all others.
139
+
140
+ | # | Notebook | Topic |
141
+ |---|----------|-------|
142
+ | 01 | `01_basic_kernel_matrix.ipynb` | GKMKernel: build, inspect, verify invariants |
143
+ | 02 | `02_distance_metrics_and_kernels.ipynb` | Distance metrics (Hamming, Levenshtein, NW, SW) + DistanceKernel (RBF, PSD, KernelSVM) |
144
+ | 03 | `03_svc_with_kernel.ipynb` | Train a gkm-SVM with KernelSVM |
145
+ | 04 | `04_clustering_sequences.ipynb` | Hierarchical clustering with kernel distances |
146
+ | 05 | `05_score_long_sequence.ipynb` | Sliding-window scan of a long sequence |
147
+ | 06 | `06_weighted_kernel.ipynb` | WGKMKernel positional weighting (centered motif) |
148
+ | 07 | `07_transform_and_comparison.ipynb` | All 3 schemes × 6 transforms, GKM vs WGKM |
149
+ | 08 | `08_windowed_3d_tensors.ipynb` | WindowedGKMKernel 3D output (line plot) |
150
+ | 09 | `09_spectrum_encoder_and_differential.ipynb` | SpectrumEncoder + DifferentialKmerScorer |
151
+ | 10 | `10_gappy_encoder.ipynb` | GappyEncoder with masks, gap ranges, RC collapse |
152
+ | 11 | `11_mismatch_encoder.ipynb` | MismatchEncoder and comparison to spectrum |
153
+ | 12 | `12_shuffler_and_chunker.ipynb` | KmerShuffler + Chunker for negative-set generation |
154
+
155
+ ## Citation
156
+
157
+ An article describing this package is in preparation. Until it is published, please cite the package as:
158
+
159
+ > *kmer-learn: Classical machine learning primitives for nucleotide sequences.* (in preparation).
160
+
161
+ For the mean time, if you use the package in your research, please cite the relevant foundational works listed below.
162
+
163
+ ## References
164
+
165
+ The package builds on the following foundational works:
166
+
167
+ - **gkmSVM** — Ghandi M, Lee D, Mohammad-Noori M, Beer MA. *Enhanced regulatory sequence prediction using gapped k-mer features.* PLoS Comput Biol. 2014;10(7):e1003711.
168
+ - **LS-GKM** — Lee D. *LS-GKM: a new gkm-SVM for large-scale datasets.* Bioinformatics. 2016;32(14):2196–8.
169
+ - **Mismatch kernel** — Leslie CS, Eskin E, Cohen A, Weston J, Noble WS. *Mismatch string kernels for discriminative protein classification.* Bioinformatics. 2004;20 Suppl 1:i467–76.
170
+ - **Spectrum / gappy kernel** — Leslie CS, Eskin E, Weston J, Noble WS. *The spectrum kernel: a string kernel for SVM protein classification.* Pacific Symposium on Biocomputing. 2002:564–75.
171
+ - **Dinucleotide shuffle** — Clote P. *Efficient calculation of the number of native states of a protein.* (2003, unpublished note); Altschul SF, Erickson BW. *Significance of nucleotide sequence alignments: a method for random sequence permutation.* Bull Math Biol. 1985;47(4):541–51.
172
+ - **Philox4×32 RNG** — Salmon JK, Moraes MA, Dror RO, Shaw DE. *Parallel random numbers: as easy as 1, 2, 3.* SC '11.
173
+ - **Multinomial Naive Bayes** — Manning CD, Raghavan P, Schütze H. *Introduction to Information Retrieval.* Cambridge University Press, 2008.
174
+ - **Needleman-Wunsch** — Needleman SB, Wunsch CD. *A general method applicable to the search for similarities in the amino acid sequence of two proteins.* J Mol Biol. 1970;48(3):443–53.
175
+ - **Smith-Waterman** — Smith TF, Waterman MS. *Identification of common molecular subsequences.* J Mol Biol. 1981;147(1):195–7.
176
+ - **NUC4.4 matrix** — NCBI standard DNA scoring matrix.
177
+
178
+ Third-party libraries used as optional backends:
179
+
180
+ - **[rapidfuzz](https://github.com/maxbachmann/RapidFuzz)** — fast Levenshtein and Hamming distances.
181
+ - **[parasail](https://github.com/jeffdaily/parasail)** — SIMD-accelerated sequence alignment (Daily, 2016).
182
+ - **[scikit-learn](https://scikit-learn.org)** — SVM, Naive Bayes, KNN.
183
+ - **[NumPy](https://numpy.org) / [SciPy](https://scipy.org)** — array and sparse-matrix infrastructure.
184
+
185
+ ## License
186
+
187
+ To be specified.
@@ -0,0 +1,145 @@
1
+ # kmer-learn
2
+
3
+ Classical machine learning primitives for nucleotide sequences — kernels, encoders, distances, models, and sequence perturbation, all in one composable Python package.
4
+
5
+ > **⚠ API stability:** This package is in early development (v0.x). The public API is **not yet stable** — breaking changes may be introduced between minor versions until v1.0. Pin your dependency to an exact version (e.g. `kmer-learn==0.1.0`) if reproducibility matters.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install kmer-learn
11
+ ```
12
+
13
+ From source (requires a C compiler):
14
+
15
+ ```bash
16
+ git clone https://github.com/synbioml/kmer-learn.git
17
+ cd kmer-learn
18
+ pip install -e .
19
+ ```
20
+
21
+ Optional backends (strongly recommended for speed):
22
+
23
+ ```bash
24
+ pip install rapidfuzz parasail # 10-100x faster edit distances and alignments
25
+ pip install pyfastx # fast FASTA parsing for examples
26
+ ```
27
+
28
+ ## Features
29
+
30
+ ### Kernels
31
+ - **GKMKernel** family — gapped k-mer kernels from gkmSVM / LS-GKM (Ghandi et al. 2014; Lee 2016). C-backed, OpenMP-parallel, with full/estimated/truncated mismatch schemes, 6 post-transforms (RBF, poly, sigmoid, exponential, …), 5 positional weighting kernels (triangular, Epanechnikov, Gaussian, Laplacian, Cauchy), reverse-complement indexing, sliding-window scan, and 3D windowed tensors.
32
+ - **DistanceKernel** — turns any distance into a kernel via a post-transform.
33
+
34
+ ### Encoders (CSR output)
35
+ - **SpectrumEncoder** — plain k-mer counts via rolling hash (k ≤ 12).
36
+ - **GappyEncoder** — gappy k-mer counts with explicit masks (`"*--*"`) or gap ranges (`L=6, g_min=2, g_max=3`).
37
+ - **MismatchEncoder** — mismatch-tolerant k-mer counts (Leslie, Eskin, Noble 2004).
38
+
39
+ All encoders support `canonical_rc=True` for reverse-complement collapsing.
40
+
41
+ ### Distances
42
+ - **Hamming**, **Levenshtein** (rapidfuzz backend + Python fallback).
43
+ - **NeedlemanWunsch**, **SmithWaterman** (parasail backend + Python fallback), with custom substitution matrices (NUC4.4, BLOSUM62, …).
44
+
45
+ ### Models
46
+ - **DifferentialKmerScorer** — Multinomial Naive Bayes on k-mer features, with auto-generated negatives via Shuffler/Chunker.
47
+ - **KernelSVM** — SVM with a precomputed kernel (works with GKMKernel and DistanceKernel).
48
+ - **LinearSVM** — Linear SVM on encoder features.
49
+ - **KNNClassifier** — k-Nearest Neighbors with a sequence distance.
50
+
51
+ ### Sequence perturbation
52
+ - **KmerShuffler** — k-mer-preserving shuffle via random Eulerian paths in the De Bruijn graph. Three endpoint modes (preserve / free / crop). Philox4×32-10 RNG, reproducible across `n_jobs`.
53
+ - **Chunker** — block-level perturbation: split into chunks of size `[min, max]`, optionally reverse-complement each, shuffle, concatenate. Five residual-handling modes, two algorithms (random / backtrack).
54
+ - **BaseBackgroundModel** — ABC for custom background models.
55
+
56
+ ### Utilities
57
+ - `kmer.utils` — bit-packed k-mer helpers (`kmer_to_code`, `code_to_kmer`, `reverse_complement`, `canonical_code`).
58
+
59
+ ## Quick start
60
+
61
+ ```python
62
+ from kmer.kernels import GKMKernel
63
+ from kmer.models import KernelSVM
64
+
65
+ # Train a gkm-SVM
66
+ clf = KernelSVM(GKMKernel(L=10, k=6, d=3, kernel_type="truncated", use_rc=True), C=1.0)
67
+ clf.fit(positives + negatives, [1]*len(positives) + [0]*len(negatives))
68
+ preds = clf.predict(test_seqs)
69
+ ```
70
+
71
+ ```python
72
+ from kmer.encoders import SpectrumEncoder
73
+ from kmer.models import DifferentialKmerScorer
74
+ from kmer.perturb import KmerShuffler
75
+
76
+ # Differential k-mer scoring with dinucleotide-shuffled background
77
+ scorer = DifferentialKmerScorer(
78
+ featurizer=SpectrumEncoder(k=6, canonical_rc=True),
79
+ background=KmerShuffler(k=2, seed=42),
80
+ )
81
+ scorer.fit(positives)
82
+ top_motifs = scorer.kmer_scores_.sort_values(ascending=False).head(20)
83
+ ```
84
+
85
+ ```python
86
+ from kmer.distance import Levenshtein, DistanceKernel
87
+ from kmer.models import KNNClassifier
88
+
89
+ # KNN with edit distance
90
+ clf = KNNClassifier(Levenshtein(), n_neighbors=5)
91
+ clf.fit(train_seqs, y_train)
92
+ ```
93
+
94
+ ## Examples (Vignettes)
95
+
96
+ The `examples/` directory (top-level, next to `kmer/`) contains a series of cross-linked Jupyter notebooks. Each notebook starts with a vignette index linking to all others.
97
+
98
+ | # | Notebook | Topic |
99
+ |---|----------|-------|
100
+ | 01 | `01_basic_kernel_matrix.ipynb` | GKMKernel: build, inspect, verify invariants |
101
+ | 02 | `02_distance_metrics_and_kernels.ipynb` | Distance metrics (Hamming, Levenshtein, NW, SW) + DistanceKernel (RBF, PSD, KernelSVM) |
102
+ | 03 | `03_svc_with_kernel.ipynb` | Train a gkm-SVM with KernelSVM |
103
+ | 04 | `04_clustering_sequences.ipynb` | Hierarchical clustering with kernel distances |
104
+ | 05 | `05_score_long_sequence.ipynb` | Sliding-window scan of a long sequence |
105
+ | 06 | `06_weighted_kernel.ipynb` | WGKMKernel positional weighting (centered motif) |
106
+ | 07 | `07_transform_and_comparison.ipynb` | All 3 schemes × 6 transforms, GKM vs WGKM |
107
+ | 08 | `08_windowed_3d_tensors.ipynb` | WindowedGKMKernel 3D output (line plot) |
108
+ | 09 | `09_spectrum_encoder_and_differential.ipynb` | SpectrumEncoder + DifferentialKmerScorer |
109
+ | 10 | `10_gappy_encoder.ipynb` | GappyEncoder with masks, gap ranges, RC collapse |
110
+ | 11 | `11_mismatch_encoder.ipynb` | MismatchEncoder and comparison to spectrum |
111
+ | 12 | `12_shuffler_and_chunker.ipynb` | KmerShuffler + Chunker for negative-set generation |
112
+
113
+ ## Citation
114
+
115
+ An article describing this package is in preparation. Until it is published, please cite the package as:
116
+
117
+ > *kmer-learn: Classical machine learning primitives for nucleotide sequences.* (in preparation).
118
+
119
+ For the mean time, if you use the package in your research, please cite the relevant foundational works listed below.
120
+
121
+ ## References
122
+
123
+ The package builds on the following foundational works:
124
+
125
+ - **gkmSVM** — Ghandi M, Lee D, Mohammad-Noori M, Beer MA. *Enhanced regulatory sequence prediction using gapped k-mer features.* PLoS Comput Biol. 2014;10(7):e1003711.
126
+ - **LS-GKM** — Lee D. *LS-GKM: a new gkm-SVM for large-scale datasets.* Bioinformatics. 2016;32(14):2196–8.
127
+ - **Mismatch kernel** — Leslie CS, Eskin E, Cohen A, Weston J, Noble WS. *Mismatch string kernels for discriminative protein classification.* Bioinformatics. 2004;20 Suppl 1:i467–76.
128
+ - **Spectrum / gappy kernel** — Leslie CS, Eskin E, Weston J, Noble WS. *The spectrum kernel: a string kernel for SVM protein classification.* Pacific Symposium on Biocomputing. 2002:564–75.
129
+ - **Dinucleotide shuffle** — Clote P. *Efficient calculation of the number of native states of a protein.* (2003, unpublished note); Altschul SF, Erickson BW. *Significance of nucleotide sequence alignments: a method for random sequence permutation.* Bull Math Biol. 1985;47(4):541–51.
130
+ - **Philox4×32 RNG** — Salmon JK, Moraes MA, Dror RO, Shaw DE. *Parallel random numbers: as easy as 1, 2, 3.* SC '11.
131
+ - **Multinomial Naive Bayes** — Manning CD, Raghavan P, Schütze H. *Introduction to Information Retrieval.* Cambridge University Press, 2008.
132
+ - **Needleman-Wunsch** — Needleman SB, Wunsch CD. *A general method applicable to the search for similarities in the amino acid sequence of two proteins.* J Mol Biol. 1970;48(3):443–53.
133
+ - **Smith-Waterman** — Smith TF, Waterman MS. *Identification of common molecular subsequences.* J Mol Biol. 1981;147(1):195–7.
134
+ - **NUC4.4 matrix** — NCBI standard DNA scoring matrix.
135
+
136
+ Third-party libraries used as optional backends:
137
+
138
+ - **[rapidfuzz](https://github.com/maxbachmann/RapidFuzz)** — fast Levenshtein and Hamming distances.
139
+ - **[parasail](https://github.com/jeffdaily/parasail)** — SIMD-accelerated sequence alignment (Daily, 2016).
140
+ - **[scikit-learn](https://scikit-learn.org)** — SVM, Naive Bayes, KNN.
141
+ - **[NumPy](https://numpy.org) / [SciPy](https://scipy.org)** — array and sparse-matrix infrastructure.
142
+
143
+ ## License
144
+
145
+ To be specified.
@@ -0,0 +1,73 @@
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 5,
4
+ "metadata": {
5
+ "kernelspec": {
6
+ "display_name": "Python 3",
7
+ "language": "python",
8
+ "name": "python3"
9
+ },
10
+ "language_info": {
11
+ "name": "python",
12
+ "version": "3.13.5",
13
+ "mimetype": "text/x-python",
14
+ "codemirror_mode": {
15
+ "name": "ipython",
16
+ "version": 3
17
+ },
18
+ "pygments_lexer": "ipython3",
19
+ "nbconvert_exporter": "python",
20
+ "file_extension": ".py"
21
+ }
22
+ },
23
+ "cells": [
24
+ {
25
+ "id": "40a8967d",
26
+ "cell_type": "markdown",
27
+ "source": "# 01 \u2014 GKMKernel: Basic Kernel Matrix\n\n**Vignette index:** | `**01**` GKMKernel basics | [`02` Distance metrics & kernels](02_distance_metrics_and_kernels.ipynb) | [`03` SVM with kernel](03_svc_with_kernel.ipynb) | [`04` Clustering](04_clustering_sequences.ipynb) | [`05` Long sequence scoring](05_score_long_sequence.ipynb) | [`06` Weighted (WGKM) kernel](06_weighted_kernel.ipynb) | [`07` Transforms & comparison](07_transform_and_comparison.ipynb) | [`08` Windowed 3D tensors](08_windowed_3d_tensors.ipynb) | [`09` Spectrum encoder & NB](09_spectrum_encoder_and_differential.ipynb) | [`10` Gappy encoder](10_gappy_encoder.ipynb) | [`11` Mismatch encoder](11_mismatch_encoder.ipynb) | [`12` Shuffler & chunker](12_shuffler_and_chunker.ipynb)\n\nThis vignette shows how to build a gkm kernel matrix and verify its basic invariants (symmetry, unit diagonal, PSD).",
28
+ "metadata": {}
29
+ },
30
+ {
31
+ "id": "0401a422",
32
+ "cell_type": "code",
33
+ "metadata": {
34
+ "execution": {
35
+ "iopub.status.busy": "2026-06-21T16:14:02.435770Z",
36
+ "iopub.execute_input": "2026-06-21T16:14:02.435946Z",
37
+ "shell.execute_reply": "2026-06-21T16:14:02.509236Z",
38
+ "iopub.status.idle": "2026-06-21T16:14:02.510335Z"
39
+ }
40
+ },
41
+ "execution_count": 1,
42
+ "source": "import numpy as np\nfrom kmer.kernels import GKMKernel\n\nseqs = [\n \"ACGTACGTACGTACGTACGT\",\n \"TTTTAAAAGGGGCCCCAAAA\",\n \"ACGTTGCATGCATGCATGCA\",\n \"CCCCGGGGTTTTAAAACCCC\",\n \"ATATGCGCATATGCGCATAT\",\n \"GAATTCGAATTCGAATTCGA\",\n]\n\nkern = GKMKernel(L=10, k=6, d=3, kernel_type=\"truncated\", use_rc=True)\nkern.set_references(seqs)\nK = np.asarray(kern.kernel())\nprint(\"Shape:\", K.shape)\nprint(\"Symmetric:\", np.allclose(K, K.T))\nprint(\"Unit diagonal:\", np.allclose(np.diag(K), 1.0))\nprint(\"Min eigenvalue:\", np.linalg.eigvalsh(K).min(), \"(PSD if >= 0)\")",
43
+ "outputs": [
44
+ {
45
+ "output_type": "stream",
46
+ "name": "stdout",
47
+ "text": "Shape: (6, 6)\nSymmetric: True\nUnit diagonal: True\nMin eigenvalue: 0.98676708653555 (PSD if >= 0)\n"
48
+ }
49
+ ]
50
+ },
51
+ {
52
+ "id": "81e9494f",
53
+ "cell_type": "code",
54
+ "metadata": {
55
+ "execution": {
56
+ "iopub.status.busy": "2026-06-21T16:14:02.511866Z",
57
+ "iopub.execute_input": "2026-06-21T16:14:02.512072Z",
58
+ "iopub.status.idle": "2026-06-21T16:14:02.529055Z",
59
+ "shell.execute_reply": "2026-06-21T16:14:02.528680Z"
60
+ }
61
+ },
62
+ "execution_count": 2,
63
+ "source": "# Cross-kernel: query sequences vs reference set\nqueries = [\"ACGTACGTACGTACGTACGTAC\", \"CCCCGGGGTTTTAAAACCCCAG\"]\nkern.set_references(seqs)\nKq = np.asarray(kern.kernel(X_query=queries))\nprint(\"Cross-kernel shape:\", Kq.shape)",
64
+ "outputs": [
65
+ {
66
+ "output_type": "stream",
67
+ "name": "stdout",
68
+ "text": "Cross-kernel shape: (2, 6)\n"
69
+ }
70
+ ]
71
+ }
72
+ ]
73
+ }