fiberhmm 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fiberhmm-2.0.0/LICENSE +21 -0
- fiberhmm-2.0.0/PKG-INFO +328 -0
- fiberhmm-2.0.0/README.md +283 -0
- fiberhmm-2.0.0/apply_model.py +6 -0
- fiberhmm-2.0.0/extract_tags.py +6 -0
- fiberhmm-2.0.0/fiberhmm/__init__.py +10 -0
- fiberhmm-2.0.0/fiberhmm/cli/__init__.py +2 -0
- fiberhmm-2.0.0/fiberhmm/cli/apply.py +298 -0
- fiberhmm-2.0.0/fiberhmm/cli/common.py +128 -0
- fiberhmm-2.0.0/fiberhmm/cli/extract_tags.py +661 -0
- fiberhmm-2.0.0/fiberhmm/cli/generate_probs.py +499 -0
- fiberhmm-2.0.0/fiberhmm/cli/train.py +1023 -0
- fiberhmm-2.0.0/fiberhmm/cli/utils.py +894 -0
- fiberhmm-2.0.0/fiberhmm/core/__init__.py +5 -0
- fiberhmm-2.0.0/fiberhmm/core/bam_reader.py +849 -0
- fiberhmm-2.0.0/fiberhmm/core/hmm.py +1089 -0
- fiberhmm-2.0.0/fiberhmm/core/model_io.py +267 -0
- fiberhmm-2.0.0/fiberhmm/inference/__init__.py +29 -0
- fiberhmm-2.0.0/fiberhmm/inference/bam_output.py +581 -0
- fiberhmm-2.0.0/fiberhmm/inference/engine.py +418 -0
- fiberhmm-2.0.0/fiberhmm/inference/parallel.py +1528 -0
- fiberhmm-2.0.0/fiberhmm/inference/stats.py +433 -0
- fiberhmm-2.0.0/fiberhmm/probabilities/__init__.py +4 -0
- fiberhmm-2.0.0/fiberhmm/probabilities/context_counter.py +589 -0
- fiberhmm-2.0.0/fiberhmm/probabilities/stats.py +310 -0
- fiberhmm-2.0.0/fiberhmm/probabilities/utils.py +90 -0
- fiberhmm-2.0.0/fiberhmm.egg-info/PKG-INFO +328 -0
- fiberhmm-2.0.0/fiberhmm.egg-info/SOURCES.txt +42 -0
- fiberhmm-2.0.0/fiberhmm.egg-info/dependency_links.txt +1 -0
- fiberhmm-2.0.0/fiberhmm.egg-info/entry_points.txt +6 -0
- fiberhmm-2.0.0/fiberhmm.egg-info/requires.txt +21 -0
- fiberhmm-2.0.0/fiberhmm.egg-info/top_level.txt +6 -0
- fiberhmm-2.0.0/fiberhmm_utils.py +6 -0
- fiberhmm-2.0.0/generate_probs.py +6 -0
- fiberhmm-2.0.0/pyproject.toml +130 -0
- fiberhmm-2.0.0/setup.cfg +4 -0
- fiberhmm-2.0.0/tests/test_bam_reader.py +352 -0
- fiberhmm-2.0.0/tests/test_cli_common.py +211 -0
- fiberhmm-2.0.0/tests/test_hmm.py +433 -0
- fiberhmm-2.0.0/tests/test_inference_engine.py +161 -0
- fiberhmm-2.0.0/tests/test_inference_parallel.py +59 -0
- fiberhmm-2.0.0/tests/test_model_io.py +174 -0
- fiberhmm-2.0.0/tests/test_package_consistency.py +152 -0
- fiberhmm-2.0.0/train_model.py +6 -0
fiberhmm-2.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024-2026 FiberHMM Authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
fiberhmm-2.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fiberhmm
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Hidden Markov Model for calling chromatin footprints from fiber-seq and DAF-seq data
|
|
5
|
+
Author: FiberHMM Authors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/fiberseq/FiberHMM
|
|
8
|
+
Project-URL: Documentation, https://github.com/fiberseq/FiberHMM#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/fiberseq/FiberHMM
|
|
10
|
+
Project-URL: Issues, https://github.com/fiberseq/FiberHMM/issues
|
|
11
|
+
Keywords: bioinformatics,chromatin,fiber-seq,DAF-seq,hidden-markov-model,nucleosome,footprinting,epigenetics,single-molecule
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: MacOS
|
|
17
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
24
|
+
Requires-Python: >=3.9
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: numpy>=1.20
|
|
28
|
+
Requires-Dist: scipy>=1.7
|
|
29
|
+
Requires-Dist: pandas>=1.3
|
|
30
|
+
Requires-Dist: pysam>=0.19
|
|
31
|
+
Requires-Dist: tqdm>=4.60
|
|
32
|
+
Provides-Extra: numba
|
|
33
|
+
Requires-Dist: numba>=0.55; extra == "numba"
|
|
34
|
+
Provides-Extra: plots
|
|
35
|
+
Requires-Dist: matplotlib>=3.5; extra == "plots"
|
|
36
|
+
Provides-Extra: all
|
|
37
|
+
Requires-Dist: numba>=0.55; extra == "all"
|
|
38
|
+
Requires-Dist: matplotlib>=3.5; extra == "all"
|
|
39
|
+
Provides-Extra: dev
|
|
40
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
41
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
42
|
+
Requires-Dist: black>=23.0; extra == "dev"
|
|
43
|
+
Requires-Dist: ruff>=0.1; extra == "dev"
|
|
44
|
+
Dynamic: license-file
|
|
45
|
+
|
|
46
|
+
# FiberHMM
|
|
47
|
+
|
|
48
|
+
Hidden Markov Model toolkit for calling chromatin footprints from fiber-seq and DAF-seq single-molecule data.
|
|
49
|
+
|
|
50
|
+
FiberHMM identifies nucleosome-protected regions (footprints) and accessible regions (methylase-sensitive patches, MSPs) from single-molecule DNA modification data, including m6A methylation (fiber-seq) and deamination marks (DAF-seq).
|
|
51
|
+
|
|
52
|
+
## Key Features
|
|
53
|
+
|
|
54
|
+
- **No genome context files** -- hexamer context computed directly from read sequences
|
|
55
|
+
- **Fibertools-compatible output** -- tagged BAM with `ns`/`nl`/`as`/`al` tags, ready for downstream tools
|
|
56
|
+
- **Native HMM implementation** -- no hmmlearn dependency; Numba JIT optional for ~10x speedup
|
|
57
|
+
- **Region-parallel processing** -- scales linearly with cores for large genomes
|
|
58
|
+
- **Multi-platform** -- supports PacBio fiber-seq, Nanopore fiber-seq, and DAF-seq
|
|
59
|
+
- **Legacy model support** -- loads old hmmlearn-trained pickle/NPZ models seamlessly
|
|
60
|
+
|
|
61
|
+
## Installation
|
|
62
|
+
|
|
63
|
+
### Using pip
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install fiberhmm
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### From source
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
git clone https://github.com/fiberseq/FiberHMM.git
|
|
73
|
+
cd FiberHMM
|
|
74
|
+
pip install -e .
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Optional dependencies
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install numba # ~10x faster HMM computation
|
|
81
|
+
pip install matplotlib # --stats visualization
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
For bigBed output, install [`bedToBigBed`](https://hgdownload.soe.ucsc.edu/admin/exe/) from UCSC tools.
|
|
85
|
+
|
|
86
|
+
## Quick Start
|
|
87
|
+
|
|
88
|
+
### 1. Generate emission probabilities
|
|
89
|
+
|
|
90
|
+
Requires accessible (naked DNA) and inaccessible (native chromatin) control BAMs:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
python generate_probs.py \
|
|
94
|
+
-a accessible_control.bam \
|
|
95
|
+
-u inaccessible_control.bam \
|
|
96
|
+
-o probs/ \
|
|
97
|
+
--stats
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### 2. Train HMM
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
python train_model.py \
|
|
104
|
+
-i sample.bam \
|
|
105
|
+
-p probs/tables/accessible_A_k3.tsv probs/tables/inaccessible_A_k3.tsv \
|
|
106
|
+
-o models/ \
|
|
107
|
+
--stats
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### 3. Call footprints
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
python apply_model.py \
|
|
114
|
+
-i experiment.bam \
|
|
115
|
+
-m models/best-model.json \
|
|
116
|
+
-o output/ \
|
|
117
|
+
-c 8 \
|
|
118
|
+
--scores
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### 4. Extract to BED12/bigBed
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
python extract_tags.py -i output/experiment_footprints.bam
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Pre-trained Models
|
|
128
|
+
|
|
129
|
+
FiberHMM ships with pre-trained models in `models/` ready for immediate use:
|
|
130
|
+
|
|
131
|
+
| Model | File | Enzyme | Platform | Mode |
|
|
132
|
+
|-------|------|--------|----------|------|
|
|
133
|
+
| **Hia5 PacBio** | `hia5_pacbio.json` | Hia5 (m6A) | PacBio | `pacbio-fiber` |
|
|
134
|
+
| **Hia5 Nanopore** | `hia5_nanopore.json` | Hia5 (m6A) | Nanopore | `nanopore-fiber` |
|
|
135
|
+
| **DddA PacBio** | `ddda_pacbio.json` | DddA (deamination) | PacBio | `daf` |
|
|
136
|
+
| **DddB Nanopore** | `dddb_nanopore.json` | DddB (deamination) | Nanopore | `daf` |
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
# Example: call footprints with a pre-trained model
|
|
140
|
+
python apply_model.py -i experiment.bam -m models/hia5_pacbio.json -o output/ -c 8
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Analysis Modes
|
|
144
|
+
|
|
145
|
+
| Mode | Flag | Description | Target bases |
|
|
146
|
+
|------|------|-------------|--------------|
|
|
147
|
+
| **PacBio fiber-seq** | `--mode pacbio-fiber` | Default. m6A at A and T (both strands) | A, T (with RC) |
|
|
148
|
+
| **Nanopore fiber-seq** | `--mode nanopore-fiber` | m6A at A only (single strand) | A only |
|
|
149
|
+
| **DAF-seq** | `--mode daf` | Deamination at C/G (strand-specific) | C or G |
|
|
150
|
+
|
|
151
|
+
All scripts accept `--mode`; context size `-k` (3-10) determines the hexamer table size.
|
|
152
|
+
|
|
153
|
+
**Note on mode selection:** The `pacbio-fiber` vs `nanopore-fiber` distinction only matters for Hia5 (m6A), where PacBio detects modifications on both strands while Nanopore detects only one. For deaminase-based methods (DddA, DddB), `--mode daf` is always used regardless of sequencing platform -- the chemistry is inherently strand-specific. Accuracy may differ between platforms, but the mode is the same.
|
|
154
|
+
|
|
155
|
+
## Output
|
|
156
|
+
|
|
157
|
+
### BAM Tags
|
|
158
|
+
|
|
159
|
+
`apply_model.py` adds footprint tags compatible with the fibertools ecosystem:
|
|
160
|
+
|
|
161
|
+
| Tag | Type | Description |
|
|
162
|
+
|-----|------|-------------|
|
|
163
|
+
| `ns` | B,I | Nucleosome/footprint starts (0-based query coords) |
|
|
164
|
+
| `nl` | B,I | Nucleosome/footprint lengths |
|
|
165
|
+
| `as` | B,I | Accessible/MSP starts |
|
|
166
|
+
| `al` | B,I | Accessible/MSP lengths |
|
|
167
|
+
| `nq` | B,C | Footprint quality scores (0-255, with `--scores`) |
|
|
168
|
+
| `aq` | B,C | MSP quality scores (0-255, with `--scores`) |
|
|
169
|
+
|
|
170
|
+
### BED12/bigBed Extraction
|
|
171
|
+
|
|
172
|
+
Use `extract_tags.py` to extract features from tagged BAMs for browser visualization:
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
# Extract all feature types to bigBed
|
|
176
|
+
python extract_tags.py -i output/sample_footprints.bam
|
|
177
|
+
|
|
178
|
+
# Extract only footprints
|
|
179
|
+
python extract_tags.py -i output/sample_footprints.bam --footprint
|
|
180
|
+
|
|
181
|
+
# Keep BED files alongside bigBed
|
|
182
|
+
python extract_tags.py -i output/sample_footprints.bam --keep-bed
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## Scripts Reference
|
|
186
|
+
|
|
187
|
+
### generate_probs.py / `fiberhmm-probs`
|
|
188
|
+
|
|
189
|
+
Generate emission probability tables from accessible and inaccessible control BAMs.
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
python generate_probs.py \
|
|
193
|
+
-a accessible_control.bam \
|
|
194
|
+
-u inaccessible_control.bam \
|
|
195
|
+
-o probs/ \
|
|
196
|
+
--mode pacbio-fiber \
|
|
197
|
+
-k 3 4 5 6 \
|
|
198
|
+
--stats
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### train_model.py / `fiberhmm-train`
|
|
202
|
+
|
|
203
|
+
Train the HMM on BAM data using precomputed emission probabilities.
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
python train_model.py \
|
|
207
|
+
-i sample.bam \
|
|
208
|
+
-p probs/tables/accessible_A_k3.tsv probs/tables/inaccessible_A_k3.tsv \
|
|
209
|
+
-o models/ \
|
|
210
|
+
-k 3 \
|
|
211
|
+
--stats
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
Output:
|
|
215
|
+
```
|
|
216
|
+
models/
|
|
217
|
+
├── best-model.json # Primary format (recommended)
|
|
218
|
+
├── best-model.npz # NumPy format
|
|
219
|
+
├── all_models.json # All training iterations
|
|
220
|
+
├── training-reads.tsv # Read IDs used for training
|
|
221
|
+
├── config.json # Training parameters
|
|
222
|
+
└── plots/ # (with --stats)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### apply_model.py / `fiberhmm-apply`
|
|
226
|
+
|
|
227
|
+
Apply a trained model to call footprints. Supports region-parallel processing.
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
python apply_model.py \
|
|
231
|
+
-i experiment.bam \
|
|
232
|
+
-m models/best-model.json \
|
|
233
|
+
-o output/ \
|
|
234
|
+
-c 8 \
|
|
235
|
+
--scores
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
Key options:
|
|
239
|
+
|
|
240
|
+
| Flag | Default | Description |
|
|
241
|
+
|------|---------|-------------|
|
|
242
|
+
| `-c/--cores` | 1 | CPU cores (0 = auto-detect) |
|
|
243
|
+
| `--region-size` | 10000000 | Region size for parallel chunks |
|
|
244
|
+
| `--skip-scaffolds` | false | Skip scaffold/contig chromosomes |
|
|
245
|
+
| `--chroms` | all | Process only these chromosomes |
|
|
246
|
+
| `--scores` | false | Compute per-footprint confidence scores |
|
|
247
|
+
| `-q/--min-mapq` | 20 | Min mapping quality |
|
|
248
|
+
| `-e/--edge-trim` | 10 | Edge masking (bp) |
|
|
249
|
+
|
|
250
|
+
### extract_tags.py / `fiberhmm-extract`
|
|
251
|
+
|
|
252
|
+
Extract footprint/MSP/m6A/m5C features from tagged BAMs to BED12/bigBed.
|
|
253
|
+
|
|
254
|
+
```bash
|
|
255
|
+
python extract_tags.py -i output/sample_footprints.bam -o output/ -c 8
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### fiberhmm-utils
|
|
259
|
+
|
|
260
|
+
Consolidated utility for model and probability management. Four subcommands:
|
|
261
|
+
|
|
262
|
+
**convert** -- Convert legacy pickle/NPZ models to JSON:
|
|
263
|
+
```bash
|
|
264
|
+
fiberhmm-utils convert old_model.pickle new_model.json
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
**inspect** -- Print model metadata, parameters, and emission statistics:
|
|
268
|
+
```bash
|
|
269
|
+
fiberhmm-utils inspect model.json
|
|
270
|
+
fiberhmm-utils inspect model.json --full # full emission table
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
**transfer** -- Transfer emission probabilities between modalities (e.g., fiber-seq to DAF-seq) using accessibility priors from a matched cell type:
|
|
274
|
+
```bash
|
|
275
|
+
fiberhmm-utils transfer \
|
|
276
|
+
--target daf_sample.bam \
|
|
277
|
+
--reference-bam fiberseq_footprints.bam \
|
|
278
|
+
-o daf_probs/ \
|
|
279
|
+
--mode daf \
|
|
280
|
+
--stats
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
**adjust** -- Scale emission probabilities in a model (clamped to [0, 1]):
|
|
284
|
+
```bash
|
|
285
|
+
fiberhmm-utils adjust model.json --state accessible --scale 1.1 -o adjusted.json
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## Fibertools Integration
|
|
289
|
+
|
|
290
|
+
FiberHMM produces BAM output with the same tag conventions used by [fibertools](https://github.com/fiberseq/fibertools-rs):
|
|
291
|
+
|
|
292
|
+
- `ns`/`nl` for nucleosome footprints
|
|
293
|
+
- `as`/`al` for methylase-sensitive patches (MSPs)
|
|
294
|
+
- `nq`/`aq` for quality scores
|
|
295
|
+
|
|
296
|
+
This means FiberHMM output can be used directly with any tool in the fibertools ecosystem, including `ft extract`, downstream analysis pipelines, and genome browsers that support fibertools-style BAM tags.
|
|
297
|
+
|
|
298
|
+
## FiberBrowser
|
|
299
|
+
|
|
300
|
+
FiberBrowser, a dedicated genome browser for single-molecule chromatin data, is coming soon.
|
|
301
|
+
|
|
302
|
+
## Performance Tips
|
|
303
|
+
|
|
304
|
+
1. **Use multiple cores**: `-c 8` (or more) for parallel processing
|
|
305
|
+
2. **Adjust region size for small genomes**:
|
|
306
|
+
- Yeast (~12 MB): `--region-size 500000`
|
|
307
|
+
- Drosophila (~140 MB): `--region-size 2000000`
|
|
308
|
+
- Human/mammalian: default 10 MB is fine
|
|
309
|
+
3. **Skip scaffolds**: `--skip-scaffolds` to avoid thousands of small contigs
|
|
310
|
+
4. **Install numba**: `pip install numba` for ~10x faster HMM training
|
|
311
|
+
|
|
312
|
+
## Model Formats
|
|
313
|
+
|
|
314
|
+
| Format | Extension | Description |
|
|
315
|
+
|--------|-----------|-------------|
|
|
316
|
+
| **JSON** | `.json` | Primary format -- portable, human-readable |
|
|
317
|
+
| NPZ | `.npz` | NumPy archive -- supported for loading |
|
|
318
|
+
| Pickle | `.pickle` | Legacy format -- supported for loading |
|
|
319
|
+
|
|
320
|
+
New models are always saved in JSON. Convert legacy models with:
|
|
321
|
+
|
|
322
|
+
```bash
|
|
323
|
+
fiberhmm-utils convert old_model.pickle new_model.json
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
## License
|
|
327
|
+
|
|
328
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
fiberhmm-2.0.0/README.md
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# FiberHMM
|
|
2
|
+
|
|
3
|
+
Hidden Markov Model toolkit for calling chromatin footprints from fiber-seq and DAF-seq single-molecule data.
|
|
4
|
+
|
|
5
|
+
FiberHMM identifies nucleosome-protected regions (footprints) and accessible regions (methylase-sensitive patches, MSPs) from single-molecule DNA modification data, including m6A methylation (fiber-seq) and deamination marks (DAF-seq).
|
|
6
|
+
|
|
7
|
+
## Key Features
|
|
8
|
+
|
|
9
|
+
- **No genome context files** -- hexamer context computed directly from read sequences
|
|
10
|
+
- **Fibertools-compatible output** -- tagged BAM with `ns`/`nl`/`as`/`al` tags, ready for downstream tools
|
|
11
|
+
- **Native HMM implementation** -- no hmmlearn dependency; Numba JIT optional for ~10x speedup
|
|
12
|
+
- **Region-parallel processing** -- scales linearly with cores for large genomes
|
|
13
|
+
- **Multi-platform** -- supports PacBio fiber-seq, Nanopore fiber-seq, and DAF-seq
|
|
14
|
+
- **Legacy model support** -- loads old hmmlearn-trained pickle/NPZ models seamlessly
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
### Using pip
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install fiberhmm
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### From source
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
git clone https://github.com/fiberseq/FiberHMM.git
|
|
28
|
+
cd FiberHMM
|
|
29
|
+
pip install -e .
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Optional dependencies
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install numba # ~10x faster HMM computation
|
|
36
|
+
pip install matplotlib # --stats visualization
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
For bigBed output, install [`bedToBigBed`](https://hgdownload.soe.ucsc.edu/admin/exe/) from UCSC tools.
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
### 1. Generate emission probabilities
|
|
44
|
+
|
|
45
|
+
Requires accessible (naked DNA) and inaccessible (native chromatin) control BAMs:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
python generate_probs.py \
|
|
49
|
+
-a accessible_control.bam \
|
|
50
|
+
-u inaccessible_control.bam \
|
|
51
|
+
-o probs/ \
|
|
52
|
+
--stats
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### 2. Train HMM
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
python train_model.py \
|
|
59
|
+
-i sample.bam \
|
|
60
|
+
-p probs/tables/accessible_A_k3.tsv probs/tables/inaccessible_A_k3.tsv \
|
|
61
|
+
-o models/ \
|
|
62
|
+
--stats
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### 3. Call footprints
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
python apply_model.py \
|
|
69
|
+
-i experiment.bam \
|
|
70
|
+
-m models/best-model.json \
|
|
71
|
+
-o output/ \
|
|
72
|
+
-c 8 \
|
|
73
|
+
--scores
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### 4. Extract to BED12/bigBed
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
python extract_tags.py -i output/experiment_footprints.bam
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Pre-trained Models
|
|
83
|
+
|
|
84
|
+
FiberHMM ships with pre-trained models in `models/` ready for immediate use:
|
|
85
|
+
|
|
86
|
+
| Model | File | Enzyme | Platform | Mode |
|
|
87
|
+
|-------|------|--------|----------|------|
|
|
88
|
+
| **Hia5 PacBio** | `hia5_pacbio.json` | Hia5 (m6A) | PacBio | `pacbio-fiber` |
|
|
89
|
+
| **Hia5 Nanopore** | `hia5_nanopore.json` | Hia5 (m6A) | Nanopore | `nanopore-fiber` |
|
|
90
|
+
| **DddA PacBio** | `ddda_pacbio.json` | DddA (deamination) | PacBio | `daf` |
|
|
91
|
+
| **DddB Nanopore** | `dddb_nanopore.json` | DddB (deamination) | Nanopore | `daf` |
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# Example: call footprints with a pre-trained model
|
|
95
|
+
python apply_model.py -i experiment.bam -m models/hia5_pacbio.json -o output/ -c 8
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Analysis Modes
|
|
99
|
+
|
|
100
|
+
| Mode | Flag | Description | Target bases |
|
|
101
|
+
|------|------|-------------|--------------|
|
|
102
|
+
| **PacBio fiber-seq** | `--mode pacbio-fiber` | Default. m6A at A and T (both strands) | A, T (with RC) |
|
|
103
|
+
| **Nanopore fiber-seq** | `--mode nanopore-fiber` | m6A at A only (single strand) | A only |
|
|
104
|
+
| **DAF-seq** | `--mode daf` | Deamination at C/G (strand-specific) | C or G |
|
|
105
|
+
|
|
106
|
+
All scripts accept `--mode`; context size `-k` (3-10) determines the hexamer table size.
|
|
107
|
+
|
|
108
|
+
**Note on mode selection:** The `pacbio-fiber` vs `nanopore-fiber` distinction only matters for Hia5 (m6A), where PacBio detects modifications on both strands while Nanopore detects only one. For deaminase-based methods (DddA, DddB), `--mode daf` is always used regardless of sequencing platform -- the chemistry is inherently strand-specific. Accuracy may differ between platforms, but the mode is the same.
|
|
109
|
+
|
|
110
|
+
## Output
|
|
111
|
+
|
|
112
|
+
### BAM Tags
|
|
113
|
+
|
|
114
|
+
`apply_model.py` adds footprint tags compatible with the fibertools ecosystem:
|
|
115
|
+
|
|
116
|
+
| Tag | Type | Description |
|
|
117
|
+
|-----|------|-------------|
|
|
118
|
+
| `ns` | B,I | Nucleosome/footprint starts (0-based query coords) |
|
|
119
|
+
| `nl` | B,I | Nucleosome/footprint lengths |
|
|
120
|
+
| `as` | B,I | Accessible/MSP starts |
|
|
121
|
+
| `al` | B,I | Accessible/MSP lengths |
|
|
122
|
+
| `nq` | B,C | Footprint quality scores (0-255, with `--scores`) |
|
|
123
|
+
| `aq` | B,C | MSP quality scores (0-255, with `--scores`) |
|
|
124
|
+
|
|
125
|
+
### BED12/bigBed Extraction
|
|
126
|
+
|
|
127
|
+
Use `extract_tags.py` to extract features from tagged BAMs for browser visualization:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
# Extract all feature types to bigBed
|
|
131
|
+
python extract_tags.py -i output/sample_footprints.bam
|
|
132
|
+
|
|
133
|
+
# Extract only footprints
|
|
134
|
+
python extract_tags.py -i output/sample_footprints.bam --footprint
|
|
135
|
+
|
|
136
|
+
# Keep BED files alongside bigBed
|
|
137
|
+
python extract_tags.py -i output/sample_footprints.bam --keep-bed
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Scripts Reference
|
|
141
|
+
|
|
142
|
+
### generate_probs.py / `fiberhmm-probs`
|
|
143
|
+
|
|
144
|
+
Generate emission probability tables from accessible and inaccessible control BAMs.
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
python generate_probs.py \
|
|
148
|
+
-a accessible_control.bam \
|
|
149
|
+
-u inaccessible_control.bam \
|
|
150
|
+
-o probs/ \
|
|
151
|
+
--mode pacbio-fiber \
|
|
152
|
+
-k 3 4 5 6 \
|
|
153
|
+
--stats
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### train_model.py / `fiberhmm-train`
|
|
157
|
+
|
|
158
|
+
Train the HMM on BAM data using precomputed emission probabilities.
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
python train_model.py \
|
|
162
|
+
-i sample.bam \
|
|
163
|
+
-p probs/tables/accessible_A_k3.tsv probs/tables/inaccessible_A_k3.tsv \
|
|
164
|
+
-o models/ \
|
|
165
|
+
-k 3 \
|
|
166
|
+
--stats
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Output:
|
|
170
|
+
```
|
|
171
|
+
models/
|
|
172
|
+
├── best-model.json # Primary format (recommended)
|
|
173
|
+
├── best-model.npz # NumPy format
|
|
174
|
+
├── all_models.json # All training iterations
|
|
175
|
+
├── training-reads.tsv # Read IDs used for training
|
|
176
|
+
├── config.json # Training parameters
|
|
177
|
+
└── plots/ # (with --stats)
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### apply_model.py / `fiberhmm-apply`
|
|
181
|
+
|
|
182
|
+
Apply a trained model to call footprints. Supports region-parallel processing.
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
python apply_model.py \
|
|
186
|
+
-i experiment.bam \
|
|
187
|
+
-m models/best-model.json \
|
|
188
|
+
-o output/ \
|
|
189
|
+
-c 8 \
|
|
190
|
+
--scores
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
Key options:
|
|
194
|
+
|
|
195
|
+
| Flag | Default | Description |
|
|
196
|
+
|------|---------|-------------|
|
|
197
|
+
| `-c/--cores` | 1 | CPU cores (0 = auto-detect) |
|
|
198
|
+
| `--region-size` | 10000000 | Region size for parallel chunks |
|
|
199
|
+
| `--skip-scaffolds` | false | Skip scaffold/contig chromosomes |
|
|
200
|
+
| `--chroms` | all | Process only these chromosomes |
|
|
201
|
+
| `--scores` | false | Compute per-footprint confidence scores |
|
|
202
|
+
| `-q/--min-mapq` | 20 | Min mapping quality |
|
|
203
|
+
| `-e/--edge-trim` | 10 | Edge masking (bp) |
|
|
204
|
+
|
|
205
|
+
### extract_tags.py / `fiberhmm-extract`
|
|
206
|
+
|
|
207
|
+
Extract footprint/MSP/m6A/m5C features from tagged BAMs to BED12/bigBed.
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
python extract_tags.py -i output/sample_footprints.bam -o output/ -c 8
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
### fiberhmm-utils
|
|
214
|
+
|
|
215
|
+
Consolidated utility for model and probability management. Four subcommands:
|
|
216
|
+
|
|
217
|
+
**convert** -- Convert legacy pickle/NPZ models to JSON:
|
|
218
|
+
```bash
|
|
219
|
+
fiberhmm-utils convert old_model.pickle new_model.json
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
**inspect** -- Print model metadata, parameters, and emission statistics:
|
|
223
|
+
```bash
|
|
224
|
+
fiberhmm-utils inspect model.json
|
|
225
|
+
fiberhmm-utils inspect model.json --full # full emission table
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
**transfer** -- Transfer emission probabilities between modalities (e.g., fiber-seq to DAF-seq) using accessibility priors from a matched cell type:
|
|
229
|
+
```bash
|
|
230
|
+
fiberhmm-utils transfer \
|
|
231
|
+
--target daf_sample.bam \
|
|
232
|
+
--reference-bam fiberseq_footprints.bam \
|
|
233
|
+
-o daf_probs/ \
|
|
234
|
+
--mode daf \
|
|
235
|
+
--stats
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
**adjust** -- Scale emission probabilities in a model (clamped to [0, 1]):
|
|
239
|
+
```bash
|
|
240
|
+
fiberhmm-utils adjust model.json --state accessible --scale 1.1 -o adjusted.json
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
## Fibertools Integration
|
|
244
|
+
|
|
245
|
+
FiberHMM produces BAM output with the same tag conventions used by [fibertools](https://github.com/fiberseq/fibertools-rs):
|
|
246
|
+
|
|
247
|
+
- `ns`/`nl` for nucleosome footprints
|
|
248
|
+
- `as`/`al` for methylase-sensitive patches (MSPs)
|
|
249
|
+
- `nq`/`aq` for quality scores
|
|
250
|
+
|
|
251
|
+
This means FiberHMM output can be used directly with any tool in the fibertools ecosystem, including `ft extract`, downstream analysis pipelines, and genome browsers that support fibertools-style BAM tags.
|
|
252
|
+
|
|
253
|
+
## FiberBrowser
|
|
254
|
+
|
|
255
|
+
FiberBrowser, a dedicated genome browser for single-molecule chromatin data, is coming soon.
|
|
256
|
+
|
|
257
|
+
## Performance Tips
|
|
258
|
+
|
|
259
|
+
1. **Use multiple cores**: `-c 8` (or more) for parallel processing
|
|
260
|
+
2. **Adjust region size for small genomes**:
|
|
261
|
+
- Yeast (~12 MB): `--region-size 500000`
|
|
262
|
+
- Drosophila (~140 MB): `--region-size 2000000`
|
|
263
|
+
- Human/mammalian: default 10 MB is fine
|
|
264
|
+
3. **Skip scaffolds**: `--skip-scaffolds` to avoid thousands of small contigs
|
|
265
|
+
4. **Install numba**: `pip install numba` for ~10x faster HMM training
|
|
266
|
+
|
|
267
|
+
## Model Formats
|
|
268
|
+
|
|
269
|
+
| Format | Extension | Description |
|
|
270
|
+
|--------|-----------|-------------|
|
|
271
|
+
| **JSON** | `.json` | Primary format -- portable, human-readable |
|
|
272
|
+
| NPZ | `.npz` | NumPy archive -- supported for loading |
|
|
273
|
+
| Pickle | `.pickle` | Legacy format -- supported for loading |
|
|
274
|
+
|
|
275
|
+
New models are always saved in JSON. Convert legacy models with:
|
|
276
|
+
|
|
277
|
+
```bash
|
|
278
|
+
fiberhmm-utils convert old_model.pickle new_model.json
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
## License
|
|
282
|
+
|
|
283
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""
|
|
2
|
+
FiberHMM - Hidden Markov Model toolkit for chromatin footprint calling
|
|
3
|
+
from fiber-seq and DAF-seq single-molecule data.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
__version__ = "2.0.0"
|
|
7
|
+
|
|
8
|
+
from fiberhmm.core.hmm import FiberHMM
|
|
9
|
+
from fiberhmm.core.bam_reader import ContextEncoder, FiberRead, read_bam
|
|
10
|
+
from fiberhmm.core.model_io import load_model, save_model, load_model_with_metadata
|