fiberhmm 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. fiberhmm-2.0.0/LICENSE +21 -0
  2. fiberhmm-2.0.0/PKG-INFO +328 -0
  3. fiberhmm-2.0.0/README.md +283 -0
  4. fiberhmm-2.0.0/apply_model.py +6 -0
  5. fiberhmm-2.0.0/extract_tags.py +6 -0
  6. fiberhmm-2.0.0/fiberhmm/__init__.py +10 -0
  7. fiberhmm-2.0.0/fiberhmm/cli/__init__.py +2 -0
  8. fiberhmm-2.0.0/fiberhmm/cli/apply.py +298 -0
  9. fiberhmm-2.0.0/fiberhmm/cli/common.py +128 -0
  10. fiberhmm-2.0.0/fiberhmm/cli/extract_tags.py +661 -0
  11. fiberhmm-2.0.0/fiberhmm/cli/generate_probs.py +499 -0
  12. fiberhmm-2.0.0/fiberhmm/cli/train.py +1023 -0
  13. fiberhmm-2.0.0/fiberhmm/cli/utils.py +894 -0
  14. fiberhmm-2.0.0/fiberhmm/core/__init__.py +5 -0
  15. fiberhmm-2.0.0/fiberhmm/core/bam_reader.py +849 -0
  16. fiberhmm-2.0.0/fiberhmm/core/hmm.py +1089 -0
  17. fiberhmm-2.0.0/fiberhmm/core/model_io.py +267 -0
  18. fiberhmm-2.0.0/fiberhmm/inference/__init__.py +29 -0
  19. fiberhmm-2.0.0/fiberhmm/inference/bam_output.py +581 -0
  20. fiberhmm-2.0.0/fiberhmm/inference/engine.py +418 -0
  21. fiberhmm-2.0.0/fiberhmm/inference/parallel.py +1528 -0
  22. fiberhmm-2.0.0/fiberhmm/inference/stats.py +433 -0
  23. fiberhmm-2.0.0/fiberhmm/probabilities/__init__.py +4 -0
  24. fiberhmm-2.0.0/fiberhmm/probabilities/context_counter.py +589 -0
  25. fiberhmm-2.0.0/fiberhmm/probabilities/stats.py +310 -0
  26. fiberhmm-2.0.0/fiberhmm/probabilities/utils.py +90 -0
  27. fiberhmm-2.0.0/fiberhmm.egg-info/PKG-INFO +328 -0
  28. fiberhmm-2.0.0/fiberhmm.egg-info/SOURCES.txt +42 -0
  29. fiberhmm-2.0.0/fiberhmm.egg-info/dependency_links.txt +1 -0
  30. fiberhmm-2.0.0/fiberhmm.egg-info/entry_points.txt +6 -0
  31. fiberhmm-2.0.0/fiberhmm.egg-info/requires.txt +21 -0
  32. fiberhmm-2.0.0/fiberhmm.egg-info/top_level.txt +6 -0
  33. fiberhmm-2.0.0/fiberhmm_utils.py +6 -0
  34. fiberhmm-2.0.0/generate_probs.py +6 -0
  35. fiberhmm-2.0.0/pyproject.toml +130 -0
  36. fiberhmm-2.0.0/setup.cfg +4 -0
  37. fiberhmm-2.0.0/tests/test_bam_reader.py +352 -0
  38. fiberhmm-2.0.0/tests/test_cli_common.py +211 -0
  39. fiberhmm-2.0.0/tests/test_hmm.py +433 -0
  40. fiberhmm-2.0.0/tests/test_inference_engine.py +161 -0
  41. fiberhmm-2.0.0/tests/test_inference_parallel.py +59 -0
  42. fiberhmm-2.0.0/tests/test_model_io.py +174 -0
  43. fiberhmm-2.0.0/tests/test_package_consistency.py +152 -0
  44. fiberhmm-2.0.0/train_model.py +6 -0
fiberhmm-2.0.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024-2026 FiberHMM Authors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,328 @@
1
+ Metadata-Version: 2.4
2
+ Name: fiberhmm
3
+ Version: 2.0.0
4
+ Summary: Hidden Markov Model for calling chromatin footprints from fiber-seq and DAF-seq data
5
+ Author: FiberHMM Authors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/fiberseq/FiberHMM
8
+ Project-URL: Documentation, https://github.com/fiberseq/FiberHMM#readme
9
+ Project-URL: Repository, https://github.com/fiberseq/FiberHMM
10
+ Project-URL: Issues, https://github.com/fiberseq/FiberHMM/issues
11
+ Keywords: bioinformatics,chromatin,fiber-seq,DAF-seq,hidden-markov-model,nucleosome,footprinting,epigenetics,single-molecule
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: MacOS
17
+ Classifier: Operating System :: POSIX :: Linux
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
24
+ Requires-Python: >=3.9
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: numpy>=1.20
28
+ Requires-Dist: scipy>=1.7
29
+ Requires-Dist: pandas>=1.3
30
+ Requires-Dist: pysam>=0.19
31
+ Requires-Dist: tqdm>=4.60
32
+ Provides-Extra: numba
33
+ Requires-Dist: numba>=0.55; extra == "numba"
34
+ Provides-Extra: plots
35
+ Requires-Dist: matplotlib>=3.5; extra == "plots"
36
+ Provides-Extra: all
37
+ Requires-Dist: numba>=0.55; extra == "all"
38
+ Requires-Dist: matplotlib>=3.5; extra == "all"
39
+ Provides-Extra: dev
40
+ Requires-Dist: pytest>=7.0; extra == "dev"
41
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
42
+ Requires-Dist: black>=23.0; extra == "dev"
43
+ Requires-Dist: ruff>=0.1; extra == "dev"
44
+ Dynamic: license-file
45
+
46
+ # FiberHMM
47
+
48
+ Hidden Markov Model toolkit for calling chromatin footprints from fiber-seq and DAF-seq single-molecule data.
49
+
50
+ FiberHMM identifies nucleosome-protected regions (footprints) and accessible regions (methylase-sensitive patches, MSPs) from single-molecule DNA modification data, including m6A methylation (fiber-seq) and deamination marks (DAF-seq).
51
+
52
+ ## Key Features
53
+
54
+ - **No genome context files** -- hexamer context computed directly from read sequences
55
+ - **Fibertools-compatible output** -- tagged BAM with `ns`/`nl`/`as`/`al` tags, ready for downstream tools
56
+ - **Native HMM implementation** -- no hmmlearn dependency; Numba JIT optional for ~10x speedup
57
+ - **Region-parallel processing** -- scales linearly with cores for large genomes
58
+ - **Multi-platform** -- supports PacBio fiber-seq, Nanopore fiber-seq, and DAF-seq
59
+ - **Legacy model support** -- loads old hmmlearn-trained pickle/NPZ models seamlessly
60
+
61
+ ## Installation
62
+
63
+ ### Using pip
64
+
65
+ ```bash
66
+ pip install fiberhmm
67
+ ```
68
+
69
+ ### From source
70
+
71
+ ```bash
72
+ git clone https://github.com/fiberseq/FiberHMM.git
73
+ cd FiberHMM
74
+ pip install -e .
75
+ ```
76
+
77
+ ### Optional dependencies
78
+
79
+ ```bash
80
+ pip install numba # ~10x faster HMM computation
81
+ pip install matplotlib # --stats visualization
82
+ ```
83
+
84
+ For bigBed output, install [`bedToBigBed`](https://hgdownload.soe.ucsc.edu/admin/exe/) from UCSC tools.
85
+
86
+ ## Quick Start
87
+
88
+ ### 1. Generate emission probabilities
89
+
90
+ Requires accessible (naked DNA) and inaccessible (native chromatin) control BAMs:
91
+
92
+ ```bash
93
+ python generate_probs.py \
94
+ -a accessible_control.bam \
95
+ -u inaccessible_control.bam \
96
+ -o probs/ \
97
+ --stats
98
+ ```
99
+
100
+ ### 2. Train HMM
101
+
102
+ ```bash
103
+ python train_model.py \
104
+ -i sample.bam \
105
+ -p probs/tables/accessible_A_k3.tsv probs/tables/inaccessible_A_k3.tsv \
106
+ -o models/ \
107
+ --stats
108
+ ```
109
+
110
+ ### 3. Call footprints
111
+
112
+ ```bash
113
+ python apply_model.py \
114
+ -i experiment.bam \
115
+ -m models/best-model.json \
116
+ -o output/ \
117
+ -c 8 \
118
+ --scores
119
+ ```
120
+
121
+ ### 4. Extract to BED12/bigBed
122
+
123
+ ```bash
124
+ python extract_tags.py -i output/experiment_footprints.bam
125
+ ```
126
+
127
+ ## Pre-trained Models
128
+
129
+ FiberHMM ships with pre-trained models in `models/` ready for immediate use:
130
+
131
+ | Model | File | Enzyme | Platform | Mode |
132
+ |-------|------|--------|----------|------|
133
+ | **Hia5 PacBio** | `hia5_pacbio.json` | Hia5 (m6A) | PacBio | `pacbio-fiber` |
134
+ | **Hia5 Nanopore** | `hia5_nanopore.json` | Hia5 (m6A) | Nanopore | `nanopore-fiber` |
135
+ | **DddA PacBio** | `ddda_pacbio.json` | DddA (deamination) | PacBio | `daf` |
136
+ | **DddB Nanopore** | `dddb_nanopore.json` | DddB (deamination) | Nanopore | `daf` |
137
+
138
+ ```bash
139
+ # Example: call footprints with a pre-trained model
140
+ python apply_model.py -i experiment.bam -m models/hia5_pacbio.json -o output/ -c 8
141
+ ```
142
+
143
+ ## Analysis Modes
144
+
145
+ | Mode | Flag | Description | Target bases |
146
+ |------|------|-------------|--------------|
147
+ | **PacBio fiber-seq** | `--mode pacbio-fiber` | Default. m6A at A and T (both strands) | A, T (with RC) |
148
+ | **Nanopore fiber-seq** | `--mode nanopore-fiber` | m6A at A only (single strand) | A only |
149
+ | **DAF-seq** | `--mode daf` | Deamination at C/G (strand-specific) | C or G |
150
+
151
+ All scripts accept `--mode`; context size `-k` (3-10) determines the hexamer table size.
152
+
153
+ **Note on mode selection:** The `pacbio-fiber` vs `nanopore-fiber` distinction only matters for Hia5 (m6A), where PacBio detects modifications on both strands while Nanopore detects only one. For deaminase-based methods (DddA, DddB), `--mode daf` is always used regardless of sequencing platform -- the chemistry is inherently strand-specific. Accuracy may differ between platforms, but the mode is the same.
154
+
155
+ ## Output
156
+
157
+ ### BAM Tags
158
+
159
+ `apply_model.py` adds footprint tags compatible with the fibertools ecosystem:
160
+
161
+ | Tag | Type | Description |
162
+ |-----|------|-------------|
163
+ | `ns` | B,I | Nucleosome/footprint starts (0-based query coords) |
164
+ | `nl` | B,I | Nucleosome/footprint lengths |
165
+ | `as` | B,I | Accessible/MSP starts |
166
+ | `al` | B,I | Accessible/MSP lengths |
167
+ | `nq` | B,C | Footprint quality scores (0-255, with `--scores`) |
168
+ | `aq` | B,C | MSP quality scores (0-255, with `--scores`) |
169
+
170
+ ### BED12/bigBed Extraction
171
+
172
+ Use `extract_tags.py` to extract features from tagged BAMs for browser visualization:
173
+
174
+ ```bash
175
+ # Extract all feature types to bigBed
176
+ python extract_tags.py -i output/sample_footprints.bam
177
+
178
+ # Extract only footprints
179
+ python extract_tags.py -i output/sample_footprints.bam --footprint
180
+
181
+ # Keep BED files alongside bigBed
182
+ python extract_tags.py -i output/sample_footprints.bam --keep-bed
183
+ ```
184
+
185
+ ## Scripts Reference
186
+
187
+ ### generate_probs.py / `fiberhmm-probs`
188
+
189
+ Generate emission probability tables from accessible and inaccessible control BAMs.
190
+
191
+ ```bash
192
+ python generate_probs.py \
193
+ -a accessible_control.bam \
194
+ -u inaccessible_control.bam \
195
+ -o probs/ \
196
+ --mode pacbio-fiber \
197
+ -k 3 4 5 6 \
198
+ --stats
199
+ ```
200
+
201
+ ### train_model.py / `fiberhmm-train`
202
+
203
+ Train the HMM on BAM data using precomputed emission probabilities.
204
+
205
+ ```bash
206
+ python train_model.py \
207
+ -i sample.bam \
208
+ -p probs/tables/accessible_A_k3.tsv probs/tables/inaccessible_A_k3.tsv \
209
+ -o models/ \
210
+ -k 3 \
211
+ --stats
212
+ ```
213
+
214
+ Output:
215
+ ```
216
+ models/
217
+ ├── best-model.json # Primary format (recommended)
218
+ ├── best-model.npz # NumPy format
219
+ ├── all_models.json # All training iterations
220
+ ├── training-reads.tsv # Read IDs used for training
221
+ ├── config.json # Training parameters
222
+ └── plots/ # (with --stats)
223
+ ```
224
+
225
+ ### apply_model.py / `fiberhmm-apply`
226
+
227
+ Apply a trained model to call footprints. Supports region-parallel processing.
228
+
229
+ ```bash
230
+ python apply_model.py \
231
+ -i experiment.bam \
232
+ -m models/best-model.json \
233
+ -o output/ \
234
+ -c 8 \
235
+ --scores
236
+ ```
237
+
238
+ Key options:
239
+
240
+ | Flag | Default | Description |
241
+ |------|---------|-------------|
242
+ | `-c/--cores` | 1 | CPU cores (0 = auto-detect) |
243
+ | `--region-size` | 10000000 | Region size for parallel chunks |
244
+ | `--skip-scaffolds` | false | Skip scaffold/contig chromosomes |
245
+ | `--chroms` | all | Process only these chromosomes |
246
+ | `--scores` | false | Compute per-footprint confidence scores |
247
+ | `-q/--min-mapq` | 20 | Min mapping quality |
248
+ | `-e/--edge-trim` | 10 | Edge masking (bp) |
249
+
250
+ ### extract_tags.py / `fiberhmm-extract`
251
+
252
+ Extract footprint/MSP/m6A/m5C features from tagged BAMs to BED12/bigBed.
253
+
254
+ ```bash
255
+ python extract_tags.py -i output/sample_footprints.bam -o output/ -c 8
256
+ ```
257
+
258
+ ### fiberhmm-utils
259
+
260
+ Consolidated utility for model and probability management. Four subcommands:
261
+
262
+ **convert** -- Convert legacy pickle/NPZ models to JSON:
263
+ ```bash
264
+ fiberhmm-utils convert old_model.pickle new_model.json
265
+ ```
266
+
267
+ **inspect** -- Print model metadata, parameters, and emission statistics:
268
+ ```bash
269
+ fiberhmm-utils inspect model.json
270
+ fiberhmm-utils inspect model.json --full # full emission table
271
+ ```
272
+
273
+ **transfer** -- Transfer emission probabilities between modalities (e.g., fiber-seq to DAF-seq) using accessibility priors from a matched cell type:
274
+ ```bash
275
+ fiberhmm-utils transfer \
276
+ --target daf_sample.bam \
277
+ --reference-bam fiberseq_footprints.bam \
278
+ -o daf_probs/ \
279
+ --mode daf \
280
+ --stats
281
+ ```
282
+
283
+ **adjust** -- Scale emission probabilities in a model (clamped to [0, 1]):
284
+ ```bash
285
+ fiberhmm-utils adjust model.json --state accessible --scale 1.1 -o adjusted.json
286
+ ```
287
+
288
+ ## Fibertools Integration
289
+
290
+ FiberHMM produces BAM output with the same tag conventions used by [fibertools](https://github.com/fiberseq/fibertools-rs):
291
+
292
+ - `ns`/`nl` for nucleosome footprints
293
+ - `as`/`al` for methylase-sensitive patches (MSPs)
294
+ - `nq`/`aq` for quality scores
295
+
296
+ This means FiberHMM output can be used directly with any tool in the fibertools ecosystem, including `ft extract`, downstream analysis pipelines, and genome browsers that support fibertools-style BAM tags.
297
+
298
+ ## FiberBrowser
299
+
300
+ FiberBrowser, a dedicated genome browser for single-molecule chromatin data, is coming soon.
301
+
302
+ ## Performance Tips
303
+
304
+ 1. **Use multiple cores**: `-c 8` (or more) for parallel processing
305
+ 2. **Adjust region size for small genomes**:
306
+ - Yeast (~12 MB): `--region-size 500000`
307
+ - Drosophila (~140 MB): `--region-size 2000000`
308
+ - Human/mammalian: default 10 MB is fine
309
+ 3. **Skip scaffolds**: `--skip-scaffolds` to avoid thousands of small contigs
310
+ 4. **Install numba**: `pip install numba` for ~10x faster HMM training
311
+
312
+ ## Model Formats
313
+
314
+ | Format | Extension | Description |
315
+ |--------|-----------|-------------|
316
+ | **JSON** | `.json` | Primary format -- portable, human-readable |
317
+ | NPZ | `.npz` | NumPy archive -- supported for loading |
318
+ | Pickle | `.pickle` | Legacy format -- supported for loading |
319
+
320
+ New models are always saved in JSON. Convert legacy models with:
321
+
322
+ ```bash
323
+ fiberhmm-utils convert old_model.pickle new_model.json
324
+ ```
325
+
326
+ ## License
327
+
328
+ MIT License. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,283 @@
1
+ # FiberHMM
2
+
3
+ Hidden Markov Model toolkit for calling chromatin footprints from fiber-seq and DAF-seq single-molecule data.
4
+
5
+ FiberHMM identifies nucleosome-protected regions (footprints) and accessible regions (methylase-sensitive patches, MSPs) from single-molecule DNA modification data, including m6A methylation (fiber-seq) and deamination marks (DAF-seq).
6
+
7
+ ## Key Features
8
+
9
+ - **No genome context files** -- hexamer context computed directly from read sequences
10
+ - **Fibertools-compatible output** -- tagged BAM with `ns`/`nl`/`as`/`al` tags, ready for downstream tools
11
+ - **Native HMM implementation** -- no hmmlearn dependency; Numba JIT optional for ~10x speedup
12
+ - **Region-parallel processing** -- scales linearly with cores for large genomes
13
+ - **Multi-platform** -- supports PacBio fiber-seq, Nanopore fiber-seq, and DAF-seq
14
+ - **Legacy model support** -- loads old hmmlearn-trained pickle/NPZ models seamlessly
15
+
16
+ ## Installation
17
+
18
+ ### Using pip
19
+
20
+ ```bash
21
+ pip install fiberhmm
22
+ ```
23
+
24
+ ### From source
25
+
26
+ ```bash
27
+ git clone https://github.com/fiberseq/FiberHMM.git
28
+ cd FiberHMM
29
+ pip install -e .
30
+ ```
31
+
32
+ ### Optional dependencies
33
+
34
+ ```bash
35
+ pip install numba # ~10x faster HMM computation
36
+ pip install matplotlib # --stats visualization
37
+ ```
38
+
39
+ For bigBed output, install [`bedToBigBed`](https://hgdownload.soe.ucsc.edu/admin/exe/) from UCSC tools.
40
+
41
+ ## Quick Start
42
+
43
+ ### 1. Generate emission probabilities
44
+
45
+ Requires accessible (naked DNA) and inaccessible (native chromatin) control BAMs:
46
+
47
+ ```bash
48
+ python generate_probs.py \
49
+ -a accessible_control.bam \
50
+ -u inaccessible_control.bam \
51
+ -o probs/ \
52
+ --stats
53
+ ```
54
+
55
+ ### 2. Train HMM
56
+
57
+ ```bash
58
+ python train_model.py \
59
+ -i sample.bam \
60
+ -p probs/tables/accessible_A_k3.tsv probs/tables/inaccessible_A_k3.tsv \
61
+ -o models/ \
62
+ --stats
63
+ ```
64
+
65
+ ### 3. Call footprints
66
+
67
+ ```bash
68
+ python apply_model.py \
69
+ -i experiment.bam \
70
+ -m models/best-model.json \
71
+ -o output/ \
72
+ -c 8 \
73
+ --scores
74
+ ```
75
+
76
+ ### 4. Extract to BED12/bigBed
77
+
78
+ ```bash
79
+ python extract_tags.py -i output/experiment_footprints.bam
80
+ ```
81
+
82
+ ## Pre-trained Models
83
+
84
+ FiberHMM ships with pre-trained models in `models/` ready for immediate use:
85
+
86
+ | Model | File | Enzyme | Platform | Mode |
87
+ |-------|------|--------|----------|------|
88
+ | **Hia5 PacBio** | `hia5_pacbio.json` | Hia5 (m6A) | PacBio | `pacbio-fiber` |
89
+ | **Hia5 Nanopore** | `hia5_nanopore.json` | Hia5 (m6A) | Nanopore | `nanopore-fiber` |
90
+ | **DddA PacBio** | `ddda_pacbio.json` | DddA (deamination) | PacBio | `daf` |
91
+ | **DddB Nanopore** | `dddb_nanopore.json` | DddB (deamination) | Nanopore | `daf` |
92
+
93
+ ```bash
94
+ # Example: call footprints with a pre-trained model
95
+ python apply_model.py -i experiment.bam -m models/hia5_pacbio.json -o output/ -c 8
96
+ ```
97
+
98
+ ## Analysis Modes
99
+
100
+ | Mode | Flag | Description | Target bases |
101
+ |------|------|-------------|--------------|
102
+ | **PacBio fiber-seq** | `--mode pacbio-fiber` | Default. m6A at A and T (both strands) | A, T (with RC) |
103
+ | **Nanopore fiber-seq** | `--mode nanopore-fiber` | m6A at A only (single strand) | A only |
104
+ | **DAF-seq** | `--mode daf` | Deamination at C/G (strand-specific) | C or G |
105
+
106
+ All scripts accept `--mode`; context size `-k` (3-10) determines the hexamer table size.
107
+
108
+ **Note on mode selection:** The `pacbio-fiber` vs `nanopore-fiber` distinction only matters for Hia5 (m6A), where PacBio detects modifications on both strands while Nanopore detects only one. For deaminase-based methods (DddA, DddB), `--mode daf` is always used regardless of sequencing platform -- the chemistry is inherently strand-specific. Accuracy may differ between platforms, but the mode is the same.
109
+
110
+ ## Output
111
+
112
+ ### BAM Tags
113
+
114
+ `apply_model.py` adds footprint tags compatible with the fibertools ecosystem:
115
+
116
+ | Tag | Type | Description |
117
+ |-----|------|-------------|
118
+ | `ns` | B,I | Nucleosome/footprint starts (0-based query coords) |
119
+ | `nl` | B,I | Nucleosome/footprint lengths |
120
+ | `as` | B,I | Accessible/MSP starts |
121
+ | `al` | B,I | Accessible/MSP lengths |
122
+ | `nq` | B,C | Footprint quality scores (0-255, with `--scores`) |
123
+ | `aq` | B,C | MSP quality scores (0-255, with `--scores`) |
124
+
125
+ ### BED12/bigBed Extraction
126
+
127
+ Use `extract_tags.py` to extract features from tagged BAMs for browser visualization:
128
+
129
+ ```bash
130
+ # Extract all feature types to bigBed
131
+ python extract_tags.py -i output/sample_footprints.bam
132
+
133
+ # Extract only footprints
134
+ python extract_tags.py -i output/sample_footprints.bam --footprint
135
+
136
+ # Keep BED files alongside bigBed
137
+ python extract_tags.py -i output/sample_footprints.bam --keep-bed
138
+ ```
139
+
140
+ ## Scripts Reference
141
+
142
+ ### generate_probs.py / `fiberhmm-probs`
143
+
144
+ Generate emission probability tables from accessible and inaccessible control BAMs.
145
+
146
+ ```bash
147
+ python generate_probs.py \
148
+ -a accessible_control.bam \
149
+ -u inaccessible_control.bam \
150
+ -o probs/ \
151
+ --mode pacbio-fiber \
152
+ -k 3 4 5 6 \
153
+ --stats
154
+ ```
155
+
156
+ ### train_model.py / `fiberhmm-train`
157
+
158
+ Train the HMM on BAM data using precomputed emission probabilities.
159
+
160
+ ```bash
161
+ python train_model.py \
162
+ -i sample.bam \
163
+ -p probs/tables/accessible_A_k3.tsv probs/tables/inaccessible_A_k3.tsv \
164
+ -o models/ \
165
+ -k 3 \
166
+ --stats
167
+ ```
168
+
169
+ Output:
170
+ ```
171
+ models/
172
+ ├── best-model.json # Primary format (recommended)
173
+ ├── best-model.npz # NumPy format
174
+ ├── all_models.json # All training iterations
175
+ ├── training-reads.tsv # Read IDs used for training
176
+ ├── config.json # Training parameters
177
+ └── plots/ # (with --stats)
178
+ ```
179
+
180
+ ### apply_model.py / `fiberhmm-apply`
181
+
182
+ Apply a trained model to call footprints. Supports region-parallel processing.
183
+
184
+ ```bash
185
+ python apply_model.py \
186
+ -i experiment.bam \
187
+ -m models/best-model.json \
188
+ -o output/ \
189
+ -c 8 \
190
+ --scores
191
+ ```
192
+
193
+ Key options:
194
+
195
+ | Flag | Default | Description |
196
+ |------|---------|-------------|
197
+ | `-c/--cores` | 1 | CPU cores (0 = auto-detect) |
198
+ | `--region-size` | 10000000 | Region size for parallel chunks |
199
+ | `--skip-scaffolds` | false | Skip scaffold/contig chromosomes |
200
+ | `--chroms` | all | Process only these chromosomes |
201
+ | `--scores` | false | Compute per-footprint confidence scores |
202
+ | `-q/--min-mapq` | 20 | Min mapping quality |
203
+ | `-e/--edge-trim` | 10 | Edge masking (bp) |
204
+
205
+ ### extract_tags.py / `fiberhmm-extract`
206
+
207
+ Extract footprint/MSP/m6A/m5C features from tagged BAMs to BED12/bigBed.
208
+
209
+ ```bash
210
+ python extract_tags.py -i output/sample_footprints.bam -o output/ -c 8
211
+ ```
212
+
213
+ ### fiberhmm-utils
214
+
215
+ Consolidated utility for model and probability management. Four subcommands:
216
+
217
+ **convert** -- Convert legacy pickle/NPZ models to JSON:
218
+ ```bash
219
+ fiberhmm-utils convert old_model.pickle new_model.json
220
+ ```
221
+
222
+ **inspect** -- Print model metadata, parameters, and emission statistics:
223
+ ```bash
224
+ fiberhmm-utils inspect model.json
225
+ fiberhmm-utils inspect model.json --full # full emission table
226
+ ```
227
+
228
+ **transfer** -- Transfer emission probabilities between modalities (e.g., fiber-seq to DAF-seq) using accessibility priors from a matched cell type:
229
+ ```bash
230
+ fiberhmm-utils transfer \
231
+ --target daf_sample.bam \
232
+ --reference-bam fiberseq_footprints.bam \
233
+ -o daf_probs/ \
234
+ --mode daf \
235
+ --stats
236
+ ```
237
+
238
+ **adjust** -- Scale emission probabilities in a model (clamped to [0, 1]):
239
+ ```bash
240
+ fiberhmm-utils adjust model.json --state accessible --scale 1.1 -o adjusted.json
241
+ ```
242
+
243
+ ## Fibertools Integration
244
+
245
+ FiberHMM produces BAM output with the same tag conventions used by [fibertools](https://github.com/fiberseq/fibertools-rs):
246
+
247
+ - `ns`/`nl` for nucleosome footprints
248
+ - `as`/`al` for methylase-sensitive patches (MSPs)
249
+ - `nq`/`aq` for quality scores
250
+
251
+ This means FiberHMM output can be used directly with any tool in the fibertools ecosystem, including `ft extract`, downstream analysis pipelines, and genome browsers that support fibertools-style BAM tags.
252
+
253
+ ## FiberBrowser
254
+
255
+ FiberBrowser, a dedicated genome browser for single-molecule chromatin data, is coming soon.
256
+
257
+ ## Performance Tips
258
+
259
+ 1. **Use multiple cores**: `-c 8` (or more) for parallel processing
260
+ 2. **Adjust region size for small genomes**:
261
+ - Yeast (~12 MB): `--region-size 500000`
262
+ - Drosophila (~140 MB): `--region-size 2000000`
263
+ - Human/mammalian: default 10 MB is fine
264
+ 3. **Skip scaffolds**: `--skip-scaffolds` to avoid thousands of small contigs
265
+ 4. **Install numba**: `pip install numba` for ~10x faster HMM training
266
+
267
+ ## Model Formats
268
+
269
+ | Format | Extension | Description |
270
+ |--------|-----------|-------------|
271
+ | **JSON** | `.json` | Primary format -- portable, human-readable |
272
+ | NPZ | `.npz` | NumPy archive -- supported for loading |
273
+ | Pickle | `.pickle` | Legacy format -- supported for loading |
274
+
275
+ New models are always saved in JSON. Convert legacy models with:
276
+
277
+ ```bash
278
+ fiberhmm-utils convert old_model.pickle new_model.json
279
+ ```
280
+
281
+ ## License
282
+
283
+ MIT License. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python3
2
+ """Backward-compatible wrapper. See fiberhmm/cli/apply.py"""
3
+ from fiberhmm.cli.apply import main
4
+
5
+ if __name__ == '__main__':
6
+ main()
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python3
2
+ """Backward-compatible wrapper. See fiberhmm/cli/extract_tags.py"""
3
+ from fiberhmm.cli.extract_tags import main
4
+
5
+ if __name__ == '__main__':
6
+ main()
@@ -0,0 +1,10 @@
1
+ """
2
+ FiberHMM - Hidden Markov Model toolkit for chromatin footprint calling
3
+ from fiber-seq and DAF-seq single-molecule data.
4
+ """
5
+
6
+ __version__ = "2.0.0"
7
+
8
+ from fiberhmm.core.hmm import FiberHMM
9
+ from fiberhmm.core.bam_reader import ContextEncoder, FiberRead, read_bam
10
+ from fiberhmm.core.model_io import load_model, save_model, load_model_with_metadata
@@ -0,0 +1,2 @@
1
+ """Command-line interface entry points."""
2
+ # CLI modules are populated as scripts are moved into the package