bam2tensor 2.2__tar.gz → 2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bam2tensor-2.2 → bam2tensor-2.4}/.github/workflows/constraints.txt +1 -1
- {bam2tensor-2.2 → bam2tensor-2.4}/.github/workflows/docs.yml +2 -2
- {bam2tensor-2.2 → bam2tensor-2.4}/.github/workflows/labeler.yml +1 -1
- {bam2tensor-2.2 → bam2tensor-2.4}/.github/workflows/release.yml +1 -1
- {bam2tensor-2.2 → bam2tensor-2.4}/CLAUDE.md +12 -3
- {bam2tensor-2.2 → bam2tensor-2.4}/PKG-INFO +105 -10
- {bam2tensor-2.2 → bam2tensor-2.4}/README.md +104 -9
- {bam2tensor-2.2 → bam2tensor-2.4}/docs/reference.md +8 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/pyproject.toml +2 -1
- {bam2tensor-2.2 → bam2tensor-2.4}/src/bam2tensor/__init__.py +3 -3
- {bam2tensor-2.2 → bam2tensor-2.4}/src/bam2tensor/__main__.py +23 -5
- {bam2tensor-2.2 → bam2tensor-2.4}/src/bam2tensor/functions.py +46 -18
- bam2tensor-2.4/src/bam2tensor/inspect.py +156 -0
- bam2tensor-2.4/src/bam2tensor/metadata.py +162 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/tests/test_duplication.py +5 -3
- {bam2tensor-2.2 → bam2tensor-2.4}/tests/test_functions.py +233 -79
- bam2tensor-2.4/tests/test_inspect.py +186 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/tests/test_main.py +33 -0
- bam2tensor-2.4/tests/test_metadata.py +224 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/uv.lock +104 -108
- {bam2tensor-2.2 → bam2tensor-2.4}/.darglint +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/.editorconfig +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/.gitattributes +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/.github/actions/setup-env/action.yml +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/.github/dependabot.yml +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/.github/labels.yml +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/.github/release-drafter.yml +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/.github/workflows/tests.yml +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/.gitignore +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/.pre-commit-config.yaml +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/CONTRIBUTING.md +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/LICENSE +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/SECURITY.md +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/docs/Makefile +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/docs/conf.py +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/docs/contributing.md +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/docs/index.md +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/docs/license.md +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/docs/logo/bam2tensor-logo.afdesign +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/docs/logo/bam2tensor-logo.png +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/docs/make.bat +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/docs/nano-banana-overview-shrunk.png +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/docs/templates/package.rst_t +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/noxfile.py +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/src/bam2tensor/embedding.py +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/src/bam2tensor/py.typed +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/src/bam2tensor/reference.py +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/tests/__init__.py +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/tests/test_embedding.py +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/tests/test_fasta.fa +0 -0
- {bam2tensor-2.2 → bam2tensor-2.4}/tests/test_reference.py +0 -0
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
nox==2026.2.9
|
|
2
|
-
uv==0.
|
|
2
|
+
uv==0.11.2
|
|
@@ -76,7 +76,7 @@ jobs:
|
|
|
76
76
|
repository-url: https://test.pypi.org/legacy/
|
|
77
77
|
|
|
78
78
|
- name: Publish the release notes
|
|
79
|
-
uses: release-drafter/release-drafter@
|
|
79
|
+
uses: release-drafter/release-drafter@v7.1.1
|
|
80
80
|
with:
|
|
81
81
|
publish: ${{ steps.check-version.outputs.tag != '' || steps.check-tag.outputs.tag != '' }}
|
|
82
82
|
tag: ${{ steps.check-version.outputs.tag || steps.check-tag.outputs.tag }}
|
|
@@ -40,10 +40,12 @@ uv run mypy src
|
|
|
40
40
|
|
|
41
41
|
```
|
|
42
42
|
src/bam2tensor/
|
|
43
|
-
__init__.py # Package version (2.
|
|
44
|
-
__main__.py # Click CLI entry point
|
|
43
|
+
__init__.py # Package version (2.4)
|
|
44
|
+
__main__.py # Click CLI entry point (bam2tensor command)
|
|
45
|
+
inspect.py # Inspect CLI entry point (bam2tensor-inspect command)
|
|
45
46
|
embedding.py # GenomeMethylationEmbedding class (FASTA parsing, CpG indexing)
|
|
46
47
|
functions.py # Core extraction: extract_methylation_data_from_bam()
|
|
48
|
+
metadata.py # .npz metadata read/write (provenance info in output files)
|
|
47
49
|
reference.py # Reference genome download and caching utilities
|
|
48
50
|
|
|
49
51
|
tests/
|
|
@@ -51,6 +53,8 @@ tests/
|
|
|
51
53
|
test_functions.py # Core function tests
|
|
52
54
|
test_embedding.py # Embedding class tests
|
|
53
55
|
test_duplication.py # Read duplication bug tests
|
|
56
|
+
test_inspect.py # Inspect CLI tests
|
|
57
|
+
test_metadata.py # Metadata read/write/round-trip tests
|
|
54
58
|
test_reference.py # Reference download/caching tests
|
|
55
59
|
test.bam, test.bam.bai, test_fasta.fa # Test fixtures
|
|
56
60
|
```
|
|
@@ -110,8 +114,11 @@ xdoctest validates code examples in docstrings. Important rules:
|
|
|
110
114
|
### Data Structure
|
|
111
115
|
- Output: scipy sparse COO matrix saved as .npz
|
|
112
116
|
- Rows = unique reads (primary alignments)
|
|
113
|
-
- Columns = CpG sites
|
|
117
|
+
- Columns = CpG sites (ordered by genomic position, determined by reference genome)
|
|
114
118
|
- Values: 1 (methylated), 0 (unmethylated), -1 (no data/indels/SNVs)
|
|
119
|
+
- Each .npz file contains a `metadata.json` entry with provenance info (genome name, version, CpG index CRC32, expected chromosomes). Read via `bam2tensor.metadata.read_npz_metadata()`.
|
|
120
|
+
- Each .npz file contains a `tlen.npy` entry with per-read signed template length (BAM TLEN field) as int32. Read via `bam2tensor.metadata.read_npz_tlen()`. Returns `None` for files from older versions.
|
|
121
|
+
- `extract_methylation_data_from_bam()` returns an `ExtractionResult` NamedTuple with `.matrix` (sparse COO) and `.tlen` (numpy int32 array).
|
|
115
122
|
|
|
116
123
|
### Methylation Strand Detection
|
|
117
124
|
- Bismark aligner: XM tag (Z/z for methylated/unmethylated CpG; no strand filtering needed)
|
|
@@ -144,6 +151,8 @@ xdoctest validates code examples in docstrings. Important rules:
|
|
|
144
151
|
uv run bam2tensor --input-path input.bam --reference-fasta ref.fa
|
|
145
152
|
# Or with auto-download:
|
|
146
153
|
uv run bam2tensor --input-path input.bam --download-reference hg38
|
|
154
|
+
# Inspect an output file:
|
|
155
|
+
uv run bam2tensor-inspect output.methylation.npz
|
|
147
156
|
```
|
|
148
157
|
|
|
149
158
|
### Reference Genome Downloads
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bam2tensor
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4
|
|
4
4
|
Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
|
|
5
5
|
Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
|
|
6
6
|
Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
|
|
@@ -72,7 +72,10 @@ Description-Content-Type: text/markdown
|
|
|
72
72
|
- [Custom Output Directory](#custom-output-directory)
|
|
73
73
|
- [Using a Custom Genome](#using-a-custom-genome)
|
|
74
74
|
- [Command-Line Options](#command-line-options)
|
|
75
|
+
- [Inspecting Output Files](#inspecting-output-files)
|
|
75
76
|
- [Output Data Structure](#output-data-structure)
|
|
77
|
+
- [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
|
|
78
|
+
- [Embedded Metadata](#embedded-metadata)
|
|
76
79
|
- [Loading Output Files](#loading-output-files)
|
|
77
80
|
- [Converting to Dense Arrays](#converting-to-dense-arrays)
|
|
78
81
|
- [Working with Genomic Coordinates](#working-with-genomic-coordinates)
|
|
@@ -95,6 +98,7 @@ Description-Content-Type: text/markdown
|
|
|
95
98
|
- **Batch Processing**: Process multiple BAM files with directory recursion
|
|
96
99
|
- **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
|
|
97
100
|
- **Quality Filtering**: Configurable mapping quality thresholds
|
|
101
|
+
- **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
|
|
98
102
|
|
|
99
103
|
## Requirements
|
|
100
104
|
|
|
@@ -285,24 +289,103 @@ Options:
|
|
|
285
289
|
| `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
|
|
286
290
|
| `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
|
|
287
291
|
|
|
292
|
+
## Inspecting Output Files
|
|
293
|
+
|
|
294
|
+
Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
|
|
295
|
+
|
|
296
|
+
```bash
|
|
297
|
+
$ bam2tensor-inspect sample.methylation.npz
|
|
298
|
+
sample.methylation.npz
|
|
299
|
+
Genome: hg38
|
|
300
|
+
Chromosomes: 24 (chr1, chr2, ... chrX, chrY)
|
|
301
|
+
Reads: 1,423,891
|
|
302
|
+
CpG sites: 28,217,448
|
|
303
|
+
Data points: 12,847,322 (sparsity: 99.97%)
|
|
304
|
+
Fragment len: median 167, mean 182, range [50, 600]
|
|
305
|
+
CpG index CRC32: a1b2c3d4
|
|
306
|
+
bam2tensor: v2.4
|
|
307
|
+
File size: 14.2 MB
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
You can pass multiple files at once:
|
|
311
|
+
|
|
312
|
+
```bash
|
|
313
|
+
$ bam2tensor-inspect *.methylation.npz
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
This works on files produced by older versions of bam2tensor too (metadata fields will be omitted).
|
|
317
|
+
|
|
288
318
|
## Output Data Structure
|
|
289
319
|
|
|
290
|
-
bam2tensor generates one `.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] with the following structure:
|
|
320
|
+
bam2tensor generates one `.methylation.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] (`scipy.sparse.coo_matrix`) with the following structure:
|
|
291
321
|
|
|
292
322
|
| Dimension | Represents |
|
|
293
323
|
|-----------|------------|
|
|
294
|
-
| Rows | Unique reads (primary alignments that pass quality filters) |
|
|
295
|
-
| Columns | CpG sites
|
|
324
|
+
| **Rows** | Unique sequencing reads (primary alignments that pass quality and flag filters, numbered sequentially as encountered across chromosomes) |
|
|
325
|
+
| **Columns** | CpG sites from the reference genome, ordered by genomic position across all chromosomes (chr1, chr2, ..., chrX, chrY). Column `i` maps to the `i`-th CpG dinucleotide in the reference FASTA. |
|
|
326
|
+
|
|
327
|
+
The **column dimension is determined entirely by the reference genome**: it equals the total number of CpG sites across all `--expected-chromosomes`. For example, hg38 with default chromosomes has ~28 million CpG columns. To map column indices back to genomic coordinates (e.g., column 12345 → chr1:29503), use the `GenomeMethylationEmbedding` class with the same reference FASTA and chromosome list (see [Working with Genomic Coordinates](#working-with-genomic-coordinates) below).
|
|
296
328
|
|
|
297
329
|
### Methylation State Values
|
|
298
330
|
|
|
299
331
|
| Value | Meaning |
|
|
300
332
|
|-------|---------|
|
|
301
|
-
| `1` | Methylated (cytosine preserved as C) |
|
|
302
|
-
| `0` | Unmethylated (cytosine converted to T by bisulfite treatment) |
|
|
303
|
-
| `-1` | No data (indel, SNV, or
|
|
333
|
+
| `1` | Methylated (cytosine preserved as C after bisulfite/enzymatic conversion) |
|
|
334
|
+
| `0` | Unmethylated (cytosine converted to T by bisulfite/enzymatic treatment) |
|
|
335
|
+
| `-1` | No data (indel, SNV, or other non-C/T base at a CpG position) |
|
|
336
|
+
|
|
337
|
+
Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
|
|
338
|
+
|
|
339
|
+
### Per-Read Fragment Length (TLEN)
|
|
340
|
+
|
|
341
|
+
Each `.methylation.npz` file includes a `tlen.npy` entry inside the ZIP archive containing the signed BAM template length (TLEN) for every read in the matrix. This enables joint fragment-length and methylation analysis without re-processing the BAM.
|
|
342
|
+
|
|
343
|
+
- One `int32` value per read (row), in the same order as the sparse matrix rows
|
|
344
|
+
- Signed: positive for the leftmost read in a pair, negative for the rightmost
|
|
345
|
+
- Zero for single-end reads or reads with unmapped mates
|
|
346
|
+
- Use `abs(tlen)` to get fragment lengths
|
|
304
347
|
|
|
305
|
-
|
|
348
|
+
```python
|
|
349
|
+
from bam2tensor.metadata import read_npz_tlen
|
|
350
|
+
import numpy as np
|
|
351
|
+
|
|
352
|
+
tlen = read_npz_tlen("sample.methylation.npz")
|
|
353
|
+
if tlen is not None:
|
|
354
|
+
frag_lengths = np.abs(tlen)
|
|
355
|
+
nonzero = frag_lengths[frag_lengths > 0]
|
|
356
|
+
print(f"Median fragment length: {np.median(nonzero):.0f}")
|
|
357
|
+
print(f"Mean fragment length: {np.mean(nonzero):.0f}")
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
### Embedded Metadata
|
|
361
|
+
|
|
362
|
+
Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
|
|
363
|
+
|
|
364
|
+
| Field | Description |
|
|
365
|
+
|-------|-------------|
|
|
366
|
+
| `bam2tensor_version` | Version of bam2tensor that produced the file |
|
|
367
|
+
| `genome_name` | Genome identifier (e.g., `hg38`, `mm10`) |
|
|
368
|
+
| `expected_chromosomes` | List of chromosomes included in the column mapping |
|
|
369
|
+
| `total_cpg_sites` | Total number of CpG columns in the matrix |
|
|
370
|
+
| `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
|
|
371
|
+
|
|
372
|
+
This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
|
|
373
|
+
|
|
374
|
+
```python
|
|
375
|
+
from bam2tensor.metadata import read_npz_metadata
|
|
376
|
+
|
|
377
|
+
meta = read_npz_metadata("sample.methylation.npz")
|
|
378
|
+
if meta is not None:
|
|
379
|
+
print(f"Genome: {meta['genome_name']}")
|
|
380
|
+
print(f"CpG sites: {meta['total_cpg_sites']:,}")
|
|
381
|
+
print(f"CpG index CRC32: {meta['cpg_index_crc32']}")
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
The `cpg_index_crc32` field uniquely identifies the column mapping. Two files with the same CRC32 have identical column semantics (same chromosomes, same CpG positions, same order) and their matrices can be directly stacked or compared. The metadata is also accessible without bam2tensor installed, since `.npz` files are ZIP archives:
|
|
385
|
+
|
|
386
|
+
```bash
|
|
387
|
+
unzip -p sample.methylation.npz metadata.json | python -m json.tool
|
|
388
|
+
```
|
|
306
389
|
|
|
307
390
|
### Loading Output Files
|
|
308
391
|
|
|
@@ -489,10 +572,22 @@ extract_methylation_data_from_bam(
|
|
|
489
572
|
quality_limit: int = 20, # Minimum MAPQ
|
|
490
573
|
verbose: bool = False, # Enable verbose output
|
|
491
574
|
debug: bool = False # Enable debug output
|
|
492
|
-
) ->
|
|
575
|
+
) -> ExtractionResult
|
|
576
|
+
```
|
|
577
|
+
|
|
578
|
+
**Returns:** An `ExtractionResult` named tuple with two fields:
|
|
579
|
+
- `matrix`: A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites)
|
|
580
|
+
- `tlen`: A 1-D numpy `int32` array of shape (n_reads,) containing the signed template length (BAM TLEN field) for each read
|
|
581
|
+
|
|
582
|
+
### `bam2tensor.metadata.read_npz_tlen`
|
|
583
|
+
|
|
584
|
+
Read per-read template lengths from a `.methylation.npz` file.
|
|
585
|
+
|
|
586
|
+
```python
|
|
587
|
+
read_npz_tlen(npz_path: str) -> np.ndarray | None
|
|
493
588
|
```
|
|
494
589
|
|
|
495
|
-
**Returns:**
|
|
590
|
+
**Returns:** The per-read template-length array, or `None` if the file was produced by an older version of bam2tensor.
|
|
496
591
|
|
|
497
592
|
## Contributing
|
|
498
593
|
|
|
@@ -39,7 +39,10 @@
|
|
|
39
39
|
- [Custom Output Directory](#custom-output-directory)
|
|
40
40
|
- [Using a Custom Genome](#using-a-custom-genome)
|
|
41
41
|
- [Command-Line Options](#command-line-options)
|
|
42
|
+
- [Inspecting Output Files](#inspecting-output-files)
|
|
42
43
|
- [Output Data Structure](#output-data-structure)
|
|
44
|
+
- [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
|
|
45
|
+
- [Embedded Metadata](#embedded-metadata)
|
|
43
46
|
- [Loading Output Files](#loading-output-files)
|
|
44
47
|
- [Converting to Dense Arrays](#converting-to-dense-arrays)
|
|
45
48
|
- [Working with Genomic Coordinates](#working-with-genomic-coordinates)
|
|
@@ -62,6 +65,7 @@
|
|
|
62
65
|
- **Batch Processing**: Process multiple BAM files with directory recursion
|
|
63
66
|
- **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
|
|
64
67
|
- **Quality Filtering**: Configurable mapping quality thresholds
|
|
68
|
+
- **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
|
|
65
69
|
|
|
66
70
|
## Requirements
|
|
67
71
|
|
|
@@ -252,24 +256,103 @@ Options:
|
|
|
252
256
|
| `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
|
|
253
257
|
| `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
|
|
254
258
|
|
|
259
|
+
## Inspecting Output Files
|
|
260
|
+
|
|
261
|
+
Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
|
|
262
|
+
|
|
263
|
+
```bash
|
|
264
|
+
$ bam2tensor-inspect sample.methylation.npz
|
|
265
|
+
sample.methylation.npz
|
|
266
|
+
Genome: hg38
|
|
267
|
+
Chromosomes: 24 (chr1, chr2, ... chrX, chrY)
|
|
268
|
+
Reads: 1,423,891
|
|
269
|
+
CpG sites: 28,217,448
|
|
270
|
+
Data points: 12,847,322 (sparsity: 99.97%)
|
|
271
|
+
Fragment len: median 167, mean 182, range [50, 600]
|
|
272
|
+
CpG index CRC32: a1b2c3d4
|
|
273
|
+
bam2tensor: v2.4
|
|
274
|
+
File size: 14.2 MB
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
You can pass multiple files at once:
|
|
278
|
+
|
|
279
|
+
```bash
|
|
280
|
+
$ bam2tensor-inspect *.methylation.npz
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
This works on files produced by older versions of bam2tensor too (metadata fields will be omitted).
|
|
284
|
+
|
|
255
285
|
## Output Data Structure
|
|
256
286
|
|
|
257
|
-
bam2tensor generates one `.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] with the following structure:
|
|
287
|
+
bam2tensor generates one `.methylation.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] (`scipy.sparse.coo_matrix`) with the following structure:
|
|
258
288
|
|
|
259
289
|
| Dimension | Represents |
|
|
260
290
|
|-----------|------------|
|
|
261
|
-
| Rows | Unique reads (primary alignments that pass quality filters) |
|
|
262
|
-
| Columns | CpG sites
|
|
291
|
+
| **Rows** | Unique sequencing reads (primary alignments that pass quality and flag filters, numbered sequentially as encountered across chromosomes) |
|
|
292
|
+
| **Columns** | CpG sites from the reference genome, ordered by genomic position across all chromosomes (chr1, chr2, ..., chrX, chrY). Column `i` maps to the `i`-th CpG dinucleotide in the reference FASTA. |
|
|
293
|
+
|
|
294
|
+
The **column dimension is determined entirely by the reference genome**: it equals the total number of CpG sites across all `--expected-chromosomes`. For example, hg38 with default chromosomes has ~28 million CpG columns. To map column indices back to genomic coordinates (e.g., column 12345 → chr1:29503), use the `GenomeMethylationEmbedding` class with the same reference FASTA and chromosome list (see [Working with Genomic Coordinates](#working-with-genomic-coordinates) below).
|
|
263
295
|
|
|
264
296
|
### Methylation State Values
|
|
265
297
|
|
|
266
298
|
| Value | Meaning |
|
|
267
299
|
|-------|---------|
|
|
268
|
-
| `1` | Methylated (cytosine preserved as C) |
|
|
269
|
-
| `0` | Unmethylated (cytosine converted to T by bisulfite treatment) |
|
|
270
|
-
| `-1` | No data (indel, SNV, or
|
|
300
|
+
| `1` | Methylated (cytosine preserved as C after bisulfite/enzymatic conversion) |
|
|
301
|
+
| `0` | Unmethylated (cytosine converted to T by bisulfite/enzymatic treatment) |
|
|
302
|
+
| `-1` | No data (indel, SNV, or other non-C/T base at a CpG position) |
|
|
303
|
+
|
|
304
|
+
Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
|
|
305
|
+
|
|
306
|
+
### Per-Read Fragment Length (TLEN)
|
|
307
|
+
|
|
308
|
+
Each `.methylation.npz` file includes a `tlen.npy` entry inside the ZIP archive containing the signed BAM template length (TLEN) for every read in the matrix. This enables joint fragment-length and methylation analysis without re-processing the BAM.
|
|
309
|
+
|
|
310
|
+
- One `int32` value per read (row), in the same order as the sparse matrix rows
|
|
311
|
+
- Signed: positive for the leftmost read in a pair, negative for the rightmost
|
|
312
|
+
- Zero for single-end reads or reads with unmapped mates
|
|
313
|
+
- Use `abs(tlen)` to get fragment lengths
|
|
271
314
|
|
|
272
|
-
|
|
315
|
+
```python
|
|
316
|
+
from bam2tensor.metadata import read_npz_tlen
|
|
317
|
+
import numpy as np
|
|
318
|
+
|
|
319
|
+
tlen = read_npz_tlen("sample.methylation.npz")
|
|
320
|
+
if tlen is not None:
|
|
321
|
+
frag_lengths = np.abs(tlen)
|
|
322
|
+
nonzero = frag_lengths[frag_lengths > 0]
|
|
323
|
+
print(f"Median fragment length: {np.median(nonzero):.0f}")
|
|
324
|
+
print(f"Mean fragment length: {np.mean(nonzero):.0f}")
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
### Embedded Metadata
|
|
328
|
+
|
|
329
|
+
Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
|
|
330
|
+
|
|
331
|
+
| Field | Description |
|
|
332
|
+
|-------|-------------|
|
|
333
|
+
| `bam2tensor_version` | Version of bam2tensor that produced the file |
|
|
334
|
+
| `genome_name` | Genome identifier (e.g., `hg38`, `mm10`) |
|
|
335
|
+
| `expected_chromosomes` | List of chromosomes included in the column mapping |
|
|
336
|
+
| `total_cpg_sites` | Total number of CpG columns in the matrix |
|
|
337
|
+
| `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
|
|
338
|
+
|
|
339
|
+
This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
|
|
340
|
+
|
|
341
|
+
```python
|
|
342
|
+
from bam2tensor.metadata import read_npz_metadata
|
|
343
|
+
|
|
344
|
+
meta = read_npz_metadata("sample.methylation.npz")
|
|
345
|
+
if meta is not None:
|
|
346
|
+
print(f"Genome: {meta['genome_name']}")
|
|
347
|
+
print(f"CpG sites: {meta['total_cpg_sites']:,}")
|
|
348
|
+
print(f"CpG index CRC32: {meta['cpg_index_crc32']}")
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
The `cpg_index_crc32` field uniquely identifies the column mapping. Two files with the same CRC32 have identical column semantics (same chromosomes, same CpG positions, same order) and their matrices can be directly stacked or compared. The metadata is also accessible without bam2tensor installed, since `.npz` files are ZIP archives:
|
|
352
|
+
|
|
353
|
+
```bash
|
|
354
|
+
unzip -p sample.methylation.npz metadata.json | python -m json.tool
|
|
355
|
+
```
|
|
273
356
|
|
|
274
357
|
### Loading Output Files
|
|
275
358
|
|
|
@@ -456,10 +539,22 @@ extract_methylation_data_from_bam(
|
|
|
456
539
|
quality_limit: int = 20, # Minimum MAPQ
|
|
457
540
|
verbose: bool = False, # Enable verbose output
|
|
458
541
|
debug: bool = False # Enable debug output
|
|
459
|
-
) ->
|
|
542
|
+
) -> ExtractionResult
|
|
543
|
+
```
|
|
544
|
+
|
|
545
|
+
**Returns:** An `ExtractionResult` named tuple with two fields:
|
|
546
|
+
- `matrix`: A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites)
|
|
547
|
+
- `tlen`: A 1-D numpy `int32` array of shape (n_reads,) containing the signed template length (BAM TLEN field) for each read
|
|
548
|
+
|
|
549
|
+
### `bam2tensor.metadata.read_npz_tlen`
|
|
550
|
+
|
|
551
|
+
Read per-read template lengths from a `.methylation.npz` file.
|
|
552
|
+
|
|
553
|
+
```python
|
|
554
|
+
read_npz_tlen(npz_path: str) -> np.ndarray | None
|
|
460
555
|
```
|
|
461
556
|
|
|
462
|
-
**Returns:**
|
|
557
|
+
**Returns:** The per-read template-length array, or `None` if the file was produced by an older version of bam2tensor.
|
|
463
558
|
|
|
464
559
|
## Contributing
|
|
465
560
|
|
|
@@ -32,6 +32,14 @@ bam2tensor.functions module
|
|
|
32
32
|
:show-inheritance:
|
|
33
33
|
:undoc-members:
|
|
34
34
|
|
|
35
|
+
bam2tensor.metadata module
|
|
36
|
+
--------------------------
|
|
37
|
+
|
|
38
|
+
.. automodule:: bam2tensor.metadata
|
|
39
|
+
:members:
|
|
40
|
+
:show-inheritance:
|
|
41
|
+
:undoc-members:
|
|
42
|
+
|
|
35
43
|
bam2tensor.reference module
|
|
36
44
|
---------------------------
|
|
37
45
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "bam2tensor"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.4"
|
|
4
4
|
description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
|
|
5
5
|
authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
|
|
6
6
|
license = "MIT"
|
|
@@ -38,6 +38,7 @@ Changelog = "https://github.com/mcwdsi/bam2tensor/releases"
|
|
|
38
38
|
|
|
39
39
|
[project.scripts]
|
|
40
40
|
bam2tensor = "bam2tensor.__main__:main"
|
|
41
|
+
bam2tensor-inspect = "bam2tensor.inspect:main"
|
|
41
42
|
|
|
42
43
|
[dependency-groups]
|
|
43
44
|
dev = [
|
|
@@ -30,14 +30,14 @@ Example:
|
|
|
30
30
|
)
|
|
31
31
|
|
|
32
32
|
# Extract methylation data
|
|
33
|
-
|
|
33
|
+
result = extract_methylation_data_from_bam(
|
|
34
34
|
input_bam="/path/to/sample.bam",
|
|
35
35
|
genome_methylation_embedding=embedding,
|
|
36
36
|
)
|
|
37
37
|
|
|
38
38
|
# Save to file
|
|
39
39
|
import scipy.sparse
|
|
40
|
-
scipy.sparse.save_npz("output.npz",
|
|
40
|
+
scipy.sparse.save_npz("output.npz", result.matrix)
|
|
41
41
|
|
|
42
42
|
Output Format:
|
|
43
43
|
The output is a SciPy sparse COO matrix where:
|
|
@@ -50,4 +50,4 @@ See Also:
|
|
|
50
50
|
- https://mcwdsi.github.io/bam2tensor for full documentation
|
|
51
51
|
"""
|
|
52
52
|
|
|
53
|
-
__version__ = "2.
|
|
53
|
+
__version__ = "2.4"
|
|
@@ -38,6 +38,11 @@ from bam2tensor.functions import (
|
|
|
38
38
|
detect_aligner,
|
|
39
39
|
extract_methylation_data_from_bam,
|
|
40
40
|
)
|
|
41
|
+
from bam2tensor.metadata import (
|
|
42
|
+
compute_cpg_index_crc32,
|
|
43
|
+
write_npz_metadata,
|
|
44
|
+
write_npz_tlen,
|
|
45
|
+
)
|
|
41
46
|
from bam2tensor.reference import (
|
|
42
47
|
KNOWN_GENOMES,
|
|
43
48
|
download_reference as download_reference_fn,
|
|
@@ -393,10 +398,12 @@ def main(
|
|
|
393
398
|
verbose=verbose,
|
|
394
399
|
)
|
|
395
400
|
n_chroms = len(genome_methylation_embedding.cpg_sites_dict)
|
|
401
|
+
cpg_crc32 = compute_cpg_index_crc32(genome_methylation_embedding)
|
|
396
402
|
print(
|
|
397
403
|
f" Total CpG sites: {genome_methylation_embedding.total_cpg_sites:,}"
|
|
398
404
|
f" across {n_chroms} chromosome(s)"
|
|
399
405
|
)
|
|
406
|
+
print(f" CpG index CRC32: {cpg_crc32}")
|
|
400
407
|
print(f" Index loaded in {_format_elapsed(time.time() - time_start)}")
|
|
401
408
|
|
|
402
409
|
# ── Discover BAM files ──────────────────────────────────────────────
|
|
@@ -437,7 +444,7 @@ def main(
|
|
|
437
444
|
# Extract
|
|
438
445
|
print(" Extracting methylation data...")
|
|
439
446
|
try:
|
|
440
|
-
|
|
447
|
+
extraction_result = extract_methylation_data_from_bam(
|
|
441
448
|
input_bam=input_bam,
|
|
442
449
|
genome_methylation_embedding=genome_methylation_embedding,
|
|
443
450
|
quality_limit=quality_limit,
|
|
@@ -450,16 +457,27 @@ def main(
|
|
|
450
457
|
continue
|
|
451
458
|
|
|
452
459
|
# Matrix stats
|
|
453
|
-
n_reads =
|
|
454
|
-
n_cpgs =
|
|
455
|
-
n_data =
|
|
460
|
+
n_reads = extraction_result.matrix.shape[0]
|
|
461
|
+
n_cpgs = extraction_result.matrix.shape[1]
|
|
462
|
+
n_data = extraction_result.matrix.nnz
|
|
456
463
|
print(
|
|
457
464
|
f" Result: {n_reads:,} reads x {n_cpgs:,} CpG sites"
|
|
458
465
|
f" ({n_data:,} data points)"
|
|
459
466
|
)
|
|
460
467
|
|
|
461
468
|
# Save
|
|
462
|
-
scipy.sparse.save_npz(output_file,
|
|
469
|
+
scipy.sparse.save_npz(output_file, extraction_result.matrix, compressed=True)
|
|
470
|
+
write_npz_tlen(output_file, extraction_result.tlen)
|
|
471
|
+
write_npz_metadata(
|
|
472
|
+
output_file,
|
|
473
|
+
{
|
|
474
|
+
"bam2tensor_version": __version__,
|
|
475
|
+
"genome_name": genome_name,
|
|
476
|
+
"expected_chromosomes": chrom_list,
|
|
477
|
+
"total_cpg_sites": genome_methylation_embedding.total_cpg_sites,
|
|
478
|
+
"cpg_index_crc32": cpg_crc32,
|
|
479
|
+
},
|
|
480
|
+
)
|
|
463
481
|
print(f" Output: {output_file}")
|
|
464
482
|
print(f" Time: {_format_elapsed(time.time() - time_bam)}")
|
|
465
483
|
|
|
@@ -39,15 +39,18 @@ Example:
|
|
|
39
39
|
... )
|
|
40
40
|
>>>
|
|
41
41
|
>>> # Extract methylation data
|
|
42
|
-
>>>
|
|
42
|
+
>>> result = extract_methylation_data_from_bam(
|
|
43
43
|
... input_bam="sample.bam",
|
|
44
44
|
... genome_methylation_embedding=embedding,
|
|
45
45
|
... quality_limit=20,
|
|
46
46
|
... )
|
|
47
47
|
>>>
|
|
48
|
-
>>> print(f"Extracted {matrix.shape[0]} reads, {matrix.nnz} data points")
|
|
48
|
+
>>> print(f"Extracted {result.matrix.shape[0]} reads, {result.matrix.nnz} data points")
|
|
49
49
|
"""
|
|
50
50
|
|
|
51
|
+
from typing import NamedTuple
|
|
52
|
+
|
|
53
|
+
import numpy as np
|
|
51
54
|
import scipy.sparse
|
|
52
55
|
import pysam
|
|
53
56
|
import bisect
|
|
@@ -55,6 +58,23 @@ import bisect
|
|
|
55
58
|
from tqdm import tqdm
|
|
56
59
|
from bam2tensor.embedding import GenomeMethylationEmbedding
|
|
57
60
|
|
|
61
|
+
|
|
62
|
+
class ExtractionResult(NamedTuple):
|
|
63
|
+
"""Result of methylation extraction from a BAM file.
|
|
64
|
+
|
|
65
|
+
Attributes:
|
|
66
|
+
matrix: Sparse COO matrix of shape (n_reads, n_cpg_sites) with
|
|
67
|
+
methylation states: 1 (methylated), 0 (unmethylated), -1
|
|
68
|
+
(no data).
|
|
69
|
+
tlen: 1-D numpy array of shape (n_reads,) containing the signed
|
|
70
|
+
template length (TLEN from BAM) for each read. 0 for
|
|
71
|
+
single-end reads or reads with unmapped mates.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
matrix: scipy.sparse.coo_matrix
|
|
75
|
+
tlen: np.ndarray
|
|
76
|
+
|
|
77
|
+
|
|
58
78
|
# BAM flag bits for reads to skip: duplicate (0x400), qcfail (0x200),
|
|
59
79
|
# secondary (0x100), supplementary (0x800).
|
|
60
80
|
_SKIP_FLAGS = 0x400 | 0x200 | 0x100 | 0x800
|
|
@@ -180,7 +200,7 @@ def extract_methylation_data_from_bam(
|
|
|
180
200
|
quality_limit: int = 20,
|
|
181
201
|
verbose: bool = False,
|
|
182
202
|
debug: bool = False,
|
|
183
|
-
) ->
|
|
203
|
+
) -> ExtractionResult:
|
|
184
204
|
"""Extract read-level CpG methylation data from a BAM file.
|
|
185
205
|
|
|
186
206
|
Parses a bisulfite-sequencing or EM-seq BAM file and extracts methylation
|
|
@@ -225,14 +245,19 @@ def extract_methylation_data_from_bam(
|
|
|
225
245
|
only processed once. Significantly slower.
|
|
226
246
|
|
|
227
247
|
Returns:
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
248
|
+
An ExtractionResult named tuple with two fields:
|
|
249
|
+
|
|
250
|
+
- **matrix**: A scipy.sparse.coo_matrix with shape
|
|
251
|
+
(n_reads, n_cpg_sites) where n_reads is the number of reads
|
|
252
|
+
that passed filters and covered at least one CpG site,
|
|
253
|
+
n_cpg_sites is genome_methylation_embedding.total_cpg_sites,
|
|
254
|
+
and values are: 1 (methylated), 0 (unmethylated), -1 (no data).
|
|
255
|
+
The matrix uses COO format for efficient construction; convert
|
|
256
|
+
to CSR (tocsr()) for row slicing or CSC (tocsc()) for column
|
|
257
|
+
slicing.
|
|
258
|
+
- **tlen**: A 1-D numpy int32 array of shape (n_reads,) containing
|
|
259
|
+
the signed template length (BAM TLEN field) for each read.
|
|
260
|
+
Values are 0 for single-end reads or reads with unmapped mates.
|
|
236
261
|
|
|
237
262
|
Raises:
|
|
238
263
|
FileNotFoundError: If the BAM file index (.bam.bai) is missing.
|
|
@@ -252,7 +277,7 @@ def extract_methylation_data_from_bam(
|
|
|
252
277
|
... )
|
|
253
278
|
>>>
|
|
254
279
|
>>> # Extract methylation data
|
|
255
|
-
>>>
|
|
280
|
+
>>> result = extract_methylation_data_from_bam(
|
|
256
281
|
... input_bam="sample.bam",
|
|
257
282
|
... genome_methylation_embedding=embedding,
|
|
258
283
|
... quality_limit=30, # Stricter quality filter
|
|
@@ -260,12 +285,12 @@ def extract_methylation_data_from_bam(
|
|
|
260
285
|
... )
|
|
261
286
|
>>>
|
|
262
287
|
>>> # Analyze results
|
|
263
|
-
>>> print(f"Reads with CpG data: {matrix.shape[0]:,}")
|
|
264
|
-
>>> print(f"Total CpG sites: {matrix.shape[1]:,}")
|
|
265
|
-
>>> print(f"Data points: {matrix.nnz:,}")
|
|
288
|
+
>>> print(f"Reads with CpG data: {result.matrix.shape[0]:,}")
|
|
289
|
+
>>> print(f"Total CpG sites: {result.matrix.shape[1]:,}")
|
|
290
|
+
>>> print(f"Data points: {result.matrix.nnz:,}")
|
|
266
291
|
>>>
|
|
267
292
|
>>> # Save to file
|
|
268
|
-
>>> scipy.sparse.save_npz("sample.methylation.npz", matrix)
|
|
293
|
+
>>> scipy.sparse.save_npz("sample.methylation.npz", result.matrix)
|
|
269
294
|
|
|
270
295
|
Note:
|
|
271
296
|
The function processes chromosomes in the order they appear in
|
|
@@ -304,6 +329,7 @@ def extract_methylation_data_from_bam(
|
|
|
304
329
|
coo_row = [] # Read number
|
|
305
330
|
coo_col = [] # CpG number (embedding)
|
|
306
331
|
coo_data = [] # Methylation state
|
|
332
|
+
tlen_list: list[int] = [] # Template length (TLEN) per read
|
|
307
333
|
|
|
308
334
|
# This is slow, but we only run it once and store the results for later
|
|
309
335
|
for chrom, cpg_sites in tqdm(
|
|
@@ -398,6 +424,7 @@ def extract_methylation_data_from_bam(
|
|
|
398
424
|
), "Read seen twice!"
|
|
399
425
|
debug_read_name_to_row_number[read_key] = read_number
|
|
400
426
|
print("************************************************\n")
|
|
427
|
+
tlen_list.append(aligned_segment.template_length)
|
|
401
428
|
read_number += 1
|
|
402
429
|
|
|
403
430
|
continue # Skip the Biscuit/bwameth/gem3 path below
|
|
@@ -526,6 +553,7 @@ def extract_methylation_data_from_bam(
|
|
|
526
553
|
f"\t{query_pos} {ref_pos} C->{query_base} [Unknown! SNV? Indel?]"
|
|
527
554
|
)
|
|
528
555
|
|
|
556
|
+
tlen_list.append(aligned_segment.template_length)
|
|
529
557
|
read_number += 1
|
|
530
558
|
|
|
531
559
|
if debug:
|
|
@@ -557,6 +585,6 @@ def extract_methylation_data_from_bam(
|
|
|
557
585
|
# Number of columns = number of CpG sites
|
|
558
586
|
assert sparse_matrix.shape[1] == genome_methylation_embedding.total_cpg_sites
|
|
559
587
|
|
|
560
|
-
|
|
588
|
+
tlen_array = np.array(tlen_list, dtype=np.int32)
|
|
561
589
|
|
|
562
|
-
|
|
590
|
+
return ExtractionResult(matrix=sparse_matrix, tlen=tlen_array)
|