bam2tensor 2.3__tar.gz → 2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bam2tensor-2.3 → bam2tensor-2.4}/.github/workflows/constraints.txt +1 -1
- {bam2tensor-2.3 → bam2tensor-2.4}/.github/workflows/docs.yml +2 -2
- {bam2tensor-2.3 → bam2tensor-2.4}/.github/workflows/labeler.yml +1 -1
- {bam2tensor-2.3 → bam2tensor-2.4}/.github/workflows/release.yml +1 -1
- {bam2tensor-2.3 → bam2tensor-2.4}/CLAUDE.md +3 -1
- {bam2tensor-2.3 → bam2tensor-2.4}/PKG-INFO +40 -4
- {bam2tensor-2.3 → bam2tensor-2.4}/README.md +39 -3
- {bam2tensor-2.3 → bam2tensor-2.4}/docs/reference.md +8 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/pyproject.toml +1 -1
- {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/__init__.py +3 -3
- {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/__main__.py +11 -6
- {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/functions.py +46 -18
- {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/inspect.py +14 -1
- {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/metadata.py +48 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_duplication.py +5 -3
- {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_functions.py +233 -79
- {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_inspect.py +44 -4
- {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_main.py +33 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_metadata.py +62 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/uv.lock +103 -107
- {bam2tensor-2.3 → bam2tensor-2.4}/.darglint +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/.editorconfig +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/.gitattributes +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/.github/actions/setup-env/action.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/.github/dependabot.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/.github/labels.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/.github/release-drafter.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/.github/workflows/tests.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/.gitignore +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/.pre-commit-config.yaml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/CONTRIBUTING.md +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/LICENSE +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/SECURITY.md +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/docs/Makefile +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/docs/conf.py +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/docs/contributing.md +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/docs/index.md +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/docs/license.md +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/docs/logo/bam2tensor-logo.afdesign +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/docs/logo/bam2tensor-logo.png +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/docs/make.bat +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/docs/nano-banana-overview-shrunk.png +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/docs/templates/package.rst_t +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/noxfile.py +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/embedding.py +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/py.typed +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/reference.py +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/tests/__init__.py +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_embedding.py +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_fasta.fa +0 -0
- {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_reference.py +0 -0
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
nox==2026.2.9
|
|
2
|
-
uv==0.
|
|
2
|
+
uv==0.11.2
|
|
@@ -76,7 +76,7 @@ jobs:
|
|
|
76
76
|
repository-url: https://test.pypi.org/legacy/
|
|
77
77
|
|
|
78
78
|
- name: Publish the release notes
|
|
79
|
-
uses: release-drafter/release-drafter@
|
|
79
|
+
uses: release-drafter/release-drafter@v7.1.1
|
|
80
80
|
with:
|
|
81
81
|
publish: ${{ steps.check-version.outputs.tag != '' || steps.check-tag.outputs.tag != '' }}
|
|
82
82
|
tag: ${{ steps.check-version.outputs.tag || steps.check-tag.outputs.tag }}
|
|
@@ -40,7 +40,7 @@ uv run mypy src
|
|
|
40
40
|
|
|
41
41
|
```
|
|
42
42
|
src/bam2tensor/
|
|
43
|
-
__init__.py # Package version (2.
|
|
43
|
+
__init__.py # Package version (2.4)
|
|
44
44
|
__main__.py # Click CLI entry point (bam2tensor command)
|
|
45
45
|
inspect.py # Inspect CLI entry point (bam2tensor-inspect command)
|
|
46
46
|
embedding.py # GenomeMethylationEmbedding class (FASTA parsing, CpG indexing)
|
|
@@ -117,6 +117,8 @@ xdoctest validates code examples in docstrings. Important rules:
|
|
|
117
117
|
- Columns = CpG sites (ordered by genomic position, determined by reference genome)
|
|
118
118
|
- Values: 1 (methylated), 0 (unmethylated), -1 (no data/indels/SNVs)
|
|
119
119
|
- Each .npz file contains a `metadata.json` entry with provenance info (genome name, version, CpG index CRC32, expected chromosomes). Read via `bam2tensor.metadata.read_npz_metadata()`.
|
|
120
|
+
- Each .npz file contains a `tlen.npy` entry with per-read signed template length (BAM TLEN field) as int32. Read via `bam2tensor.metadata.read_npz_tlen()`. Returns `None` for files from older versions.
|
|
121
|
+
- `extract_methylation_data_from_bam()` returns an `ExtractionResult` NamedTuple with `.matrix` (sparse COO) and `.tlen` (numpy int32 array).
|
|
120
122
|
|
|
121
123
|
### Methylation Strand Detection
|
|
122
124
|
- Bismark aligner: XM tag (Z/z for methylated/unmethylated CpG; no strand filtering needed)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bam2tensor
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4
|
|
4
4
|
Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
|
|
5
5
|
Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
|
|
6
6
|
Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
|
|
@@ -74,6 +74,7 @@ Description-Content-Type: text/markdown
|
|
|
74
74
|
- [Command-Line Options](#command-line-options)
|
|
75
75
|
- [Inspecting Output Files](#inspecting-output-files)
|
|
76
76
|
- [Output Data Structure](#output-data-structure)
|
|
77
|
+
- [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
|
|
77
78
|
- [Embedded Metadata](#embedded-metadata)
|
|
78
79
|
- [Loading Output Files](#loading-output-files)
|
|
79
80
|
- [Converting to Dense Arrays](#converting-to-dense-arrays)
|
|
@@ -97,6 +98,7 @@ Description-Content-Type: text/markdown
|
|
|
97
98
|
- **Batch Processing**: Process multiple BAM files with directory recursion
|
|
98
99
|
- **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
|
|
99
100
|
- **Quality Filtering**: Configurable mapping quality thresholds
|
|
101
|
+
- **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
|
|
100
102
|
|
|
101
103
|
## Requirements
|
|
102
104
|
|
|
@@ -299,8 +301,9 @@ sample.methylation.npz
|
|
|
299
301
|
Reads: 1,423,891
|
|
300
302
|
CpG sites: 28,217,448
|
|
301
303
|
Data points: 12,847,322 (sparsity: 99.97%)
|
|
304
|
+
Fragment len: median 167, mean 182, range [50, 600]
|
|
302
305
|
CpG index CRC32: a1b2c3d4
|
|
303
|
-
bam2tensor: v2.
|
|
306
|
+
bam2tensor: v2.4
|
|
304
307
|
File size: 14.2 MB
|
|
305
308
|
```
|
|
306
309
|
|
|
@@ -333,6 +336,27 @@ The **column dimension is determined entirely by the reference genome**: it equa
|
|
|
333
336
|
|
|
334
337
|
Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
|
|
335
338
|
|
|
339
|
+
### Per-Read Fragment Length (TLEN)
|
|
340
|
+
|
|
341
|
+
Each `.methylation.npz` file includes a `tlen.npy` entry inside the ZIP archive containing the signed BAM template length (TLEN) for every read in the matrix. This enables joint fragment-length and methylation analysis without re-processing the BAM.
|
|
342
|
+
|
|
343
|
+
- One `int32` value per read (row), in the same order as the sparse matrix rows
|
|
344
|
+
- Signed: positive for the leftmost read in a pair, negative for the rightmost
|
|
345
|
+
- Zero for single-end reads or reads with unmapped mates
|
|
346
|
+
- Use `abs(tlen)` to get fragment lengths
|
|
347
|
+
|
|
348
|
+
```python
|
|
349
|
+
from bam2tensor.metadata import read_npz_tlen
|
|
350
|
+
import numpy as np
|
|
351
|
+
|
|
352
|
+
tlen = read_npz_tlen("sample.methylation.npz")
|
|
353
|
+
if tlen is not None:
|
|
354
|
+
frag_lengths = np.abs(tlen)
|
|
355
|
+
nonzero = frag_lengths[frag_lengths > 0]
|
|
356
|
+
print(f"Median fragment length: {np.median(nonzero):.0f}")
|
|
357
|
+
print(f"Mean fragment length: {np.mean(nonzero):.0f}")
|
|
358
|
+
```
|
|
359
|
+
|
|
336
360
|
### Embedded Metadata
|
|
337
361
|
|
|
338
362
|
Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
|
|
@@ -548,10 +572,22 @@ extract_methylation_data_from_bam(
|
|
|
548
572
|
quality_limit: int = 20, # Minimum MAPQ
|
|
549
573
|
verbose: bool = False, # Enable verbose output
|
|
550
574
|
debug: bool = False # Enable debug output
|
|
551
|
-
) ->
|
|
575
|
+
) -> ExtractionResult
|
|
576
|
+
```
|
|
577
|
+
|
|
578
|
+
**Returns:** An `ExtractionResult` named tuple with two fields:
|
|
579
|
+
- `matrix`: A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites)
|
|
580
|
+
- `tlen`: A 1-D numpy `int32` array of shape (n_reads,) containing the signed template length (BAM TLEN field) for each read
|
|
581
|
+
|
|
582
|
+
### `bam2tensor.metadata.read_npz_tlen`
|
|
583
|
+
|
|
584
|
+
Read per-read template lengths from a `.methylation.npz` file.
|
|
585
|
+
|
|
586
|
+
```python
|
|
587
|
+
read_npz_tlen(npz_path: str) -> np.ndarray | None
|
|
552
588
|
```
|
|
553
589
|
|
|
554
|
-
**Returns:**
|
|
590
|
+
**Returns:** The per-read template-length array, or `None` if the file was produced by an older version of bam2tensor.
|
|
555
591
|
|
|
556
592
|
## Contributing
|
|
557
593
|
|
|
@@ -41,6 +41,7 @@
|
|
|
41
41
|
- [Command-Line Options](#command-line-options)
|
|
42
42
|
- [Inspecting Output Files](#inspecting-output-files)
|
|
43
43
|
- [Output Data Structure](#output-data-structure)
|
|
44
|
+
- [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
|
|
44
45
|
- [Embedded Metadata](#embedded-metadata)
|
|
45
46
|
- [Loading Output Files](#loading-output-files)
|
|
46
47
|
- [Converting to Dense Arrays](#converting-to-dense-arrays)
|
|
@@ -64,6 +65,7 @@
|
|
|
64
65
|
- **Batch Processing**: Process multiple BAM files with directory recursion
|
|
65
66
|
- **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
|
|
66
67
|
- **Quality Filtering**: Configurable mapping quality thresholds
|
|
68
|
+
- **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
|
|
67
69
|
|
|
68
70
|
## Requirements
|
|
69
71
|
|
|
@@ -266,8 +268,9 @@ sample.methylation.npz
|
|
|
266
268
|
Reads: 1,423,891
|
|
267
269
|
CpG sites: 28,217,448
|
|
268
270
|
Data points: 12,847,322 (sparsity: 99.97%)
|
|
271
|
+
Fragment len: median 167, mean 182, range [50, 600]
|
|
269
272
|
CpG index CRC32: a1b2c3d4
|
|
270
|
-
bam2tensor: v2.
|
|
273
|
+
bam2tensor: v2.4
|
|
271
274
|
File size: 14.2 MB
|
|
272
275
|
```
|
|
273
276
|
|
|
@@ -300,6 +303,27 @@ The **column dimension is determined entirely by the reference genome**: it equa
|
|
|
300
303
|
|
|
301
304
|
Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
|
|
302
305
|
|
|
306
|
+
### Per-Read Fragment Length (TLEN)
|
|
307
|
+
|
|
308
|
+
Each `.methylation.npz` file includes a `tlen.npy` entry inside the ZIP archive containing the signed BAM template length (TLEN) for every read in the matrix. This enables joint fragment-length and methylation analysis without re-processing the BAM.
|
|
309
|
+
|
|
310
|
+
- One `int32` value per read (row), in the same order as the sparse matrix rows
|
|
311
|
+
- Signed: positive for the leftmost read in a pair, negative for the rightmost
|
|
312
|
+
- Zero for single-end reads or reads with unmapped mates
|
|
313
|
+
- Use `abs(tlen)` to get fragment lengths
|
|
314
|
+
|
|
315
|
+
```python
|
|
316
|
+
from bam2tensor.metadata import read_npz_tlen
|
|
317
|
+
import numpy as np
|
|
318
|
+
|
|
319
|
+
tlen = read_npz_tlen("sample.methylation.npz")
|
|
320
|
+
if tlen is not None:
|
|
321
|
+
frag_lengths = np.abs(tlen)
|
|
322
|
+
nonzero = frag_lengths[frag_lengths > 0]
|
|
323
|
+
print(f"Median fragment length: {np.median(nonzero):.0f}")
|
|
324
|
+
print(f"Mean fragment length: {np.mean(nonzero):.0f}")
|
|
325
|
+
```
|
|
326
|
+
|
|
303
327
|
### Embedded Metadata
|
|
304
328
|
|
|
305
329
|
Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
|
|
@@ -515,10 +539,22 @@ extract_methylation_data_from_bam(
|
|
|
515
539
|
quality_limit: int = 20, # Minimum MAPQ
|
|
516
540
|
verbose: bool = False, # Enable verbose output
|
|
517
541
|
debug: bool = False # Enable debug output
|
|
518
|
-
) ->
|
|
542
|
+
) -> ExtractionResult
|
|
543
|
+
```
|
|
544
|
+
|
|
545
|
+
**Returns:** An `ExtractionResult` named tuple with two fields:
|
|
546
|
+
- `matrix`: A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites)
|
|
547
|
+
- `tlen`: A 1-D numpy `int32` array of shape (n_reads,) containing the signed template length (BAM TLEN field) for each read
|
|
548
|
+
|
|
549
|
+
### `bam2tensor.metadata.read_npz_tlen`
|
|
550
|
+
|
|
551
|
+
Read per-read template lengths from a `.methylation.npz` file.
|
|
552
|
+
|
|
553
|
+
```python
|
|
554
|
+
read_npz_tlen(npz_path: str) -> np.ndarray | None
|
|
519
555
|
```
|
|
520
556
|
|
|
521
|
-
**Returns:**
|
|
557
|
+
**Returns:** The per-read template-length array, or `None` if the file was produced by an older version of bam2tensor.
|
|
522
558
|
|
|
523
559
|
## Contributing
|
|
524
560
|
|
|
@@ -32,6 +32,14 @@ bam2tensor.functions module
|
|
|
32
32
|
:show-inheritance:
|
|
33
33
|
:undoc-members:
|
|
34
34
|
|
|
35
|
+
bam2tensor.metadata module
|
|
36
|
+
--------------------------
|
|
37
|
+
|
|
38
|
+
.. automodule:: bam2tensor.metadata
|
|
39
|
+
:members:
|
|
40
|
+
:show-inheritance:
|
|
41
|
+
:undoc-members:
|
|
42
|
+
|
|
35
43
|
bam2tensor.reference module
|
|
36
44
|
---------------------------
|
|
37
45
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "bam2tensor"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.4"
|
|
4
4
|
description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
|
|
5
5
|
authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
|
|
6
6
|
license = "MIT"
|
|
@@ -30,14 +30,14 @@ Example:
|
|
|
30
30
|
)
|
|
31
31
|
|
|
32
32
|
# Extract methylation data
|
|
33
|
-
|
|
33
|
+
result = extract_methylation_data_from_bam(
|
|
34
34
|
input_bam="/path/to/sample.bam",
|
|
35
35
|
genome_methylation_embedding=embedding,
|
|
36
36
|
)
|
|
37
37
|
|
|
38
38
|
# Save to file
|
|
39
39
|
import scipy.sparse
|
|
40
|
-
scipy.sparse.save_npz("output.npz",
|
|
40
|
+
scipy.sparse.save_npz("output.npz", result.matrix)
|
|
41
41
|
|
|
42
42
|
Output Format:
|
|
43
43
|
The output is a SciPy sparse COO matrix where:
|
|
@@ -50,4 +50,4 @@ See Also:
|
|
|
50
50
|
- https://mcwdsi.github.io/bam2tensor for full documentation
|
|
51
51
|
"""
|
|
52
52
|
|
|
53
|
-
__version__ = "2.
|
|
53
|
+
__version__ = "2.4"
|
|
@@ -38,7 +38,11 @@ from bam2tensor.functions import (
|
|
|
38
38
|
detect_aligner,
|
|
39
39
|
extract_methylation_data_from_bam,
|
|
40
40
|
)
|
|
41
|
-
from bam2tensor.metadata import
|
|
41
|
+
from bam2tensor.metadata import (
|
|
42
|
+
compute_cpg_index_crc32,
|
|
43
|
+
write_npz_metadata,
|
|
44
|
+
write_npz_tlen,
|
|
45
|
+
)
|
|
42
46
|
from bam2tensor.reference import (
|
|
43
47
|
KNOWN_GENOMES,
|
|
44
48
|
download_reference as download_reference_fn,
|
|
@@ -440,7 +444,7 @@ def main(
|
|
|
440
444
|
# Extract
|
|
441
445
|
print(" Extracting methylation data...")
|
|
442
446
|
try:
|
|
443
|
-
|
|
447
|
+
extraction_result = extract_methylation_data_from_bam(
|
|
444
448
|
input_bam=input_bam,
|
|
445
449
|
genome_methylation_embedding=genome_methylation_embedding,
|
|
446
450
|
quality_limit=quality_limit,
|
|
@@ -453,16 +457,17 @@ def main(
|
|
|
453
457
|
continue
|
|
454
458
|
|
|
455
459
|
# Matrix stats
|
|
456
|
-
n_reads =
|
|
457
|
-
n_cpgs =
|
|
458
|
-
n_data =
|
|
460
|
+
n_reads = extraction_result.matrix.shape[0]
|
|
461
|
+
n_cpgs = extraction_result.matrix.shape[1]
|
|
462
|
+
n_data = extraction_result.matrix.nnz
|
|
459
463
|
print(
|
|
460
464
|
f" Result: {n_reads:,} reads x {n_cpgs:,} CpG sites"
|
|
461
465
|
f" ({n_data:,} data points)"
|
|
462
466
|
)
|
|
463
467
|
|
|
464
468
|
# Save
|
|
465
|
-
scipy.sparse.save_npz(output_file,
|
|
469
|
+
scipy.sparse.save_npz(output_file, extraction_result.matrix, compressed=True)
|
|
470
|
+
write_npz_tlen(output_file, extraction_result.tlen)
|
|
466
471
|
write_npz_metadata(
|
|
467
472
|
output_file,
|
|
468
473
|
{
|
|
@@ -39,15 +39,18 @@ Example:
|
|
|
39
39
|
... )
|
|
40
40
|
>>>
|
|
41
41
|
>>> # Extract methylation data
|
|
42
|
-
>>>
|
|
42
|
+
>>> result = extract_methylation_data_from_bam(
|
|
43
43
|
... input_bam="sample.bam",
|
|
44
44
|
... genome_methylation_embedding=embedding,
|
|
45
45
|
... quality_limit=20,
|
|
46
46
|
... )
|
|
47
47
|
>>>
|
|
48
|
-
>>> print(f"Extracted {matrix.shape[0]} reads, {matrix.nnz} data points")
|
|
48
|
+
>>> print(f"Extracted {result.matrix.shape[0]} reads, {result.matrix.nnz} data points")
|
|
49
49
|
"""
|
|
50
50
|
|
|
51
|
+
from typing import NamedTuple
|
|
52
|
+
|
|
53
|
+
import numpy as np
|
|
51
54
|
import scipy.sparse
|
|
52
55
|
import pysam
|
|
53
56
|
import bisect
|
|
@@ -55,6 +58,23 @@ import bisect
|
|
|
55
58
|
from tqdm import tqdm
|
|
56
59
|
from bam2tensor.embedding import GenomeMethylationEmbedding
|
|
57
60
|
|
|
61
|
+
|
|
62
|
+
class ExtractionResult(NamedTuple):
|
|
63
|
+
"""Result of methylation extraction from a BAM file.
|
|
64
|
+
|
|
65
|
+
Attributes:
|
|
66
|
+
matrix: Sparse COO matrix of shape (n_reads, n_cpg_sites) with
|
|
67
|
+
methylation states: 1 (methylated), 0 (unmethylated), -1
|
|
68
|
+
(no data).
|
|
69
|
+
tlen: 1-D numpy array of shape (n_reads,) containing the signed
|
|
70
|
+
template length (TLEN from BAM) for each read. 0 for
|
|
71
|
+
single-end reads or reads with unmapped mates.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
matrix: scipy.sparse.coo_matrix
|
|
75
|
+
tlen: np.ndarray
|
|
76
|
+
|
|
77
|
+
|
|
58
78
|
# BAM flag bits for reads to skip: duplicate (0x400), qcfail (0x200),
|
|
59
79
|
# secondary (0x100), supplementary (0x800).
|
|
60
80
|
_SKIP_FLAGS = 0x400 | 0x200 | 0x100 | 0x800
|
|
@@ -180,7 +200,7 @@ def extract_methylation_data_from_bam(
|
|
|
180
200
|
quality_limit: int = 20,
|
|
181
201
|
verbose: bool = False,
|
|
182
202
|
debug: bool = False,
|
|
183
|
-
) ->
|
|
203
|
+
) -> ExtractionResult:
|
|
184
204
|
"""Extract read-level CpG methylation data from a BAM file.
|
|
185
205
|
|
|
186
206
|
Parses a bisulfite-sequencing or EM-seq BAM file and extracts methylation
|
|
@@ -225,14 +245,19 @@ def extract_methylation_data_from_bam(
|
|
|
225
245
|
only processed once. Significantly slower.
|
|
226
246
|
|
|
227
247
|
Returns:
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
248
|
+
An ExtractionResult named tuple with two fields:
|
|
249
|
+
|
|
250
|
+
- **matrix**: A scipy.sparse.coo_matrix with shape
|
|
251
|
+
(n_reads, n_cpg_sites) where n_reads is the number of reads
|
|
252
|
+
that passed filters and covered at least one CpG site,
|
|
253
|
+
n_cpg_sites is genome_methylation_embedding.total_cpg_sites,
|
|
254
|
+
and values are: 1 (methylated), 0 (unmethylated), -1 (no data).
|
|
255
|
+
The matrix uses COO format for efficient construction; convert
|
|
256
|
+
to CSR (tocsr()) for row slicing or CSC (tocsc()) for column
|
|
257
|
+
slicing.
|
|
258
|
+
- **tlen**: A 1-D numpy int32 array of shape (n_reads,) containing
|
|
259
|
+
the signed template length (BAM TLEN field) for each read.
|
|
260
|
+
Values are 0 for single-end reads or reads with unmapped mates.
|
|
236
261
|
|
|
237
262
|
Raises:
|
|
238
263
|
FileNotFoundError: If the BAM file index (.bam.bai) is missing.
|
|
@@ -252,7 +277,7 @@ def extract_methylation_data_from_bam(
|
|
|
252
277
|
... )
|
|
253
278
|
>>>
|
|
254
279
|
>>> # Extract methylation data
|
|
255
|
-
>>>
|
|
280
|
+
>>> result = extract_methylation_data_from_bam(
|
|
256
281
|
... input_bam="sample.bam",
|
|
257
282
|
... genome_methylation_embedding=embedding,
|
|
258
283
|
... quality_limit=30, # Stricter quality filter
|
|
@@ -260,12 +285,12 @@ def extract_methylation_data_from_bam(
|
|
|
260
285
|
... )
|
|
261
286
|
>>>
|
|
262
287
|
>>> # Analyze results
|
|
263
|
-
>>> print(f"Reads with CpG data: {matrix.shape[0]:,}")
|
|
264
|
-
>>> print(f"Total CpG sites: {matrix.shape[1]:,}")
|
|
265
|
-
>>> print(f"Data points: {matrix.nnz:,}")
|
|
288
|
+
>>> print(f"Reads with CpG data: {result.matrix.shape[0]:,}")
|
|
289
|
+
>>> print(f"Total CpG sites: {result.matrix.shape[1]:,}")
|
|
290
|
+
>>> print(f"Data points: {result.matrix.nnz:,}")
|
|
266
291
|
>>>
|
|
267
292
|
>>> # Save to file
|
|
268
|
-
>>> scipy.sparse.save_npz("sample.methylation.npz", matrix)
|
|
293
|
+
>>> scipy.sparse.save_npz("sample.methylation.npz", result.matrix)
|
|
269
294
|
|
|
270
295
|
Note:
|
|
271
296
|
The function processes chromosomes in the order they appear in
|
|
@@ -304,6 +329,7 @@ def extract_methylation_data_from_bam(
|
|
|
304
329
|
coo_row = [] # Read number
|
|
305
330
|
coo_col = [] # CpG number (embedding)
|
|
306
331
|
coo_data = [] # Methylation state
|
|
332
|
+
tlen_list: list[int] = [] # Template length (TLEN) per read
|
|
307
333
|
|
|
308
334
|
# This is slow, but we only run it once and store the results for later
|
|
309
335
|
for chrom, cpg_sites in tqdm(
|
|
@@ -398,6 +424,7 @@ def extract_methylation_data_from_bam(
|
|
|
398
424
|
), "Read seen twice!"
|
|
399
425
|
debug_read_name_to_row_number[read_key] = read_number
|
|
400
426
|
print("************************************************\n")
|
|
427
|
+
tlen_list.append(aligned_segment.template_length)
|
|
401
428
|
read_number += 1
|
|
402
429
|
|
|
403
430
|
continue # Skip the Biscuit/bwameth/gem3 path below
|
|
@@ -526,6 +553,7 @@ def extract_methylation_data_from_bam(
|
|
|
526
553
|
f"\t{query_pos} {ref_pos} C->{query_base} [Unknown! SNV? Indel?]"
|
|
527
554
|
)
|
|
528
555
|
|
|
556
|
+
tlen_list.append(aligned_segment.template_length)
|
|
529
557
|
read_number += 1
|
|
530
558
|
|
|
531
559
|
if debug:
|
|
@@ -557,6 +585,6 @@ def extract_methylation_data_from_bam(
|
|
|
557
585
|
# Number of columns = number of CpG sites
|
|
558
586
|
assert sparse_matrix.shape[1] == genome_methylation_embedding.total_cpg_sites
|
|
559
587
|
|
|
560
|
-
|
|
588
|
+
tlen_array = np.array(tlen_list, dtype=np.int32)
|
|
561
589
|
|
|
562
|
-
|
|
590
|
+
return ExtractionResult(matrix=sparse_matrix, tlen=tlen_array)
|
|
@@ -21,7 +21,7 @@ import click
|
|
|
21
21
|
import numpy as np
|
|
22
22
|
import scipy.sparse
|
|
23
23
|
|
|
24
|
-
from bam2tensor.metadata import read_npz_metadata
|
|
24
|
+
from bam2tensor.metadata import read_npz_metadata, read_npz_tlen
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def _format_size(nbytes: int) -> str:
|
|
@@ -103,6 +103,19 @@ def inspect_npz(npz_path: str) -> None:
|
|
|
103
103
|
print(f" CpG sites: {n_cpgs:,}")
|
|
104
104
|
print(f" Data points: {n_data:,} (sparsity: {sparsity:.2%})")
|
|
105
105
|
|
|
106
|
+
# TLEN / fragment length statistics
|
|
107
|
+
tlen = read_npz_tlen(npz_path)
|
|
108
|
+
if tlen is not None:
|
|
109
|
+
nonzero = np.abs(tlen)[np.abs(tlen) > 0]
|
|
110
|
+
if len(nonzero) > 0:
|
|
111
|
+
print(
|
|
112
|
+
f" Fragment len: median {np.median(nonzero):.0f}, "
|
|
113
|
+
f"mean {np.mean(nonzero):.0f}, "
|
|
114
|
+
f"range [{nonzero.min()}, {nonzero.max()}]"
|
|
115
|
+
)
|
|
116
|
+
else:
|
|
117
|
+
print(" Fragment len: all zero (single-end data)")
|
|
118
|
+
|
|
106
119
|
if meta and "cpg_index_crc32" in meta:
|
|
107
120
|
print(f" CpG index CRC32: {meta['cpg_index_crc32']}")
|
|
108
121
|
if meta and "bam2tensor_version" in meta:
|
|
@@ -27,10 +27,13 @@ Example:
|
|
|
27
27
|
hg38
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
|
+
import io
|
|
30
31
|
import json
|
|
31
32
|
import zipfile
|
|
32
33
|
import zlib
|
|
33
34
|
|
|
35
|
+
import numpy as np
|
|
36
|
+
|
|
34
37
|
from bam2tensor.embedding import GenomeMethylationEmbedding
|
|
35
38
|
|
|
36
39
|
|
|
@@ -112,3 +115,48 @@ def read_npz_metadata(npz_path: str) -> dict | None:
|
|
|
112
115
|
if "metadata.json" in zf.namelist():
|
|
113
116
|
return json.loads(zf.read("metadata.json"))
|
|
114
117
|
return None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def write_npz_tlen(npz_path: str, tlen: np.ndarray) -> None:
|
|
121
|
+
"""Append a ``tlen.npy`` entry to an existing ``.npz`` file.
|
|
122
|
+
|
|
123
|
+
The array is serialised with :func:`numpy.save` and appended to the
|
|
124
|
+
ZIP archive. ``scipy.sparse.load_npz`` ignores this extra entry, so
|
|
125
|
+
the file remains compatible with existing sparse-matrix code.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
npz_path: Path to the ``.npz`` file (must already exist).
|
|
129
|
+
tlen: A 1-D numpy array of per-read template lengths.
|
|
130
|
+
|
|
131
|
+
Example:
|
|
132
|
+
>>> # xdoctest: +SKIP
|
|
133
|
+
>>> write_npz_tlen("out.npz", np.array([300, -300, 0], dtype=np.int32))
|
|
134
|
+
"""
|
|
135
|
+
buf = io.BytesIO()
|
|
136
|
+
np.save(buf, tlen)
|
|
137
|
+
with zipfile.ZipFile(npz_path, "a") as zf:
|
|
138
|
+
zf.writestr("tlen.npy", buf.getvalue())
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def read_npz_tlen(npz_path: str) -> np.ndarray | None:
|
|
142
|
+
"""Read the ``tlen.npy`` entry from a ``.npz`` file.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
npz_path: Path to the ``.npz`` file.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
The per-read template-length array, or ``None`` if the file does
|
|
149
|
+
not contain a ``tlen.npy`` entry (e.g. files produced by older
|
|
150
|
+
versions).
|
|
151
|
+
|
|
152
|
+
Example:
|
|
153
|
+
>>> # xdoctest: +SKIP
|
|
154
|
+
>>> tlen = read_npz_tlen("sample.methylation.npz")
|
|
155
|
+
>>> if tlen is not None:
|
|
156
|
+
... print(f"Median fragment length: {np.median(np.abs(tlen)):.0f}")
|
|
157
|
+
"""
|
|
158
|
+
with zipfile.ZipFile(npz_path, "r") as zf:
|
|
159
|
+
if "tlen.npy" in zf.namelist():
|
|
160
|
+
buf = io.BytesIO(zf.read("tlen.npy"))
|
|
161
|
+
return np.load(buf, allow_pickle=False)
|
|
162
|
+
return None
|
|
@@ -54,10 +54,12 @@ def test_duplication_bug(tmp_path):
|
|
|
54
54
|
pysam.index(str(bam_path))
|
|
55
55
|
|
|
56
56
|
# 4. Run extraction
|
|
57
|
-
|
|
57
|
+
result = functions.extract_methylation_data_from_bam(
|
|
58
58
|
input_bam=str(bam_path), genome_methylation_embedding=emb, debug=True
|
|
59
59
|
)
|
|
60
60
|
|
|
61
|
-
print(f"Matrix shape: {matrix.shape}")
|
|
61
|
+
print(f"Matrix shape: {result.matrix.shape}")
|
|
62
62
|
# We expect exactly 1 row because there is only 1 read
|
|
63
|
-
assert
|
|
63
|
+
assert (
|
|
64
|
+
result.matrix.shape[0] == 1
|
|
65
|
+
), f"Expected 1 row (read), got {result.matrix.shape[0]}"
|