bam2tensor 2.2__tar.gz → 2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {bam2tensor-2.2 → bam2tensor-2.4}/.github/workflows/constraints.txt +1 -1
  2. {bam2tensor-2.2 → bam2tensor-2.4}/.github/workflows/docs.yml +2 -2
  3. {bam2tensor-2.2 → bam2tensor-2.4}/.github/workflows/labeler.yml +1 -1
  4. {bam2tensor-2.2 → bam2tensor-2.4}/.github/workflows/release.yml +1 -1
  5. {bam2tensor-2.2 → bam2tensor-2.4}/CLAUDE.md +12 -3
  6. {bam2tensor-2.2 → bam2tensor-2.4}/PKG-INFO +105 -10
  7. {bam2tensor-2.2 → bam2tensor-2.4}/README.md +104 -9
  8. {bam2tensor-2.2 → bam2tensor-2.4}/docs/reference.md +8 -0
  9. {bam2tensor-2.2 → bam2tensor-2.4}/pyproject.toml +2 -1
  10. {bam2tensor-2.2 → bam2tensor-2.4}/src/bam2tensor/__init__.py +3 -3
  11. {bam2tensor-2.2 → bam2tensor-2.4}/src/bam2tensor/__main__.py +23 -5
  12. {bam2tensor-2.2 → bam2tensor-2.4}/src/bam2tensor/functions.py +46 -18
  13. bam2tensor-2.4/src/bam2tensor/inspect.py +156 -0
  14. bam2tensor-2.4/src/bam2tensor/metadata.py +162 -0
  15. {bam2tensor-2.2 → bam2tensor-2.4}/tests/test_duplication.py +5 -3
  16. {bam2tensor-2.2 → bam2tensor-2.4}/tests/test_functions.py +233 -79
  17. bam2tensor-2.4/tests/test_inspect.py +186 -0
  18. {bam2tensor-2.2 → bam2tensor-2.4}/tests/test_main.py +33 -0
  19. bam2tensor-2.4/tests/test_metadata.py +224 -0
  20. {bam2tensor-2.2 → bam2tensor-2.4}/uv.lock +104 -108
  21. {bam2tensor-2.2 → bam2tensor-2.4}/.darglint +0 -0
  22. {bam2tensor-2.2 → bam2tensor-2.4}/.editorconfig +0 -0
  23. {bam2tensor-2.2 → bam2tensor-2.4}/.gitattributes +0 -0
  24. {bam2tensor-2.2 → bam2tensor-2.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  25. {bam2tensor-2.2 → bam2tensor-2.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  26. {bam2tensor-2.2 → bam2tensor-2.4}/.github/actions/setup-env/action.yml +0 -0
  27. {bam2tensor-2.2 → bam2tensor-2.4}/.github/dependabot.yml +0 -0
  28. {bam2tensor-2.2 → bam2tensor-2.4}/.github/labels.yml +0 -0
  29. {bam2tensor-2.2 → bam2tensor-2.4}/.github/release-drafter.yml +0 -0
  30. {bam2tensor-2.2 → bam2tensor-2.4}/.github/workflows/tests.yml +0 -0
  31. {bam2tensor-2.2 → bam2tensor-2.4}/.gitignore +0 -0
  32. {bam2tensor-2.2 → bam2tensor-2.4}/.pre-commit-config.yaml +0 -0
  33. {bam2tensor-2.2 → bam2tensor-2.4}/CONTRIBUTING.md +0 -0
  34. {bam2tensor-2.2 → bam2tensor-2.4}/LICENSE +0 -0
  35. {bam2tensor-2.2 → bam2tensor-2.4}/SECURITY.md +0 -0
  36. {bam2tensor-2.2 → bam2tensor-2.4}/docs/Makefile +0 -0
  37. {bam2tensor-2.2 → bam2tensor-2.4}/docs/conf.py +0 -0
  38. {bam2tensor-2.2 → bam2tensor-2.4}/docs/contributing.md +0 -0
  39. {bam2tensor-2.2 → bam2tensor-2.4}/docs/index.md +0 -0
  40. {bam2tensor-2.2 → bam2tensor-2.4}/docs/license.md +0 -0
  41. {bam2tensor-2.2 → bam2tensor-2.4}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
  42. {bam2tensor-2.2 → bam2tensor-2.4}/docs/logo/bam2tensor-logo.afdesign +0 -0
  43. {bam2tensor-2.2 → bam2tensor-2.4}/docs/logo/bam2tensor-logo.png +0 -0
  44. {bam2tensor-2.2 → bam2tensor-2.4}/docs/make.bat +0 -0
  45. {bam2tensor-2.2 → bam2tensor-2.4}/docs/nano-banana-overview-shrunk.png +0 -0
  46. {bam2tensor-2.2 → bam2tensor-2.4}/docs/templates/package.rst_t +0 -0
  47. {bam2tensor-2.2 → bam2tensor-2.4}/noxfile.py +0 -0
  48. {bam2tensor-2.2 → bam2tensor-2.4}/src/bam2tensor/embedding.py +0 -0
  49. {bam2tensor-2.2 → bam2tensor-2.4}/src/bam2tensor/py.typed +0 -0
  50. {bam2tensor-2.2 → bam2tensor-2.4}/src/bam2tensor/reference.py +0 -0
  51. {bam2tensor-2.2 → bam2tensor-2.4}/tests/__init__.py +0 -0
  52. {bam2tensor-2.2 → bam2tensor-2.4}/tests/test_embedding.py +0 -0
  53. {bam2tensor-2.2 → bam2tensor-2.4}/tests/test_fasta.fa +0 -0
  54. {bam2tensor-2.2 → bam2tensor-2.4}/tests/test_reference.py +0 -0
@@ -1,2 +1,2 @@
1
1
  nox==2026.2.9
2
- uv==0.10.7
2
+ uv==0.11.2
@@ -59,8 +59,8 @@ jobs:
59
59
  needs: build
60
60
  steps:
61
61
  - name: Setup Pages
62
- uses: actions/configure-pages@v5
62
+ uses: actions/configure-pages@v6
63
63
 
64
64
  - name: Deploy to GitHub Pages
65
65
  id: deployment
66
- uses: actions/deploy-pages@v4
66
+ uses: actions/deploy-pages@v5
@@ -20,6 +20,6 @@ jobs:
20
20
  uses: actions/checkout@v6
21
21
 
22
22
  - name: Run Labeler
23
- uses: crazy-max/ghaction-github-labeler@v5.3.0
23
+ uses: crazy-max/ghaction-github-labeler@v6.0.0
24
24
  with:
25
25
  skip-delete: true
@@ -76,7 +76,7 @@ jobs:
76
76
  repository-url: https://test.pypi.org/legacy/
77
77
 
78
78
  - name: Publish the release notes
79
- uses: release-drafter/release-drafter@v6.2.0
79
+ uses: release-drafter/release-drafter@v7.1.1
80
80
  with:
81
81
  publish: ${{ steps.check-version.outputs.tag != '' || steps.check-tag.outputs.tag != '' }}
82
82
  tag: ${{ steps.check-version.outputs.tag || steps.check-tag.outputs.tag }}
@@ -40,10 +40,12 @@ uv run mypy src
40
40
 
41
41
  ```
42
42
  src/bam2tensor/
43
- __init__.py # Package version (2.1)
44
- __main__.py # Click CLI entry point
43
+ __init__.py # Package version (2.4)
44
+ __main__.py # Click CLI entry point (bam2tensor command)
45
+ inspect.py # Inspect CLI entry point (bam2tensor-inspect command)
45
46
  embedding.py # GenomeMethylationEmbedding class (FASTA parsing, CpG indexing)
46
47
  functions.py # Core extraction: extract_methylation_data_from_bam()
48
+ metadata.py # .npz metadata read/write (provenance info in output files)
47
49
  reference.py # Reference genome download and caching utilities
48
50
 
49
51
  tests/
@@ -51,6 +53,8 @@ tests/
51
53
  test_functions.py # Core function tests
52
54
  test_embedding.py # Embedding class tests
53
55
  test_duplication.py # Read duplication bug tests
56
+ test_inspect.py # Inspect CLI tests
57
+ test_metadata.py # Metadata read/write/round-trip tests
54
58
  test_reference.py # Reference download/caching tests
55
59
  test.bam, test.bam.bai, test_fasta.fa # Test fixtures
56
60
  ```
@@ -110,8 +114,11 @@ xdoctest validates code examples in docstrings. Important rules:
110
114
  ### Data Structure
111
115
  - Output: scipy sparse COO matrix saved as .npz
112
116
  - Rows = unique reads (primary alignments)
113
- - Columns = CpG sites
117
+ - Columns = CpG sites (ordered by genomic position, determined by reference genome)
114
118
  - Values: 1 (methylated), 0 (unmethylated), -1 (no data/indels/SNVs)
119
+ - Each .npz file contains a `metadata.json` entry with provenance info (genome name, version, CpG index CRC32, expected chromosomes). Read via `bam2tensor.metadata.read_npz_metadata()`.
120
+ - Each .npz file contains a `tlen.npy` entry with per-read signed template length (BAM TLEN field) as int32. Read via `bam2tensor.metadata.read_npz_tlen()`. Returns `None` for files from older versions.
121
+ - `extract_methylation_data_from_bam()` returns an `ExtractionResult` NamedTuple with `.matrix` (sparse COO) and `.tlen` (numpy int32 array).
115
122
 
116
123
  ### Methylation Strand Detection
117
124
  - Bismark aligner: XM tag (Z/z for methylated/unmethylated CpG; no strand filtering needed)
@@ -144,6 +151,8 @@ xdoctest validates code examples in docstrings. Important rules:
144
151
  uv run bam2tensor --input-path input.bam --reference-fasta ref.fa
145
152
  # Or with auto-download:
146
153
  uv run bam2tensor --input-path input.bam --download-reference hg38
154
+ # Inspect an output file:
155
+ uv run bam2tensor-inspect output.methylation.npz
147
156
  ```
148
157
 
149
158
  ### Reference Genome Downloads
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bam2tensor
3
- Version: 2.2
3
+ Version: 2.4
4
4
  Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
5
5
  Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
6
6
  Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
@@ -72,7 +72,10 @@ Description-Content-Type: text/markdown
72
72
  - [Custom Output Directory](#custom-output-directory)
73
73
  - [Using a Custom Genome](#using-a-custom-genome)
74
74
  - [Command-Line Options](#command-line-options)
75
+ - [Inspecting Output Files](#inspecting-output-files)
75
76
  - [Output Data Structure](#output-data-structure)
77
+ - [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
78
+ - [Embedded Metadata](#embedded-metadata)
76
79
  - [Loading Output Files](#loading-output-files)
77
80
  - [Converting to Dense Arrays](#converting-to-dense-arrays)
78
81
  - [Working with Genomic Coordinates](#working-with-genomic-coordinates)
@@ -95,6 +98,7 @@ Description-Content-Type: text/markdown
95
98
  - **Batch Processing**: Process multiple BAM files with directory recursion
96
99
  - **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
97
100
  - **Quality Filtering**: Configurable mapping quality thresholds
101
+ - **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
98
102
 
99
103
  ## Requirements
100
104
 
@@ -285,24 +289,103 @@ Options:
285
289
  | `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
286
290
  | `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
287
291
 
292
+ ## Inspecting Output Files
293
+
294
+ Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
295
+
296
+ ```bash
297
+ $ bam2tensor-inspect sample.methylation.npz
298
+ sample.methylation.npz
299
+ Genome: hg38
300
+ Chromosomes: 24 (chr1, chr2, ... chrX, chrY)
301
+ Reads: 1,423,891
302
+ CpG sites: 28,217,448
303
+ Data points: 12,847,322 (sparsity: 99.97%)
304
+ Fragment len: median 167, mean 182, range [50, 600]
305
+ CpG index CRC32: a1b2c3d4
306
+ bam2tensor: v2.4
307
+ File size: 14.2 MB
308
+ ```
309
+
310
+ You can pass multiple files at once:
311
+
312
+ ```bash
313
+ $ bam2tensor-inspect *.methylation.npz
314
+ ```
315
+
316
+ This works on files produced by older versions of bam2tensor too (metadata fields will be omitted).
317
+
288
318
  ## Output Data Structure
289
319
 
290
- bam2tensor generates one `.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] with the following structure:
320
+ bam2tensor generates one `.methylation.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] (`scipy.sparse.coo_matrix`) with the following structure:
291
321
 
292
322
  | Dimension | Represents |
293
323
  |-----------|------------|
294
- | Rows | Unique reads (primary alignments that pass quality filters) |
295
- | Columns | CpG sites (ordered by genomic position across all chromosomes) |
324
+ | **Rows** | Unique sequencing reads (primary alignments that pass quality and flag filters, numbered sequentially as encountered across chromosomes) |
325
+ | **Columns** | CpG sites from the reference genome, ordered by genomic position across all chromosomes (chr1, chr2, ..., chrX, chrY). Column `i` maps to the `i`-th CpG dinucleotide in the reference FASTA. |
326
+
327
+ The **column dimension is determined entirely by the reference genome**: it equals the total number of CpG sites across all `--expected-chromosomes`. For example, hg38 with default chromosomes has ~28 million CpG columns. To map column indices back to genomic coordinates (e.g., column 12345 → chr1:29503), use the `GenomeMethylationEmbedding` class with the same reference FASTA and chromosome list (see [Working with Genomic Coordinates](#working-with-genomic-coordinates) below).
296
328
 
297
329
  ### Methylation State Values
298
330
 
299
331
  | Value | Meaning |
300
332
  |-------|---------|
301
- | `1` | Methylated (cytosine preserved as C) |
302
- | `0` | Unmethylated (cytosine converted to T by bisulfite treatment) |
303
- | `-1` | No data (indel, SNV, or site not covered by read) |
333
+ | `1` | Methylated (cytosine preserved as C after bisulfite/enzymatic conversion) |
334
+ | `0` | Unmethylated (cytosine converted to T by bisulfite/enzymatic treatment) |
335
+ | `-1` | No data (indel, SNV, or other non-C/T base at a CpG position) |
336
+
337
+ Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
338
+
339
+ ### Per-Read Fragment Length (TLEN)
340
+
341
+ Each `.methylation.npz` file includes a `tlen.npy` entry inside the ZIP archive containing the signed BAM template length (TLEN) for every read in the matrix. This enables joint fragment-length and methylation analysis without re-processing the BAM.
342
+
343
+ - One `int32` value per read (row), in the same order as the sparse matrix rows
344
+ - Signed: positive for the leftmost read in a pair, negative for the rightmost
345
+ - Zero for single-end reads or reads with unmapped mates
346
+ - Use `abs(tlen)` to get fragment lengths
304
347
 
305
- Note: Sparse matrices only store non-zero values. Positions with value `0` (unmethylated) are stored, but positions not covered by a read are simply absent from the matrix.
348
+ ```python
349
+ from bam2tensor.metadata import read_npz_tlen
350
+ import numpy as np
351
+
352
+ tlen = read_npz_tlen("sample.methylation.npz")
353
+ if tlen is not None:
354
+ frag_lengths = np.abs(tlen)
355
+ nonzero = frag_lengths[frag_lengths > 0]
356
+ print(f"Median fragment length: {np.median(nonzero):.0f}")
357
+ print(f"Mean fragment length: {np.mean(nonzero):.0f}")
358
+ ```
359
+
360
+ ### Embedded Metadata
361
+
362
+ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
363
+
364
+ | Field | Description |
365
+ |-------|-------------|
366
+ | `bam2tensor_version` | Version of bam2tensor that produced the file |
367
+ | `genome_name` | Genome identifier (e.g., `hg38`, `mm10`) |
368
+ | `expected_chromosomes` | List of chromosomes included in the column mapping |
369
+ | `total_cpg_sites` | Total number of CpG columns in the matrix |
370
+ | `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
371
+
372
+ This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
373
+
374
+ ```python
375
+ from bam2tensor.metadata import read_npz_metadata
376
+
377
+ meta = read_npz_metadata("sample.methylation.npz")
378
+ if meta is not None:
379
+ print(f"Genome: {meta['genome_name']}")
380
+ print(f"CpG sites: {meta['total_cpg_sites']:,}")
381
+ print(f"CpG index CRC32: {meta['cpg_index_crc32']}")
382
+ ```
383
+
384
+ The `cpg_index_crc32` field uniquely identifies the column mapping. Two files with the same CRC32 have identical column semantics (same chromosomes, same CpG positions, same order) and their matrices can be directly stacked or compared. The metadata is also accessible without bam2tensor installed, since `.npz` files are ZIP archives:
385
+
386
+ ```bash
387
+ unzip -p sample.methylation.npz metadata.json | python -m json.tool
388
+ ```
306
389
 
307
390
  ### Loading Output Files
308
391
 
@@ -489,10 +572,22 @@ extract_methylation_data_from_bam(
489
572
  quality_limit: int = 20, # Minimum MAPQ
490
573
  verbose: bool = False, # Enable verbose output
491
574
  debug: bool = False # Enable debug output
492
- ) -> scipy.sparse.coo_matrix
575
+ ) -> ExtractionResult
576
+ ```
577
+
578
+ **Returns:** An `ExtractionResult` named tuple with two fields:
579
+ - `matrix`: A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites)
580
+ - `tlen`: A 1-D numpy `int32` array of shape (n_reads,) containing the signed template length (BAM TLEN field) for each read
581
+
582
+ ### `bam2tensor.metadata.read_npz_tlen`
583
+
584
+ Read per-read template lengths from a `.methylation.npz` file.
585
+
586
+ ```python
587
+ read_npz_tlen(npz_path: str) -> np.ndarray | None
493
588
  ```
494
589
 
495
- **Returns:** A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites).
590
+ **Returns:** The per-read template-length array, or `None` if the file was produced by an older version of bam2tensor.
496
591
 
497
592
  ## Contributing
498
593
 
@@ -39,7 +39,10 @@
39
39
  - [Custom Output Directory](#custom-output-directory)
40
40
  - [Using a Custom Genome](#using-a-custom-genome)
41
41
  - [Command-Line Options](#command-line-options)
42
+ - [Inspecting Output Files](#inspecting-output-files)
42
43
  - [Output Data Structure](#output-data-structure)
44
+ - [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
45
+ - [Embedded Metadata](#embedded-metadata)
43
46
  - [Loading Output Files](#loading-output-files)
44
47
  - [Converting to Dense Arrays](#converting-to-dense-arrays)
45
48
  - [Working with Genomic Coordinates](#working-with-genomic-coordinates)
@@ -62,6 +65,7 @@
62
65
  - **Batch Processing**: Process multiple BAM files with directory recursion
63
66
  - **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
64
67
  - **Quality Filtering**: Configurable mapping quality thresholds
68
+ - **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
65
69
 
66
70
  ## Requirements
67
71
 
@@ -252,24 +256,103 @@ Options:
252
256
  | `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
253
257
  | `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
254
258
 
259
+ ## Inspecting Output Files
260
+
261
+ Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
262
+
263
+ ```bash
264
+ $ bam2tensor-inspect sample.methylation.npz
265
+ sample.methylation.npz
266
+ Genome: hg38
267
+ Chromosomes: 24 (chr1, chr2, ... chrX, chrY)
268
+ Reads: 1,423,891
269
+ CpG sites: 28,217,448
270
+ Data points: 12,847,322 (sparsity: 99.97%)
271
+ Fragment len: median 167, mean 182, range [50, 600]
272
+ CpG index CRC32: a1b2c3d4
273
+ bam2tensor: v2.4
274
+ File size: 14.2 MB
275
+ ```
276
+
277
+ You can pass multiple files at once:
278
+
279
+ ```bash
280
+ $ bam2tensor-inspect *.methylation.npz
281
+ ```
282
+
283
+ This works on files produced by older versions of bam2tensor too (metadata fields will be omitted).
284
+
255
285
  ## Output Data Structure
256
286
 
257
- bam2tensor generates one `.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] with the following structure:
287
+ bam2tensor generates one `.methylation.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] (`scipy.sparse.coo_matrix`) with the following structure:
258
288
 
259
289
  | Dimension | Represents |
260
290
  |-----------|------------|
261
- | Rows | Unique reads (primary alignments that pass quality filters) |
262
- | Columns | CpG sites (ordered by genomic position across all chromosomes) |
291
+ | **Rows** | Unique sequencing reads (primary alignments that pass quality and flag filters, numbered sequentially as encountered across chromosomes) |
292
+ | **Columns** | CpG sites from the reference genome, ordered by genomic position across all chromosomes (chr1, chr2, ..., chrX, chrY). Column `i` maps to the `i`-th CpG dinucleotide in the reference FASTA. |
293
+
294
+ The **column dimension is determined entirely by the reference genome**: it equals the total number of CpG sites across all `--expected-chromosomes`. For example, hg38 with default chromosomes has ~28 million CpG columns. To map column indices back to genomic coordinates (e.g., column 12345 → chr1:29503), use the `GenomeMethylationEmbedding` class with the same reference FASTA and chromosome list (see [Working with Genomic Coordinates](#working-with-genomic-coordinates) below).
263
295
 
264
296
  ### Methylation State Values
265
297
 
266
298
  | Value | Meaning |
267
299
  |-------|---------|
268
- | `1` | Methylated (cytosine preserved as C) |
269
- | `0` | Unmethylated (cytosine converted to T by bisulfite treatment) |
270
- | `-1` | No data (indel, SNV, or site not covered by read) |
300
+ | `1` | Methylated (cytosine preserved as C after bisulfite/enzymatic conversion) |
301
+ | `0` | Unmethylated (cytosine converted to T by bisulfite/enzymatic treatment) |
302
+ | `-1` | No data (indel, SNV, or other non-C/T base at a CpG position) |
303
+
304
+ Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
305
+
306
+ ### Per-Read Fragment Length (TLEN)
307
+
308
+ Each `.methylation.npz` file includes a `tlen.npy` entry inside the ZIP archive containing the signed BAM template length (TLEN) for every read in the matrix. This enables joint fragment-length and methylation analysis without re-processing the BAM.
309
+
310
+ - One `int32` value per read (row), in the same order as the sparse matrix rows
311
+ - Signed: positive for the leftmost read in a pair, negative for the rightmost
312
+ - Zero for single-end reads or reads with unmapped mates
313
+ - Use `abs(tlen)` to get fragment lengths
271
314
 
272
- Note: Sparse matrices only store non-zero values. Positions with value `0` (unmethylated) are stored, but positions not covered by a read are simply absent from the matrix.
315
+ ```python
316
+ from bam2tensor.metadata import read_npz_tlen
317
+ import numpy as np
318
+
319
+ tlen = read_npz_tlen("sample.methylation.npz")
320
+ if tlen is not None:
321
+ frag_lengths = np.abs(tlen)
322
+ nonzero = frag_lengths[frag_lengths > 0]
323
+ print(f"Median fragment length: {np.median(nonzero):.0f}")
324
+ print(f"Mean fragment length: {np.mean(nonzero):.0f}")
325
+ ```
326
+
327
+ ### Embedded Metadata
328
+
329
+ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
330
+
331
+ | Field | Description |
332
+ |-------|-------------|
333
+ | `bam2tensor_version` | Version of bam2tensor that produced the file |
334
+ | `genome_name` | Genome identifier (e.g., `hg38`, `mm10`) |
335
+ | `expected_chromosomes` | List of chromosomes included in the column mapping |
336
+ | `total_cpg_sites` | Total number of CpG columns in the matrix |
337
+ | `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
338
+
339
+ This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
340
+
341
+ ```python
342
+ from bam2tensor.metadata import read_npz_metadata
343
+
344
+ meta = read_npz_metadata("sample.methylation.npz")
345
+ if meta is not None:
346
+ print(f"Genome: {meta['genome_name']}")
347
+ print(f"CpG sites: {meta['total_cpg_sites']:,}")
348
+ print(f"CpG index CRC32: {meta['cpg_index_crc32']}")
349
+ ```
350
+
351
+ The `cpg_index_crc32` field uniquely identifies the column mapping. Two files with the same CRC32 have identical column semantics (same chromosomes, same CpG positions, same order) and their matrices can be directly stacked or compared. The metadata is also accessible without bam2tensor installed, since `.npz` files are ZIP archives:
352
+
353
+ ```bash
354
+ unzip -p sample.methylation.npz metadata.json | python -m json.tool
355
+ ```
273
356
 
274
357
  ### Loading Output Files
275
358
 
@@ -456,10 +539,22 @@ extract_methylation_data_from_bam(
456
539
  quality_limit: int = 20, # Minimum MAPQ
457
540
  verbose: bool = False, # Enable verbose output
458
541
  debug: bool = False # Enable debug output
459
- ) -> scipy.sparse.coo_matrix
542
+ ) -> ExtractionResult
543
+ ```
544
+
545
+ **Returns:** An `ExtractionResult` named tuple with two fields:
546
+ - `matrix`: A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites)
547
+ - `tlen`: A 1-D numpy `int32` array of shape (n_reads,) containing the signed template length (BAM TLEN field) for each read
548
+
549
+ ### `bam2tensor.metadata.read_npz_tlen`
550
+
551
+ Read per-read template lengths from a `.methylation.npz` file.
552
+
553
+ ```python
554
+ read_npz_tlen(npz_path: str) -> np.ndarray | None
460
555
  ```
461
556
 
462
- **Returns:** A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites).
557
+ **Returns:** The per-read template-length array, or `None` if the file was produced by an older version of bam2tensor.
463
558
 
464
559
  ## Contributing
465
560
 
@@ -32,6 +32,14 @@ bam2tensor.functions module
32
32
  :show-inheritance:
33
33
  :undoc-members:
34
34
 
35
+ bam2tensor.metadata module
36
+ --------------------------
37
+
38
+ .. automodule:: bam2tensor.metadata
39
+ :members:
40
+ :show-inheritance:
41
+ :undoc-members:
42
+
35
43
  bam2tensor.reference module
36
44
  ---------------------------
37
45
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "bam2tensor"
3
- version = "2.2"
3
+ version = "2.4"
4
4
  description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
5
5
  authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
6
6
  license = "MIT"
@@ -38,6 +38,7 @@ Changelog = "https://github.com/mcwdsi/bam2tensor/releases"
38
38
 
39
39
  [project.scripts]
40
40
  bam2tensor = "bam2tensor.__main__:main"
41
+ bam2tensor-inspect = "bam2tensor.inspect:main"
41
42
 
42
43
  [dependency-groups]
43
44
  dev = [
@@ -30,14 +30,14 @@ Example:
30
30
  )
31
31
 
32
32
  # Extract methylation data
33
- sparse_matrix = extract_methylation_data_from_bam(
33
+ result = extract_methylation_data_from_bam(
34
34
  input_bam="/path/to/sample.bam",
35
35
  genome_methylation_embedding=embedding,
36
36
  )
37
37
 
38
38
  # Save to file
39
39
  import scipy.sparse
40
- scipy.sparse.save_npz("output.npz", sparse_matrix)
40
+ scipy.sparse.save_npz("output.npz", result.matrix)
41
41
 
42
42
  Output Format:
43
43
  The output is a SciPy sparse COO matrix where:
@@ -50,4 +50,4 @@ See Also:
50
50
  - https://mcwdsi.github.io/bam2tensor for full documentation
51
51
  """
52
52
 
53
- __version__ = "2.2"
53
+ __version__ = "2.4"
@@ -38,6 +38,11 @@ from bam2tensor.functions import (
38
38
  detect_aligner,
39
39
  extract_methylation_data_from_bam,
40
40
  )
41
+ from bam2tensor.metadata import (
42
+ compute_cpg_index_crc32,
43
+ write_npz_metadata,
44
+ write_npz_tlen,
45
+ )
41
46
  from bam2tensor.reference import (
42
47
  KNOWN_GENOMES,
43
48
  download_reference as download_reference_fn,
@@ -393,10 +398,12 @@ def main(
393
398
  verbose=verbose,
394
399
  )
395
400
  n_chroms = len(genome_methylation_embedding.cpg_sites_dict)
401
+ cpg_crc32 = compute_cpg_index_crc32(genome_methylation_embedding)
396
402
  print(
397
403
  f" Total CpG sites: {genome_methylation_embedding.total_cpg_sites:,}"
398
404
  f" across {n_chroms} chromosome(s)"
399
405
  )
406
+ print(f" CpG index CRC32: {cpg_crc32}")
400
407
  print(f" Index loaded in {_format_elapsed(time.time() - time_start)}")
401
408
 
402
409
  # ── Discover BAM files ──────────────────────────────────────────────
@@ -437,7 +444,7 @@ def main(
437
444
  # Extract
438
445
  print(" Extracting methylation data...")
439
446
  try:
440
- methylation_data_coo = extract_methylation_data_from_bam(
447
+ extraction_result = extract_methylation_data_from_bam(
441
448
  input_bam=input_bam,
442
449
  genome_methylation_embedding=genome_methylation_embedding,
443
450
  quality_limit=quality_limit,
@@ -450,16 +457,27 @@ def main(
450
457
  continue
451
458
 
452
459
  # Matrix stats
453
- n_reads = methylation_data_coo.shape[0]
454
- n_cpgs = methylation_data_coo.shape[1]
455
- n_data = methylation_data_coo.nnz
460
+ n_reads = extraction_result.matrix.shape[0]
461
+ n_cpgs = extraction_result.matrix.shape[1]
462
+ n_data = extraction_result.matrix.nnz
456
463
  print(
457
464
  f" Result: {n_reads:,} reads x {n_cpgs:,} CpG sites"
458
465
  f" ({n_data:,} data points)"
459
466
  )
460
467
 
461
468
  # Save
462
- scipy.sparse.save_npz(output_file, methylation_data_coo, compressed=True)
469
+ scipy.sparse.save_npz(output_file, extraction_result.matrix, compressed=True)
470
+ write_npz_tlen(output_file, extraction_result.tlen)
471
+ write_npz_metadata(
472
+ output_file,
473
+ {
474
+ "bam2tensor_version": __version__,
475
+ "genome_name": genome_name,
476
+ "expected_chromosomes": chrom_list,
477
+ "total_cpg_sites": genome_methylation_embedding.total_cpg_sites,
478
+ "cpg_index_crc32": cpg_crc32,
479
+ },
480
+ )
463
481
  print(f" Output: {output_file}")
464
482
  print(f" Time: {_format_elapsed(time.time() - time_bam)}")
465
483
 
@@ -39,15 +39,18 @@ Example:
39
39
  ... )
40
40
  >>>
41
41
  >>> # Extract methylation data
42
- >>> matrix = extract_methylation_data_from_bam(
42
+ >>> result = extract_methylation_data_from_bam(
43
43
  ... input_bam="sample.bam",
44
44
  ... genome_methylation_embedding=embedding,
45
45
  ... quality_limit=20,
46
46
  ... )
47
47
  >>>
48
- >>> print(f"Extracted {matrix.shape[0]} reads, {matrix.nnz} data points")
48
+ >>> print(f"Extracted {result.matrix.shape[0]} reads, {result.matrix.nnz} data points")
49
49
  """
50
50
 
51
+ from typing import NamedTuple
52
+
53
+ import numpy as np
51
54
  import scipy.sparse
52
55
  import pysam
53
56
  import bisect
@@ -55,6 +58,23 @@ import bisect
55
58
  from tqdm import tqdm
56
59
  from bam2tensor.embedding import GenomeMethylationEmbedding
57
60
 
61
+
62
+ class ExtractionResult(NamedTuple):
63
+ """Result of methylation extraction from a BAM file.
64
+
65
+ Attributes:
66
+ matrix: Sparse COO matrix of shape (n_reads, n_cpg_sites) with
67
+ methylation states: 1 (methylated), 0 (unmethylated), -1
68
+ (no data).
69
+ tlen: 1-D numpy array of shape (n_reads,) containing the signed
70
+ template length (TLEN from BAM) for each read. 0 for
71
+ single-end reads or reads with unmapped mates.
72
+ """
73
+
74
+ matrix: scipy.sparse.coo_matrix
75
+ tlen: np.ndarray
76
+
77
+
58
78
  # BAM flag bits for reads to skip: duplicate (0x400), qcfail (0x200),
59
79
  # secondary (0x100), supplementary (0x800).
60
80
  _SKIP_FLAGS = 0x400 | 0x200 | 0x100 | 0x800
@@ -180,7 +200,7 @@ def extract_methylation_data_from_bam(
180
200
  quality_limit: int = 20,
181
201
  verbose: bool = False,
182
202
  debug: bool = False,
183
- ) -> scipy.sparse.coo_matrix:
203
+ ) -> ExtractionResult:
184
204
  """Extract read-level CpG methylation data from a BAM file.
185
205
 
186
206
  Parses a bisulfite-sequencing or EM-seq BAM file and extracts methylation
@@ -225,14 +245,19 @@ def extract_methylation_data_from_bam(
225
245
  only processed once. Significantly slower.
226
246
 
227
247
  Returns:
228
- A scipy.sparse.coo_matrix with shape (n_reads, n_cpg_sites) where:
229
- - n_reads is the number of reads that passed filters and covered
230
- at least one CpG site
231
- - n_cpg_sites is genome_methylation_embedding.total_cpg_sites
232
- - Values are: 1 (methylated), 0 (unmethylated), -1 (no data)
233
-
234
- The matrix uses COO format for efficient construction. Convert to
235
- CSR (tocsr()) for row slicing or CSC (tocsc()) for column slicing.
248
+ An ExtractionResult named tuple with two fields:
249
+
250
+ - **matrix**: A scipy.sparse.coo_matrix with shape
251
+ (n_reads, n_cpg_sites) where n_reads is the number of reads
252
+ that passed filters and covered at least one CpG site,
253
+ n_cpg_sites is genome_methylation_embedding.total_cpg_sites,
254
+ and values are: 1 (methylated), 0 (unmethylated), -1 (no data).
255
+ The matrix uses COO format for efficient construction; convert
256
+ to CSR (tocsr()) for row slicing or CSC (tocsc()) for column
257
+ slicing.
258
+ - **tlen**: A 1-D numpy int32 array of shape (n_reads,) containing
259
+ the signed template length (BAM TLEN field) for each read.
260
+ Values are 0 for single-end reads or reads with unmapped mates.
236
261
 
237
262
  Raises:
238
263
  FileNotFoundError: If the BAM file index (.bam.bai) is missing.
@@ -252,7 +277,7 @@ def extract_methylation_data_from_bam(
252
277
  ... )
253
278
  >>>
254
279
  >>> # Extract methylation data
255
- >>> matrix = extract_methylation_data_from_bam(
280
+ >>> result = extract_methylation_data_from_bam(
256
281
  ... input_bam="sample.bam",
257
282
  ... genome_methylation_embedding=embedding,
258
283
  ... quality_limit=30, # Stricter quality filter
@@ -260,12 +285,12 @@ def extract_methylation_data_from_bam(
260
285
  ... )
261
286
  >>>
262
287
  >>> # Analyze results
263
- >>> print(f"Reads with CpG data: {matrix.shape[0]:,}")
264
- >>> print(f"Total CpG sites: {matrix.shape[1]:,}")
265
- >>> print(f"Data points: {matrix.nnz:,}")
288
+ >>> print(f"Reads with CpG data: {result.matrix.shape[0]:,}")
289
+ >>> print(f"Total CpG sites: {result.matrix.shape[1]:,}")
290
+ >>> print(f"Data points: {result.matrix.nnz:,}")
266
291
  >>>
267
292
  >>> # Save to file
268
- >>> scipy.sparse.save_npz("sample.methylation.npz", matrix)
293
+ >>> scipy.sparse.save_npz("sample.methylation.npz", result.matrix)
269
294
 
270
295
  Note:
271
296
  The function processes chromosomes in the order they appear in
@@ -304,6 +329,7 @@ def extract_methylation_data_from_bam(
304
329
  coo_row = [] # Read number
305
330
  coo_col = [] # CpG number (embedding)
306
331
  coo_data = [] # Methylation state
332
+ tlen_list: list[int] = [] # Template length (TLEN) per read
307
333
 
308
334
  # This is slow, but we only run it once and store the results for later
309
335
  for chrom, cpg_sites in tqdm(
@@ -398,6 +424,7 @@ def extract_methylation_data_from_bam(
398
424
  ), "Read seen twice!"
399
425
  debug_read_name_to_row_number[read_key] = read_number
400
426
  print("************************************************\n")
427
+ tlen_list.append(aligned_segment.template_length)
401
428
  read_number += 1
402
429
 
403
430
  continue # Skip the Biscuit/bwameth/gem3 path below
@@ -526,6 +553,7 @@ def extract_methylation_data_from_bam(
526
553
  f"\t{query_pos} {ref_pos} C->{query_base} [Unknown! SNV? Indel?]"
527
554
  )
528
555
 
556
+ tlen_list.append(aligned_segment.template_length)
529
557
  read_number += 1
530
558
 
531
559
  if debug:
@@ -557,6 +585,6 @@ def extract_methylation_data_from_bam(
557
585
  # Number of columns = number of CpG sites
558
586
  assert sparse_matrix.shape[1] == genome_methylation_embedding.total_cpg_sites
559
587
 
560
- return sparse_matrix
588
+ tlen_array = np.array(tlen_list, dtype=np.int32)
561
589
 
562
- # return scipy.sparse.coo_matrix((coo_data, (coo_row, coo_col)), shape=(len(read_name_to_row_number) + 1, total_cpg_sites))
590
+ return ExtractionResult(matrix=sparse_matrix, tlen=tlen_array)