bam2tensor 2.2__tar.gz → 2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {bam2tensor-2.2 → bam2tensor-2.3}/CLAUDE.md +10 -3
  2. {bam2tensor-2.2 → bam2tensor-2.3}/PKG-INFO +67 -8
  3. {bam2tensor-2.2 → bam2tensor-2.3}/README.md +66 -7
  4. {bam2tensor-2.2 → bam2tensor-2.3}/pyproject.toml +2 -1
  5. {bam2tensor-2.2 → bam2tensor-2.3}/src/bam2tensor/__init__.py +1 -1
  6. {bam2tensor-2.2 → bam2tensor-2.3}/src/bam2tensor/__main__.py +13 -0
  7. bam2tensor-2.3/src/bam2tensor/inspect.py +143 -0
  8. bam2tensor-2.3/src/bam2tensor/metadata.py +114 -0
  9. bam2tensor-2.3/tests/test_inspect.py +146 -0
  10. bam2tensor-2.3/tests/test_metadata.py +162 -0
  11. {bam2tensor-2.2 → bam2tensor-2.3}/uv.lock +1 -1
  12. {bam2tensor-2.2 → bam2tensor-2.3}/.darglint +0 -0
  13. {bam2tensor-2.2 → bam2tensor-2.3}/.editorconfig +0 -0
  14. {bam2tensor-2.2 → bam2tensor-2.3}/.gitattributes +0 -0
  15. {bam2tensor-2.2 → bam2tensor-2.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  16. {bam2tensor-2.2 → bam2tensor-2.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  17. {bam2tensor-2.2 → bam2tensor-2.3}/.github/actions/setup-env/action.yml +0 -0
  18. {bam2tensor-2.2 → bam2tensor-2.3}/.github/dependabot.yml +0 -0
  19. {bam2tensor-2.2 → bam2tensor-2.3}/.github/labels.yml +0 -0
  20. {bam2tensor-2.2 → bam2tensor-2.3}/.github/release-drafter.yml +0 -0
  21. {bam2tensor-2.2 → bam2tensor-2.3}/.github/workflows/constraints.txt +0 -0
  22. {bam2tensor-2.2 → bam2tensor-2.3}/.github/workflows/docs.yml +0 -0
  23. {bam2tensor-2.2 → bam2tensor-2.3}/.github/workflows/labeler.yml +0 -0
  24. {bam2tensor-2.2 → bam2tensor-2.3}/.github/workflows/release.yml +0 -0
  25. {bam2tensor-2.2 → bam2tensor-2.3}/.github/workflows/tests.yml +0 -0
  26. {bam2tensor-2.2 → bam2tensor-2.3}/.gitignore +0 -0
  27. {bam2tensor-2.2 → bam2tensor-2.3}/.pre-commit-config.yaml +0 -0
  28. {bam2tensor-2.2 → bam2tensor-2.3}/CONTRIBUTING.md +0 -0
  29. {bam2tensor-2.2 → bam2tensor-2.3}/LICENSE +0 -0
  30. {bam2tensor-2.2 → bam2tensor-2.3}/SECURITY.md +0 -0
  31. {bam2tensor-2.2 → bam2tensor-2.3}/docs/Makefile +0 -0
  32. {bam2tensor-2.2 → bam2tensor-2.3}/docs/conf.py +0 -0
  33. {bam2tensor-2.2 → bam2tensor-2.3}/docs/contributing.md +0 -0
  34. {bam2tensor-2.2 → bam2tensor-2.3}/docs/index.md +0 -0
  35. {bam2tensor-2.2 → bam2tensor-2.3}/docs/license.md +0 -0
  36. {bam2tensor-2.2 → bam2tensor-2.3}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
  37. {bam2tensor-2.2 → bam2tensor-2.3}/docs/logo/bam2tensor-logo.afdesign +0 -0
  38. {bam2tensor-2.2 → bam2tensor-2.3}/docs/logo/bam2tensor-logo.png +0 -0
  39. {bam2tensor-2.2 → bam2tensor-2.3}/docs/make.bat +0 -0
  40. {bam2tensor-2.2 → bam2tensor-2.3}/docs/nano-banana-overview-shrunk.png +0 -0
  41. {bam2tensor-2.2 → bam2tensor-2.3}/docs/reference.md +0 -0
  42. {bam2tensor-2.2 → bam2tensor-2.3}/docs/templates/package.rst_t +0 -0
  43. {bam2tensor-2.2 → bam2tensor-2.3}/noxfile.py +0 -0
  44. {bam2tensor-2.2 → bam2tensor-2.3}/src/bam2tensor/embedding.py +0 -0
  45. {bam2tensor-2.2 → bam2tensor-2.3}/src/bam2tensor/functions.py +0 -0
  46. {bam2tensor-2.2 → bam2tensor-2.3}/src/bam2tensor/py.typed +0 -0
  47. {bam2tensor-2.2 → bam2tensor-2.3}/src/bam2tensor/reference.py +0 -0
  48. {bam2tensor-2.2 → bam2tensor-2.3}/tests/__init__.py +0 -0
  49. {bam2tensor-2.2 → bam2tensor-2.3}/tests/test_duplication.py +0 -0
  50. {bam2tensor-2.2 → bam2tensor-2.3}/tests/test_embedding.py +0 -0
  51. {bam2tensor-2.2 → bam2tensor-2.3}/tests/test_fasta.fa +0 -0
  52. {bam2tensor-2.2 → bam2tensor-2.3}/tests/test_functions.py +0 -0
  53. {bam2tensor-2.2 → bam2tensor-2.3}/tests/test_main.py +0 -0
  54. {bam2tensor-2.2 → bam2tensor-2.3}/tests/test_reference.py +0 -0
@@ -40,10 +40,12 @@ uv run mypy src
40
40
 
41
41
  ```
42
42
  src/bam2tensor/
43
- __init__.py # Package version (2.1)
44
- __main__.py # Click CLI entry point
43
+ __init__.py # Package version (2.3)
44
+ __main__.py # Click CLI entry point (bam2tensor command)
45
+ inspect.py # Inspect CLI entry point (bam2tensor-inspect command)
45
46
  embedding.py # GenomeMethylationEmbedding class (FASTA parsing, CpG indexing)
46
47
  functions.py # Core extraction: extract_methylation_data_from_bam()
48
+ metadata.py # .npz metadata read/write (provenance info in output files)
47
49
  reference.py # Reference genome download and caching utilities
48
50
 
49
51
  tests/
@@ -51,6 +53,8 @@ tests/
51
53
  test_functions.py # Core function tests
52
54
  test_embedding.py # Embedding class tests
53
55
  test_duplication.py # Read duplication bug tests
56
+ test_inspect.py # Inspect CLI tests
57
+ test_metadata.py # Metadata read/write/round-trip tests
54
58
  test_reference.py # Reference download/caching tests
55
59
  test.bam, test.bam.bai, test_fasta.fa # Test fixtures
56
60
  ```
@@ -110,8 +114,9 @@ xdoctest validates code examples in docstrings. Important rules:
110
114
  ### Data Structure
111
115
  - Output: scipy sparse COO matrix saved as .npz
112
116
  - Rows = unique reads (primary alignments)
113
- - Columns = CpG sites
117
+ - Columns = CpG sites (ordered by genomic position, determined by reference genome)
114
118
  - Values: 1 (methylated), 0 (unmethylated), -1 (no data/indels/SNVs)
119
+ - Each .npz file contains a `metadata.json` entry with provenance info (genome name, version, CpG index CRC32, expected chromosomes). Read via `bam2tensor.metadata.read_npz_metadata()`.
115
120
 
116
121
  ### Methylation Strand Detection
117
122
  - Bismark aligner: XM tag (Z/z for methylated/unmethylated CpG; no strand filtering needed)
@@ -144,6 +149,8 @@ xdoctest validates code examples in docstrings. Important rules:
144
149
  uv run bam2tensor --input-path input.bam --reference-fasta ref.fa
145
150
  # Or with auto-download:
146
151
  uv run bam2tensor --input-path input.bam --download-reference hg38
152
+ # Inspect an output file:
153
+ uv run bam2tensor-inspect output.methylation.npz
147
154
  ```
148
155
 
149
156
  ### Reference Genome Downloads
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bam2tensor
3
- Version: 2.2
3
+ Version: 2.3
4
4
  Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
5
5
  Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
6
6
  Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
@@ -72,7 +72,9 @@ Description-Content-Type: text/markdown
72
72
  - [Custom Output Directory](#custom-output-directory)
73
73
  - [Using a Custom Genome](#using-a-custom-genome)
74
74
  - [Command-Line Options](#command-line-options)
75
+ - [Inspecting Output Files](#inspecting-output-files)
75
76
  - [Output Data Structure](#output-data-structure)
77
+ - [Embedded Metadata](#embedded-metadata)
76
78
  - [Loading Output Files](#loading-output-files)
77
79
  - [Converting to Dense Arrays](#converting-to-dense-arrays)
78
80
  - [Working with Genomic Coordinates](#working-with-genomic-coordinates)
@@ -285,24 +287,81 @@ Options:
285
287
  | `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
286
288
  | `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
287
289
 
290
+ ## Inspecting Output Files
291
+
292
+ Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
293
+
294
+ ```bash
295
+ $ bam2tensor-inspect sample.methylation.npz
296
+ sample.methylation.npz
297
+ Genome: hg38
298
+ Chromosomes: 24 (chr1, chr2, ... chrX, chrY)
299
+ Reads: 1,423,891
300
+ CpG sites: 28,217,448
301
+ Data points: 12,847,322 (sparsity: 99.97%)
302
+ CpG index CRC32: a1b2c3d4
303
+ bam2tensor: v2.3
304
+ File size: 14.2 MB
305
+ ```
306
+
307
+ You can pass multiple files at once:
308
+
309
+ ```bash
310
+ $ bam2tensor-inspect *.methylation.npz
311
+ ```
312
+
313
+ This works on files produced by older versions of bam2tensor too (metadata fields will be omitted).
314
+
288
315
  ## Output Data Structure
289
316
 
290
- bam2tensor generates one `.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] with the following structure:
317
+ bam2tensor generates one `.methylation.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] (`scipy.sparse.coo_matrix`) with the following structure:
291
318
 
292
319
  | Dimension | Represents |
293
320
  |-----------|------------|
294
- | Rows | Unique reads (primary alignments that pass quality filters) |
295
- | Columns | CpG sites (ordered by genomic position across all chromosomes) |
321
+ | **Rows** | Unique sequencing reads (primary alignments that pass quality and flag filters, numbered sequentially as encountered across chromosomes) |
322
+ | **Columns** | CpG sites from the reference genome, ordered by genomic position across all chromosomes (chr1, chr2, ..., chrX, chrY). Column `i` maps to the `i`-th CpG dinucleotide in the reference FASTA. |
323
+
324
+ The **column dimension is determined entirely by the reference genome**: it equals the total number of CpG sites across all `--expected-chromosomes`. For example, hg38 with default chromosomes has ~28 million CpG columns. To map column indices back to genomic coordinates (e.g., column 12345 → chr1:29503), use the `GenomeMethylationEmbedding` class with the same reference FASTA and chromosome list (see [Working with Genomic Coordinates](#working-with-genomic-coordinates) below).
296
325
 
297
326
  ### Methylation State Values
298
327
 
299
328
  | Value | Meaning |
300
329
  |-------|---------|
301
- | `1` | Methylated (cytosine preserved as C) |
302
- | `0` | Unmethylated (cytosine converted to T by bisulfite treatment) |
303
- | `-1` | No data (indel, SNV, or site not covered by read) |
330
+ | `1` | Methylated (cytosine preserved as C after bisulfite/enzymatic conversion) |
331
+ | `0` | Unmethylated (cytosine converted to T by bisulfite/enzymatic treatment) |
332
+ | `-1` | No data (indel, SNV, or other non-C/T base at a CpG position) |
333
+
334
+ Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
304
335
 
305
- Note: Sparse matrices only store non-zero values. Positions with value `0` (unmethylated) are stored, but positions not covered by a read are simply absent from the matrix.
336
+ ### Embedded Metadata
337
+
338
+ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
339
+
340
+ | Field | Description |
341
+ |-------|-------------|
342
+ | `bam2tensor_version` | Version of bam2tensor that produced the file |
343
+ | `genome_name` | Genome identifier (e.g., `hg38`, `mm10`) |
344
+ | `expected_chromosomes` | List of chromosomes included in the column mapping |
345
+ | `total_cpg_sites` | Total number of CpG columns in the matrix |
346
+ | `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
347
+
348
+ This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
349
+
350
+ ```python
351
+ from bam2tensor.metadata import read_npz_metadata
352
+
353
+ meta = read_npz_metadata("sample.methylation.npz")
354
+ if meta is not None:
355
+ print(f"Genome: {meta['genome_name']}")
356
+ print(f"CpG sites: {meta['total_cpg_sites']:,}")
357
+ print(f"CpG index CRC32: {meta['cpg_index_crc32']}")
358
+ ```
359
+
360
+ The `cpg_index_crc32` field uniquely identifies the column mapping. Two files with the same CRC32 have identical column semantics (same chromosomes, same CpG positions, same order) and their matrices can be directly stacked or compared. The metadata is also accessible without bam2tensor installed, since `.npz` files are ZIP archives:
361
+
362
+ ```bash
363
+ unzip -p sample.methylation.npz metadata.json | python -m json.tool
364
+ ```
306
365
 
307
366
  ### Loading Output Files
308
367
 
@@ -39,7 +39,9 @@
39
39
  - [Custom Output Directory](#custom-output-directory)
40
40
  - [Using a Custom Genome](#using-a-custom-genome)
41
41
  - [Command-Line Options](#command-line-options)
42
+ - [Inspecting Output Files](#inspecting-output-files)
42
43
  - [Output Data Structure](#output-data-structure)
44
+ - [Embedded Metadata](#embedded-metadata)
43
45
  - [Loading Output Files](#loading-output-files)
44
46
  - [Converting to Dense Arrays](#converting-to-dense-arrays)
45
47
  - [Working with Genomic Coordinates](#working-with-genomic-coordinates)
@@ -252,24 +254,81 @@ Options:
252
254
  | `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
253
255
  | `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
254
256
 
257
+ ## Inspecting Output Files
258
+
259
+ Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
260
+
261
+ ```bash
262
+ $ bam2tensor-inspect sample.methylation.npz
263
+ sample.methylation.npz
264
+ Genome: hg38
265
+ Chromosomes: 24 (chr1, chr2, ... chrX, chrY)
266
+ Reads: 1,423,891
267
+ CpG sites: 28,217,448
268
+ Data points: 12,847,322 (sparsity: 99.97%)
269
+ CpG index CRC32: a1b2c3d4
270
+ bam2tensor: v2.3
271
+ File size: 14.2 MB
272
+ ```
273
+
274
+ You can pass multiple files at once:
275
+
276
+ ```bash
277
+ $ bam2tensor-inspect *.methylation.npz
278
+ ```
279
+
280
+ This works on files produced by older versions of bam2tensor too (metadata fields will be omitted).
281
+
255
282
  ## Output Data Structure
256
283
 
257
- bam2tensor generates one `.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] with the following structure:
284
+ bam2tensor generates one `.methylation.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] (`scipy.sparse.coo_matrix`) with the following structure:
258
285
 
259
286
  | Dimension | Represents |
260
287
  |-----------|------------|
261
- | Rows | Unique reads (primary alignments that pass quality filters) |
262
- | Columns | CpG sites (ordered by genomic position across all chromosomes) |
288
+ | **Rows** | Unique sequencing reads (primary alignments that pass quality and flag filters, numbered sequentially as encountered across chromosomes) |
289
+ | **Columns** | CpG sites from the reference genome, ordered by genomic position across all chromosomes (chr1, chr2, ..., chrX, chrY). Column `i` maps to the `i`-th CpG dinucleotide in the reference FASTA. |
290
+
291
+ The **column dimension is determined entirely by the reference genome**: it equals the total number of CpG sites across all `--expected-chromosomes`. For example, hg38 with default chromosomes has ~28 million CpG columns. To map column indices back to genomic coordinates (e.g., column 12345 → chr1:29503), use the `GenomeMethylationEmbedding` class with the same reference FASTA and chromosome list (see [Working with Genomic Coordinates](#working-with-genomic-coordinates) below).
263
292
 
264
293
  ### Methylation State Values
265
294
 
266
295
  | Value | Meaning |
267
296
  |-------|---------|
268
- | `1` | Methylated (cytosine preserved as C) |
269
- | `0` | Unmethylated (cytosine converted to T by bisulfite treatment) |
270
- | `-1` | No data (indel, SNV, or site not covered by read) |
297
+ | `1` | Methylated (cytosine preserved as C after bisulfite/enzymatic conversion) |
298
+ | `0` | Unmethylated (cytosine converted to T by bisulfite/enzymatic treatment) |
299
+ | `-1` | No data (indel, SNV, or other non-C/T base at a CpG position) |
300
+
301
+ Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
271
302
 
272
- Note: Sparse matrices only store non-zero values. Positions with value `0` (unmethylated) are stored, but positions not covered by a read are simply absent from the matrix.
303
+ ### Embedded Metadata
304
+
305
+ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
306
+
307
+ | Field | Description |
308
+ |-------|-------------|
309
+ | `bam2tensor_version` | Version of bam2tensor that produced the file |
310
+ | `genome_name` | Genome identifier (e.g., `hg38`, `mm10`) |
311
+ | `expected_chromosomes` | List of chromosomes included in the column mapping |
312
+ | `total_cpg_sites` | Total number of CpG columns in the matrix |
313
+ | `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
314
+
315
+ This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
316
+
317
+ ```python
318
+ from bam2tensor.metadata import read_npz_metadata
319
+
320
+ meta = read_npz_metadata("sample.methylation.npz")
321
+ if meta is not None:
322
+ print(f"Genome: {meta['genome_name']}")
323
+ print(f"CpG sites: {meta['total_cpg_sites']:,}")
324
+ print(f"CpG index CRC32: {meta['cpg_index_crc32']}")
325
+ ```
326
+
327
+ The `cpg_index_crc32` field uniquely identifies the column mapping. Two files with the same CRC32 have identical column semantics (same chromosomes, same CpG positions, same order) and their matrices can be directly stacked or compared. The metadata is also accessible without bam2tensor installed, since `.npz` files are ZIP archives:
328
+
329
+ ```bash
330
+ unzip -p sample.methylation.npz metadata.json | python -m json.tool
331
+ ```
273
332
 
274
333
  ### Loading Output Files
275
334
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "bam2tensor"
3
- version = "2.2"
3
+ version = "2.3"
4
4
  description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
5
5
  authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
6
6
  license = "MIT"
@@ -38,6 +38,7 @@ Changelog = "https://github.com/mcwdsi/bam2tensor/releases"
38
38
 
39
39
  [project.scripts]
40
40
  bam2tensor = "bam2tensor.__main__:main"
41
+ bam2tensor-inspect = "bam2tensor.inspect:main"
41
42
 
42
43
  [dependency-groups]
43
44
  dev = [
@@ -50,4 +50,4 @@ See Also:
50
50
  - https://mcwdsi.github.io/bam2tensor for full documentation
51
51
  """
52
52
 
53
- __version__ = "2.2"
53
+ __version__ = "2.3"
@@ -38,6 +38,7 @@ from bam2tensor.functions import (
38
38
  detect_aligner,
39
39
  extract_methylation_data_from_bam,
40
40
  )
41
+ from bam2tensor.metadata import compute_cpg_index_crc32, write_npz_metadata
41
42
  from bam2tensor.reference import (
42
43
  KNOWN_GENOMES,
43
44
  download_reference as download_reference_fn,
@@ -393,10 +394,12 @@ def main(
393
394
  verbose=verbose,
394
395
  )
395
396
  n_chroms = len(genome_methylation_embedding.cpg_sites_dict)
397
+ cpg_crc32 = compute_cpg_index_crc32(genome_methylation_embedding)
396
398
  print(
397
399
  f" Total CpG sites: {genome_methylation_embedding.total_cpg_sites:,}"
398
400
  f" across {n_chroms} chromosome(s)"
399
401
  )
402
+ print(f" CpG index CRC32: {cpg_crc32}")
400
403
  print(f" Index loaded in {_format_elapsed(time.time() - time_start)}")
401
404
 
402
405
  # ── Discover BAM files ──────────────────────────────────────────────
@@ -460,6 +463,16 @@ def main(
460
463
 
461
464
  # Save
462
465
  scipy.sparse.save_npz(output_file, methylation_data_coo, compressed=True)
466
+ write_npz_metadata(
467
+ output_file,
468
+ {
469
+ "bam2tensor_version": __version__,
470
+ "genome_name": genome_name,
471
+ "expected_chromosomes": chrom_list,
472
+ "total_cpg_sites": genome_methylation_embedding.total_cpg_sites,
473
+ "cpg_index_crc32": cpg_crc32,
474
+ },
475
+ )
463
476
  print(f" Output: {output_file}")
464
477
  print(f" Time: {_format_elapsed(time.time() - time_bam)}")
465
478
 
@@ -0,0 +1,143 @@
1
+ """Inspect command for bam2tensor .npz output files.
2
+
3
+ Provides a CLI entry point (``bam2tensor-inspect``) that prints a summary
4
+ of one or more ``.methylation.npz`` files, including matrix dimensions,
5
+ sparsity, file size, and embedded provenance metadata.
6
+
7
+ Example:
8
+ Inspect a single file::
9
+
10
+ $ bam2tensor-inspect sample.methylation.npz
11
+
12
+ Inspect multiple files::
13
+
14
+ $ bam2tensor-inspect *.methylation.npz
15
+ """
16
+
17
+ import os
18
+ import sys
19
+
20
+ import click
21
+ import numpy as np
22
+ import scipy.sparse
23
+
24
+ from bam2tensor.metadata import read_npz_metadata
25
+
26
+
27
+ def _format_size(nbytes: int) -> str:
28
+ """Format a byte count as a human-readable string.
29
+
30
+ Args:
31
+ nbytes: Number of bytes.
32
+
33
+ Returns:
34
+ A string such as ``"14.2 MB"`` or ``"832 bytes"``.
35
+
36
+ Example:
37
+ >>> _format_size(14_200_000)
38
+ '13.5 MB'
39
+
40
+ >>> _format_size(500)
41
+ '500 bytes'
42
+
43
+ >>> _format_size(2048)
44
+ '2.0 KB'
45
+ """
46
+ if nbytes < 1024:
47
+ return f"{nbytes} bytes"
48
+ elif nbytes < 1024 * 1024:
49
+ return f"{nbytes / 1024:.1f} KB"
50
+ elif nbytes < 1024 * 1024 * 1024:
51
+ return f"{nbytes / (1024 * 1024):.1f} MB"
52
+ else:
53
+ return f"{nbytes / (1024 * 1024 * 1024):.1f} GB"
54
+
55
+
56
+ def inspect_npz(npz_path: str) -> None:
57
+ """Print a human-readable summary of a .methylation.npz file.
58
+
59
+ Loads the sparse matrix and any embedded metadata, then prints
60
+ matrix dimensions, data-point counts, sparsity, provenance
61
+ information, and file size.
62
+
63
+ Args:
64
+ npz_path: Path to the ``.npz`` file to inspect.
65
+
66
+ Example:
67
+ >>> # xdoctest: +SKIP
68
+ >>> inspect_npz("sample.methylation.npz")
69
+ sample.methylation.npz
70
+ Reads: 1,423
71
+ CpG sites: 28,217,448
72
+ ...
73
+ """
74
+ # Load matrix
75
+ matrix = scipy.sparse.load_npz(npz_path)
76
+ n_reads, n_cpgs = matrix.shape
77
+ n_data = matrix.nnz
78
+ total_cells = int(np.prod(matrix.shape)) if n_reads > 0 else 0
79
+ sparsity = 1 - (n_data / total_cells) if total_cells > 0 else 0.0
80
+ file_size = os.path.getsize(npz_path)
81
+
82
+ # Load metadata (may be None for old files)
83
+ meta = read_npz_metadata(npz_path)
84
+
85
+ # Print summary
86
+ print(os.path.basename(npz_path))
87
+
88
+ if meta and "genome_name" in meta:
89
+ print(f" Genome: {meta['genome_name']}")
90
+ if meta and "expected_chromosomes" in meta:
91
+ chroms = meta["expected_chromosomes"]
92
+ n_chr = len(chroms)
93
+ if n_chr <= 4:
94
+ chrom_display = ", ".join(chroms)
95
+ else:
96
+ chrom_display = (
97
+ f"{n_chr} ({chroms[0]}, {chroms[1]}, "
98
+ f"... {chroms[-2]}, {chroms[-1]})"
99
+ )
100
+ print(f" Chromosomes: {chrom_display}")
101
+
102
+ print(f" Reads: {n_reads:,}")
103
+ print(f" CpG sites: {n_cpgs:,}")
104
+ print(f" Data points: {n_data:,} (sparsity: {sparsity:.2%})")
105
+
106
+ if meta and "cpg_index_crc32" in meta:
107
+ print(f" CpG index CRC32: {meta['cpg_index_crc32']}")
108
+ if meta and "bam2tensor_version" in meta:
109
+ print(f" bam2tensor: v{meta['bam2tensor_version']}")
110
+ elif meta is None:
111
+ print(" Metadata: none (produced by older bam2tensor)")
112
+
113
+ print(f" File size: {_format_size(file_size)}")
114
+
115
+
116
+ @click.command(help="Inspect bam2tensor .methylation.npz output files.")
117
+ @click.argument(
118
+ "files",
119
+ nargs=-1,
120
+ required=True,
121
+ type=click.Path(exists=True, dir_okay=False, readable=True),
122
+ )
123
+ def main(files: tuple[str, ...]) -> None:
124
+ """Inspect one or more .methylation.npz files.
125
+
126
+ Prints a summary of each file including matrix dimensions, sparsity,
127
+ embedded metadata, and file size.
128
+
129
+ Args:
130
+ files: One or more paths to ``.methylation.npz`` files.
131
+ """
132
+ for i, path in enumerate(files):
133
+ if i > 0:
134
+ print()
135
+ try:
136
+ inspect_npz(path)
137
+ except Exception as e:
138
+ print(f"{os.path.basename(path)}", file=sys.stderr)
139
+ print(f" Error: {e}", file=sys.stderr)
140
+
141
+
142
+ if __name__ == "__main__":
143
+ main() # pylint: disable=no-value-for-parameter
@@ -0,0 +1,114 @@
1
+ """Metadata utilities for bam2tensor .npz output files.
2
+
3
+ This module provides functions to embed and retrieve provenance metadata
4
+ inside the ``.methylation.npz`` files produced by bam2tensor. The metadata
5
+ is stored as a ``metadata.json`` entry appended to the ZIP archive that
6
+ underlies every ``.npz`` file. ``scipy.sparse.load_npz`` silently ignores
7
+ this extra entry, so existing downstream code is unaffected.
8
+
9
+ Example:
10
+ Writing metadata (done automatically by the CLI)::
11
+
12
+ >>> # xdoctest: +SKIP
13
+ >>> from bam2tensor.metadata import write_npz_metadata, read_npz_metadata
14
+ >>> write_npz_metadata("sample.methylation.npz", {
15
+ ... "bam2tensor_version": "2.2",
16
+ ... "genome_name": "hg38",
17
+ ... })
18
+ >>> read_npz_metadata("sample.methylation.npz")
19
+ {'bam2tensor_version': '2.2', 'genome_name': 'hg38'}
20
+
21
+ Reading metadata from an existing file::
22
+
23
+ >>> # xdoctest: +SKIP
24
+ >>> meta = read_npz_metadata("sample.methylation.npz")
25
+ >>> if meta is not None:
26
+ ... print(meta["genome_name"])
27
+ hg38
28
+ """
29
+
30
+ import json
31
+ import zipfile
32
+ import zlib
33
+
34
+ from bam2tensor.embedding import GenomeMethylationEmbedding
35
+
36
+
37
+ def compute_cpg_index_crc32(embedding: GenomeMethylationEmbedding) -> str:
38
+ """Compute a CRC32 checksum over the CpG site positions in an embedding.
39
+
40
+ The checksum captures the exact column mapping of the sparse matrix:
41
+ which chromosomes are included, in what order, and which genomic
42
+ positions are CpG sites within each chromosome. Two embeddings with
43
+ the same checksum will produce identical column semantics.
44
+
45
+ Args:
46
+ embedding: A fully initialised GenomeMethylationEmbedding whose
47
+ ``cpg_sites_dict`` is populated.
48
+
49
+ Returns:
50
+ The CRC32 checksum as an 8-character lowercase hexadecimal string.
51
+
52
+ Example:
53
+ >>> # xdoctest: +SKIP
54
+ >>> from bam2tensor.embedding import GenomeMethylationEmbedding
55
+ >>> emb = GenomeMethylationEmbedding(
56
+ ... genome_name="hg38",
57
+ ... expected_chromosomes=["chr1"],
58
+ ... fasta_source="ref.fa",
59
+ ... )
60
+ >>> compute_cpg_index_crc32(emb)
61
+ 'a1b2c3d4'
62
+ """
63
+ # Build a deterministic byte representation:
64
+ # chrom\tpos1,pos2,...\n (one line per chromosome, in order)
65
+ parts: list[str] = []
66
+ for chrom in embedding.expected_chromosomes:
67
+ positions = embedding.cpg_sites_dict.get(chrom, [])
68
+ parts.append(chrom + "\t" + ",".join(str(p) for p in positions))
69
+ payload = "\n".join(parts).encode("utf-8")
70
+ return format(zlib.crc32(payload) & 0xFFFFFFFF, "08x")
71
+
72
+
73
+ def write_npz_metadata(
74
+ npz_path: str,
75
+ metadata: dict,
76
+ ) -> None:
77
+ """Append a ``metadata.json`` entry to an existing ``.npz`` file.
78
+
79
+ The metadata is serialised as compact JSON and appended to the ZIP
80
+ archive. ``scipy.sparse.load_npz`` ignores unrecognised entries, so
81
+ the file remains fully compatible with existing code.
82
+
83
+ Args:
84
+ npz_path: Path to the ``.npz`` file (must already exist).
85
+ metadata: A JSON-serialisable dictionary of metadata to embed.
86
+
87
+ Example:
88
+ >>> # xdoctest: +SKIP
89
+ >>> write_npz_metadata("out.npz", {"genome_name": "hg38"})
90
+ """
91
+ with zipfile.ZipFile(npz_path, "a") as zf:
92
+ zf.writestr("metadata.json", json.dumps(metadata, indent=2))
93
+
94
+
95
+ def read_npz_metadata(npz_path: str) -> dict | None:
96
+ """Read the ``metadata.json`` entry from a ``.npz`` file.
97
+
98
+ Args:
99
+ npz_path: Path to the ``.npz`` file.
100
+
101
+ Returns:
102
+ The metadata dictionary, or ``None`` if the file does not contain
103
+ a ``metadata.json`` entry (e.g. files produced by older versions).
104
+
105
+ Example:
106
+ >>> # xdoctest: +SKIP
107
+ >>> meta = read_npz_metadata("sample.methylation.npz")
108
+ >>> meta["genome_name"]
109
+ 'hg38'
110
+ """
111
+ with zipfile.ZipFile(npz_path, "r") as zf:
112
+ if "metadata.json" in zf.namelist():
113
+ return json.loads(zf.read("metadata.json"))
114
+ return None
@@ -0,0 +1,146 @@
1
+ """Test cases for the inspect module."""
2
+
3
+ import shutil
4
+
5
+ import scipy.sparse
6
+ from click.testing import CliRunner
7
+
8
+ from bam2tensor import __main__
9
+ from bam2tensor.inspect import _format_size
10
+ from bam2tensor.inspect import main as inspect_main
11
+ from bam2tensor.metadata import write_npz_metadata
12
+
13
+
14
+ def test_inspect_with_metadata(tmp_path) -> None:
15
+ """Inspect prints metadata fields when present."""
16
+ npz_path = str(tmp_path / "sample.methylation.npz")
17
+ matrix = scipy.sparse.coo_matrix(([1, 0, -1], ([0, 0, 1], [0, 2, 1])), shape=(2, 5))
18
+ scipy.sparse.save_npz(npz_path, matrix)
19
+ write_npz_metadata(
20
+ npz_path,
21
+ {
22
+ "bam2tensor_version": "2.3",
23
+ "genome_name": "hg38",
24
+ "expected_chromosomes": ["chr1", "chr2", "chrX", "chrY"],
25
+ "total_cpg_sites": 5,
26
+ "cpg_index_crc32": "deadbeef",
27
+ },
28
+ )
29
+
30
+ runner = CliRunner()
31
+ result = runner.invoke(inspect_main, [npz_path])
32
+ assert result.exit_code == 0
33
+ assert "hg38" in result.output
34
+ assert "Reads:" in result.output
35
+ assert "2" in result.output # 2 reads
36
+ assert "CpG sites:" in result.output
37
+ assert "deadbeef" in result.output
38
+ assert "v2.3" in result.output
39
+ assert "chr1, chr2, chrX, chrY" in result.output
40
+
41
+
42
+ def test_inspect_without_metadata(tmp_path) -> None:
43
+ """Inspect works on files without metadata (older bam2tensor)."""
44
+ npz_path = str(tmp_path / "old.methylation.npz")
45
+ matrix = scipy.sparse.coo_matrix(([1], ([0], [0])), shape=(1, 100))
46
+ scipy.sparse.save_npz(npz_path, matrix)
47
+
48
+ runner = CliRunner()
49
+ result = runner.invoke(inspect_main, [npz_path])
50
+ assert result.exit_code == 0
51
+ assert "Reads:" in result.output
52
+ assert "older bam2tensor" in result.output
53
+ # Should NOT have genome or CRC lines
54
+ assert "Genome:" not in result.output
55
+
56
+
57
+ def test_inspect_multiple_files(tmp_path) -> None:
58
+ """Inspect handles multiple files with blank line separator."""
59
+ paths = []
60
+ for name in ["a.npz", "b.npz"]:
61
+ p = str(tmp_path / name)
62
+ matrix = scipy.sparse.coo_matrix(([1], ([0], [0])), shape=(1, 10))
63
+ scipy.sparse.save_npz(p, matrix)
64
+ paths.append(p)
65
+
66
+ runner = CliRunner()
67
+ result = runner.invoke(inspect_main, paths)
68
+ assert result.exit_code == 0
69
+ assert "a.npz" in result.output
70
+ assert "b.npz" in result.output
71
+
72
+
73
+ def test_inspect_many_chromosomes(tmp_path) -> None:
74
+ """Chromosome list is summarised when > 4 entries."""
75
+ npz_path = str(tmp_path / "matrix.npz")
76
+ matrix = scipy.sparse.coo_matrix(([1], ([0], [0])), shape=(1, 10))
77
+ scipy.sparse.save_npz(npz_path, matrix)
78
+ chroms = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"]
79
+ write_npz_metadata(
80
+ npz_path,
81
+ {
82
+ "expected_chromosomes": chroms,
83
+ "genome_name": "hg38",
84
+ },
85
+ )
86
+
87
+ runner = CliRunner()
88
+ result = runner.invoke(inspect_main, [npz_path])
89
+ assert "24 (" in result.output
90
+ assert "chrY" in result.output
91
+
92
+
93
+ def test_inspect_end_to_end(tmp_path) -> None:
94
+ """Full pipeline: bam2tensor produces file, bam2tensor-inspect reads it."""
95
+ shutil.copy("tests/test.bam", tmp_path / "test.bam")
96
+ shutil.copy("tests/test.bam.bai", tmp_path / "test.bam.bai")
97
+
98
+ runner = CliRunner()
99
+ # Run extraction
100
+ result = runner.invoke(
101
+ __main__.main,
102
+ [
103
+ "--input-path",
104
+ str(tmp_path / "test.bam"),
105
+ "--reference-fasta",
106
+ "tests/test_fasta.fa",
107
+ "--genome-name",
108
+ "test",
109
+ "--expected-chromosomes",
110
+ "chr1,chr2,chr3",
111
+ "--output-dir",
112
+ str(tmp_path / "out"),
113
+ "--overwrite",
114
+ ],
115
+ )
116
+ assert result.exit_code == 0
117
+
118
+ # Inspect the output
119
+ npz_path = str(tmp_path / "out" / "test.methylation.npz")
120
+ result = runner.invoke(inspect_main, [npz_path])
121
+ assert result.exit_code == 0
122
+ assert "test" in result.output # genome_name
123
+ assert "CpG index CRC32:" in result.output
124
+ assert "v2.3" in result.output
125
+
126
+
127
+ def test_format_size_bytes() -> None:
128
+ """_format_size handles small byte counts."""
129
+ assert _format_size(500) == "500 bytes"
130
+
131
+
132
+ def test_format_size_kb() -> None:
133
+ """_format_size handles kilobyte range."""
134
+ assert _format_size(2048) == "2.0 KB"
135
+
136
+
137
+ def test_format_size_mb() -> None:
138
+ """_format_size handles megabyte range."""
139
+ result = _format_size(14_200_000)
140
+ assert "MB" in result
141
+
142
+
143
+ def test_format_size_gb() -> None:
144
+ """_format_size handles gigabyte range."""
145
+ result = _format_size(2_500_000_000)
146
+ assert "GB" in result
@@ -0,0 +1,162 @@
1
+ """Test cases for the metadata module."""
2
+
3
+ import json
4
+ import zipfile
5
+
6
+ import scipy.sparse
7
+
8
+ from bam2tensor import embedding
9
+ from bam2tensor.metadata import (
10
+ compute_cpg_index_crc32,
11
+ read_npz_metadata,
12
+ write_npz_metadata,
13
+ )
14
+
15
+ TEST_EMBEDDING = embedding.GenomeMethylationEmbedding(
16
+ "test_genome",
17
+ expected_chromosomes=["chr1", "chr2", "chr3"],
18
+ fasta_source="tests/test_fasta.fa",
19
+ window_size=150,
20
+ skip_cache=False,
21
+ verbose=False,
22
+ )
23
+
24
+
25
+ # -- compute_cpg_index_crc32 -------------------------------------------------
26
+
27
+
28
+ def test_cpg_index_crc32_deterministic() -> None:
29
+ """Same embedding always produces the same CRC32."""
30
+ assert compute_cpg_index_crc32(TEST_EMBEDDING) == compute_cpg_index_crc32(
31
+ TEST_EMBEDDING
32
+ )
33
+
34
+
35
+ def test_cpg_index_crc32_format() -> None:
36
+ """CRC32 is an 8-character hex string."""
37
+ crc = compute_cpg_index_crc32(TEST_EMBEDDING)
38
+ assert len(crc) == 8
39
+ int(crc, 16) # must be valid hex
40
+
41
+
42
+ def test_cpg_index_crc32_differs_for_different_embeddings(tmp_path) -> None:
43
+ """Different chromosome lists produce different CRC32 values."""
44
+ emb_subset = embedding.GenomeMethylationEmbedding(
45
+ "test_subset",
46
+ expected_chromosomes=["chr1"],
47
+ fasta_source="tests/test_fasta.fa",
48
+ window_size=150,
49
+ skip_cache=True,
50
+ verbose=False,
51
+ )
52
+ assert compute_cpg_index_crc32(TEST_EMBEDDING) != compute_cpg_index_crc32(
53
+ emb_subset
54
+ )
55
+
56
+
57
+ # -- write / read round-trip -------------------------------------------------
58
+
59
+
60
+ def test_write_then_read_metadata(tmp_path) -> None:
61
+ """Metadata survives a write-then-read round trip."""
62
+ npz_path = str(tmp_path / "matrix.npz")
63
+ matrix = scipy.sparse.coo_matrix(([1, 0, -1], ([0, 0, 1], [0, 2, 1])), shape=(2, 4))
64
+ scipy.sparse.save_npz(npz_path, matrix)
65
+
66
+ metadata = {
67
+ "bam2tensor_version": "2.2",
68
+ "genome_name": "hg38",
69
+ "cpg_index_crc32": "deadbeef",
70
+ "total_cpg_sites": 4,
71
+ "expected_chromosomes": ["chr1", "chr2"],
72
+ }
73
+ write_npz_metadata(npz_path, metadata)
74
+
75
+ loaded = read_npz_metadata(npz_path)
76
+ assert loaded == metadata
77
+
78
+
79
+ def test_scipy_load_unaffected_by_metadata(tmp_path) -> None:
80
+ """scipy.sparse.load_npz still works after metadata is appended."""
81
+ npz_path = str(tmp_path / "matrix.npz")
82
+ data = [1, 0, -1, 1, 0]
83
+ row = [0, 0, 1, 1, 2]
84
+ col = [0, 2, 1, 3, 2]
85
+ matrix = scipy.sparse.coo_matrix((data, (row, col)), shape=(3, 5))
86
+ scipy.sparse.save_npz(npz_path, matrix)
87
+
88
+ write_npz_metadata(npz_path, {"genome_name": "hg38"})
89
+
90
+ loaded = scipy.sparse.load_npz(npz_path)
91
+ assert (loaded.toarray() == matrix.toarray()).all()
92
+ assert loaded.shape == matrix.shape
93
+ assert loaded.nnz == matrix.nnz
94
+
95
+
96
+ def test_read_metadata_returns_none_without_metadata(tmp_path) -> None:
97
+ """read_npz_metadata returns None for files without metadata."""
98
+ npz_path = str(tmp_path / "plain.npz")
99
+ matrix = scipy.sparse.coo_matrix(([1], ([0], [0])), shape=(1, 1))
100
+ scipy.sparse.save_npz(npz_path, matrix)
101
+
102
+ assert read_npz_metadata(npz_path) is None
103
+
104
+
105
+ def test_metadata_accessible_via_zipfile(tmp_path) -> None:
106
+ """Metadata is plain JSON readable with standard zipfile tools."""
107
+ npz_path = str(tmp_path / "matrix.npz")
108
+ matrix = scipy.sparse.coo_matrix(([1], ([0], [0])), shape=(1, 1))
109
+ scipy.sparse.save_npz(npz_path, matrix)
110
+
111
+ write_npz_metadata(npz_path, {"genome_name": "mm10", "total_cpg_sites": 42})
112
+
113
+ with zipfile.ZipFile(npz_path, "r") as zf:
114
+ assert "metadata.json" in zf.namelist()
115
+ raw = json.loads(zf.read("metadata.json"))
116
+ assert raw["genome_name"] == "mm10"
117
+ assert raw["total_cpg_sites"] == 42
118
+
119
+
120
+ # -- CLI integration (end-to-end) -------------------------------------------
121
+
122
+
123
+ def test_main_writes_metadata(tmp_path) -> None:
124
+ """The CLI embeds metadata in the output .npz file."""
125
+ import shutil
126
+ from click.testing import CliRunner
127
+ from bam2tensor import __main__
128
+
129
+ shutil.copy("tests/test.bam", tmp_path / "test.bam")
130
+ shutil.copy("tests/test.bam.bai", tmp_path / "test.bam.bai")
131
+
132
+ runner = CliRunner()
133
+ result = runner.invoke(
134
+ __main__.main,
135
+ [
136
+ "--input-path",
137
+ str(tmp_path / "test.bam"),
138
+ "--reference-fasta",
139
+ "tests/test_fasta.fa",
140
+ "--genome-name",
141
+ "test",
142
+ "--expected-chromosomes",
143
+ "chr1,chr2,chr3",
144
+ "--output-dir",
145
+ str(tmp_path / "out"),
146
+ "--overwrite",
147
+ ],
148
+ )
149
+ assert result.exit_code == 0, f"CLI failed: {result.output}"
150
+
151
+ npz_path = str(tmp_path / "out" / "test.methylation.npz")
152
+ meta = read_npz_metadata(npz_path)
153
+ assert meta is not None
154
+ assert meta["genome_name"] == "test"
155
+ assert meta["expected_chromosomes"] == ["chr1", "chr2", "chr3"]
156
+ assert meta["total_cpg_sites"] == TEST_EMBEDDING.total_cpg_sites
157
+ assert len(meta["cpg_index_crc32"]) == 8
158
+ assert "bam2tensor_version" in meta
159
+
160
+ # Verify the sparse matrix is still loadable
161
+ mat = scipy.sparse.load_npz(npz_path)
162
+ assert mat.shape[1] == TEST_EMBEDDING.total_cpg_sites
@@ -62,7 +62,7 @@ wheels = [
62
62
 
63
63
  [[package]]
64
64
  name = "bam2tensor"
65
- version = "2.0"
65
+ version = "2.3"
66
66
  source = { editable = "." }
67
67
  dependencies = [
68
68
  { name = "biopython" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes