bam2tensor 2.1__tar.gz → 2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bam2tensor-2.1 → bam2tensor-2.3}/CLAUDE.md +10 -3
- bam2tensor-2.1/README.md → bam2tensor-2.3/PKG-INFO +99 -7
- bam2tensor-2.1/PKG-INFO → bam2tensor-2.3/README.md +66 -28
- {bam2tensor-2.1 → bam2tensor-2.3}/pyproject.toml +17 -2
- {bam2tensor-2.1 → bam2tensor-2.3}/src/bam2tensor/__init__.py +1 -1
- {bam2tensor-2.1 → bam2tensor-2.3}/src/bam2tensor/__main__.py +13 -0
- bam2tensor-2.3/src/bam2tensor/inspect.py +143 -0
- bam2tensor-2.3/src/bam2tensor/metadata.py +114 -0
- bam2tensor-2.3/tests/test_inspect.py +146 -0
- bam2tensor-2.3/tests/test_metadata.py +162 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/uv.lock +1 -1
- {bam2tensor-2.1 → bam2tensor-2.3}/.darglint +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.editorconfig +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.gitattributes +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.github/actions/setup-env/action.yml +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.github/dependabot.yml +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.github/labels.yml +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.github/release-drafter.yml +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.github/workflows/constraints.txt +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.github/workflows/docs.yml +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.github/workflows/labeler.yml +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.github/workflows/release.yml +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.github/workflows/tests.yml +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.gitignore +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/.pre-commit-config.yaml +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/CONTRIBUTING.md +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/LICENSE +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/SECURITY.md +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/docs/Makefile +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/docs/conf.py +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/docs/contributing.md +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/docs/index.md +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/docs/license.md +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/docs/logo/bam2tensor-logo.afdesign +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/docs/logo/bam2tensor-logo.png +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/docs/make.bat +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/docs/nano-banana-overview-shrunk.png +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/docs/reference.md +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/docs/templates/package.rst_t +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/noxfile.py +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/src/bam2tensor/embedding.py +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/src/bam2tensor/functions.py +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/src/bam2tensor/py.typed +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/src/bam2tensor/reference.py +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/tests/__init__.py +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/tests/test_duplication.py +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/tests/test_embedding.py +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/tests/test_fasta.fa +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/tests/test_functions.py +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/tests/test_main.py +0 -0
- {bam2tensor-2.1 → bam2tensor-2.3}/tests/test_reference.py +0 -0
|
@@ -40,10 +40,12 @@ uv run mypy src
|
|
|
40
40
|
|
|
41
41
|
```
|
|
42
42
|
src/bam2tensor/
|
|
43
|
-
__init__.py # Package version (2.
|
|
44
|
-
__main__.py # Click CLI entry point
|
|
43
|
+
__init__.py # Package version (2.3)
|
|
44
|
+
__main__.py # Click CLI entry point (bam2tensor command)
|
|
45
|
+
inspect.py # Inspect CLI entry point (bam2tensor-inspect command)
|
|
45
46
|
embedding.py # GenomeMethylationEmbedding class (FASTA parsing, CpG indexing)
|
|
46
47
|
functions.py # Core extraction: extract_methylation_data_from_bam()
|
|
48
|
+
metadata.py # .npz metadata read/write (provenance info in output files)
|
|
47
49
|
reference.py # Reference genome download and caching utilities
|
|
48
50
|
|
|
49
51
|
tests/
|
|
@@ -51,6 +53,8 @@ tests/
|
|
|
51
53
|
test_functions.py # Core function tests
|
|
52
54
|
test_embedding.py # Embedding class tests
|
|
53
55
|
test_duplication.py # Read duplication bug tests
|
|
56
|
+
test_inspect.py # Inspect CLI tests
|
|
57
|
+
test_metadata.py # Metadata read/write/round-trip tests
|
|
54
58
|
test_reference.py # Reference download/caching tests
|
|
55
59
|
test.bam, test.bam.bai, test_fasta.fa # Test fixtures
|
|
56
60
|
```
|
|
@@ -110,8 +114,9 @@ xdoctest validates code examples in docstrings. Important rules:
|
|
|
110
114
|
### Data Structure
|
|
111
115
|
- Output: scipy sparse COO matrix saved as .npz
|
|
112
116
|
- Rows = unique reads (primary alignments)
|
|
113
|
-
- Columns = CpG sites
|
|
117
|
+
- Columns = CpG sites (ordered by genomic position, determined by reference genome)
|
|
114
118
|
- Values: 1 (methylated), 0 (unmethylated), -1 (no data/indels/SNVs)
|
|
119
|
+
- Each .npz file contains a `metadata.json` entry with provenance info (genome name, version, CpG index CRC32, expected chromosomes). Read via `bam2tensor.metadata.read_npz_metadata()`.
|
|
115
120
|
|
|
116
121
|
### Methylation Strand Detection
|
|
117
122
|
- Bismark aligner: XM tag (Z/z for methylated/unmethylated CpG; no strand filtering needed)
|
|
@@ -144,6 +149,8 @@ xdoctest validates code examples in docstrings. Important rules:
|
|
|
144
149
|
uv run bam2tensor --input-path input.bam --reference-fasta ref.fa
|
|
145
150
|
# Or with auto-download:
|
|
146
151
|
uv run bam2tensor --input-path input.bam --download-reference hg38
|
|
152
|
+
# Inspect an output file:
|
|
153
|
+
uv run bam2tensor-inspect output.methylation.npz
|
|
147
154
|
```
|
|
148
155
|
|
|
149
156
|
### Reference Genome Downloads
|
|
@@ -1,3 +1,36 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bam2tensor
|
|
3
|
+
Version: 2.3
|
|
4
|
+
Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
|
|
5
|
+
Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
|
|
6
|
+
Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
|
|
7
|
+
Project-URL: Documentation, https://mcwdsi.github.io/bam2tensor
|
|
8
|
+
Project-URL: Changelog, https://github.com/mcwdsi/bam2tensor/releases
|
|
9
|
+
Author-email: Nick Semenkovich <semenko@alum.mit.edu>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: MacOS
|
|
16
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.10
|
|
26
|
+
Requires-Dist: biopython>=1.81
|
|
27
|
+
Requires-Dist: click>=8.0.1
|
|
28
|
+
Requires-Dist: numpy>=1.26.0
|
|
29
|
+
Requires-Dist: pysam>=0.22.0
|
|
30
|
+
Requires-Dist: scipy>=1.11.4
|
|
31
|
+
Requires-Dist: tqdm>=4.66.1
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
1
34
|
# bam2tensor
|
|
2
35
|
|
|
3
36
|
**Author:** [Nick Semenkovich](https://nick.semenkovich.com/) (semenko@alum.mit.edu)
|
|
@@ -39,7 +72,9 @@
|
|
|
39
72
|
- [Custom Output Directory](#custom-output-directory)
|
|
40
73
|
- [Using a Custom Genome](#using-a-custom-genome)
|
|
41
74
|
- [Command-Line Options](#command-line-options)
|
|
75
|
+
- [Inspecting Output Files](#inspecting-output-files)
|
|
42
76
|
- [Output Data Structure](#output-data-structure)
|
|
77
|
+
- [Embedded Metadata](#embedded-metadata)
|
|
43
78
|
- [Loading Output Files](#loading-output-files)
|
|
44
79
|
- [Converting to Dense Arrays](#converting-to-dense-arrays)
|
|
45
80
|
- [Working with Genomic Coordinates](#working-with-genomic-coordinates)
|
|
@@ -252,24 +287,81 @@ Options:
|
|
|
252
287
|
| `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
|
|
253
288
|
| `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
|
|
254
289
|
|
|
290
|
+
## Inspecting Output Files
|
|
291
|
+
|
|
292
|
+
Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
|
|
293
|
+
|
|
294
|
+
```bash
|
|
295
|
+
$ bam2tensor-inspect sample.methylation.npz
|
|
296
|
+
sample.methylation.npz
|
|
297
|
+
Genome: hg38
|
|
298
|
+
Chromosomes: 24 (chr1, chr2, ... chrX, chrY)
|
|
299
|
+
Reads: 1,423,891
|
|
300
|
+
CpG sites: 28,217,448
|
|
301
|
+
Data points: 12,847,322 (sparsity: 99.97%)
|
|
302
|
+
CpG index CRC32: a1b2c3d4
|
|
303
|
+
bam2tensor: v2.3
|
|
304
|
+
File size: 14.2 MB
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
You can pass multiple files at once:
|
|
308
|
+
|
|
309
|
+
```bash
|
|
310
|
+
$ bam2tensor-inspect *.methylation.npz
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
This works on files produced by older versions of bam2tensor too (metadata fields will be omitted).
|
|
314
|
+
|
|
255
315
|
## Output Data Structure
|
|
256
316
|
|
|
257
|
-
bam2tensor generates one `.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] with the following structure:
|
|
317
|
+
bam2tensor generates one `.methylation.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] (`scipy.sparse.coo_matrix`) with the following structure:
|
|
258
318
|
|
|
259
319
|
| Dimension | Represents |
|
|
260
320
|
|-----------|------------|
|
|
261
|
-
| Rows | Unique reads (primary alignments that pass quality filters) |
|
|
262
|
-
| Columns | CpG sites
|
|
321
|
+
| **Rows** | Unique sequencing reads (primary alignments that pass quality and flag filters, numbered sequentially as encountered across chromosomes) |
|
|
322
|
+
| **Columns** | CpG sites from the reference genome, ordered by genomic position across all chromosomes (chr1, chr2, ..., chrX, chrY). Column `i` maps to the `i`-th CpG dinucleotide in the reference FASTA. |
|
|
323
|
+
|
|
324
|
+
The **column dimension is determined entirely by the reference genome**: it equals the total number of CpG sites across all `--expected-chromosomes`. For example, hg38 with default chromosomes has ~28 million CpG columns. To map column indices back to genomic coordinates (e.g., column 12345 → chr1:29503), use the `GenomeMethylationEmbedding` class with the same reference FASTA and chromosome list (see [Working with Genomic Coordinates](#working-with-genomic-coordinates) below).
|
|
263
325
|
|
|
264
326
|
### Methylation State Values
|
|
265
327
|
|
|
266
328
|
| Value | Meaning |
|
|
267
329
|
|-------|---------|
|
|
268
|
-
| `1` | Methylated (cytosine preserved as C) |
|
|
269
|
-
| `0` | Unmethylated (cytosine converted to T by bisulfite treatment) |
|
|
270
|
-
| `-1` | No data (indel, SNV, or
|
|
330
|
+
| `1` | Methylated (cytosine preserved as C after bisulfite/enzymatic conversion) |
|
|
331
|
+
| `0` | Unmethylated (cytosine converted to T by bisulfite/enzymatic treatment) |
|
|
332
|
+
| `-1` | No data (indel, SNV, or other non-C/T base at a CpG position) |
|
|
271
333
|
|
|
272
|
-
Note:
|
|
334
|
+
Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
|
|
335
|
+
|
|
336
|
+
### Embedded Metadata
|
|
337
|
+
|
|
338
|
+
Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
|
|
339
|
+
|
|
340
|
+
| Field | Description |
|
|
341
|
+
|-------|-------------|
|
|
342
|
+
| `bam2tensor_version` | Version of bam2tensor that produced the file |
|
|
343
|
+
| `genome_name` | Genome identifier (e.g., `hg38`, `mm10`) |
|
|
344
|
+
| `expected_chromosomes` | List of chromosomes included in the column mapping |
|
|
345
|
+
| `total_cpg_sites` | Total number of CpG columns in the matrix |
|
|
346
|
+
| `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
|
|
347
|
+
|
|
348
|
+
This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
|
|
349
|
+
|
|
350
|
+
```python
|
|
351
|
+
from bam2tensor.metadata import read_npz_metadata
|
|
352
|
+
|
|
353
|
+
meta = read_npz_metadata("sample.methylation.npz")
|
|
354
|
+
if meta is not None:
|
|
355
|
+
print(f"Genome: {meta['genome_name']}")
|
|
356
|
+
print(f"CpG sites: {meta['total_cpg_sites']:,}")
|
|
357
|
+
print(f"CpG index CRC32: {meta['cpg_index_crc32']}")
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
The `cpg_index_crc32` field uniquely identifies the column mapping. Two files with the same CRC32 have identical column semantics (same chromosomes, same CpG positions, same order) and their matrices can be directly stacked or compared. The metadata is also accessible without bam2tensor installed, since `.npz` files are ZIP archives:
|
|
361
|
+
|
|
362
|
+
```bash
|
|
363
|
+
unzip -p sample.methylation.npz metadata.json | python -m json.tool
|
|
364
|
+
```
|
|
273
365
|
|
|
274
366
|
### Loading Output Files
|
|
275
367
|
|
|
@@ -1,24 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: bam2tensor
|
|
3
|
-
Version: 2.1
|
|
4
|
-
Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
|
|
5
|
-
Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
|
|
6
|
-
Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
|
|
7
|
-
Project-URL: Documentation, https://mcwdsi.github.io/bam2tensor
|
|
8
|
-
Project-URL: Changelog, https://github.com/mcwdsi/bam2tensor/releases
|
|
9
|
-
Author-email: Nick Semenkovich <semenko@alum.mit.edu>
|
|
10
|
-
License-Expression: MIT
|
|
11
|
-
License-File: LICENSE
|
|
12
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
-
Requires-Python: >=3.10
|
|
14
|
-
Requires-Dist: biopython>=1.81
|
|
15
|
-
Requires-Dist: click>=8.0.1
|
|
16
|
-
Requires-Dist: numpy>=1.26.0
|
|
17
|
-
Requires-Dist: pysam>=0.22.0
|
|
18
|
-
Requires-Dist: scipy>=1.11.4
|
|
19
|
-
Requires-Dist: tqdm>=4.66.1
|
|
20
|
-
Description-Content-Type: text/markdown
|
|
21
|
-
|
|
22
1
|
# bam2tensor
|
|
23
2
|
|
|
24
3
|
**Author:** [Nick Semenkovich](https://nick.semenkovich.com/) (semenko@alum.mit.edu)
|
|
@@ -60,7 +39,9 @@ Description-Content-Type: text/markdown
|
|
|
60
39
|
- [Custom Output Directory](#custom-output-directory)
|
|
61
40
|
- [Using a Custom Genome](#using-a-custom-genome)
|
|
62
41
|
- [Command-Line Options](#command-line-options)
|
|
42
|
+
- [Inspecting Output Files](#inspecting-output-files)
|
|
63
43
|
- [Output Data Structure](#output-data-structure)
|
|
44
|
+
- [Embedded Metadata](#embedded-metadata)
|
|
64
45
|
- [Loading Output Files](#loading-output-files)
|
|
65
46
|
- [Converting to Dense Arrays](#converting-to-dense-arrays)
|
|
66
47
|
- [Working with Genomic Coordinates](#working-with-genomic-coordinates)
|
|
@@ -273,24 +254,81 @@ Options:
|
|
|
273
254
|
| `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
|
|
274
255
|
| `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
|
|
275
256
|
|
|
257
|
+
## Inspecting Output Files
|
|
258
|
+
|
|
259
|
+
Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
|
|
260
|
+
|
|
261
|
+
```bash
|
|
262
|
+
$ bam2tensor-inspect sample.methylation.npz
|
|
263
|
+
sample.methylation.npz
|
|
264
|
+
Genome: hg38
|
|
265
|
+
Chromosomes: 24 (chr1, chr2, ... chrX, chrY)
|
|
266
|
+
Reads: 1,423,891
|
|
267
|
+
CpG sites: 28,217,448
|
|
268
|
+
Data points: 12,847,322 (sparsity: 99.97%)
|
|
269
|
+
CpG index CRC32: a1b2c3d4
|
|
270
|
+
bam2tensor: v2.3
|
|
271
|
+
File size: 14.2 MB
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
You can pass multiple files at once:
|
|
275
|
+
|
|
276
|
+
```bash
|
|
277
|
+
$ bam2tensor-inspect *.methylation.npz
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
This works on files produced by older versions of bam2tensor too (metadata fields will be omitted).
|
|
281
|
+
|
|
276
282
|
## Output Data Structure
|
|
277
283
|
|
|
278
|
-
bam2tensor generates one `.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] with the following structure:
|
|
284
|
+
bam2tensor generates one `.methylation.npz` file per input BAM file. Each file contains a SciPy sparse [COO matrix] (`scipy.sparse.coo_matrix`) with the following structure:
|
|
279
285
|
|
|
280
286
|
| Dimension | Represents |
|
|
281
287
|
|-----------|------------|
|
|
282
|
-
| Rows | Unique reads (primary alignments that pass quality filters) |
|
|
283
|
-
| Columns | CpG sites
|
|
288
|
+
| **Rows** | Unique sequencing reads (primary alignments that pass quality and flag filters, numbered sequentially as encountered across chromosomes) |
|
|
289
|
+
| **Columns** | CpG sites from the reference genome, ordered by genomic position across all chromosomes (chr1, chr2, ..., chrX, chrY). Column `i` maps to the `i`-th CpG dinucleotide in the reference FASTA. |
|
|
290
|
+
|
|
291
|
+
The **column dimension is determined entirely by the reference genome**: it equals the total number of CpG sites across all `--expected-chromosomes`. For example, hg38 with default chromosomes has ~28 million CpG columns. To map column indices back to genomic coordinates (e.g., column 12345 → chr1:29503), use the `GenomeMethylationEmbedding` class with the same reference FASTA and chromosome list (see [Working with Genomic Coordinates](#working-with-genomic-coordinates) below).
|
|
284
292
|
|
|
285
293
|
### Methylation State Values
|
|
286
294
|
|
|
287
295
|
| Value | Meaning |
|
|
288
296
|
|-------|---------|
|
|
289
|
-
| `1` | Methylated (cytosine preserved as C) |
|
|
290
|
-
| `0` | Unmethylated (cytosine converted to T by bisulfite treatment) |
|
|
291
|
-
| `-1` | No data (indel, SNV, or
|
|
297
|
+
| `1` | Methylated (cytosine preserved as C after bisulfite/enzymatic conversion) |
|
|
298
|
+
| `0` | Unmethylated (cytosine converted to T by bisulfite/enzymatic treatment) |
|
|
299
|
+
| `-1` | No data (indel, SNV, or other non-C/T base at a CpG position) |
|
|
292
300
|
|
|
293
|
-
Note:
|
|
301
|
+
Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
|
|
302
|
+
|
|
303
|
+
### Embedded Metadata
|
|
304
|
+
|
|
305
|
+
Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
|
|
306
|
+
|
|
307
|
+
| Field | Description |
|
|
308
|
+
|-------|-------------|
|
|
309
|
+
| `bam2tensor_version` | Version of bam2tensor that produced the file |
|
|
310
|
+
| `genome_name` | Genome identifier (e.g., `hg38`, `mm10`) |
|
|
311
|
+
| `expected_chromosomes` | List of chromosomes included in the column mapping |
|
|
312
|
+
| `total_cpg_sites` | Total number of CpG columns in the matrix |
|
|
313
|
+
| `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
|
|
314
|
+
|
|
315
|
+
This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
from bam2tensor.metadata import read_npz_metadata
|
|
319
|
+
|
|
320
|
+
meta = read_npz_metadata("sample.methylation.npz")
|
|
321
|
+
if meta is not None:
|
|
322
|
+
print(f"Genome: {meta['genome_name']}")
|
|
323
|
+
print(f"CpG sites: {meta['total_cpg_sites']:,}")
|
|
324
|
+
print(f"CpG index CRC32: {meta['cpg_index_crc32']}")
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
The `cpg_index_crc32` field uniquely identifies the column mapping. Two files with the same CRC32 have identical column semantics (same chromosomes, same CpG positions, same order) and their matrices can be directly stacked or compared. The metadata is also accessible without bam2tensor installed, since `.npz` files are ZIP archives:
|
|
328
|
+
|
|
329
|
+
```bash
|
|
330
|
+
unzip -p sample.methylation.npz metadata.json | python -m json.tool
|
|
331
|
+
```
|
|
294
332
|
|
|
295
333
|
### Loading Output Files
|
|
296
334
|
|
|
@@ -1,12 +1,26 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "bam2tensor"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.3"
|
|
4
4
|
description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
|
|
5
5
|
authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
|
|
6
6
|
license = "MIT"
|
|
7
7
|
readme = "README.md"
|
|
8
8
|
requires-python = ">=3.10"
|
|
9
|
-
classifiers = [
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Development Status :: 5 - Production/Stable",
|
|
11
|
+
"Intended Audience :: Science/Research",
|
|
12
|
+
"License :: OSI Approved :: MIT License",
|
|
13
|
+
"Operating System :: MacOS",
|
|
14
|
+
"Operating System :: POSIX :: Linux",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3.10",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Programming Language :: Python :: 3.12",
|
|
19
|
+
"Programming Language :: Python :: 3.13",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Medical Science Apps.",
|
|
22
|
+
"Typing :: Typed",
|
|
23
|
+
]
|
|
10
24
|
dependencies = [
|
|
11
25
|
"click>=8.0.1",
|
|
12
26
|
"numpy>=1.26.0",
|
|
@@ -24,6 +38,7 @@ Changelog = "https://github.com/mcwdsi/bam2tensor/releases"
|
|
|
24
38
|
|
|
25
39
|
[project.scripts]
|
|
26
40
|
bam2tensor = "bam2tensor.__main__:main"
|
|
41
|
+
bam2tensor-inspect = "bam2tensor.inspect:main"
|
|
27
42
|
|
|
28
43
|
[dependency-groups]
|
|
29
44
|
dev = [
|
|
@@ -38,6 +38,7 @@ from bam2tensor.functions import (
|
|
|
38
38
|
detect_aligner,
|
|
39
39
|
extract_methylation_data_from_bam,
|
|
40
40
|
)
|
|
41
|
+
from bam2tensor.metadata import compute_cpg_index_crc32, write_npz_metadata
|
|
41
42
|
from bam2tensor.reference import (
|
|
42
43
|
KNOWN_GENOMES,
|
|
43
44
|
download_reference as download_reference_fn,
|
|
@@ -393,10 +394,12 @@ def main(
|
|
|
393
394
|
verbose=verbose,
|
|
394
395
|
)
|
|
395
396
|
n_chroms = len(genome_methylation_embedding.cpg_sites_dict)
|
|
397
|
+
cpg_crc32 = compute_cpg_index_crc32(genome_methylation_embedding)
|
|
396
398
|
print(
|
|
397
399
|
f" Total CpG sites: {genome_methylation_embedding.total_cpg_sites:,}"
|
|
398
400
|
f" across {n_chroms} chromosome(s)"
|
|
399
401
|
)
|
|
402
|
+
print(f" CpG index CRC32: {cpg_crc32}")
|
|
400
403
|
print(f" Index loaded in {_format_elapsed(time.time() - time_start)}")
|
|
401
404
|
|
|
402
405
|
# ── Discover BAM files ──────────────────────────────────────────────
|
|
@@ -460,6 +463,16 @@ def main(
|
|
|
460
463
|
|
|
461
464
|
# Save
|
|
462
465
|
scipy.sparse.save_npz(output_file, methylation_data_coo, compressed=True)
|
|
466
|
+
write_npz_metadata(
|
|
467
|
+
output_file,
|
|
468
|
+
{
|
|
469
|
+
"bam2tensor_version": __version__,
|
|
470
|
+
"genome_name": genome_name,
|
|
471
|
+
"expected_chromosomes": chrom_list,
|
|
472
|
+
"total_cpg_sites": genome_methylation_embedding.total_cpg_sites,
|
|
473
|
+
"cpg_index_crc32": cpg_crc32,
|
|
474
|
+
},
|
|
475
|
+
)
|
|
463
476
|
print(f" Output: {output_file}")
|
|
464
477
|
print(f" Time: {_format_elapsed(time.time() - time_bam)}")
|
|
465
478
|
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Inspect command for bam2tensor .npz output files.
|
|
2
|
+
|
|
3
|
+
Provides a CLI entry point (``bam2tensor-inspect``) that prints a summary
|
|
4
|
+
of one or more ``.methylation.npz`` files, including matrix dimensions,
|
|
5
|
+
sparsity, file size, and embedded provenance metadata.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
Inspect a single file::
|
|
9
|
+
|
|
10
|
+
$ bam2tensor-inspect sample.methylation.npz
|
|
11
|
+
|
|
12
|
+
Inspect multiple files::
|
|
13
|
+
|
|
14
|
+
$ bam2tensor-inspect *.methylation.npz
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
import sys
|
|
19
|
+
|
|
20
|
+
import click
|
|
21
|
+
import numpy as np
|
|
22
|
+
import scipy.sparse
|
|
23
|
+
|
|
24
|
+
from bam2tensor.metadata import read_npz_metadata
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _format_size(nbytes: int) -> str:
|
|
28
|
+
"""Format a byte count as a human-readable string.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
nbytes: Number of bytes.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
A string such as ``"14.2 MB"`` or ``"832 bytes"``.
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
>>> _format_size(14_200_000)
|
|
38
|
+
'13.5 MB'
|
|
39
|
+
|
|
40
|
+
>>> _format_size(500)
|
|
41
|
+
'500 bytes'
|
|
42
|
+
|
|
43
|
+
>>> _format_size(2048)
|
|
44
|
+
'2.0 KB'
|
|
45
|
+
"""
|
|
46
|
+
if nbytes < 1024:
|
|
47
|
+
return f"{nbytes} bytes"
|
|
48
|
+
elif nbytes < 1024 * 1024:
|
|
49
|
+
return f"{nbytes / 1024:.1f} KB"
|
|
50
|
+
elif nbytes < 1024 * 1024 * 1024:
|
|
51
|
+
return f"{nbytes / (1024 * 1024):.1f} MB"
|
|
52
|
+
else:
|
|
53
|
+
return f"{nbytes / (1024 * 1024 * 1024):.1f} GB"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def inspect_npz(npz_path: str) -> None:
|
|
57
|
+
"""Print a human-readable summary of a .methylation.npz file.
|
|
58
|
+
|
|
59
|
+
Loads the sparse matrix and any embedded metadata, then prints
|
|
60
|
+
matrix dimensions, data-point counts, sparsity, provenance
|
|
61
|
+
information, and file size.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
npz_path: Path to the ``.npz`` file to inspect.
|
|
65
|
+
|
|
66
|
+
Example:
|
|
67
|
+
>>> # xdoctest: +SKIP
|
|
68
|
+
>>> inspect_npz("sample.methylation.npz")
|
|
69
|
+
sample.methylation.npz
|
|
70
|
+
Reads: 1,423
|
|
71
|
+
CpG sites: 28,217,448
|
|
72
|
+
...
|
|
73
|
+
"""
|
|
74
|
+
# Load matrix
|
|
75
|
+
matrix = scipy.sparse.load_npz(npz_path)
|
|
76
|
+
n_reads, n_cpgs = matrix.shape
|
|
77
|
+
n_data = matrix.nnz
|
|
78
|
+
total_cells = int(np.prod(matrix.shape)) if n_reads > 0 else 0
|
|
79
|
+
sparsity = 1 - (n_data / total_cells) if total_cells > 0 else 0.0
|
|
80
|
+
file_size = os.path.getsize(npz_path)
|
|
81
|
+
|
|
82
|
+
# Load metadata (may be None for old files)
|
|
83
|
+
meta = read_npz_metadata(npz_path)
|
|
84
|
+
|
|
85
|
+
# Print summary
|
|
86
|
+
print(os.path.basename(npz_path))
|
|
87
|
+
|
|
88
|
+
if meta and "genome_name" in meta:
|
|
89
|
+
print(f" Genome: {meta['genome_name']}")
|
|
90
|
+
if meta and "expected_chromosomes" in meta:
|
|
91
|
+
chroms = meta["expected_chromosomes"]
|
|
92
|
+
n_chr = len(chroms)
|
|
93
|
+
if n_chr <= 4:
|
|
94
|
+
chrom_display = ", ".join(chroms)
|
|
95
|
+
else:
|
|
96
|
+
chrom_display = (
|
|
97
|
+
f"{n_chr} ({chroms[0]}, {chroms[1]}, "
|
|
98
|
+
f"... {chroms[-2]}, {chroms[-1]})"
|
|
99
|
+
)
|
|
100
|
+
print(f" Chromosomes: {chrom_display}")
|
|
101
|
+
|
|
102
|
+
print(f" Reads: {n_reads:,}")
|
|
103
|
+
print(f" CpG sites: {n_cpgs:,}")
|
|
104
|
+
print(f" Data points: {n_data:,} (sparsity: {sparsity:.2%})")
|
|
105
|
+
|
|
106
|
+
if meta and "cpg_index_crc32" in meta:
|
|
107
|
+
print(f" CpG index CRC32: {meta['cpg_index_crc32']}")
|
|
108
|
+
if meta and "bam2tensor_version" in meta:
|
|
109
|
+
print(f" bam2tensor: v{meta['bam2tensor_version']}")
|
|
110
|
+
elif meta is None:
|
|
111
|
+
print(" Metadata: none (produced by older bam2tensor)")
|
|
112
|
+
|
|
113
|
+
print(f" File size: {_format_size(file_size)}")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@click.command(help="Inspect bam2tensor .methylation.npz output files.")
|
|
117
|
+
@click.argument(
|
|
118
|
+
"files",
|
|
119
|
+
nargs=-1,
|
|
120
|
+
required=True,
|
|
121
|
+
type=click.Path(exists=True, dir_okay=False, readable=True),
|
|
122
|
+
)
|
|
123
|
+
def main(files: tuple[str, ...]) -> None:
|
|
124
|
+
"""Inspect one or more .methylation.npz files.
|
|
125
|
+
|
|
126
|
+
Prints a summary of each file including matrix dimensions, sparsity,
|
|
127
|
+
embedded metadata, and file size.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
files: One or more paths to ``.methylation.npz`` files.
|
|
131
|
+
"""
|
|
132
|
+
for i, path in enumerate(files):
|
|
133
|
+
if i > 0:
|
|
134
|
+
print()
|
|
135
|
+
try:
|
|
136
|
+
inspect_npz(path)
|
|
137
|
+
except Exception as e:
|
|
138
|
+
print(f"{os.path.basename(path)}", file=sys.stderr)
|
|
139
|
+
print(f" Error: {e}", file=sys.stderr)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
if __name__ == "__main__":
|
|
143
|
+
main() # pylint: disable=no-value-for-parameter
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Metadata utilities for bam2tensor .npz output files.
|
|
2
|
+
|
|
3
|
+
This module provides functions to embed and retrieve provenance metadata
|
|
4
|
+
inside the ``.methylation.npz`` files produced by bam2tensor. The metadata
|
|
5
|
+
is stored as a ``metadata.json`` entry appended to the ZIP archive that
|
|
6
|
+
underlies every ``.npz`` file. ``scipy.sparse.load_npz`` silently ignores
|
|
7
|
+
this extra entry, so existing downstream code is unaffected.
|
|
8
|
+
|
|
9
|
+
Example:
|
|
10
|
+
Writing metadata (done automatically by the CLI)::
|
|
11
|
+
|
|
12
|
+
>>> # xdoctest: +SKIP
|
|
13
|
+
>>> from bam2tensor.metadata import write_npz_metadata, read_npz_metadata
|
|
14
|
+
>>> write_npz_metadata("sample.methylation.npz", {
|
|
15
|
+
... "bam2tensor_version": "2.2",
|
|
16
|
+
... "genome_name": "hg38",
|
|
17
|
+
... })
|
|
18
|
+
>>> read_npz_metadata("sample.methylation.npz")
|
|
19
|
+
{'bam2tensor_version': '2.2', 'genome_name': 'hg38'}
|
|
20
|
+
|
|
21
|
+
Reading metadata from an existing file::
|
|
22
|
+
|
|
23
|
+
>>> # xdoctest: +SKIP
|
|
24
|
+
>>> meta = read_npz_metadata("sample.methylation.npz")
|
|
25
|
+
>>> if meta is not None:
|
|
26
|
+
... print(meta["genome_name"])
|
|
27
|
+
hg38
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
import json
|
|
31
|
+
import zipfile
|
|
32
|
+
import zlib
|
|
33
|
+
|
|
34
|
+
from bam2tensor.embedding import GenomeMethylationEmbedding
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def compute_cpg_index_crc32(embedding: GenomeMethylationEmbedding) -> str:
|
|
38
|
+
"""Compute a CRC32 checksum over the CpG site positions in an embedding.
|
|
39
|
+
|
|
40
|
+
The checksum captures the exact column mapping of the sparse matrix:
|
|
41
|
+
which chromosomes are included, in what order, and which genomic
|
|
42
|
+
positions are CpG sites within each chromosome. Two embeddings with
|
|
43
|
+
the same checksum will produce identical column semantics.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
embedding: A fully initialised GenomeMethylationEmbedding whose
|
|
47
|
+
``cpg_sites_dict`` is populated.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
The CRC32 checksum as an 8-character lowercase hexadecimal string.
|
|
51
|
+
|
|
52
|
+
Example:
|
|
53
|
+
>>> # xdoctest: +SKIP
|
|
54
|
+
>>> from bam2tensor.embedding import GenomeMethylationEmbedding
|
|
55
|
+
>>> emb = GenomeMethylationEmbedding(
|
|
56
|
+
... genome_name="hg38",
|
|
57
|
+
... expected_chromosomes=["chr1"],
|
|
58
|
+
... fasta_source="ref.fa",
|
|
59
|
+
... )
|
|
60
|
+
>>> compute_cpg_index_crc32(emb)
|
|
61
|
+
'a1b2c3d4'
|
|
62
|
+
"""
|
|
63
|
+
# Build a deterministic byte representation:
|
|
64
|
+
# chrom\tpos1,pos2,...\n (one line per chromosome, in order)
|
|
65
|
+
parts: list[str] = []
|
|
66
|
+
for chrom in embedding.expected_chromosomes:
|
|
67
|
+
positions = embedding.cpg_sites_dict.get(chrom, [])
|
|
68
|
+
parts.append(chrom + "\t" + ",".join(str(p) for p in positions))
|
|
69
|
+
payload = "\n".join(parts).encode("utf-8")
|
|
70
|
+
return format(zlib.crc32(payload) & 0xFFFFFFFF, "08x")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def write_npz_metadata(
|
|
74
|
+
npz_path: str,
|
|
75
|
+
metadata: dict,
|
|
76
|
+
) -> None:
|
|
77
|
+
"""Append a ``metadata.json`` entry to an existing ``.npz`` file.
|
|
78
|
+
|
|
79
|
+
The metadata is serialised as compact JSON and appended to the ZIP
|
|
80
|
+
archive. ``scipy.sparse.load_npz`` ignores unrecognised entries, so
|
|
81
|
+
the file remains fully compatible with existing code.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
npz_path: Path to the ``.npz`` file (must already exist).
|
|
85
|
+
metadata: A JSON-serialisable dictionary of metadata to embed.
|
|
86
|
+
|
|
87
|
+
Example:
|
|
88
|
+
>>> # xdoctest: +SKIP
|
|
89
|
+
>>> write_npz_metadata("out.npz", {"genome_name": "hg38"})
|
|
90
|
+
"""
|
|
91
|
+
with zipfile.ZipFile(npz_path, "a") as zf:
|
|
92
|
+
zf.writestr("metadata.json", json.dumps(metadata, indent=2))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def read_npz_metadata(npz_path: str) -> dict | None:
|
|
96
|
+
"""Read the ``metadata.json`` entry from a ``.npz`` file.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
npz_path: Path to the ``.npz`` file.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
The metadata dictionary, or ``None`` if the file does not contain
|
|
103
|
+
a ``metadata.json`` entry (e.g. files produced by older versions).
|
|
104
|
+
|
|
105
|
+
Example:
|
|
106
|
+
>>> # xdoctest: +SKIP
|
|
107
|
+
>>> meta = read_npz_metadata("sample.methylation.npz")
|
|
108
|
+
>>> meta["genome_name"]
|
|
109
|
+
'hg38'
|
|
110
|
+
"""
|
|
111
|
+
with zipfile.ZipFile(npz_path, "r") as zf:
|
|
112
|
+
if "metadata.json" in zf.namelist():
|
|
113
|
+
return json.loads(zf.read("metadata.json"))
|
|
114
|
+
return None
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Test cases for the inspect module."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
|
|
5
|
+
import scipy.sparse
|
|
6
|
+
from click.testing import CliRunner
|
|
7
|
+
|
|
8
|
+
from bam2tensor import __main__
|
|
9
|
+
from bam2tensor.inspect import _format_size
|
|
10
|
+
from bam2tensor.inspect import main as inspect_main
|
|
11
|
+
from bam2tensor.metadata import write_npz_metadata
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_inspect_with_metadata(tmp_path) -> None:
|
|
15
|
+
"""Inspect prints metadata fields when present."""
|
|
16
|
+
npz_path = str(tmp_path / "sample.methylation.npz")
|
|
17
|
+
matrix = scipy.sparse.coo_matrix(([1, 0, -1], ([0, 0, 1], [0, 2, 1])), shape=(2, 5))
|
|
18
|
+
scipy.sparse.save_npz(npz_path, matrix)
|
|
19
|
+
write_npz_metadata(
|
|
20
|
+
npz_path,
|
|
21
|
+
{
|
|
22
|
+
"bam2tensor_version": "2.3",
|
|
23
|
+
"genome_name": "hg38",
|
|
24
|
+
"expected_chromosomes": ["chr1", "chr2", "chrX", "chrY"],
|
|
25
|
+
"total_cpg_sites": 5,
|
|
26
|
+
"cpg_index_crc32": "deadbeef",
|
|
27
|
+
},
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
runner = CliRunner()
|
|
31
|
+
result = runner.invoke(inspect_main, [npz_path])
|
|
32
|
+
assert result.exit_code == 0
|
|
33
|
+
assert "hg38" in result.output
|
|
34
|
+
assert "Reads:" in result.output
|
|
35
|
+
assert "2" in result.output # 2 reads
|
|
36
|
+
assert "CpG sites:" in result.output
|
|
37
|
+
assert "deadbeef" in result.output
|
|
38
|
+
assert "v2.3" in result.output
|
|
39
|
+
assert "chr1, chr2, chrX, chrY" in result.output
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_inspect_without_metadata(tmp_path) -> None:
|
|
43
|
+
"""Inspect works on files without metadata (older bam2tensor)."""
|
|
44
|
+
npz_path = str(tmp_path / "old.methylation.npz")
|
|
45
|
+
matrix = scipy.sparse.coo_matrix(([1], ([0], [0])), shape=(1, 100))
|
|
46
|
+
scipy.sparse.save_npz(npz_path, matrix)
|
|
47
|
+
|
|
48
|
+
runner = CliRunner()
|
|
49
|
+
result = runner.invoke(inspect_main, [npz_path])
|
|
50
|
+
assert result.exit_code == 0
|
|
51
|
+
assert "Reads:" in result.output
|
|
52
|
+
assert "older bam2tensor" in result.output
|
|
53
|
+
# Should NOT have genome or CRC lines
|
|
54
|
+
assert "Genome:" not in result.output
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_inspect_multiple_files(tmp_path) -> None:
|
|
58
|
+
"""Inspect handles multiple files with blank line separator."""
|
|
59
|
+
paths = []
|
|
60
|
+
for name in ["a.npz", "b.npz"]:
|
|
61
|
+
p = str(tmp_path / name)
|
|
62
|
+
matrix = scipy.sparse.coo_matrix(([1], ([0], [0])), shape=(1, 10))
|
|
63
|
+
scipy.sparse.save_npz(p, matrix)
|
|
64
|
+
paths.append(p)
|
|
65
|
+
|
|
66
|
+
runner = CliRunner()
|
|
67
|
+
result = runner.invoke(inspect_main, paths)
|
|
68
|
+
assert result.exit_code == 0
|
|
69
|
+
assert "a.npz" in result.output
|
|
70
|
+
assert "b.npz" in result.output
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_inspect_many_chromosomes(tmp_path) -> None:
|
|
74
|
+
"""Chromosome list is summarised when > 4 entries."""
|
|
75
|
+
npz_path = str(tmp_path / "matrix.npz")
|
|
76
|
+
matrix = scipy.sparse.coo_matrix(([1], ([0], [0])), shape=(1, 10))
|
|
77
|
+
scipy.sparse.save_npz(npz_path, matrix)
|
|
78
|
+
chroms = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"]
|
|
79
|
+
write_npz_metadata(
|
|
80
|
+
npz_path,
|
|
81
|
+
{
|
|
82
|
+
"expected_chromosomes": chroms,
|
|
83
|
+
"genome_name": "hg38",
|
|
84
|
+
},
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
runner = CliRunner()
|
|
88
|
+
result = runner.invoke(inspect_main, [npz_path])
|
|
89
|
+
assert "24 (" in result.output
|
|
90
|
+
assert "chrY" in result.output
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def test_inspect_end_to_end(tmp_path) -> None:
|
|
94
|
+
"""Full pipeline: bam2tensor produces file, bam2tensor-inspect reads it."""
|
|
95
|
+
shutil.copy("tests/test.bam", tmp_path / "test.bam")
|
|
96
|
+
shutil.copy("tests/test.bam.bai", tmp_path / "test.bam.bai")
|
|
97
|
+
|
|
98
|
+
runner = CliRunner()
|
|
99
|
+
# Run extraction
|
|
100
|
+
result = runner.invoke(
|
|
101
|
+
__main__.main,
|
|
102
|
+
[
|
|
103
|
+
"--input-path",
|
|
104
|
+
str(tmp_path / "test.bam"),
|
|
105
|
+
"--reference-fasta",
|
|
106
|
+
"tests/test_fasta.fa",
|
|
107
|
+
"--genome-name",
|
|
108
|
+
"test",
|
|
109
|
+
"--expected-chromosomes",
|
|
110
|
+
"chr1,chr2,chr3",
|
|
111
|
+
"--output-dir",
|
|
112
|
+
str(tmp_path / "out"),
|
|
113
|
+
"--overwrite",
|
|
114
|
+
],
|
|
115
|
+
)
|
|
116
|
+
assert result.exit_code == 0
|
|
117
|
+
|
|
118
|
+
# Inspect the output
|
|
119
|
+
npz_path = str(tmp_path / "out" / "test.methylation.npz")
|
|
120
|
+
result = runner.invoke(inspect_main, [npz_path])
|
|
121
|
+
assert result.exit_code == 0
|
|
122
|
+
assert "test" in result.output # genome_name
|
|
123
|
+
assert "CpG index CRC32:" in result.output
|
|
124
|
+
assert "v2.3" in result.output
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_format_size_bytes() -> None:
|
|
128
|
+
"""_format_size handles small byte counts."""
|
|
129
|
+
assert _format_size(500) == "500 bytes"
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def test_format_size_kb() -> None:
|
|
133
|
+
"""_format_size handles kilobyte range."""
|
|
134
|
+
assert _format_size(2048) == "2.0 KB"
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def test_format_size_mb() -> None:
|
|
138
|
+
"""_format_size handles megabyte range."""
|
|
139
|
+
result = _format_size(14_200_000)
|
|
140
|
+
assert "MB" in result
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def test_format_size_gb() -> None:
|
|
144
|
+
"""_format_size handles gigabyte range."""
|
|
145
|
+
result = _format_size(2_500_000_000)
|
|
146
|
+
assert "GB" in result
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""Test cases for the metadata module."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import zipfile
|
|
5
|
+
|
|
6
|
+
import scipy.sparse
|
|
7
|
+
|
|
8
|
+
from bam2tensor import embedding
|
|
9
|
+
from bam2tensor.metadata import (
|
|
10
|
+
compute_cpg_index_crc32,
|
|
11
|
+
read_npz_metadata,
|
|
12
|
+
write_npz_metadata,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
TEST_EMBEDDING = embedding.GenomeMethylationEmbedding(
|
|
16
|
+
"test_genome",
|
|
17
|
+
expected_chromosomes=["chr1", "chr2", "chr3"],
|
|
18
|
+
fasta_source="tests/test_fasta.fa",
|
|
19
|
+
window_size=150,
|
|
20
|
+
skip_cache=False,
|
|
21
|
+
verbose=False,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# -- compute_cpg_index_crc32 -------------------------------------------------
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_cpg_index_crc32_deterministic() -> None:
|
|
29
|
+
"""Same embedding always produces the same CRC32."""
|
|
30
|
+
assert compute_cpg_index_crc32(TEST_EMBEDDING) == compute_cpg_index_crc32(
|
|
31
|
+
TEST_EMBEDDING
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_cpg_index_crc32_format() -> None:
|
|
36
|
+
"""CRC32 is an 8-character hex string."""
|
|
37
|
+
crc = compute_cpg_index_crc32(TEST_EMBEDDING)
|
|
38
|
+
assert len(crc) == 8
|
|
39
|
+
int(crc, 16) # must be valid hex
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_cpg_index_crc32_differs_for_different_embeddings(tmp_path) -> None:
|
|
43
|
+
"""Different chromosome lists produce different CRC32 values."""
|
|
44
|
+
emb_subset = embedding.GenomeMethylationEmbedding(
|
|
45
|
+
"test_subset",
|
|
46
|
+
expected_chromosomes=["chr1"],
|
|
47
|
+
fasta_source="tests/test_fasta.fa",
|
|
48
|
+
window_size=150,
|
|
49
|
+
skip_cache=True,
|
|
50
|
+
verbose=False,
|
|
51
|
+
)
|
|
52
|
+
assert compute_cpg_index_crc32(TEST_EMBEDDING) != compute_cpg_index_crc32(
|
|
53
|
+
emb_subset
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# -- write / read round-trip -------------------------------------------------
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_write_then_read_metadata(tmp_path) -> None:
|
|
61
|
+
"""Metadata survives a write-then-read round trip."""
|
|
62
|
+
npz_path = str(tmp_path / "matrix.npz")
|
|
63
|
+
matrix = scipy.sparse.coo_matrix(([1, 0, -1], ([0, 0, 1], [0, 2, 1])), shape=(2, 4))
|
|
64
|
+
scipy.sparse.save_npz(npz_path, matrix)
|
|
65
|
+
|
|
66
|
+
metadata = {
|
|
67
|
+
"bam2tensor_version": "2.2",
|
|
68
|
+
"genome_name": "hg38",
|
|
69
|
+
"cpg_index_crc32": "deadbeef",
|
|
70
|
+
"total_cpg_sites": 4,
|
|
71
|
+
"expected_chromosomes": ["chr1", "chr2"],
|
|
72
|
+
}
|
|
73
|
+
write_npz_metadata(npz_path, metadata)
|
|
74
|
+
|
|
75
|
+
loaded = read_npz_metadata(npz_path)
|
|
76
|
+
assert loaded == metadata
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_scipy_load_unaffected_by_metadata(tmp_path) -> None:
|
|
80
|
+
"""scipy.sparse.load_npz still works after metadata is appended."""
|
|
81
|
+
npz_path = str(tmp_path / "matrix.npz")
|
|
82
|
+
data = [1, 0, -1, 1, 0]
|
|
83
|
+
row = [0, 0, 1, 1, 2]
|
|
84
|
+
col = [0, 2, 1, 3, 2]
|
|
85
|
+
matrix = scipy.sparse.coo_matrix((data, (row, col)), shape=(3, 5))
|
|
86
|
+
scipy.sparse.save_npz(npz_path, matrix)
|
|
87
|
+
|
|
88
|
+
write_npz_metadata(npz_path, {"genome_name": "hg38"})
|
|
89
|
+
|
|
90
|
+
loaded = scipy.sparse.load_npz(npz_path)
|
|
91
|
+
assert (loaded.toarray() == matrix.toarray()).all()
|
|
92
|
+
assert loaded.shape == matrix.shape
|
|
93
|
+
assert loaded.nnz == matrix.nnz
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def test_read_metadata_returns_none_without_metadata(tmp_path) -> None:
|
|
97
|
+
"""read_npz_metadata returns None for files without metadata."""
|
|
98
|
+
npz_path = str(tmp_path / "plain.npz")
|
|
99
|
+
matrix = scipy.sparse.coo_matrix(([1], ([0], [0])), shape=(1, 1))
|
|
100
|
+
scipy.sparse.save_npz(npz_path, matrix)
|
|
101
|
+
|
|
102
|
+
assert read_npz_metadata(npz_path) is None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_metadata_accessible_via_zipfile(tmp_path) -> None:
|
|
106
|
+
"""Metadata is plain JSON readable with standard zipfile tools."""
|
|
107
|
+
npz_path = str(tmp_path / "matrix.npz")
|
|
108
|
+
matrix = scipy.sparse.coo_matrix(([1], ([0], [0])), shape=(1, 1))
|
|
109
|
+
scipy.sparse.save_npz(npz_path, matrix)
|
|
110
|
+
|
|
111
|
+
write_npz_metadata(npz_path, {"genome_name": "mm10", "total_cpg_sites": 42})
|
|
112
|
+
|
|
113
|
+
with zipfile.ZipFile(npz_path, "r") as zf:
|
|
114
|
+
assert "metadata.json" in zf.namelist()
|
|
115
|
+
raw = json.loads(zf.read("metadata.json"))
|
|
116
|
+
assert raw["genome_name"] == "mm10"
|
|
117
|
+
assert raw["total_cpg_sites"] == 42
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# -- CLI integration (end-to-end) -------------------------------------------
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def test_main_writes_metadata(tmp_path) -> None:
|
|
124
|
+
"""The CLI embeds metadata in the output .npz file."""
|
|
125
|
+
import shutil
|
|
126
|
+
from click.testing import CliRunner
|
|
127
|
+
from bam2tensor import __main__
|
|
128
|
+
|
|
129
|
+
shutil.copy("tests/test.bam", tmp_path / "test.bam")
|
|
130
|
+
shutil.copy("tests/test.bam.bai", tmp_path / "test.bam.bai")
|
|
131
|
+
|
|
132
|
+
runner = CliRunner()
|
|
133
|
+
result = runner.invoke(
|
|
134
|
+
__main__.main,
|
|
135
|
+
[
|
|
136
|
+
"--input-path",
|
|
137
|
+
str(tmp_path / "test.bam"),
|
|
138
|
+
"--reference-fasta",
|
|
139
|
+
"tests/test_fasta.fa",
|
|
140
|
+
"--genome-name",
|
|
141
|
+
"test",
|
|
142
|
+
"--expected-chromosomes",
|
|
143
|
+
"chr1,chr2,chr3",
|
|
144
|
+
"--output-dir",
|
|
145
|
+
str(tmp_path / "out"),
|
|
146
|
+
"--overwrite",
|
|
147
|
+
],
|
|
148
|
+
)
|
|
149
|
+
assert result.exit_code == 0, f"CLI failed: {result.output}"
|
|
150
|
+
|
|
151
|
+
npz_path = str(tmp_path / "out" / "test.methylation.npz")
|
|
152
|
+
meta = read_npz_metadata(npz_path)
|
|
153
|
+
assert meta is not None
|
|
154
|
+
assert meta["genome_name"] == "test"
|
|
155
|
+
assert meta["expected_chromosomes"] == ["chr1", "chr2", "chr3"]
|
|
156
|
+
assert meta["total_cpg_sites"] == TEST_EMBEDDING.total_cpg_sites
|
|
157
|
+
assert len(meta["cpg_index_crc32"]) == 8
|
|
158
|
+
assert "bam2tensor_version" in meta
|
|
159
|
+
|
|
160
|
+
# Verify the sparse matrix is still loadable
|
|
161
|
+
mat = scipy.sparse.load_npz(npz_path)
|
|
162
|
+
assert mat.shape[1] == TEST_EMBEDDING.total_cpg_sites
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{bam2tensor-2.1 → bam2tensor-2.3}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|