bam2tensor 2.3__tar.gz → 2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {bam2tensor-2.3 → bam2tensor-2.4}/.github/workflows/constraints.txt +1 -1
  2. {bam2tensor-2.3 → bam2tensor-2.4}/.github/workflows/docs.yml +2 -2
  3. {bam2tensor-2.3 → bam2tensor-2.4}/.github/workflows/labeler.yml +1 -1
  4. {bam2tensor-2.3 → bam2tensor-2.4}/.github/workflows/release.yml +1 -1
  5. {bam2tensor-2.3 → bam2tensor-2.4}/CLAUDE.md +3 -1
  6. {bam2tensor-2.3 → bam2tensor-2.4}/PKG-INFO +40 -4
  7. {bam2tensor-2.3 → bam2tensor-2.4}/README.md +39 -3
  8. {bam2tensor-2.3 → bam2tensor-2.4}/docs/reference.md +8 -0
  9. {bam2tensor-2.3 → bam2tensor-2.4}/pyproject.toml +1 -1
  10. {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/__init__.py +3 -3
  11. {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/__main__.py +11 -6
  12. {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/functions.py +46 -18
  13. {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/inspect.py +14 -1
  14. {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/metadata.py +48 -0
  15. {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_duplication.py +5 -3
  16. {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_functions.py +233 -79
  17. {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_inspect.py +44 -4
  18. {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_main.py +33 -0
  19. {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_metadata.py +62 -0
  20. {bam2tensor-2.3 → bam2tensor-2.4}/uv.lock +103 -107
  21. {bam2tensor-2.3 → bam2tensor-2.4}/.darglint +0 -0
  22. {bam2tensor-2.3 → bam2tensor-2.4}/.editorconfig +0 -0
  23. {bam2tensor-2.3 → bam2tensor-2.4}/.gitattributes +0 -0
  24. {bam2tensor-2.3 → bam2tensor-2.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  25. {bam2tensor-2.3 → bam2tensor-2.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  26. {bam2tensor-2.3 → bam2tensor-2.4}/.github/actions/setup-env/action.yml +0 -0
  27. {bam2tensor-2.3 → bam2tensor-2.4}/.github/dependabot.yml +0 -0
  28. {bam2tensor-2.3 → bam2tensor-2.4}/.github/labels.yml +0 -0
  29. {bam2tensor-2.3 → bam2tensor-2.4}/.github/release-drafter.yml +0 -0
  30. {bam2tensor-2.3 → bam2tensor-2.4}/.github/workflows/tests.yml +0 -0
  31. {bam2tensor-2.3 → bam2tensor-2.4}/.gitignore +0 -0
  32. {bam2tensor-2.3 → bam2tensor-2.4}/.pre-commit-config.yaml +0 -0
  33. {bam2tensor-2.3 → bam2tensor-2.4}/CONTRIBUTING.md +0 -0
  34. {bam2tensor-2.3 → bam2tensor-2.4}/LICENSE +0 -0
  35. {bam2tensor-2.3 → bam2tensor-2.4}/SECURITY.md +0 -0
  36. {bam2tensor-2.3 → bam2tensor-2.4}/docs/Makefile +0 -0
  37. {bam2tensor-2.3 → bam2tensor-2.4}/docs/conf.py +0 -0
  38. {bam2tensor-2.3 → bam2tensor-2.4}/docs/contributing.md +0 -0
  39. {bam2tensor-2.3 → bam2tensor-2.4}/docs/index.md +0 -0
  40. {bam2tensor-2.3 → bam2tensor-2.4}/docs/license.md +0 -0
  41. {bam2tensor-2.3 → bam2tensor-2.4}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
  42. {bam2tensor-2.3 → bam2tensor-2.4}/docs/logo/bam2tensor-logo.afdesign +0 -0
  43. {bam2tensor-2.3 → bam2tensor-2.4}/docs/logo/bam2tensor-logo.png +0 -0
  44. {bam2tensor-2.3 → bam2tensor-2.4}/docs/make.bat +0 -0
  45. {bam2tensor-2.3 → bam2tensor-2.4}/docs/nano-banana-overview-shrunk.png +0 -0
  46. {bam2tensor-2.3 → bam2tensor-2.4}/docs/templates/package.rst_t +0 -0
  47. {bam2tensor-2.3 → bam2tensor-2.4}/noxfile.py +0 -0
  48. {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/embedding.py +0 -0
  49. {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/py.typed +0 -0
  50. {bam2tensor-2.3 → bam2tensor-2.4}/src/bam2tensor/reference.py +0 -0
  51. {bam2tensor-2.3 → bam2tensor-2.4}/tests/__init__.py +0 -0
  52. {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_embedding.py +0 -0
  53. {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_fasta.fa +0 -0
  54. {bam2tensor-2.3 → bam2tensor-2.4}/tests/test_reference.py +0 -0
@@ -1,2 +1,2 @@
1
1
  nox==2026.2.9
2
- uv==0.10.7
2
+ uv==0.11.2
@@ -59,8 +59,8 @@ jobs:
59
59
  needs: build
60
60
  steps:
61
61
  - name: Setup Pages
62
- uses: actions/configure-pages@v5
62
+ uses: actions/configure-pages@v6
63
63
 
64
64
  - name: Deploy to GitHub Pages
65
65
  id: deployment
66
- uses: actions/deploy-pages@v4
66
+ uses: actions/deploy-pages@v5
@@ -20,6 +20,6 @@ jobs:
20
20
  uses: actions/checkout@v6
21
21
 
22
22
  - name: Run Labeler
23
- uses: crazy-max/ghaction-github-labeler@v5.3.0
23
+ uses: crazy-max/ghaction-github-labeler@v6.0.0
24
24
  with:
25
25
  skip-delete: true
@@ -76,7 +76,7 @@ jobs:
76
76
  repository-url: https://test.pypi.org/legacy/
77
77
 
78
78
  - name: Publish the release notes
79
- uses: release-drafter/release-drafter@v6.2.0
79
+ uses: release-drafter/release-drafter@v7.1.1
80
80
  with:
81
81
  publish: ${{ steps.check-version.outputs.tag != '' || steps.check-tag.outputs.tag != '' }}
82
82
  tag: ${{ steps.check-version.outputs.tag || steps.check-tag.outputs.tag }}
@@ -40,7 +40,7 @@ uv run mypy src
40
40
 
41
41
  ```
42
42
  src/bam2tensor/
43
- __init__.py # Package version (2.3)
43
+ __init__.py # Package version (2.4)
44
44
  __main__.py # Click CLI entry point (bam2tensor command)
45
45
  inspect.py # Inspect CLI entry point (bam2tensor-inspect command)
46
46
  embedding.py # GenomeMethylationEmbedding class (FASTA parsing, CpG indexing)
@@ -117,6 +117,8 @@ xdoctest validates code examples in docstrings. Important rules:
117
117
  - Columns = CpG sites (ordered by genomic position, determined by reference genome)
118
118
  - Values: 1 (methylated), 0 (unmethylated), -1 (no data/indels/SNVs)
119
119
  - Each .npz file contains a `metadata.json` entry with provenance info (genome name, version, CpG index CRC32, expected chromosomes). Read via `bam2tensor.metadata.read_npz_metadata()`.
120
+ - Each .npz file contains a `tlen.npy` entry with per-read signed template length (BAM TLEN field) as int32. Read via `bam2tensor.metadata.read_npz_tlen()`. Returns `None` for files from older versions.
121
+ - `extract_methylation_data_from_bam()` returns an `ExtractionResult` NamedTuple with `.matrix` (sparse COO) and `.tlen` (numpy int32 array).
120
122
 
121
123
  ### Methylation Strand Detection
122
124
  - Bismark aligner: XM tag (Z/z for methylated/unmethylated CpG; no strand filtering needed)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bam2tensor
3
- Version: 2.3
3
+ Version: 2.4
4
4
  Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
5
5
  Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
6
6
  Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
@@ -74,6 +74,7 @@ Description-Content-Type: text/markdown
74
74
  - [Command-Line Options](#command-line-options)
75
75
  - [Inspecting Output Files](#inspecting-output-files)
76
76
  - [Output Data Structure](#output-data-structure)
77
+ - [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
77
78
  - [Embedded Metadata](#embedded-metadata)
78
79
  - [Loading Output Files](#loading-output-files)
79
80
  - [Converting to Dense Arrays](#converting-to-dense-arrays)
@@ -97,6 +98,7 @@ Description-Content-Type: text/markdown
97
98
  - **Batch Processing**: Process multiple BAM files with directory recursion
98
99
  - **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
99
100
  - **Quality Filtering**: Configurable mapping quality thresholds
101
+ - **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
100
102
 
101
103
  ## Requirements
102
104
 
@@ -299,8 +301,9 @@ sample.methylation.npz
299
301
  Reads: 1,423,891
300
302
  CpG sites: 28,217,448
301
303
  Data points: 12,847,322 (sparsity: 99.97%)
304
+ Fragment len: median 167, mean 182, range [50, 600]
302
305
  CpG index CRC32: a1b2c3d4
303
- bam2tensor: v2.3
306
+ bam2tensor: v2.4
304
307
  File size: 14.2 MB
305
308
  ```
306
309
 
@@ -333,6 +336,27 @@ The **column dimension is determined entirely by the reference genome**: it equa
333
336
 
334
337
  Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
335
338
 
339
+ ### Per-Read Fragment Length (TLEN)
340
+
341
+ Each `.methylation.npz` file includes a `tlen.npy` entry inside the ZIP archive containing the signed BAM template length (TLEN) for every read in the matrix. This enables joint fragment-length and methylation analysis without re-processing the BAM.
342
+
343
+ - One `int32` value per read (row), in the same order as the sparse matrix rows
344
+ - Signed: positive for the leftmost read in a pair, negative for the rightmost
345
+ - Zero for single-end reads or reads with unmapped mates
346
+ - Use `abs(tlen)` to get fragment lengths
347
+
348
+ ```python
349
+ from bam2tensor.metadata import read_npz_tlen
350
+ import numpy as np
351
+
352
+ tlen = read_npz_tlen("sample.methylation.npz")
353
+ if tlen is not None:
354
+ frag_lengths = np.abs(tlen)
355
+ nonzero = frag_lengths[frag_lengths > 0]
356
+ print(f"Median fragment length: {np.median(nonzero):.0f}")
357
+ print(f"Mean fragment length: {np.mean(nonzero):.0f}")
358
+ ```
359
+
336
360
  ### Embedded Metadata
337
361
 
338
362
  Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
@@ -548,10 +572,22 @@ extract_methylation_data_from_bam(
548
572
  quality_limit: int = 20, # Minimum MAPQ
549
573
  verbose: bool = False, # Enable verbose output
550
574
  debug: bool = False # Enable debug output
551
- ) -> scipy.sparse.coo_matrix
575
+ ) -> ExtractionResult
576
+ ```
577
+
578
+ **Returns:** An `ExtractionResult` named tuple with two fields:
579
+ - `matrix`: A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites)
580
+ - `tlen`: A 1-D numpy `int32` array of shape (n_reads,) containing the signed template length (BAM TLEN field) for each read
581
+
582
+ ### `bam2tensor.metadata.read_npz_tlen`
583
+
584
+ Read per-read template lengths from a `.methylation.npz` file.
585
+
586
+ ```python
587
+ read_npz_tlen(npz_path: str) -> np.ndarray | None
552
588
  ```
553
589
 
554
- **Returns:** A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites).
590
+ **Returns:** The per-read template-length array, or `None` if the file was produced by an older version of bam2tensor.
555
591
 
556
592
  ## Contributing
557
593
 
@@ -41,6 +41,7 @@
41
41
  - [Command-Line Options](#command-line-options)
42
42
  - [Inspecting Output Files](#inspecting-output-files)
43
43
  - [Output Data Structure](#output-data-structure)
44
+ - [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
44
45
  - [Embedded Metadata](#embedded-metadata)
45
46
  - [Loading Output Files](#loading-output-files)
46
47
  - [Converting to Dense Arrays](#converting-to-dense-arrays)
@@ -64,6 +65,7 @@
64
65
  - **Batch Processing**: Process multiple BAM files with directory recursion
65
66
  - **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
66
67
  - **Quality Filtering**: Configurable mapping quality thresholds
68
+ - **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
67
69
 
68
70
  ## Requirements
69
71
 
@@ -266,8 +268,9 @@ sample.methylation.npz
266
268
  Reads: 1,423,891
267
269
  CpG sites: 28,217,448
268
270
  Data points: 12,847,322 (sparsity: 99.97%)
271
+ Fragment len: median 167, mean 182, range [50, 600]
269
272
  CpG index CRC32: a1b2c3d4
270
- bam2tensor: v2.3
273
+ bam2tensor: v2.4
271
274
  File size: 14.2 MB
272
275
  ```
273
276
 
@@ -300,6 +303,27 @@ The **column dimension is determined entirely by the reference genome**: it equa
300
303
 
301
304
  Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
302
305
 
306
+ ### Per-Read Fragment Length (TLEN)
307
+
308
+ Each `.methylation.npz` file includes a `tlen.npy` entry inside the ZIP archive containing the signed BAM template length (TLEN) for every read in the matrix. This enables joint fragment-length and methylation analysis without re-processing the BAM.
309
+
310
+ - One `int32` value per read (row), in the same order as the sparse matrix rows
311
+ - Signed: positive for the leftmost read in a pair, negative for the rightmost
312
+ - Zero for single-end reads or reads with unmapped mates
313
+ - Use `abs(tlen)` to get fragment lengths
314
+
315
+ ```python
316
+ from bam2tensor.metadata import read_npz_tlen
317
+ import numpy as np
318
+
319
+ tlen = read_npz_tlen("sample.methylation.npz")
320
+ if tlen is not None:
321
+ frag_lengths = np.abs(tlen)
322
+ nonzero = frag_lengths[frag_lengths > 0]
323
+ print(f"Median fragment length: {np.median(nonzero):.0f}")
324
+ print(f"Mean fragment length: {np.mean(nonzero):.0f}")
325
+ ```
326
+
303
327
  ### Embedded Metadata
304
328
 
305
329
  Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
@@ -515,10 +539,22 @@ extract_methylation_data_from_bam(
515
539
  quality_limit: int = 20, # Minimum MAPQ
516
540
  verbose: bool = False, # Enable verbose output
517
541
  debug: bool = False # Enable debug output
518
- ) -> scipy.sparse.coo_matrix
542
+ ) -> ExtractionResult
543
+ ```
544
+
545
+ **Returns:** An `ExtractionResult` named tuple with two fields:
546
+ - `matrix`: A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites)
547
+ - `tlen`: A 1-D numpy `int32` array of shape (n_reads,) containing the signed template length (BAM TLEN field) for each read
548
+
549
+ ### `bam2tensor.metadata.read_npz_tlen`
550
+
551
+ Read per-read template lengths from a `.methylation.npz` file.
552
+
553
+ ```python
554
+ read_npz_tlen(npz_path: str) -> np.ndarray | None
519
555
  ```
520
556
 
521
- **Returns:** A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites).
557
+ **Returns:** The per-read template-length array, or `None` if the file was produced by an older version of bam2tensor.
522
558
 
523
559
  ## Contributing
524
560
 
@@ -32,6 +32,14 @@ bam2tensor.functions module
32
32
  :show-inheritance:
33
33
  :undoc-members:
34
34
 
35
+ bam2tensor.metadata module
36
+ --------------------------
37
+
38
+ .. automodule:: bam2tensor.metadata
39
+ :members:
40
+ :show-inheritance:
41
+ :undoc-members:
42
+
35
43
  bam2tensor.reference module
36
44
  ---------------------------
37
45
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "bam2tensor"
3
- version = "2.3"
3
+ version = "2.4"
4
4
  description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
5
5
  authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
6
6
  license = "MIT"
@@ -30,14 +30,14 @@ Example:
30
30
  )
31
31
 
32
32
  # Extract methylation data
33
- sparse_matrix = extract_methylation_data_from_bam(
33
+ result = extract_methylation_data_from_bam(
34
34
  input_bam="/path/to/sample.bam",
35
35
  genome_methylation_embedding=embedding,
36
36
  )
37
37
 
38
38
  # Save to file
39
39
  import scipy.sparse
40
- scipy.sparse.save_npz("output.npz", sparse_matrix)
40
+ scipy.sparse.save_npz("output.npz", result.matrix)
41
41
 
42
42
  Output Format:
43
43
  The output is a SciPy sparse COO matrix where:
@@ -50,4 +50,4 @@ See Also:
50
50
  - https://mcwdsi.github.io/bam2tensor for full documentation
51
51
  """
52
52
 
53
- __version__ = "2.3"
53
+ __version__ = "2.4"
@@ -38,7 +38,11 @@ from bam2tensor.functions import (
38
38
  detect_aligner,
39
39
  extract_methylation_data_from_bam,
40
40
  )
41
- from bam2tensor.metadata import compute_cpg_index_crc32, write_npz_metadata
41
+ from bam2tensor.metadata import (
42
+ compute_cpg_index_crc32,
43
+ write_npz_metadata,
44
+ write_npz_tlen,
45
+ )
42
46
  from bam2tensor.reference import (
43
47
  KNOWN_GENOMES,
44
48
  download_reference as download_reference_fn,
@@ -440,7 +444,7 @@ def main(
440
444
  # Extract
441
445
  print(" Extracting methylation data...")
442
446
  try:
443
- methylation_data_coo = extract_methylation_data_from_bam(
447
+ extraction_result = extract_methylation_data_from_bam(
444
448
  input_bam=input_bam,
445
449
  genome_methylation_embedding=genome_methylation_embedding,
446
450
  quality_limit=quality_limit,
@@ -453,16 +457,17 @@ def main(
453
457
  continue
454
458
 
455
459
  # Matrix stats
456
- n_reads = methylation_data_coo.shape[0]
457
- n_cpgs = methylation_data_coo.shape[1]
458
- n_data = methylation_data_coo.nnz
460
+ n_reads = extraction_result.matrix.shape[0]
461
+ n_cpgs = extraction_result.matrix.shape[1]
462
+ n_data = extraction_result.matrix.nnz
459
463
  print(
460
464
  f" Result: {n_reads:,} reads x {n_cpgs:,} CpG sites"
461
465
  f" ({n_data:,} data points)"
462
466
  )
463
467
 
464
468
  # Save
465
- scipy.sparse.save_npz(output_file, methylation_data_coo, compressed=True)
469
+ scipy.sparse.save_npz(output_file, extraction_result.matrix, compressed=True)
470
+ write_npz_tlen(output_file, extraction_result.tlen)
466
471
  write_npz_metadata(
467
472
  output_file,
468
473
  {
@@ -39,15 +39,18 @@ Example:
39
39
  ... )
40
40
  >>>
41
41
  >>> # Extract methylation data
42
- >>> matrix = extract_methylation_data_from_bam(
42
+ >>> result = extract_methylation_data_from_bam(
43
43
  ... input_bam="sample.bam",
44
44
  ... genome_methylation_embedding=embedding,
45
45
  ... quality_limit=20,
46
46
  ... )
47
47
  >>>
48
- >>> print(f"Extracted {matrix.shape[0]} reads, {matrix.nnz} data points")
48
+ >>> print(f"Extracted {result.matrix.shape[0]} reads, {result.matrix.nnz} data points")
49
49
  """
50
50
 
51
+ from typing import NamedTuple
52
+
53
+ import numpy as np
51
54
  import scipy.sparse
52
55
  import pysam
53
56
  import bisect
@@ -55,6 +58,23 @@ import bisect
55
58
  from tqdm import tqdm
56
59
  from bam2tensor.embedding import GenomeMethylationEmbedding
57
60
 
61
+
62
+ class ExtractionResult(NamedTuple):
63
+ """Result of methylation extraction from a BAM file.
64
+
65
+ Attributes:
66
+ matrix: Sparse COO matrix of shape (n_reads, n_cpg_sites) with
67
+ methylation states: 1 (methylated), 0 (unmethylated), -1
68
+ (no data).
69
+ tlen: 1-D numpy array of shape (n_reads,) containing the signed
70
+ template length (TLEN from BAM) for each read. 0 for
71
+ single-end reads or reads with unmapped mates.
72
+ """
73
+
74
+ matrix: scipy.sparse.coo_matrix
75
+ tlen: np.ndarray
76
+
77
+
58
78
  # BAM flag bits for reads to skip: duplicate (0x400), qcfail (0x200),
59
79
  # secondary (0x100), supplementary (0x800).
60
80
  _SKIP_FLAGS = 0x400 | 0x200 | 0x100 | 0x800
@@ -180,7 +200,7 @@ def extract_methylation_data_from_bam(
180
200
  quality_limit: int = 20,
181
201
  verbose: bool = False,
182
202
  debug: bool = False,
183
- ) -> scipy.sparse.coo_matrix:
203
+ ) -> ExtractionResult:
184
204
  """Extract read-level CpG methylation data from a BAM file.
185
205
 
186
206
  Parses a bisulfite-sequencing or EM-seq BAM file and extracts methylation
@@ -225,14 +245,19 @@ def extract_methylation_data_from_bam(
225
245
  only processed once. Significantly slower.
226
246
 
227
247
  Returns:
228
- A scipy.sparse.coo_matrix with shape (n_reads, n_cpg_sites) where:
229
- - n_reads is the number of reads that passed filters and covered
230
- at least one CpG site
231
- - n_cpg_sites is genome_methylation_embedding.total_cpg_sites
232
- - Values are: 1 (methylated), 0 (unmethylated), -1 (no data)
233
-
234
- The matrix uses COO format for efficient construction. Convert to
235
- CSR (tocsr()) for row slicing or CSC (tocsc()) for column slicing.
248
+ An ExtractionResult named tuple with two fields:
249
+
250
+ - **matrix**: A scipy.sparse.coo_matrix with shape
251
+ (n_reads, n_cpg_sites) where n_reads is the number of reads
252
+ that passed filters and covered at least one CpG site,
253
+ n_cpg_sites is genome_methylation_embedding.total_cpg_sites,
254
+ and values are: 1 (methylated), 0 (unmethylated), -1 (no data).
255
+ The matrix uses COO format for efficient construction; convert
256
+ to CSR (tocsr()) for row slicing or CSC (tocsc()) for column
257
+ slicing.
258
+ - **tlen**: A 1-D numpy int32 array of shape (n_reads,) containing
259
+ the signed template length (BAM TLEN field) for each read.
260
+ Values are 0 for single-end reads or reads with unmapped mates.
236
261
 
237
262
  Raises:
238
263
  FileNotFoundError: If the BAM file index (.bam.bai) is missing.
@@ -252,7 +277,7 @@ def extract_methylation_data_from_bam(
252
277
  ... )
253
278
  >>>
254
279
  >>> # Extract methylation data
255
- >>> matrix = extract_methylation_data_from_bam(
280
+ >>> result = extract_methylation_data_from_bam(
256
281
  ... input_bam="sample.bam",
257
282
  ... genome_methylation_embedding=embedding,
258
283
  ... quality_limit=30, # Stricter quality filter
@@ -260,12 +285,12 @@ def extract_methylation_data_from_bam(
260
285
  ... )
261
286
  >>>
262
287
  >>> # Analyze results
263
- >>> print(f"Reads with CpG data: {matrix.shape[0]:,}")
264
- >>> print(f"Total CpG sites: {matrix.shape[1]:,}")
265
- >>> print(f"Data points: {matrix.nnz:,}")
288
+ >>> print(f"Reads with CpG data: {result.matrix.shape[0]:,}")
289
+ >>> print(f"Total CpG sites: {result.matrix.shape[1]:,}")
290
+ >>> print(f"Data points: {result.matrix.nnz:,}")
266
291
  >>>
267
292
  >>> # Save to file
268
- >>> scipy.sparse.save_npz("sample.methylation.npz", matrix)
293
+ >>> scipy.sparse.save_npz("sample.methylation.npz", result.matrix)
269
294
 
270
295
  Note:
271
296
  The function processes chromosomes in the order they appear in
@@ -304,6 +329,7 @@ def extract_methylation_data_from_bam(
304
329
  coo_row = [] # Read number
305
330
  coo_col = [] # CpG number (embedding)
306
331
  coo_data = [] # Methylation state
332
+ tlen_list: list[int] = [] # Template length (TLEN) per read
307
333
 
308
334
  # This is slow, but we only run it once and store the results for later
309
335
  for chrom, cpg_sites in tqdm(
@@ -398,6 +424,7 @@ def extract_methylation_data_from_bam(
398
424
  ), "Read seen twice!"
399
425
  debug_read_name_to_row_number[read_key] = read_number
400
426
  print("************************************************\n")
427
+ tlen_list.append(aligned_segment.template_length)
401
428
  read_number += 1
402
429
 
403
430
  continue # Skip the Biscuit/bwameth/gem3 path below
@@ -526,6 +553,7 @@ def extract_methylation_data_from_bam(
526
553
  f"\t{query_pos} {ref_pos} C->{query_base} [Unknown! SNV? Indel?]"
527
554
  )
528
555
 
556
+ tlen_list.append(aligned_segment.template_length)
529
557
  read_number += 1
530
558
 
531
559
  if debug:
@@ -557,6 +585,6 @@ def extract_methylation_data_from_bam(
557
585
  # Number of columns = number of CpG sites
558
586
  assert sparse_matrix.shape[1] == genome_methylation_embedding.total_cpg_sites
559
587
 
560
- return sparse_matrix
588
+ tlen_array = np.array(tlen_list, dtype=np.int32)
561
589
 
562
- # return scipy.sparse.coo_matrix((coo_data, (coo_row, coo_col)), shape=(len(read_name_to_row_number) + 1, total_cpg_sites))
590
+ return ExtractionResult(matrix=sparse_matrix, tlen=tlen_array)
@@ -21,7 +21,7 @@ import click
21
21
  import numpy as np
22
22
  import scipy.sparse
23
23
 
24
- from bam2tensor.metadata import read_npz_metadata
24
+ from bam2tensor.metadata import read_npz_metadata, read_npz_tlen
25
25
 
26
26
 
27
27
  def _format_size(nbytes: int) -> str:
@@ -103,6 +103,19 @@ def inspect_npz(npz_path: str) -> None:
103
103
  print(f" CpG sites: {n_cpgs:,}")
104
104
  print(f" Data points: {n_data:,} (sparsity: {sparsity:.2%})")
105
105
 
106
+ # TLEN / fragment length statistics
107
+ tlen = read_npz_tlen(npz_path)
108
+ if tlen is not None:
109
+ nonzero = np.abs(tlen)[np.abs(tlen) > 0]
110
+ if len(nonzero) > 0:
111
+ print(
112
+ f" Fragment len: median {np.median(nonzero):.0f}, "
113
+ f"mean {np.mean(nonzero):.0f}, "
114
+ f"range [{nonzero.min()}, {nonzero.max()}]"
115
+ )
116
+ else:
117
+ print(" Fragment len: all zero (single-end data)")
118
+
106
119
  if meta and "cpg_index_crc32" in meta:
107
120
  print(f" CpG index CRC32: {meta['cpg_index_crc32']}")
108
121
  if meta and "bam2tensor_version" in meta:
@@ -27,10 +27,13 @@ Example:
27
27
  hg38
28
28
  """
29
29
 
30
+ import io
30
31
  import json
31
32
  import zipfile
32
33
  import zlib
33
34
 
35
+ import numpy as np
36
+
34
37
  from bam2tensor.embedding import GenomeMethylationEmbedding
35
38
 
36
39
 
@@ -112,3 +115,48 @@ def read_npz_metadata(npz_path: str) -> dict | None:
112
115
  if "metadata.json" in zf.namelist():
113
116
  return json.loads(zf.read("metadata.json"))
114
117
  return None
118
+
119
+
120
+ def write_npz_tlen(npz_path: str, tlen: np.ndarray) -> None:
121
+ """Append a ``tlen.npy`` entry to an existing ``.npz`` file.
122
+
123
+ The array is serialised with :func:`numpy.save` and appended to the
124
+ ZIP archive. ``scipy.sparse.load_npz`` ignores this extra entry, so
125
+ the file remains compatible with existing sparse-matrix code.
126
+
127
+ Args:
128
+ npz_path: Path to the ``.npz`` file (must already exist).
129
+ tlen: A 1-D numpy array of per-read template lengths.
130
+
131
+ Example:
132
+ >>> # xdoctest: +SKIP
133
+ >>> write_npz_tlen("out.npz", np.array([300, -300, 0], dtype=np.int32))
134
+ """
135
+ buf = io.BytesIO()
136
+ np.save(buf, tlen)
137
+ with zipfile.ZipFile(npz_path, "a") as zf:
138
+ zf.writestr("tlen.npy", buf.getvalue())
139
+
140
+
141
+ def read_npz_tlen(npz_path: str) -> np.ndarray | None:
142
+ """Read the ``tlen.npy`` entry from a ``.npz`` file.
143
+
144
+ Args:
145
+ npz_path: Path to the ``.npz`` file.
146
+
147
+ Returns:
148
+ The per-read template-length array, or ``None`` if the file does
149
+ not contain a ``tlen.npy`` entry (e.g. files produced by older
150
+ versions).
151
+
152
+ Example:
153
+ >>> # xdoctest: +SKIP
154
+ >>> tlen = read_npz_tlen("sample.methylation.npz")
155
+ >>> if tlen is not None:
156
+ ... print(f"Median fragment length: {np.median(np.abs(tlen)):.0f}")
157
+ """
158
+ with zipfile.ZipFile(npz_path, "r") as zf:
159
+ if "tlen.npy" in zf.namelist():
160
+ buf = io.BytesIO(zf.read("tlen.npy"))
161
+ return np.load(buf, allow_pickle=False)
162
+ return None
@@ -54,10 +54,12 @@ def test_duplication_bug(tmp_path):
54
54
  pysam.index(str(bam_path))
55
55
 
56
56
  # 4. Run extraction
57
- matrix = functions.extract_methylation_data_from_bam(
57
+ result = functions.extract_methylation_data_from_bam(
58
58
  input_bam=str(bam_path), genome_methylation_embedding=emb, debug=True
59
59
  )
60
60
 
61
- print(f"Matrix shape: {matrix.shape}")
61
+ print(f"Matrix shape: {result.matrix.shape}")
62
62
  # We expect exactly 1 row because there is only 1 read
63
- assert matrix.shape[0] == 1, f"Expected 1 row (read), got {matrix.shape[0]}"
63
+ assert (
64
+ result.matrix.shape[0] == 1
65
+ ), f"Expected 1 row (read), got {result.matrix.shape[0]}"