bam2tensor 2.3__tar.gz → 2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {bam2tensor-2.3 → bam2tensor-2.5}/.github/workflows/constraints.txt +1 -1
  2. {bam2tensor-2.3 → bam2tensor-2.5}/.github/workflows/docs.yml +2 -2
  3. {bam2tensor-2.3 → bam2tensor-2.5}/.github/workflows/labeler.yml +1 -1
  4. {bam2tensor-2.3 → bam2tensor-2.5}/.github/workflows/release.yml +1 -1
  5. {bam2tensor-2.3 → bam2tensor-2.5}/CLAUDE.md +3 -1
  6. {bam2tensor-2.3 → bam2tensor-2.5}/PKG-INFO +40 -4
  7. {bam2tensor-2.3 → bam2tensor-2.5}/README.md +39 -3
  8. {bam2tensor-2.3 → bam2tensor-2.5}/docs/reference.md +8 -0
  9. {bam2tensor-2.3 → bam2tensor-2.5}/pyproject.toml +1 -1
  10. {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/__init__.py +3 -3
  11. {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/__main__.py +87 -6
  12. {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/functions.py +298 -57
  13. {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/inspect.py +33 -1
  14. {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/metadata.py +48 -0
  15. {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_duplication.py +5 -3
  16. bam2tensor-2.5/tests/test_filters.py +568 -0
  17. {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_functions.py +233 -79
  18. {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_inspect.py +44 -4
  19. {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_main.py +33 -0
  20. {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_metadata.py +62 -0
  21. {bam2tensor-2.3 → bam2tensor-2.5}/uv.lock +104 -108
  22. {bam2tensor-2.3 → bam2tensor-2.5}/.darglint +0 -0
  23. {bam2tensor-2.3 → bam2tensor-2.5}/.editorconfig +0 -0
  24. {bam2tensor-2.3 → bam2tensor-2.5}/.gitattributes +0 -0
  25. {bam2tensor-2.3 → bam2tensor-2.5}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  26. {bam2tensor-2.3 → bam2tensor-2.5}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  27. {bam2tensor-2.3 → bam2tensor-2.5}/.github/actions/setup-env/action.yml +0 -0
  28. {bam2tensor-2.3 → bam2tensor-2.5}/.github/dependabot.yml +0 -0
  29. {bam2tensor-2.3 → bam2tensor-2.5}/.github/labels.yml +0 -0
  30. {bam2tensor-2.3 → bam2tensor-2.5}/.github/release-drafter.yml +0 -0
  31. {bam2tensor-2.3 → bam2tensor-2.5}/.github/workflows/tests.yml +0 -0
  32. {bam2tensor-2.3 → bam2tensor-2.5}/.gitignore +0 -0
  33. {bam2tensor-2.3 → bam2tensor-2.5}/.pre-commit-config.yaml +0 -0
  34. {bam2tensor-2.3 → bam2tensor-2.5}/CONTRIBUTING.md +0 -0
  35. {bam2tensor-2.3 → bam2tensor-2.5}/LICENSE +0 -0
  36. {bam2tensor-2.3 → bam2tensor-2.5}/SECURITY.md +0 -0
  37. {bam2tensor-2.3 → bam2tensor-2.5}/docs/Makefile +0 -0
  38. {bam2tensor-2.3 → bam2tensor-2.5}/docs/conf.py +0 -0
  39. {bam2tensor-2.3 → bam2tensor-2.5}/docs/contributing.md +0 -0
  40. {bam2tensor-2.3 → bam2tensor-2.5}/docs/index.md +0 -0
  41. {bam2tensor-2.3 → bam2tensor-2.5}/docs/license.md +0 -0
  42. {bam2tensor-2.3 → bam2tensor-2.5}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
  43. {bam2tensor-2.3 → bam2tensor-2.5}/docs/logo/bam2tensor-logo.afdesign +0 -0
  44. {bam2tensor-2.3 → bam2tensor-2.5}/docs/logo/bam2tensor-logo.png +0 -0
  45. {bam2tensor-2.3 → bam2tensor-2.5}/docs/make.bat +0 -0
  46. {bam2tensor-2.3 → bam2tensor-2.5}/docs/nano-banana-overview-shrunk.png +0 -0
  47. {bam2tensor-2.3 → bam2tensor-2.5}/docs/templates/package.rst_t +0 -0
  48. {bam2tensor-2.3 → bam2tensor-2.5}/noxfile.py +0 -0
  49. {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/embedding.py +0 -0
  50. {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/py.typed +0 -0
  51. {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/reference.py +0 -0
  52. {bam2tensor-2.3 → bam2tensor-2.5}/tests/__init__.py +0 -0
  53. {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_embedding.py +0 -0
  54. {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_fasta.fa +0 -0
  55. {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_reference.py +0 -0
@@ -1,2 +1,2 @@
1
1
  nox==2026.2.9
2
- uv==0.10.7
2
+ uv==0.11.2
@@ -59,8 +59,8 @@ jobs:
59
59
  needs: build
60
60
  steps:
61
61
  - name: Setup Pages
62
- uses: actions/configure-pages@v5
62
+ uses: actions/configure-pages@v6
63
63
 
64
64
  - name: Deploy to GitHub Pages
65
65
  id: deployment
66
- uses: actions/deploy-pages@v4
66
+ uses: actions/deploy-pages@v5
@@ -20,6 +20,6 @@ jobs:
20
20
  uses: actions/checkout@v6
21
21
 
22
22
  - name: Run Labeler
23
- uses: crazy-max/ghaction-github-labeler@v5.3.0
23
+ uses: crazy-max/ghaction-github-labeler@v6.0.0
24
24
  with:
25
25
  skip-delete: true
@@ -76,7 +76,7 @@ jobs:
76
76
  repository-url: https://test.pypi.org/legacy/
77
77
 
78
78
  - name: Publish the release notes
79
- uses: release-drafter/release-drafter@v6.2.0
79
+ uses: release-drafter/release-drafter@v7.1.1
80
80
  with:
81
81
  publish: ${{ steps.check-version.outputs.tag != '' || steps.check-tag.outputs.tag != '' }}
82
82
  tag: ${{ steps.check-version.outputs.tag || steps.check-tag.outputs.tag }}
@@ -40,7 +40,7 @@ uv run mypy src
40
40
 
41
41
  ```
42
42
  src/bam2tensor/
43
- __init__.py # Package version (2.3)
43
+ __init__.py # Package version (2.5)
44
44
  __main__.py # Click CLI entry point (bam2tensor command)
45
45
  inspect.py # Inspect CLI entry point (bam2tensor-inspect command)
46
46
  embedding.py # GenomeMethylationEmbedding class (FASTA parsing, CpG indexing)
@@ -117,6 +117,8 @@ xdoctest validates code examples in docstrings. Important rules:
117
117
  - Columns = CpG sites (ordered by genomic position, determined by reference genome)
118
118
  - Values: 1 (methylated), 0 (unmethylated), -1 (no data/indels/SNVs)
119
119
  - Each .npz file contains a `metadata.json` entry with provenance info (genome name, version, CpG index CRC32, expected chromosomes). Read via `bam2tensor.metadata.read_npz_metadata()`.
120
+ - Each .npz file contains a `tlen.npy` entry with per-read signed template length (BAM TLEN field) as int32. Read via `bam2tensor.metadata.read_npz_tlen()`. Returns `None` for files from older versions.
121
+ - `extract_methylation_data_from_bam()` returns an `ExtractionResult` NamedTuple with `.matrix` (sparse COO) and `.tlen` (numpy int32 array).
120
122
 
121
123
  ### Methylation Strand Detection
122
124
  - Bismark aligner: XM tag (Z/z for methylated/unmethylated CpG; no strand filtering needed)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bam2tensor
3
- Version: 2.3
3
+ Version: 2.5
4
4
  Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
5
5
  Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
6
6
  Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
@@ -74,6 +74,7 @@ Description-Content-Type: text/markdown
74
74
  - [Command-Line Options](#command-line-options)
75
75
  - [Inspecting Output Files](#inspecting-output-files)
76
76
  - [Output Data Structure](#output-data-structure)
77
+ - [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
77
78
  - [Embedded Metadata](#embedded-metadata)
78
79
  - [Loading Output Files](#loading-output-files)
79
80
  - [Converting to Dense Arrays](#converting-to-dense-arrays)
@@ -97,6 +98,7 @@ Description-Content-Type: text/markdown
97
98
  - **Batch Processing**: Process multiple BAM files with directory recursion
98
99
  - **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
99
100
  - **Quality Filtering**: Configurable mapping quality thresholds
101
+ - **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
100
102
 
101
103
  ## Requirements
102
104
 
@@ -299,8 +301,9 @@ sample.methylation.npz
299
301
  Reads: 1,423,891
300
302
  CpG sites: 28,217,448
301
303
  Data points: 12,847,322 (sparsity: 99.97%)
304
+ Fragment len: median 167, mean 182, range [50, 600]
302
305
  CpG index CRC32: a1b2c3d4
303
- bam2tensor: v2.3
306
+ bam2tensor: v2.4
304
307
  File size: 14.2 MB
305
308
  ```
306
309
 
@@ -333,6 +336,27 @@ The **column dimension is determined entirely by the reference genome**: it equa
333
336
 
334
337
  Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
335
338
 
339
+ ### Per-Read Fragment Length (TLEN)
340
+
341
+ Each `.methylation.npz` file includes a `tlen.npy` entry inside the ZIP archive containing the signed BAM template length (TLEN) for every read in the matrix. This enables joint fragment-length and methylation analysis without re-processing the BAM.
342
+
343
+ - One `int32` value per read (row), in the same order as the sparse matrix rows
344
+ - Signed: positive for the leftmost read in a pair, negative for the rightmost
345
+ - Zero for single-end reads or reads with unmapped mates
346
+ - Use `abs(tlen)` to get fragment lengths
347
+
348
+ ```python
349
+ from bam2tensor.metadata import read_npz_tlen
350
+ import numpy as np
351
+
352
+ tlen = read_npz_tlen("sample.methylation.npz")
353
+ if tlen is not None:
354
+ frag_lengths = np.abs(tlen)
355
+ nonzero = frag_lengths[frag_lengths > 0]
356
+ print(f"Median fragment length: {np.median(nonzero):.0f}")
357
+ print(f"Mean fragment length: {np.mean(nonzero):.0f}")
358
+ ```
359
+
336
360
  ### Embedded Metadata
337
361
 
338
362
  Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
@@ -548,10 +572,22 @@ extract_methylation_data_from_bam(
548
572
  quality_limit: int = 20, # Minimum MAPQ
549
573
  verbose: bool = False, # Enable verbose output
550
574
  debug: bool = False # Enable debug output
551
- ) -> scipy.sparse.coo_matrix
575
+ ) -> ExtractionResult
576
+ ```
577
+
578
+ **Returns:** An `ExtractionResult` named tuple with two fields:
579
+ - `matrix`: A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites)
580
+ - `tlen`: A 1-D numpy `int32` array of shape (n_reads,) containing the signed template length (BAM TLEN field) for each read
581
+
582
+ ### `bam2tensor.metadata.read_npz_tlen`
583
+
584
+ Read per-read template lengths from a `.methylation.npz` file.
585
+
586
+ ```python
587
+ read_npz_tlen(npz_path: str) -> np.ndarray | None
552
588
  ```
553
589
 
554
- **Returns:** A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites).
590
+ **Returns:** The per-read template-length array, or `None` if the file was produced by an older version of bam2tensor.
555
591
 
556
592
  ## Contributing
557
593
 
@@ -41,6 +41,7 @@
41
41
  - [Command-Line Options](#command-line-options)
42
42
  - [Inspecting Output Files](#inspecting-output-files)
43
43
  - [Output Data Structure](#output-data-structure)
44
+ - [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
44
45
  - [Embedded Metadata](#embedded-metadata)
45
46
  - [Loading Output Files](#loading-output-files)
46
47
  - [Converting to Dense Arrays](#converting-to-dense-arrays)
@@ -64,6 +65,7 @@
64
65
  - **Batch Processing**: Process multiple BAM files with directory recursion
65
66
  - **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
66
67
  - **Quality Filtering**: Configurable mapping quality thresholds
68
+ - **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
67
69
 
68
70
  ## Requirements
69
71
 
@@ -266,8 +268,9 @@ sample.methylation.npz
266
268
  Reads: 1,423,891
267
269
  CpG sites: 28,217,448
268
270
  Data points: 12,847,322 (sparsity: 99.97%)
271
+ Fragment len: median 167, mean 182, range [50, 600]
269
272
  CpG index CRC32: a1b2c3d4
270
- bam2tensor: v2.3
273
+ bam2tensor: v2.4
271
274
  File size: 14.2 MB
272
275
  ```
273
276
 
@@ -300,6 +303,27 @@ The **column dimension is determined entirely by the reference genome**: it equa
300
303
 
301
304
  Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
302
305
 
306
+ ### Per-Read Fragment Length (TLEN)
307
+
308
+ Each `.methylation.npz` file includes a `tlen.npy` entry inside the ZIP archive containing the signed BAM template length (TLEN) for every read in the matrix. This enables joint fragment-length and methylation analysis without re-processing the BAM.
309
+
310
+ - One `int32` value per read (row), in the same order as the sparse matrix rows
311
+ - Signed: positive for the leftmost read in a pair, negative for the rightmost
312
+ - Zero for single-end reads or reads with unmapped mates
313
+ - Use `abs(tlen)` to get fragment lengths
314
+
315
+ ```python
316
+ from bam2tensor.metadata import read_npz_tlen
317
+ import numpy as np
318
+
319
+ tlen = read_npz_tlen("sample.methylation.npz")
320
+ if tlen is not None:
321
+ frag_lengths = np.abs(tlen)
322
+ nonzero = frag_lengths[frag_lengths > 0]
323
+ print(f"Median fragment length: {np.median(nonzero):.0f}")
324
+ print(f"Mean fragment length: {np.mean(nonzero):.0f}")
325
+ ```
326
+
303
327
  ### Embedded Metadata
304
328
 
305
329
  Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
@@ -515,10 +539,22 @@ extract_methylation_data_from_bam(
515
539
  quality_limit: int = 20, # Minimum MAPQ
516
540
  verbose: bool = False, # Enable verbose output
517
541
  debug: bool = False # Enable debug output
518
- ) -> scipy.sparse.coo_matrix
542
+ ) -> ExtractionResult
543
+ ```
544
+
545
+ **Returns:** An `ExtractionResult` named tuple with two fields:
546
+ - `matrix`: A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites)
547
+ - `tlen`: A 1-D numpy `int32` array of shape (n_reads,) containing the signed template length (BAM TLEN field) for each read
548
+
549
+ ### `bam2tensor.metadata.read_npz_tlen`
550
+
551
+ Read per-read template lengths from a `.methylation.npz` file.
552
+
553
+ ```python
554
+ read_npz_tlen(npz_path: str) -> np.ndarray | None
519
555
  ```
520
556
 
521
- **Returns:** A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites).
557
+ **Returns:** The per-read template-length array, or `None` if the file was produced by an older version of bam2tensor.
522
558
 
523
559
  ## Contributing
524
560
 
@@ -32,6 +32,14 @@ bam2tensor.functions module
32
32
  :show-inheritance:
33
33
  :undoc-members:
34
34
 
35
+ bam2tensor.metadata module
36
+ --------------------------
37
+
38
+ .. automodule:: bam2tensor.metadata
39
+ :members:
40
+ :show-inheritance:
41
+ :undoc-members:
42
+
35
43
  bam2tensor.reference module
36
44
  ---------------------------
37
45
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "bam2tensor"
3
- version = "2.3"
3
+ version = "2.5"
4
4
  description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
5
5
  authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
6
6
  license = "MIT"
@@ -30,14 +30,14 @@ Example:
30
30
  )
31
31
 
32
32
  # Extract methylation data
33
- sparse_matrix = extract_methylation_data_from_bam(
33
+ result = extract_methylation_data_from_bam(
34
34
  input_bam="/path/to/sample.bam",
35
35
  genome_methylation_embedding=embedding,
36
36
  )
37
37
 
38
38
  # Save to file
39
39
  import scipy.sparse
40
- scipy.sparse.save_npz("output.npz", sparse_matrix)
40
+ scipy.sparse.save_npz("output.npz", result.matrix)
41
41
 
42
42
  Output Format:
43
43
  The output is a SciPy sparse COO matrix where:
@@ -50,4 +50,4 @@ See Also:
50
50
  - https://mcwdsi.github.io/bam2tensor for full documentation
51
51
  """
52
52
 
53
- __version__ = "2.3"
53
+ __version__ = "2.5"
@@ -38,7 +38,11 @@ from bam2tensor.functions import (
38
38
  detect_aligner,
39
39
  extract_methylation_data_from_bam,
40
40
  )
41
- from bam2tensor.metadata import compute_cpg_index_crc32, write_npz_metadata
41
+ from bam2tensor.metadata import (
42
+ compute_cpg_index_crc32,
43
+ write_npz_metadata,
44
+ write_npz_tlen,
45
+ )
42
46
  from bam2tensor.reference import (
43
47
  KNOWN_GENOMES,
44
48
  download_reference as download_reference_fn,
@@ -225,6 +229,43 @@ def validate_input_output(
225
229
  default=20,
226
230
  type=int,
227
231
  )
232
+ @click.option(
233
+ "--filter-non-converted",
234
+ help=(
235
+ "Drop reads with >= --non-converted-threshold retained non-CpG "
236
+ "cytosines, the signature of incomplete bisulfite/EM-seq conversion "
237
+ "(port of nebiolabs/mark-nonconverted-reads). Default: off."
238
+ ),
239
+ is_flag=True,
240
+ )
241
+ @click.option(
242
+ "--non-converted-threshold",
243
+ help=(
244
+ "Minimum count of retained non-CpG cytosines to drop a read "
245
+ "(default = 3, matches NEB mark-nonconverted-reads)."
246
+ ),
247
+ default=3,
248
+ type=int,
249
+ )
250
+ @click.option(
251
+ "--filter-em-overconversion",
252
+ help=(
253
+ "Drop EM-seq reads whose covered CpGs are all called unmethylated "
254
+ "and cover at least --em-overconversion-min-cpgs sites (heuristic "
255
+ "for the fragment-level over-conversion artifact described in "
256
+ "Loyfer et al. bioRxiv 2026.03.24.713040). Default: off."
257
+ ),
258
+ is_flag=True,
259
+ )
260
+ @click.option(
261
+ "--em-overconversion-min-cpgs",
262
+ help=(
263
+ "Minimum covered CpG count required before the EM over-conversion "
264
+ "filter will drop a read (default = 3)."
265
+ ),
266
+ default=3,
267
+ type=int,
268
+ )
228
269
  @click.option("--verbose", help="Verbose output.", is_flag=True)
229
270
  @click.option("--skip-cache", help="De-novo generate CpG sites (slow).", is_flag=True)
230
271
  @click.option(
@@ -259,6 +300,10 @@ def main(
259
300
  expected_chromosomes: str | None,
260
301
  reference_fasta: str | None,
261
302
  quality_limit: int,
303
+ filter_non_converted: bool,
304
+ non_converted_threshold: int,
305
+ filter_em_overconversion: bool,
306
+ em_overconversion_min_cpgs: int,
262
307
  verbose: bool,
263
308
  skip_cache: bool,
264
309
  debug: bool,
@@ -296,6 +341,17 @@ def main(
296
341
  ``--download-reference`` is used.
297
342
  quality_limit: Minimum mapping quality (MAPQ) threshold. Reads below
298
343
  this quality are excluded.
344
+ filter_non_converted: If True, drop reads with at least
345
+ ``non_converted_threshold`` retained non-CpG cytosines —
346
+ indicating incomplete bisulfite/EM-seq conversion.
347
+ non_converted_threshold: Threshold used by the non-converted
348
+ read filter.
349
+ filter_em_overconversion: If True, drop reads whose covered CpGs
350
+ are all called unmethylated and cover at least
351
+ ``em_overconversion_min_cpgs`` sites — heuristic for EM-seq
352
+ fragment-level over-conversion (Loyfer et al. 2026).
353
+ em_overconversion_min_cpgs: Minimum covered CpG count required
354
+ before the over-conversion filter will drop a read.
299
355
  verbose: If True, print detailed progress information.
300
356
  skip_cache: If True, regenerate the CpG site index even if a cache
301
357
  file exists.
@@ -378,6 +434,16 @@ def main(
378
434
  print(f" Reference: {reference_fasta}")
379
435
  print(f" Chromosomes: {chrom_display}")
380
436
  print(f" Quality limit: MAPQ >= {quality_limit}")
437
+ if filter_non_converted:
438
+ print(
439
+ f" Filters: non-converted reads (>= "
440
+ f"{non_converted_threshold} retained non-CpG Cs)"
441
+ )
442
+ if filter_em_overconversion:
443
+ print(
444
+ f" EM over-conversion (all-unmethylated, >= "
445
+ f"{em_overconversion_min_cpgs} CpGs)"
446
+ )
381
447
  if output_dir:
382
448
  print(f" Output dir: {output_dir}")
383
449
  else:
@@ -440,10 +506,14 @@ def main(
440
506
  # Extract
441
507
  print(" Extracting methylation data...")
442
508
  try:
443
- methylation_data_coo = extract_methylation_data_from_bam(
509
+ extraction_result = extract_methylation_data_from_bam(
444
510
  input_bam=input_bam,
445
511
  genome_methylation_embedding=genome_methylation_embedding,
446
512
  quality_limit=quality_limit,
513
+ filter_non_converted=filter_non_converted,
514
+ non_converted_threshold=non_converted_threshold,
515
+ filter_em_overconversion=filter_em_overconversion,
516
+ em_overconversion_min_cpgs=em_overconversion_min_cpgs,
447
517
  verbose=verbose,
448
518
  debug=debug,
449
519
  )
@@ -453,16 +523,17 @@ def main(
453
523
  continue
454
524
 
455
525
  # Matrix stats
456
- n_reads = methylation_data_coo.shape[0]
457
- n_cpgs = methylation_data_coo.shape[1]
458
- n_data = methylation_data_coo.nnz
526
+ n_reads = extraction_result.matrix.shape[0]
527
+ n_cpgs = extraction_result.matrix.shape[1]
528
+ n_data = extraction_result.matrix.nnz
459
529
  print(
460
530
  f" Result: {n_reads:,} reads x {n_cpgs:,} CpG sites"
461
531
  f" ({n_data:,} data points)"
462
532
  )
463
533
 
464
534
  # Save
465
- scipy.sparse.save_npz(output_file, methylation_data_coo, compressed=True)
535
+ scipy.sparse.save_npz(output_file, extraction_result.matrix, compressed=True)
536
+ write_npz_tlen(output_file, extraction_result.tlen)
466
537
  write_npz_metadata(
467
538
  output_file,
468
539
  {
@@ -471,6 +542,16 @@ def main(
471
542
  "expected_chromosomes": chrom_list,
472
543
  "total_cpg_sites": genome_methylation_embedding.total_cpg_sites,
473
544
  "cpg_index_crc32": cpg_crc32,
545
+ "filters": {
546
+ "non_converted_reads": {
547
+ "enabled": filter_non_converted,
548
+ "threshold": non_converted_threshold,
549
+ },
550
+ "em_overconversion": {
551
+ "enabled": filter_em_overconversion,
552
+ "min_cpgs": em_overconversion_min_cpgs,
553
+ },
554
+ },
474
555
  },
475
556
  )
476
557
  print(f" Output: {output_file}")