bam2tensor 2.3__tar.gz → 2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bam2tensor-2.3 → bam2tensor-2.5}/.github/workflows/constraints.txt +1 -1
- {bam2tensor-2.3 → bam2tensor-2.5}/.github/workflows/docs.yml +2 -2
- {bam2tensor-2.3 → bam2tensor-2.5}/.github/workflows/labeler.yml +1 -1
- {bam2tensor-2.3 → bam2tensor-2.5}/.github/workflows/release.yml +1 -1
- {bam2tensor-2.3 → bam2tensor-2.5}/CLAUDE.md +3 -1
- {bam2tensor-2.3 → bam2tensor-2.5}/PKG-INFO +40 -4
- {bam2tensor-2.3 → bam2tensor-2.5}/README.md +39 -3
- {bam2tensor-2.3 → bam2tensor-2.5}/docs/reference.md +8 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/pyproject.toml +1 -1
- {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/__init__.py +3 -3
- {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/__main__.py +87 -6
- {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/functions.py +298 -57
- {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/inspect.py +33 -1
- {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/metadata.py +48 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_duplication.py +5 -3
- bam2tensor-2.5/tests/test_filters.py +568 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_functions.py +233 -79
- {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_inspect.py +44 -4
- {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_main.py +33 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_metadata.py +62 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/uv.lock +104 -108
- {bam2tensor-2.3 → bam2tensor-2.5}/.darglint +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/.editorconfig +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/.gitattributes +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/.github/actions/setup-env/action.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/.github/dependabot.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/.github/labels.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/.github/release-drafter.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/.github/workflows/tests.yml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/.gitignore +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/.pre-commit-config.yaml +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/CONTRIBUTING.md +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/LICENSE +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/SECURITY.md +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/docs/Makefile +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/docs/conf.py +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/docs/contributing.md +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/docs/index.md +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/docs/license.md +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/docs/logo/bam2tensor-logo.afdesign +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/docs/logo/bam2tensor-logo.png +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/docs/make.bat +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/docs/nano-banana-overview-shrunk.png +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/docs/templates/package.rst_t +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/noxfile.py +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/embedding.py +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/py.typed +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/src/bam2tensor/reference.py +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/tests/__init__.py +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_embedding.py +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_fasta.fa +0 -0
- {bam2tensor-2.3 → bam2tensor-2.5}/tests/test_reference.py +0 -0
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
nox==2026.2.9
|
|
2
|
-
uv==0.
|
|
2
|
+
uv==0.11.2
|
|
@@ -76,7 +76,7 @@ jobs:
|
|
|
76
76
|
repository-url: https://test.pypi.org/legacy/
|
|
77
77
|
|
|
78
78
|
- name: Publish the release notes
|
|
79
|
-
uses: release-drafter/release-drafter@
|
|
79
|
+
uses: release-drafter/release-drafter@v7.1.1
|
|
80
80
|
with:
|
|
81
81
|
publish: ${{ steps.check-version.outputs.tag != '' || steps.check-tag.outputs.tag != '' }}
|
|
82
82
|
tag: ${{ steps.check-version.outputs.tag || steps.check-tag.outputs.tag }}
|
|
@@ -40,7 +40,7 @@ uv run mypy src
|
|
|
40
40
|
|
|
41
41
|
```
|
|
42
42
|
src/bam2tensor/
|
|
43
|
-
__init__.py # Package version (2.
|
|
43
|
+
__init__.py # Package version (2.5)
|
|
44
44
|
__main__.py # Click CLI entry point (bam2tensor command)
|
|
45
45
|
inspect.py # Inspect CLI entry point (bam2tensor-inspect command)
|
|
46
46
|
embedding.py # GenomeMethylationEmbedding class (FASTA parsing, CpG indexing)
|
|
@@ -117,6 +117,8 @@ xdoctest validates code examples in docstrings. Important rules:
|
|
|
117
117
|
- Columns = CpG sites (ordered by genomic position, determined by reference genome)
|
|
118
118
|
- Values: 1 (methylated), 0 (unmethylated), -1 (no data/indels/SNVs)
|
|
119
119
|
- Each .npz file contains a `metadata.json` entry with provenance info (genome name, version, CpG index CRC32, expected chromosomes). Read via `bam2tensor.metadata.read_npz_metadata()`.
|
|
120
|
+
- Each .npz file contains a `tlen.npy` entry with per-read signed template length (BAM TLEN field) as int32. Read via `bam2tensor.metadata.read_npz_tlen()`. Returns `None` for files from older versions.
|
|
121
|
+
- `extract_methylation_data_from_bam()` returns an `ExtractionResult` NamedTuple with `.matrix` (sparse COO) and `.tlen` (numpy int32 array).
|
|
120
122
|
|
|
121
123
|
### Methylation Strand Detection
|
|
122
124
|
- Bismark aligner: XM tag (Z/z for methylated/unmethylated CpG; no strand filtering needed)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bam2tensor
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.5
|
|
4
4
|
Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
|
|
5
5
|
Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
|
|
6
6
|
Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
|
|
@@ -74,6 +74,7 @@ Description-Content-Type: text/markdown
|
|
|
74
74
|
- [Command-Line Options](#command-line-options)
|
|
75
75
|
- [Inspecting Output Files](#inspecting-output-files)
|
|
76
76
|
- [Output Data Structure](#output-data-structure)
|
|
77
|
+
- [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
|
|
77
78
|
- [Embedded Metadata](#embedded-metadata)
|
|
78
79
|
- [Loading Output Files](#loading-output-files)
|
|
79
80
|
- [Converting to Dense Arrays](#converting-to-dense-arrays)
|
|
@@ -97,6 +98,7 @@ Description-Content-Type: text/markdown
|
|
|
97
98
|
- **Batch Processing**: Process multiple BAM files with directory recursion
|
|
98
99
|
- **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
|
|
99
100
|
- **Quality Filtering**: Configurable mapping quality thresholds
|
|
101
|
+
- **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
|
|
100
102
|
|
|
101
103
|
## Requirements
|
|
102
104
|
|
|
@@ -299,8 +301,9 @@ sample.methylation.npz
|
|
|
299
301
|
Reads: 1,423,891
|
|
300
302
|
CpG sites: 28,217,448
|
|
301
303
|
Data points: 12,847,322 (sparsity: 99.97%)
|
|
304
|
+
Fragment len: median 167, mean 182, range [50, 600]
|
|
302
305
|
CpG index CRC32: a1b2c3d4
|
|
303
|
-
bam2tensor: v2.
|
|
306
|
+
bam2tensor: v2.4
|
|
304
307
|
File size: 14.2 MB
|
|
305
308
|
```
|
|
306
309
|
|
|
@@ -333,6 +336,27 @@ The **column dimension is determined entirely by the reference genome**: it equa
|
|
|
333
336
|
|
|
334
337
|
Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
|
|
335
338
|
|
|
339
|
+
### Per-Read Fragment Length (TLEN)
|
|
340
|
+
|
|
341
|
+
Each `.methylation.npz` file includes a `tlen.npy` entry inside the ZIP archive containing the signed BAM template length (TLEN) for every read in the matrix. This enables joint fragment-length and methylation analysis without re-processing the BAM.
|
|
342
|
+
|
|
343
|
+
- One `int32` value per read (row), in the same order as the sparse matrix rows
|
|
344
|
+
- Signed: positive for the leftmost read in a pair, negative for the rightmost
|
|
345
|
+
- Zero for single-end reads or reads with unmapped mates
|
|
346
|
+
- Use `abs(tlen)` to get fragment lengths
|
|
347
|
+
|
|
348
|
+
```python
|
|
349
|
+
from bam2tensor.metadata import read_npz_tlen
|
|
350
|
+
import numpy as np
|
|
351
|
+
|
|
352
|
+
tlen = read_npz_tlen("sample.methylation.npz")
|
|
353
|
+
if tlen is not None:
|
|
354
|
+
frag_lengths = np.abs(tlen)
|
|
355
|
+
nonzero = frag_lengths[frag_lengths > 0]
|
|
356
|
+
print(f"Median fragment length: {np.median(nonzero):.0f}")
|
|
357
|
+
print(f"Mean fragment length: {np.mean(nonzero):.0f}")
|
|
358
|
+
```
|
|
359
|
+
|
|
336
360
|
### Embedded Metadata
|
|
337
361
|
|
|
338
362
|
Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
|
|
@@ -548,10 +572,22 @@ extract_methylation_data_from_bam(
|
|
|
548
572
|
quality_limit: int = 20, # Minimum MAPQ
|
|
549
573
|
verbose: bool = False, # Enable verbose output
|
|
550
574
|
debug: bool = False # Enable debug output
|
|
551
|
-
) ->
|
|
575
|
+
) -> ExtractionResult
|
|
576
|
+
```
|
|
577
|
+
|
|
578
|
+
**Returns:** An `ExtractionResult` named tuple with two fields:
|
|
579
|
+
- `matrix`: A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites)
|
|
580
|
+
- `tlen`: A 1-D numpy `int32` array of shape (n_reads,) containing the signed template length (BAM TLEN field) for each read
|
|
581
|
+
|
|
582
|
+
### `bam2tensor.metadata.read_npz_tlen`
|
|
583
|
+
|
|
584
|
+
Read per-read template lengths from a `.methylation.npz` file.
|
|
585
|
+
|
|
586
|
+
```python
|
|
587
|
+
read_npz_tlen(npz_path: str) -> np.ndarray | None
|
|
552
588
|
```
|
|
553
589
|
|
|
554
|
-
**Returns:**
|
|
590
|
+
**Returns:** The per-read template-length array, or `None` if the file was produced by an older version of bam2tensor.
|
|
555
591
|
|
|
556
592
|
## Contributing
|
|
557
593
|
|
|
@@ -41,6 +41,7 @@
|
|
|
41
41
|
- [Command-Line Options](#command-line-options)
|
|
42
42
|
- [Inspecting Output Files](#inspecting-output-files)
|
|
43
43
|
- [Output Data Structure](#output-data-structure)
|
|
44
|
+
- [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
|
|
44
45
|
- [Embedded Metadata](#embedded-metadata)
|
|
45
46
|
- [Loading Output Files](#loading-output-files)
|
|
46
47
|
- [Converting to Dense Arrays](#converting-to-dense-arrays)
|
|
@@ -64,6 +65,7 @@
|
|
|
64
65
|
- **Batch Processing**: Process multiple BAM files with directory recursion
|
|
65
66
|
- **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
|
|
66
67
|
- **Quality Filtering**: Configurable mapping quality thresholds
|
|
68
|
+
- **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
|
|
67
69
|
|
|
68
70
|
## Requirements
|
|
69
71
|
|
|
@@ -266,8 +268,9 @@ sample.methylation.npz
|
|
|
266
268
|
Reads: 1,423,891
|
|
267
269
|
CpG sites: 28,217,448
|
|
268
270
|
Data points: 12,847,322 (sparsity: 99.97%)
|
|
271
|
+
Fragment len: median 167, mean 182, range [50, 600]
|
|
269
272
|
CpG index CRC32: a1b2c3d4
|
|
270
|
-
bam2tensor: v2.
|
|
273
|
+
bam2tensor: v2.4
|
|
271
274
|
File size: 14.2 MB
|
|
272
275
|
```
|
|
273
276
|
|
|
@@ -300,6 +303,27 @@ The **column dimension is determined entirely by the reference genome**: it equa
|
|
|
300
303
|
|
|
301
304
|
Note: The matrix uses SciPy's COO sparse format, which explicitly stores all non-zero values. Unmethylated sites (value `0`) **are** stored as explicit entries. Positions not covered by a read are simply absent from the matrix (implicit zero, which is distinct from the explicit `0` = unmethylated).
|
|
302
305
|
|
|
306
|
+
### Per-Read Fragment Length (TLEN)
|
|
307
|
+
|
|
308
|
+
Each `.methylation.npz` file includes a `tlen.npy` entry inside the ZIP archive containing the signed BAM template length (TLEN) for every read in the matrix. This enables joint fragment-length and methylation analysis without re-processing the BAM.
|
|
309
|
+
|
|
310
|
+
- One `int32` value per read (row), in the same order as the sparse matrix rows
|
|
311
|
+
- Signed: positive for the leftmost read in a pair, negative for the rightmost
|
|
312
|
+
- Zero for single-end reads or reads with unmapped mates
|
|
313
|
+
- Use `abs(tlen)` to get fragment lengths
|
|
314
|
+
|
|
315
|
+
```python
|
|
316
|
+
from bam2tensor.metadata import read_npz_tlen
|
|
317
|
+
import numpy as np
|
|
318
|
+
|
|
319
|
+
tlen = read_npz_tlen("sample.methylation.npz")
|
|
320
|
+
if tlen is not None:
|
|
321
|
+
frag_lengths = np.abs(tlen)
|
|
322
|
+
nonzero = frag_lengths[frag_lengths > 0]
|
|
323
|
+
print(f"Median fragment length: {np.median(nonzero):.0f}")
|
|
324
|
+
print(f"Mean fragment length: {np.mean(nonzero):.0f}")
|
|
325
|
+
```
|
|
326
|
+
|
|
303
327
|
### Embedded Metadata
|
|
304
328
|
|
|
305
329
|
Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP archive with provenance information:
|
|
@@ -515,10 +539,22 @@ extract_methylation_data_from_bam(
|
|
|
515
539
|
quality_limit: int = 20, # Minimum MAPQ
|
|
516
540
|
verbose: bool = False, # Enable verbose output
|
|
517
541
|
debug: bool = False # Enable debug output
|
|
518
|
-
) ->
|
|
542
|
+
) -> ExtractionResult
|
|
543
|
+
```
|
|
544
|
+
|
|
545
|
+
**Returns:** An `ExtractionResult` named tuple with two fields:
|
|
546
|
+
- `matrix`: A SciPy COO sparse matrix with shape (n_reads, n_cpg_sites)
|
|
547
|
+
- `tlen`: A 1-D numpy `int32` array of shape (n_reads,) containing the signed template length (BAM TLEN field) for each read
|
|
548
|
+
|
|
549
|
+
### `bam2tensor.metadata.read_npz_tlen`
|
|
550
|
+
|
|
551
|
+
Read per-read template lengths from a `.methylation.npz` file.
|
|
552
|
+
|
|
553
|
+
```python
|
|
554
|
+
read_npz_tlen(npz_path: str) -> np.ndarray | None
|
|
519
555
|
```
|
|
520
556
|
|
|
521
|
-
**Returns:**
|
|
557
|
+
**Returns:** The per-read template-length array, or `None` if the file was produced by an older version of bam2tensor.
|
|
522
558
|
|
|
523
559
|
## Contributing
|
|
524
560
|
|
|
@@ -32,6 +32,14 @@ bam2tensor.functions module
|
|
|
32
32
|
:show-inheritance:
|
|
33
33
|
:undoc-members:
|
|
34
34
|
|
|
35
|
+
bam2tensor.metadata module
|
|
36
|
+
--------------------------
|
|
37
|
+
|
|
38
|
+
.. automodule:: bam2tensor.metadata
|
|
39
|
+
:members:
|
|
40
|
+
:show-inheritance:
|
|
41
|
+
:undoc-members:
|
|
42
|
+
|
|
35
43
|
bam2tensor.reference module
|
|
36
44
|
---------------------------
|
|
37
45
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "bam2tensor"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.5"
|
|
4
4
|
description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
|
|
5
5
|
authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
|
|
6
6
|
license = "MIT"
|
|
@@ -30,14 +30,14 @@ Example:
|
|
|
30
30
|
)
|
|
31
31
|
|
|
32
32
|
# Extract methylation data
|
|
33
|
-
|
|
33
|
+
result = extract_methylation_data_from_bam(
|
|
34
34
|
input_bam="/path/to/sample.bam",
|
|
35
35
|
genome_methylation_embedding=embedding,
|
|
36
36
|
)
|
|
37
37
|
|
|
38
38
|
# Save to file
|
|
39
39
|
import scipy.sparse
|
|
40
|
-
scipy.sparse.save_npz("output.npz",
|
|
40
|
+
scipy.sparse.save_npz("output.npz", result.matrix)
|
|
41
41
|
|
|
42
42
|
Output Format:
|
|
43
43
|
The output is a SciPy sparse COO matrix where:
|
|
@@ -50,4 +50,4 @@ See Also:
|
|
|
50
50
|
- https://mcwdsi.github.io/bam2tensor for full documentation
|
|
51
51
|
"""
|
|
52
52
|
|
|
53
|
-
__version__ = "2.
|
|
53
|
+
__version__ = "2.5"
|
|
@@ -38,7 +38,11 @@ from bam2tensor.functions import (
|
|
|
38
38
|
detect_aligner,
|
|
39
39
|
extract_methylation_data_from_bam,
|
|
40
40
|
)
|
|
41
|
-
from bam2tensor.metadata import
|
|
41
|
+
from bam2tensor.metadata import (
|
|
42
|
+
compute_cpg_index_crc32,
|
|
43
|
+
write_npz_metadata,
|
|
44
|
+
write_npz_tlen,
|
|
45
|
+
)
|
|
42
46
|
from bam2tensor.reference import (
|
|
43
47
|
KNOWN_GENOMES,
|
|
44
48
|
download_reference as download_reference_fn,
|
|
@@ -225,6 +229,43 @@ def validate_input_output(
|
|
|
225
229
|
default=20,
|
|
226
230
|
type=int,
|
|
227
231
|
)
|
|
232
|
+
@click.option(
|
|
233
|
+
"--filter-non-converted",
|
|
234
|
+
help=(
|
|
235
|
+
"Drop reads with >= --non-converted-threshold retained non-CpG "
|
|
236
|
+
"cytosines, the signature of incomplete bisulfite/EM-seq conversion "
|
|
237
|
+
"(port of nebiolabs/mark-nonconverted-reads). Default: off."
|
|
238
|
+
),
|
|
239
|
+
is_flag=True,
|
|
240
|
+
)
|
|
241
|
+
@click.option(
|
|
242
|
+
"--non-converted-threshold",
|
|
243
|
+
help=(
|
|
244
|
+
"Minimum count of retained non-CpG cytosines to drop a read "
|
|
245
|
+
"(default = 3, matches NEB mark-nonconverted-reads)."
|
|
246
|
+
),
|
|
247
|
+
default=3,
|
|
248
|
+
type=int,
|
|
249
|
+
)
|
|
250
|
+
@click.option(
|
|
251
|
+
"--filter-em-overconversion",
|
|
252
|
+
help=(
|
|
253
|
+
"Drop EM-seq reads whose covered CpGs are all called unmethylated "
|
|
254
|
+
"and cover at least --em-overconversion-min-cpgs sites (heuristic "
|
|
255
|
+
"for the fragment-level over-conversion artifact described in "
|
|
256
|
+
"Loyfer et al. bioRxiv 2026.03.24.713040). Default: off."
|
|
257
|
+
),
|
|
258
|
+
is_flag=True,
|
|
259
|
+
)
|
|
260
|
+
@click.option(
|
|
261
|
+
"--em-overconversion-min-cpgs",
|
|
262
|
+
help=(
|
|
263
|
+
"Minimum covered CpG count required before the EM over-conversion "
|
|
264
|
+
"filter will drop a read (default = 3)."
|
|
265
|
+
),
|
|
266
|
+
default=3,
|
|
267
|
+
type=int,
|
|
268
|
+
)
|
|
228
269
|
@click.option("--verbose", help="Verbose output.", is_flag=True)
|
|
229
270
|
@click.option("--skip-cache", help="De-novo generate CpG sites (slow).", is_flag=True)
|
|
230
271
|
@click.option(
|
|
@@ -259,6 +300,10 @@ def main(
|
|
|
259
300
|
expected_chromosomes: str | None,
|
|
260
301
|
reference_fasta: str | None,
|
|
261
302
|
quality_limit: int,
|
|
303
|
+
filter_non_converted: bool,
|
|
304
|
+
non_converted_threshold: int,
|
|
305
|
+
filter_em_overconversion: bool,
|
|
306
|
+
em_overconversion_min_cpgs: int,
|
|
262
307
|
verbose: bool,
|
|
263
308
|
skip_cache: bool,
|
|
264
309
|
debug: bool,
|
|
@@ -296,6 +341,17 @@ def main(
|
|
|
296
341
|
``--download-reference`` is used.
|
|
297
342
|
quality_limit: Minimum mapping quality (MAPQ) threshold. Reads below
|
|
298
343
|
this quality are excluded.
|
|
344
|
+
filter_non_converted: If True, drop reads with at least
|
|
345
|
+
``non_converted_threshold`` retained non-CpG cytosines —
|
|
346
|
+
indicating incomplete bisulfite/EM-seq conversion.
|
|
347
|
+
non_converted_threshold: Threshold used by the non-converted
|
|
348
|
+
read filter.
|
|
349
|
+
filter_em_overconversion: If True, drop reads whose covered CpGs
|
|
350
|
+
are all called unmethylated and cover at least
|
|
351
|
+
``em_overconversion_min_cpgs`` sites — heuristic for EM-seq
|
|
352
|
+
fragment-level over-conversion (Loyfer et al. 2026).
|
|
353
|
+
em_overconversion_min_cpgs: Minimum covered CpG count required
|
|
354
|
+
before the over-conversion filter will drop a read.
|
|
299
355
|
verbose: If True, print detailed progress information.
|
|
300
356
|
skip_cache: If True, regenerate the CpG site index even if a cache
|
|
301
357
|
file exists.
|
|
@@ -378,6 +434,16 @@ def main(
|
|
|
378
434
|
print(f" Reference: {reference_fasta}")
|
|
379
435
|
print(f" Chromosomes: {chrom_display}")
|
|
380
436
|
print(f" Quality limit: MAPQ >= {quality_limit}")
|
|
437
|
+
if filter_non_converted:
|
|
438
|
+
print(
|
|
439
|
+
f" Filters: non-converted reads (>= "
|
|
440
|
+
f"{non_converted_threshold} retained non-CpG Cs)"
|
|
441
|
+
)
|
|
442
|
+
if filter_em_overconversion:
|
|
443
|
+
print(
|
|
444
|
+
f" EM over-conversion (all-unmethylated, >= "
|
|
445
|
+
f"{em_overconversion_min_cpgs} CpGs)"
|
|
446
|
+
)
|
|
381
447
|
if output_dir:
|
|
382
448
|
print(f" Output dir: {output_dir}")
|
|
383
449
|
else:
|
|
@@ -440,10 +506,14 @@ def main(
|
|
|
440
506
|
# Extract
|
|
441
507
|
print(" Extracting methylation data...")
|
|
442
508
|
try:
|
|
443
|
-
|
|
509
|
+
extraction_result = extract_methylation_data_from_bam(
|
|
444
510
|
input_bam=input_bam,
|
|
445
511
|
genome_methylation_embedding=genome_methylation_embedding,
|
|
446
512
|
quality_limit=quality_limit,
|
|
513
|
+
filter_non_converted=filter_non_converted,
|
|
514
|
+
non_converted_threshold=non_converted_threshold,
|
|
515
|
+
filter_em_overconversion=filter_em_overconversion,
|
|
516
|
+
em_overconversion_min_cpgs=em_overconversion_min_cpgs,
|
|
447
517
|
verbose=verbose,
|
|
448
518
|
debug=debug,
|
|
449
519
|
)
|
|
@@ -453,16 +523,17 @@ def main(
|
|
|
453
523
|
continue
|
|
454
524
|
|
|
455
525
|
# Matrix stats
|
|
456
|
-
n_reads =
|
|
457
|
-
n_cpgs =
|
|
458
|
-
n_data =
|
|
526
|
+
n_reads = extraction_result.matrix.shape[0]
|
|
527
|
+
n_cpgs = extraction_result.matrix.shape[1]
|
|
528
|
+
n_data = extraction_result.matrix.nnz
|
|
459
529
|
print(
|
|
460
530
|
f" Result: {n_reads:,} reads x {n_cpgs:,} CpG sites"
|
|
461
531
|
f" ({n_data:,} data points)"
|
|
462
532
|
)
|
|
463
533
|
|
|
464
534
|
# Save
|
|
465
|
-
scipy.sparse.save_npz(output_file,
|
|
535
|
+
scipy.sparse.save_npz(output_file, extraction_result.matrix, compressed=True)
|
|
536
|
+
write_npz_tlen(output_file, extraction_result.tlen)
|
|
466
537
|
write_npz_metadata(
|
|
467
538
|
output_file,
|
|
468
539
|
{
|
|
@@ -471,6 +542,16 @@ def main(
|
|
|
471
542
|
"expected_chromosomes": chrom_list,
|
|
472
543
|
"total_cpg_sites": genome_methylation_embedding.total_cpg_sites,
|
|
473
544
|
"cpg_index_crc32": cpg_crc32,
|
|
545
|
+
"filters": {
|
|
546
|
+
"non_converted_reads": {
|
|
547
|
+
"enabled": filter_non_converted,
|
|
548
|
+
"threshold": non_converted_threshold,
|
|
549
|
+
},
|
|
550
|
+
"em_overconversion": {
|
|
551
|
+
"enabled": filter_em_overconversion,
|
|
552
|
+
"min_cpgs": em_overconversion_min_cpgs,
|
|
553
|
+
},
|
|
554
|
+
},
|
|
474
555
|
},
|
|
475
556
|
)
|
|
476
557
|
print(f" Output: {output_file}")
|