bam2tensor 2.6__tar.gz → 2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bam2tensor-2.6 → bam2tensor-2.7}/PKG-INFO +1 -1
- {bam2tensor-2.6 → bam2tensor-2.7}/pyproject.toml +1 -1
- {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/__init__.py +1 -1
- {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/embedding.py +70 -23
- {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/metadata.py +29 -2
- {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_embedding.py +221 -1
- {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_inspect.py +2 -2
- {bam2tensor-2.6 → bam2tensor-2.7}/uv.lock +1 -1
- {bam2tensor-2.6 → bam2tensor-2.7}/.darglint +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.editorconfig +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.gitattributes +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.github/actions/setup-env/action.yml +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.github/dependabot.yml +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.github/labels.yml +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.github/release-drafter.yml +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.github/workflows/constraints.txt +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.github/workflows/docs.yml +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.github/workflows/labeler.yml +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.github/workflows/release.yml +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.github/workflows/tests.yml +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.gitignore +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/.pre-commit-config.yaml +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/CLAUDE.md +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/CONTRIBUTING.md +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/LICENSE +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/README.md +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/SECURITY.md +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/docs/Makefile +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/docs/conf.py +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/docs/contributing.md +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/docs/index.md +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/docs/license.md +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/docs/logo/bam2tensor-logo.afdesign +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/docs/logo/bam2tensor-logo.png +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/docs/make.bat +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/docs/nano-banana-overview-shrunk.png +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/docs/reference.md +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/docs/templates/package.rst_t +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/noxfile.py +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/__main__.py +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/functions.py +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/inspect.py +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/py.typed +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/reference.py +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/tests/__init__.py +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_duplication.py +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_fasta.fa +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_filters.py +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_functions.py +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_main.py +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_metadata.py +0 -0
- {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_reference.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bam2tensor
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.7
|
|
4
4
|
Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
|
|
5
5
|
Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
|
|
6
6
|
Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "bam2tensor"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.7"
|
|
4
4
|
description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
|
|
5
5
|
authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
|
|
6
6
|
license = "MIT"
|
|
@@ -45,6 +45,9 @@ import numpy as np
|
|
|
45
45
|
from tqdm import tqdm
|
|
46
46
|
from Bio import SeqIO
|
|
47
47
|
|
|
48
|
+
from bam2tensor import __version__
|
|
49
|
+
from bam2tensor.metadata import compute_fasta_sha256
|
|
50
|
+
|
|
48
51
|
|
|
49
52
|
class GenomeMethylationEmbedding:
|
|
50
53
|
"""Manages CpG site positions and coordinate conversions for a reference genome.
|
|
@@ -173,7 +176,12 @@ class GenomeMethylationEmbedding:
|
|
|
173
176
|
window_size == self.window_size
|
|
174
177
|
), "Window size does not match cached window size!"
|
|
175
178
|
except FileNotFoundError as e:
|
|
176
|
-
|
|
179
|
+
# Stale-cache rejections (version or FASTA SHA-256 mismatch)
|
|
180
|
+
# raise FileNotFoundError too — always surface those so users
|
|
181
|
+
# are not silently regenerating a cache they thought was valid.
|
|
182
|
+
if os.path.exists(self.cache_file):
|
|
183
|
+
print(f"Discarding stale embedding cache: {e}")
|
|
184
|
+
elif self.verbose:
|
|
177
185
|
print("Could not load methylation embedding from cache: " + str(e))
|
|
178
186
|
|
|
179
187
|
if not cache_available:
|
|
@@ -224,6 +232,9 @@ class GenomeMethylationEmbedding:
|
|
|
224
232
|
The cache file is named "{genome_name}.cache.json.gz" and contains:
|
|
225
233
|
- genome_name: The genome identifier
|
|
226
234
|
- fasta_source: Path to the original FASTA file
|
|
235
|
+
- fasta_sha256: SHA-256 of the FASTA file bytes (for cache validation)
|
|
236
|
+
- bam2tensor_version: Version that produced this cache
|
|
237
|
+
- total_cpg_sites: Total CpG count across all included chromosomes
|
|
227
238
|
- expected_chromosomes: List of included chromosomes
|
|
228
239
|
- window_size: The window_size parameter (for compatibility checking)
|
|
229
240
|
- cpg_sites_dict: Dictionary of chromosome -> list of CpG positions
|
|
@@ -241,9 +252,13 @@ class GenomeMethylationEmbedding:
|
|
|
241
252
|
|
|
242
253
|
assert len(self.cpg_sites_dict) > 0, "CpG sites dict is empty!"
|
|
243
254
|
|
|
255
|
+
total_cpg_sites = sum(len(v) for v in self.cpg_sites_dict.values())
|
|
244
256
|
cache_data = {
|
|
245
257
|
"genome_name": self.genome_name,
|
|
246
258
|
"fasta_source": self.fasta_source,
|
|
259
|
+
"fasta_sha256": compute_fasta_sha256(self.fasta_source),
|
|
260
|
+
"bam2tensor_version": __version__,
|
|
261
|
+
"total_cpg_sites": total_cpg_sites,
|
|
247
262
|
"expected_chromosomes": self.expected_chromosomes,
|
|
248
263
|
"window_size": self.window_size,
|
|
249
264
|
"cpg_sites_dict": self.cpg_sites_dict,
|
|
@@ -263,38 +278,66 @@ class GenomeMethylationEmbedding:
|
|
|
263
278
|
restore all CpG site data. If successful, this avoids the slow
|
|
264
279
|
FASTA parsing step.
|
|
265
280
|
|
|
281
|
+
Provenance is validated before the cached data is trusted: the
|
|
282
|
+
cache must have been written by the same major.minor of
|
|
283
|
+
bam2tensor and must reference a FASTA file with the same SHA-256
|
|
284
|
+
as the current ``fasta_source``. A stale cache is rejected with
|
|
285
|
+
a ``FileNotFoundError`` so the caller falls through to a fresh
|
|
286
|
+
FASTA parse and overwrites the stale cache on save.
|
|
287
|
+
|
|
266
288
|
Returns:
|
|
267
289
|
True if the cache was successfully loaded.
|
|
268
290
|
|
|
269
291
|
Raises:
|
|
270
|
-
FileNotFoundError: If the cache file does not exist
|
|
292
|
+
FileNotFoundError: If the cache file does not exist, or if
|
|
293
|
+
the cache is stale (version mismatch or FASTA SHA-256
|
|
294
|
+
mismatch).
|
|
271
295
|
|
|
272
296
|
Note:
|
|
273
|
-
After loading, the caller should verify that
|
|
274
|
-
and window_size match the current
|
|
275
|
-
overwrites those attributes
|
|
297
|
+
After loading, the caller should verify that
|
|
298
|
+
``expected_chromosomes`` and ``window_size`` match the current
|
|
299
|
+
configuration, as this method overwrites those attributes
|
|
300
|
+
with cached values.
|
|
276
301
|
"""
|
|
277
302
|
|
|
278
|
-
if os.path.exists(self.cache_file):
|
|
279
|
-
|
|
280
|
-
print(f"\tReading embedding from cache: {self.cache_file}")
|
|
303
|
+
if not os.path.exists(self.cache_file):
|
|
304
|
+
raise FileNotFoundError("No cache of embedding found.")
|
|
281
305
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
with gzip.open(self.cache_file, "rt") as f:
|
|
285
|
-
self.cache_data = json.load(f)
|
|
306
|
+
if self.verbose:
|
|
307
|
+
print(f"\tReading embedding from cache: {self.cache_file}")
|
|
286
308
|
|
|
287
|
-
|
|
288
|
-
self.
|
|
289
|
-
self.fasta_source = self.cache_data["fasta_source"]
|
|
290
|
-
self.expected_chromosomes = self.cache_data["expected_chromosomes"]
|
|
291
|
-
self.window_size = self.cache_data["window_size"]
|
|
292
|
-
self.cpg_sites_dict = self.cache_data["cpg_sites_dict"]
|
|
309
|
+
with gzip.open(self.cache_file, "rt") as f:
|
|
310
|
+
self.cache_data = json.load(f)
|
|
293
311
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
312
|
+
# Validate cache provenance: stale caches predating v2.7 used a
|
|
313
|
+
# case-sensitive CpG search that silently dropped roughly half
|
|
314
|
+
# the CpG sites in soft-masked FASTAs (e.g. UCSC's hg38.fa.gz).
|
|
315
|
+
cached_version = self.cache_data.get("bam2tensor_version")
|
|
316
|
+
if cached_version != __version__:
|
|
317
|
+
raise FileNotFoundError(
|
|
318
|
+
f"Stale cache {self.cache_file!r}: written by bam2tensor "
|
|
319
|
+
f"{cached_version!r}, current is {__version__!r}. "
|
|
320
|
+
"Regenerating."
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
cached_fasta_sha256 = self.cache_data.get("fasta_sha256")
|
|
324
|
+
current_fasta_sha256 = compute_fasta_sha256(self.fasta_source)
|
|
325
|
+
if cached_fasta_sha256 != current_fasta_sha256:
|
|
326
|
+
raise FileNotFoundError(
|
|
327
|
+
f"Stale cache {self.cache_file!r}: FASTA SHA-256 mismatch "
|
|
328
|
+
f"(cache={cached_fasta_sha256}, current={current_fasta_sha256}). "
|
|
329
|
+
"Regenerating."
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# Load the cached data
|
|
333
|
+
self.genome_name = self.cache_data["genome_name"]
|
|
334
|
+
self.fasta_source = self.cache_data["fasta_source"]
|
|
335
|
+
self.expected_chromosomes = self.cache_data["expected_chromosomes"]
|
|
336
|
+
self.window_size = self.cache_data["window_size"]
|
|
337
|
+
self.cpg_sites_dict = self.cache_data["cpg_sites_dict"]
|
|
338
|
+
|
|
339
|
+
if self.verbose:
|
|
340
|
+
print(f"\tCached genome fasta source: {self.fasta_source}")
|
|
298
341
|
|
|
299
342
|
return True
|
|
300
343
|
|
|
@@ -350,7 +393,11 @@ class GenomeMethylationEmbedding:
|
|
|
350
393
|
if self.verbose:
|
|
351
394
|
tqdm.write(f"\tSkipping chromosome {seqrecord.id}")
|
|
352
395
|
continue
|
|
353
|
-
sequence
|
|
396
|
+
# Upper-case the sequence so soft-masked FASTAs (UCSC's default
|
|
397
|
+
# hg38.fa.gz uses lowercase for RepeatMasker/TRF regions) do not
|
|
398
|
+
# silently drop CpGs in repeats — that is roughly half of all
|
|
399
|
+
# CpGs in the human genome.
|
|
400
|
+
sequence = seqrecord.seq.upper()
|
|
354
401
|
|
|
355
402
|
# Find all CpG sites
|
|
356
403
|
# The pos+1 is because we want to store the 1-based position, because .bed is wild and arguably 1-based maybe:
|
|
@@ -27,17 +27,44 @@ Example:
|
|
|
27
27
|
hg38
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
|
+
import hashlib
|
|
30
31
|
import io
|
|
31
32
|
import json
|
|
32
33
|
import zipfile
|
|
33
34
|
import zlib
|
|
35
|
+
from typing import TYPE_CHECKING
|
|
34
36
|
|
|
35
37
|
import numpy as np
|
|
36
38
|
|
|
37
|
-
|
|
39
|
+
if TYPE_CHECKING:
|
|
40
|
+
# Avoid a runtime circular import: embedding.py imports compute_fasta_sha256
|
|
41
|
+
# from this module, and this module only needs the embedding type for
|
|
42
|
+
# annotations.
|
|
43
|
+
from bam2tensor.embedding import GenomeMethylationEmbedding
|
|
38
44
|
|
|
39
45
|
|
|
40
|
-
def
|
|
46
|
+
def compute_fasta_sha256(fasta_source: str) -> str:
|
|
47
|
+
"""Compute the SHA-256 of a FASTA file's bytes on disk.
|
|
48
|
+
|
|
49
|
+
Used to stamp the CpG-site cache (see
|
|
50
|
+
:py:class:`bam2tensor.embedding.GenomeMethylationEmbedding`) so a
|
|
51
|
+
cache can be rejected when the underlying FASTA changes (e.g. a
|
|
52
|
+
user swaps a soft-masked build for a hard-masked one).
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
fasta_source: Path to the reference FASTA file.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
The hex-encoded SHA-256 digest of the file's bytes.
|
|
59
|
+
"""
|
|
60
|
+
h = hashlib.sha256()
|
|
61
|
+
with open(fasta_source, "rb") as f:
|
|
62
|
+
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
|
63
|
+
h.update(chunk)
|
|
64
|
+
return h.hexdigest()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def compute_cpg_index_crc32(embedding: "GenomeMethylationEmbedding") -> str:
|
|
41
68
|
"""Compute a CRC32 checksum over the CpG site positions in an embedding.
|
|
42
69
|
|
|
43
70
|
The checksum captures the exact column mapping of the sparse matrix:
|
|
@@ -1,6 +1,10 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
import json
|
|
1
3
|
import os
|
|
4
|
+
|
|
2
5
|
import pytest
|
|
3
|
-
|
|
6
|
+
|
|
7
|
+
from bam2tensor import __version__, embedding
|
|
4
8
|
|
|
5
9
|
# Generate a fresh, uncached embedding
|
|
6
10
|
test_embedding = embedding.GenomeMethylationEmbedding(
|
|
@@ -241,3 +245,219 @@ def test_save_cache_verbose(tmp_path, capsys) -> None:
|
|
|
241
245
|
assert "Saved embedding cache" in captured.out
|
|
242
246
|
finally:
|
|
243
247
|
os.chdir(original_cwd)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
# -- Regression: soft-masked FASTAs must enumerate every CpG -----------------
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _write_softmasked_fasta(path: str) -> None:
|
|
254
|
+
"""Write a tiny mixed-case FASTA with CG/cg/Cg/cG to ``path``.
|
|
255
|
+
|
|
256
|
+
UCSC's default ``hg38.fa.gz`` is soft-masked (lowercase = RepeatMasker
|
|
257
|
+
/ Tandem Repeats Finder). Prior to v2.7 the CpG enumeration used a
|
|
258
|
+
case-sensitive search and silently dropped roughly half the CpGs in
|
|
259
|
+
soft-masked references — this fixture pins the regression.
|
|
260
|
+
"""
|
|
261
|
+
# 1-based positions of the C in each C[gG] (0-based + 1):
|
|
262
|
+
# CG @ pos 6, cg @ pos 11, Cg @ pos 17, cG @ pos 23
|
|
263
|
+
seq = "AAAAACGAAAcgAAAACgAAAAcGAAAAA"
|
|
264
|
+
with open(path, "w") as fh:
|
|
265
|
+
fh.write(">chr1\n")
|
|
266
|
+
fh.write(seq + "\n")
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def test_soft_masked_fasta_enumerates_all_cpgs(tmp_path) -> None:
|
|
270
|
+
"""All CpGs are found regardless of case (regression for v2.7 case fix).
|
|
271
|
+
|
|
272
|
+
Bug: prior to v2.7, ``sequence.find('CG')`` on a soft-masked FASTA
|
|
273
|
+
missed every lowercase or mixed-case ``cg``/``Cg``/``cG``. On real
|
|
274
|
+
UCSC hg38 that silently dropped ~53% of all CpG sites.
|
|
275
|
+
"""
|
|
276
|
+
fasta = tmp_path / "softmasked.fa"
|
|
277
|
+
_write_softmasked_fasta(str(fasta))
|
|
278
|
+
|
|
279
|
+
original_cwd = os.getcwd()
|
|
280
|
+
os.chdir(tmp_path)
|
|
281
|
+
try:
|
|
282
|
+
emb = embedding.GenomeMethylationEmbedding(
|
|
283
|
+
"softmasked_test",
|
|
284
|
+
expected_chromosomes=["chr1"],
|
|
285
|
+
fasta_source=str(fasta),
|
|
286
|
+
window_size=150,
|
|
287
|
+
skip_cache=True,
|
|
288
|
+
verbose=False,
|
|
289
|
+
)
|
|
290
|
+
finally:
|
|
291
|
+
os.chdir(original_cwd)
|
|
292
|
+
|
|
293
|
+
assert emb.total_cpg_sites == 4
|
|
294
|
+
assert emb.cpg_sites_dict["chr1"] == [6, 11, 17, 23]
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
# -- Cache provenance: stamp + reject stale ---------------------------------
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def _read_cache(path: str) -> dict:
|
|
301
|
+
"""Read and decode a gzipped JSON cache file."""
|
|
302
|
+
with gzip.open(path, "rt") as f:
|
|
303
|
+
return json.load(f)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def test_cache_stamps_provenance(tmp_path) -> None:
|
|
307
|
+
"""Saved cache embeds bam2tensor_version, fasta_sha256, total_cpg_sites."""
|
|
308
|
+
original_cwd = os.getcwd()
|
|
309
|
+
os.chdir(tmp_path)
|
|
310
|
+
try:
|
|
311
|
+
emb = embedding.GenomeMethylationEmbedding(
|
|
312
|
+
"provenance_test",
|
|
313
|
+
expected_chromosomes=["chr1", "chr2", "chr3"],
|
|
314
|
+
fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
|
|
315
|
+
window_size=150,
|
|
316
|
+
skip_cache=False,
|
|
317
|
+
verbose=False,
|
|
318
|
+
)
|
|
319
|
+
data = _read_cache("provenance_test.cache.json.gz")
|
|
320
|
+
finally:
|
|
321
|
+
os.chdir(original_cwd)
|
|
322
|
+
|
|
323
|
+
assert data["bam2tensor_version"] == __version__
|
|
324
|
+
assert data["total_cpg_sites"] == emb.total_cpg_sites
|
|
325
|
+
assert isinstance(data["fasta_sha256"], str)
|
|
326
|
+
assert len(data["fasta_sha256"]) == 64 # hex-encoded SHA-256
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def test_cache_rejected_on_fasta_sha256_mismatch(tmp_path, capsys) -> None:
|
|
330
|
+
"""A cache whose ``fasta_sha256`` no longer matches the current FASTA is discarded.
|
|
331
|
+
|
|
332
|
+
This is the primary safety net for the upgrade to v2.7: any user
|
|
333
|
+
with a pre-v2.7 cache (or any user who switches between soft- and
|
|
334
|
+
hard-masked FASTAs) will get a clean re-parse rather than silently
|
|
335
|
+
misaligned column indices.
|
|
336
|
+
"""
|
|
337
|
+
original_cwd = os.getcwd()
|
|
338
|
+
os.chdir(tmp_path)
|
|
339
|
+
try:
|
|
340
|
+
# First run: generate cache against the real test FASTA.
|
|
341
|
+
emb1 = embedding.GenomeMethylationEmbedding(
|
|
342
|
+
"sha_mismatch_test",
|
|
343
|
+
expected_chromosomes=["chr1", "chr2", "chr3"],
|
|
344
|
+
fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
|
|
345
|
+
window_size=150,
|
|
346
|
+
skip_cache=False,
|
|
347
|
+
verbose=False,
|
|
348
|
+
)
|
|
349
|
+
cache_path = "sha_mismatch_test.cache.json.gz"
|
|
350
|
+
assert os.path.exists(cache_path)
|
|
351
|
+
|
|
352
|
+
# Tamper with the cached fasta_sha256 so it can no longer match
|
|
353
|
+
# the on-disk FASTA. This simulates a user upgrading bam2tensor
|
|
354
|
+
# with an inherited v2.5 cache whose enumeration is stale.
|
|
355
|
+
data = _read_cache(cache_path)
|
|
356
|
+
data["fasta_sha256"] = "0" * 64
|
|
357
|
+
with gzip.open(cache_path, "wt") as f:
|
|
358
|
+
json.dump(data, f)
|
|
359
|
+
|
|
360
|
+
# Second run: the stale cache must be rejected and regenerated.
|
|
361
|
+
emb2 = embedding.GenomeMethylationEmbedding(
|
|
362
|
+
"sha_mismatch_test",
|
|
363
|
+
expected_chromosomes=["chr1", "chr2", "chr3"],
|
|
364
|
+
fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
|
|
365
|
+
window_size=150,
|
|
366
|
+
skip_cache=False,
|
|
367
|
+
verbose=False,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
captured = capsys.readouterr()
|
|
371
|
+
assert "Discarding stale embedding cache" in captured.out
|
|
372
|
+
assert "FASTA SHA-256 mismatch" in captured.out
|
|
373
|
+
|
|
374
|
+
# Regenerated cache: same content as the first run.
|
|
375
|
+
assert emb2.total_cpg_sites == emb1.total_cpg_sites
|
|
376
|
+
# And the on-disk cache now stamps the real SHA-256, not the tampered one.
|
|
377
|
+
refreshed = _read_cache(cache_path)
|
|
378
|
+
assert refreshed["fasta_sha256"] != "0" * 64
|
|
379
|
+
finally:
|
|
380
|
+
os.chdir(original_cwd)
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def test_cache_rejected_on_version_mismatch(tmp_path, capsys) -> None:
|
|
384
|
+
"""A cache stamped with an older bam2tensor version is discarded."""
|
|
385
|
+
original_cwd = os.getcwd()
|
|
386
|
+
os.chdir(tmp_path)
|
|
387
|
+
try:
|
|
388
|
+
embedding.GenomeMethylationEmbedding(
|
|
389
|
+
"version_mismatch_test",
|
|
390
|
+
expected_chromosomes=["chr1", "chr2", "chr3"],
|
|
391
|
+
fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
|
|
392
|
+
window_size=150,
|
|
393
|
+
skip_cache=False,
|
|
394
|
+
verbose=False,
|
|
395
|
+
)
|
|
396
|
+
cache_path = "version_mismatch_test.cache.json.gz"
|
|
397
|
+
|
|
398
|
+
# Roll the version back to a release that predates the
|
|
399
|
+
# case-sensitivity fix; any non-current version should trigger
|
|
400
|
+
# rejection.
|
|
401
|
+
data = _read_cache(cache_path)
|
|
402
|
+
data["bam2tensor_version"] = "2.5"
|
|
403
|
+
with gzip.open(cache_path, "wt") as f:
|
|
404
|
+
json.dump(data, f)
|
|
405
|
+
|
|
406
|
+
embedding.GenomeMethylationEmbedding(
|
|
407
|
+
"version_mismatch_test",
|
|
408
|
+
expected_chromosomes=["chr1", "chr2", "chr3"],
|
|
409
|
+
fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
|
|
410
|
+
window_size=150,
|
|
411
|
+
skip_cache=False,
|
|
412
|
+
verbose=False,
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
captured = capsys.readouterr()
|
|
416
|
+
assert "Discarding stale embedding cache" in captured.out
|
|
417
|
+
assert "bam2tensor" in captured.out
|
|
418
|
+
assert "2.5" in captured.out
|
|
419
|
+
finally:
|
|
420
|
+
os.chdir(original_cwd)
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def test_cache_rejected_when_provenance_absent(tmp_path, capsys) -> None:
|
|
424
|
+
"""A pre-v2.7 cache without provenance fields is treated as stale.
|
|
425
|
+
|
|
426
|
+
Users on v2.5 / v2.6 have caches with no ``bam2tensor_version`` or
|
|
427
|
+
``fasta_sha256``. After upgrading, those caches must be discarded so
|
|
428
|
+
the case-sensitivity fix actually takes effect on their data.
|
|
429
|
+
"""
|
|
430
|
+
original_cwd = os.getcwd()
|
|
431
|
+
os.chdir(tmp_path)
|
|
432
|
+
try:
|
|
433
|
+
embedding.GenomeMethylationEmbedding(
|
|
434
|
+
"legacy_cache_test",
|
|
435
|
+
expected_chromosomes=["chr1", "chr2", "chr3"],
|
|
436
|
+
fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
|
|
437
|
+
window_size=150,
|
|
438
|
+
skip_cache=False,
|
|
439
|
+
verbose=False,
|
|
440
|
+
)
|
|
441
|
+
cache_path = "legacy_cache_test.cache.json.gz"
|
|
442
|
+
|
|
443
|
+
# Strip provenance fields to mimic a v2.5-era cache file.
|
|
444
|
+
data = _read_cache(cache_path)
|
|
445
|
+
data.pop("bam2tensor_version", None)
|
|
446
|
+
data.pop("fasta_sha256", None)
|
|
447
|
+
data.pop("total_cpg_sites", None)
|
|
448
|
+
with gzip.open(cache_path, "wt") as f:
|
|
449
|
+
json.dump(data, f)
|
|
450
|
+
|
|
451
|
+
embedding.GenomeMethylationEmbedding(
|
|
452
|
+
"legacy_cache_test",
|
|
453
|
+
expected_chromosomes=["chr1", "chr2", "chr3"],
|
|
454
|
+
fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
|
|
455
|
+
window_size=150,
|
|
456
|
+
skip_cache=False,
|
|
457
|
+
verbose=False,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
captured = capsys.readouterr()
|
|
461
|
+
assert "Discarding stale embedding cache" in captured.out
|
|
462
|
+
finally:
|
|
463
|
+
os.chdir(original_cwd)
|
|
@@ -6,7 +6,7 @@ import numpy as np
|
|
|
6
6
|
import scipy.sparse
|
|
7
7
|
from click.testing import CliRunner
|
|
8
8
|
|
|
9
|
-
from bam2tensor import __main__
|
|
9
|
+
from bam2tensor import __main__, __version__
|
|
10
10
|
from bam2tensor.inspect import _format_size
|
|
11
11
|
from bam2tensor.inspect import main as inspect_main
|
|
12
12
|
from bam2tensor.metadata import write_npz_metadata, write_npz_tlen
|
|
@@ -122,7 +122,7 @@ def test_inspect_end_to_end(tmp_path) -> None:
|
|
|
122
122
|
assert result.exit_code == 0
|
|
123
123
|
assert "test" in result.output # genome_name
|
|
124
124
|
assert "CpG index CRC32:" in result.output
|
|
125
|
-
assert "
|
|
125
|
+
assert f"v{__version__}" in result.output
|
|
126
126
|
|
|
127
127
|
|
|
128
128
|
def test_format_size_bytes() -> None:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{bam2tensor-2.6 → bam2tensor-2.7}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|