bam2tensor 2.6__tar.gz → 2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {bam2tensor-2.6 → bam2tensor-2.7}/PKG-INFO +1 -1
  2. {bam2tensor-2.6 → bam2tensor-2.7}/pyproject.toml +1 -1
  3. {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/__init__.py +1 -1
  4. {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/embedding.py +70 -23
  5. {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/metadata.py +29 -2
  6. {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_embedding.py +221 -1
  7. {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_inspect.py +2 -2
  8. {bam2tensor-2.6 → bam2tensor-2.7}/uv.lock +1 -1
  9. {bam2tensor-2.6 → bam2tensor-2.7}/.darglint +0 -0
  10. {bam2tensor-2.6 → bam2tensor-2.7}/.editorconfig +0 -0
  11. {bam2tensor-2.6 → bam2tensor-2.7}/.gitattributes +0 -0
  12. {bam2tensor-2.6 → bam2tensor-2.7}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  13. {bam2tensor-2.6 → bam2tensor-2.7}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  14. {bam2tensor-2.6 → bam2tensor-2.7}/.github/actions/setup-env/action.yml +0 -0
  15. {bam2tensor-2.6 → bam2tensor-2.7}/.github/dependabot.yml +0 -0
  16. {bam2tensor-2.6 → bam2tensor-2.7}/.github/labels.yml +0 -0
  17. {bam2tensor-2.6 → bam2tensor-2.7}/.github/release-drafter.yml +0 -0
  18. {bam2tensor-2.6 → bam2tensor-2.7}/.github/workflows/constraints.txt +0 -0
  19. {bam2tensor-2.6 → bam2tensor-2.7}/.github/workflows/docs.yml +0 -0
  20. {bam2tensor-2.6 → bam2tensor-2.7}/.github/workflows/labeler.yml +0 -0
  21. {bam2tensor-2.6 → bam2tensor-2.7}/.github/workflows/release.yml +0 -0
  22. {bam2tensor-2.6 → bam2tensor-2.7}/.github/workflows/tests.yml +0 -0
  23. {bam2tensor-2.6 → bam2tensor-2.7}/.gitignore +0 -0
  24. {bam2tensor-2.6 → bam2tensor-2.7}/.pre-commit-config.yaml +0 -0
  25. {bam2tensor-2.6 → bam2tensor-2.7}/CLAUDE.md +0 -0
  26. {bam2tensor-2.6 → bam2tensor-2.7}/CONTRIBUTING.md +0 -0
  27. {bam2tensor-2.6 → bam2tensor-2.7}/LICENSE +0 -0
  28. {bam2tensor-2.6 → bam2tensor-2.7}/README.md +0 -0
  29. {bam2tensor-2.6 → bam2tensor-2.7}/SECURITY.md +0 -0
  30. {bam2tensor-2.6 → bam2tensor-2.7}/docs/Makefile +0 -0
  31. {bam2tensor-2.6 → bam2tensor-2.7}/docs/conf.py +0 -0
  32. {bam2tensor-2.6 → bam2tensor-2.7}/docs/contributing.md +0 -0
  33. {bam2tensor-2.6 → bam2tensor-2.7}/docs/index.md +0 -0
  34. {bam2tensor-2.6 → bam2tensor-2.7}/docs/license.md +0 -0
  35. {bam2tensor-2.6 → bam2tensor-2.7}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
  36. {bam2tensor-2.6 → bam2tensor-2.7}/docs/logo/bam2tensor-logo.afdesign +0 -0
  37. {bam2tensor-2.6 → bam2tensor-2.7}/docs/logo/bam2tensor-logo.png +0 -0
  38. {bam2tensor-2.6 → bam2tensor-2.7}/docs/make.bat +0 -0
  39. {bam2tensor-2.6 → bam2tensor-2.7}/docs/nano-banana-overview-shrunk.png +0 -0
  40. {bam2tensor-2.6 → bam2tensor-2.7}/docs/reference.md +0 -0
  41. {bam2tensor-2.6 → bam2tensor-2.7}/docs/templates/package.rst_t +0 -0
  42. {bam2tensor-2.6 → bam2tensor-2.7}/noxfile.py +0 -0
  43. {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/__main__.py +0 -0
  44. {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/functions.py +0 -0
  45. {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/inspect.py +0 -0
  46. {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/py.typed +0 -0
  47. {bam2tensor-2.6 → bam2tensor-2.7}/src/bam2tensor/reference.py +0 -0
  48. {bam2tensor-2.6 → bam2tensor-2.7}/tests/__init__.py +0 -0
  49. {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_duplication.py +0 -0
  50. {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_fasta.fa +0 -0
  51. {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_filters.py +0 -0
  52. {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_functions.py +0 -0
  53. {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_main.py +0 -0
  54. {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_metadata.py +0 -0
  55. {bam2tensor-2.6 → bam2tensor-2.7}/tests/test_reference.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bam2tensor
3
- Version: 2.6
3
+ Version: 2.7
4
4
  Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
5
5
  Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
6
6
  Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "bam2tensor"
3
- version = "2.6"
3
+ version = "2.7"
4
4
  description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
5
5
  authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
6
6
  license = "MIT"
@@ -50,4 +50,4 @@ See Also:
50
50
  - https://mcwdsi.github.io/bam2tensor for full documentation
51
51
  """
52
52
 
53
- __version__ = "2.6"
53
+ __version__ = "2.7"
@@ -45,6 +45,9 @@ import numpy as np
45
45
  from tqdm import tqdm
46
46
  from Bio import SeqIO
47
47
 
48
+ from bam2tensor import __version__
49
+ from bam2tensor.metadata import compute_fasta_sha256
50
+
48
51
 
49
52
  class GenomeMethylationEmbedding:
50
53
  """Manages CpG site positions and coordinate conversions for a reference genome.
@@ -173,7 +176,12 @@ class GenomeMethylationEmbedding:
173
176
  window_size == self.window_size
174
177
  ), "Window size does not match cached window size!"
175
178
  except FileNotFoundError as e:
176
- if self.verbose:
179
+ # Stale-cache rejections (version or FASTA SHA-256 mismatch)
180
+ # raise FileNotFoundError too — always surface those so users
181
+ # are not silently regenerating a cache they thought was valid.
182
+ if os.path.exists(self.cache_file):
183
+ print(f"Discarding stale embedding cache: {e}")
184
+ elif self.verbose:
177
185
  print("Could not load methylation embedding from cache: " + str(e))
178
186
 
179
187
  if not cache_available:
@@ -224,6 +232,9 @@ class GenomeMethylationEmbedding:
224
232
  The cache file is named "{genome_name}.cache.json.gz" and contains:
225
233
  - genome_name: The genome identifier
226
234
  - fasta_source: Path to the original FASTA file
235
+ - fasta_sha256: SHA-256 of the FASTA file bytes (for cache validation)
236
+ - bam2tensor_version: Version that produced this cache
237
+ - total_cpg_sites: Total CpG count across all included chromosomes
227
238
  - expected_chromosomes: List of included chromosomes
228
239
  - window_size: The window_size parameter (for compatibility checking)
229
240
  - cpg_sites_dict: Dictionary of chromosome -> list of CpG positions
@@ -241,9 +252,13 @@ class GenomeMethylationEmbedding:
241
252
 
242
253
  assert len(self.cpg_sites_dict) > 0, "CpG sites dict is empty!"
243
254
 
255
+ total_cpg_sites = sum(len(v) for v in self.cpg_sites_dict.values())
244
256
  cache_data = {
245
257
  "genome_name": self.genome_name,
246
258
  "fasta_source": self.fasta_source,
259
+ "fasta_sha256": compute_fasta_sha256(self.fasta_source),
260
+ "bam2tensor_version": __version__,
261
+ "total_cpg_sites": total_cpg_sites,
247
262
  "expected_chromosomes": self.expected_chromosomes,
248
263
  "window_size": self.window_size,
249
264
  "cpg_sites_dict": self.cpg_sites_dict,
@@ -263,38 +278,66 @@ class GenomeMethylationEmbedding:
263
278
  restore all CpG site data. If successful, this avoids the slow
264
279
  FASTA parsing step.
265
280
 
281
+ Provenance is validated before the cached data is trusted: the
282
+ cache must have been written by the same major.minor of
283
+ bam2tensor and must reference a FASTA file with the same SHA-256
284
+ as the current ``fasta_source``. A stale cache is rejected with
285
+ a ``FileNotFoundError`` so the caller falls through to a fresh
286
+ FASTA parse and overwrites the stale cache on save.
287
+
266
288
  Returns:
267
289
  True if the cache was successfully loaded.
268
290
 
269
291
  Raises:
270
- FileNotFoundError: If the cache file does not exist.
292
+ FileNotFoundError: If the cache file does not exist, or if
293
+ the cache is stale (version mismatch or FASTA SHA-256
294
+ mismatch).
271
295
 
272
296
  Note:
273
- After loading, the caller should verify that expected_chromosomes
274
- and window_size match the current configuration, as this method
275
- overwrites those attributes with cached values.
297
+ After loading, the caller should verify that
298
+ ``expected_chromosomes`` and ``window_size`` match the current
299
+ configuration, as this method overwrites those attributes
300
+ with cached values.
276
301
  """
277
302
 
278
- if os.path.exists(self.cache_file):
279
- if self.verbose:
280
- print(f"\tReading embedding from cache: {self.cache_file}")
303
+ if not os.path.exists(self.cache_file):
304
+ raise FileNotFoundError("No cache of embedding found.")
281
305
 
282
- # TODO: Add type hinting via TypedDicts?
283
- # e.g. https://stackoverflow.com/questions/51291722/define-a-jsonable-type-using-mypy-pep-526
284
- with gzip.open(self.cache_file, "rt") as f:
285
- self.cache_data = json.load(f)
306
+ if self.verbose:
307
+ print(f"\tReading embedding from cache: {self.cache_file}")
286
308
 
287
- # Load the cached data
288
- self.genome_name = self.cache_data["genome_name"]
289
- self.fasta_source = self.cache_data["fasta_source"]
290
- self.expected_chromosomes = self.cache_data["expected_chromosomes"]
291
- self.window_size = self.cache_data["window_size"]
292
- self.cpg_sites_dict = self.cache_data["cpg_sites_dict"]
309
+ with gzip.open(self.cache_file, "rt") as f:
310
+ self.cache_data = json.load(f)
293
311
 
294
- if self.verbose:
295
- print(f"\tCached genome fasta source: {self.fasta_source}")
296
- else:
297
- raise FileNotFoundError("No cache of embedding found.")
312
+ # Validate cache provenance: stale caches predating v2.7 used a
313
+ # case-sensitive CpG search that silently dropped roughly half
314
+ # the CpG sites in soft-masked FASTAs (e.g. UCSC's hg38.fa.gz).
315
+ cached_version = self.cache_data.get("bam2tensor_version")
316
+ if cached_version != __version__:
317
+ raise FileNotFoundError(
318
+ f"Stale cache {self.cache_file!r}: written by bam2tensor "
319
+ f"{cached_version!r}, current is {__version__!r}. "
320
+ "Regenerating."
321
+ )
322
+
323
+ cached_fasta_sha256 = self.cache_data.get("fasta_sha256")
324
+ current_fasta_sha256 = compute_fasta_sha256(self.fasta_source)
325
+ if cached_fasta_sha256 != current_fasta_sha256:
326
+ raise FileNotFoundError(
327
+ f"Stale cache {self.cache_file!r}: FASTA SHA-256 mismatch "
328
+ f"(cache={cached_fasta_sha256}, current={current_fasta_sha256}). "
329
+ "Regenerating."
330
+ )
331
+
332
+ # Load the cached data
333
+ self.genome_name = self.cache_data["genome_name"]
334
+ self.fasta_source = self.cache_data["fasta_source"]
335
+ self.expected_chromosomes = self.cache_data["expected_chromosomes"]
336
+ self.window_size = self.cache_data["window_size"]
337
+ self.cpg_sites_dict = self.cache_data["cpg_sites_dict"]
338
+
339
+ if self.verbose:
340
+ print(f"\tCached genome fasta source: {self.fasta_source}")
298
341
 
299
342
  return True
300
343
 
@@ -350,7 +393,11 @@ class GenomeMethylationEmbedding:
350
393
  if self.verbose:
351
394
  tqdm.write(f"\tSkipping chromosome {seqrecord.id}")
352
395
  continue
353
- sequence = seqrecord.seq
396
+ # Upper-case the sequence so soft-masked FASTAs (UCSC's default
397
+ # hg38.fa.gz uses lowercase for RepeatMasker/TRF regions) do not
398
+ # silently drop CpGs in repeats — that is roughly half of all
399
+ # CpGs in the human genome.
400
+ sequence = seqrecord.seq.upper()
354
401
 
355
402
  # Find all CpG sites
356
403
  # The pos+1 is because we want to store the 1-based position, because .bed is wild and arguably 1-based maybe:
@@ -27,17 +27,44 @@ Example:
27
27
  hg38
28
28
  """
29
29
 
30
+ import hashlib
30
31
  import io
31
32
  import json
32
33
  import zipfile
33
34
  import zlib
35
+ from typing import TYPE_CHECKING
34
36
 
35
37
  import numpy as np
36
38
 
37
- from bam2tensor.embedding import GenomeMethylationEmbedding
39
+ if TYPE_CHECKING:
40
+ # Avoid a runtime circular import: embedding.py imports compute_fasta_sha256
41
+ # from this module, and this module only needs the embedding type for
42
+ # annotations.
43
+ from bam2tensor.embedding import GenomeMethylationEmbedding
38
44
 
39
45
 
40
- def compute_cpg_index_crc32(embedding: GenomeMethylationEmbedding) -> str:
46
+ def compute_fasta_sha256(fasta_source: str) -> str:
47
+ """Compute the SHA-256 of a FASTA file's bytes on disk.
48
+
49
+ Used to stamp the CpG-site cache (see
50
+ :py:class:`bam2tensor.embedding.GenomeMethylationEmbedding`) so a
51
+ cache can be rejected when the underlying FASTA changes (e.g. a
52
+ user swaps a soft-masked build for a hard-masked one).
53
+
54
+ Args:
55
+ fasta_source: Path to the reference FASTA file.
56
+
57
+ Returns:
58
+ The hex-encoded SHA-256 digest of the file's bytes.
59
+ """
60
+ h = hashlib.sha256()
61
+ with open(fasta_source, "rb") as f:
62
+ for chunk in iter(lambda: f.read(1024 * 1024), b""):
63
+ h.update(chunk)
64
+ return h.hexdigest()
65
+
66
+
67
+ def compute_cpg_index_crc32(embedding: "GenomeMethylationEmbedding") -> str:
41
68
  """Compute a CRC32 checksum over the CpG site positions in an embedding.
42
69
 
43
70
  The checksum captures the exact column mapping of the sparse matrix:
@@ -1,6 +1,10 @@
1
+ import gzip
2
+ import json
1
3
  import os
4
+
2
5
  import pytest
3
- from bam2tensor import embedding
6
+
7
+ from bam2tensor import __version__, embedding
4
8
 
5
9
  # Generate a fresh, uncached embedding
6
10
  test_embedding = embedding.GenomeMethylationEmbedding(
@@ -241,3 +245,219 @@ def test_save_cache_verbose(tmp_path, capsys) -> None:
241
245
  assert "Saved embedding cache" in captured.out
242
246
  finally:
243
247
  os.chdir(original_cwd)
248
+
249
+
250
+ # -- Regression: soft-masked FASTAs must enumerate every CpG -----------------
251
+
252
+
253
+ def _write_softmasked_fasta(path: str) -> None:
254
+ """Write a tiny mixed-case FASTA with CG/cg/Cg/cG to ``path``.
255
+
256
+ UCSC's default ``hg38.fa.gz`` is soft-masked (lowercase = RepeatMasker
257
+ / Tandem Repeats Finder). Prior to v2.7 the CpG enumeration used a
258
+ case-sensitive search and silently dropped roughly half the CpGs in
259
+ soft-masked references — this fixture pins the regression.
260
+ """
261
+ # 1-based positions of the C in each C[gG] (0-based + 1):
262
+ # CG @ pos 6, cg @ pos 11, Cg @ pos 17, cG @ pos 23
263
+ seq = "AAAAACGAAAcgAAAACgAAAAcGAAAAA"
264
+ with open(path, "w") as fh:
265
+ fh.write(">chr1\n")
266
+ fh.write(seq + "\n")
267
+
268
+
269
+ def test_soft_masked_fasta_enumerates_all_cpgs(tmp_path) -> None:
270
+ """All CpGs are found regardless of case (regression for v2.7 case fix).
271
+
272
+ Bug: prior to v2.7, ``sequence.find('CG')`` on a soft-masked FASTA
273
+ missed every lowercase or mixed-case ``cg``/``Cg``/``cG``. On real
274
+ UCSC hg38 that silently dropped ~53% of all CpG sites.
275
+ """
276
+ fasta = tmp_path / "softmasked.fa"
277
+ _write_softmasked_fasta(str(fasta))
278
+
279
+ original_cwd = os.getcwd()
280
+ os.chdir(tmp_path)
281
+ try:
282
+ emb = embedding.GenomeMethylationEmbedding(
283
+ "softmasked_test",
284
+ expected_chromosomes=["chr1"],
285
+ fasta_source=str(fasta),
286
+ window_size=150,
287
+ skip_cache=True,
288
+ verbose=False,
289
+ )
290
+ finally:
291
+ os.chdir(original_cwd)
292
+
293
+ assert emb.total_cpg_sites == 4
294
+ assert emb.cpg_sites_dict["chr1"] == [6, 11, 17, 23]
295
+
296
+
297
+ # -- Cache provenance: stamp + reject stale ---------------------------------
298
+
299
+
300
+ def _read_cache(path: str) -> dict:
301
+ """Read and decode a gzipped JSON cache file."""
302
+ with gzip.open(path, "rt") as f:
303
+ return json.load(f)
304
+
305
+
306
+ def test_cache_stamps_provenance(tmp_path) -> None:
307
+ """Saved cache embeds bam2tensor_version, fasta_sha256, total_cpg_sites."""
308
+ original_cwd = os.getcwd()
309
+ os.chdir(tmp_path)
310
+ try:
311
+ emb = embedding.GenomeMethylationEmbedding(
312
+ "provenance_test",
313
+ expected_chromosomes=["chr1", "chr2", "chr3"],
314
+ fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
315
+ window_size=150,
316
+ skip_cache=False,
317
+ verbose=False,
318
+ )
319
+ data = _read_cache("provenance_test.cache.json.gz")
320
+ finally:
321
+ os.chdir(original_cwd)
322
+
323
+ assert data["bam2tensor_version"] == __version__
324
+ assert data["total_cpg_sites"] == emb.total_cpg_sites
325
+ assert isinstance(data["fasta_sha256"], str)
326
+ assert len(data["fasta_sha256"]) == 64 # hex-encoded SHA-256
327
+
328
+
329
+ def test_cache_rejected_on_fasta_sha256_mismatch(tmp_path, capsys) -> None:
330
+ """A cache whose ``fasta_sha256`` no longer matches the current FASTA is discarded.
331
+
332
+ This is the primary safety net for the upgrade to v2.7: any user
333
+ with a pre-v2.7 cache (or any user who switches between soft- and
334
+ hard-masked FASTAs) will get a clean re-parse rather than silently
335
+ misaligned column indices.
336
+ """
337
+ original_cwd = os.getcwd()
338
+ os.chdir(tmp_path)
339
+ try:
340
+ # First run: generate cache against the real test FASTA.
341
+ emb1 = embedding.GenomeMethylationEmbedding(
342
+ "sha_mismatch_test",
343
+ expected_chromosomes=["chr1", "chr2", "chr3"],
344
+ fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
345
+ window_size=150,
346
+ skip_cache=False,
347
+ verbose=False,
348
+ )
349
+ cache_path = "sha_mismatch_test.cache.json.gz"
350
+ assert os.path.exists(cache_path)
351
+
352
+ # Tamper with the cached fasta_sha256 so it can no longer match
353
+ # the on-disk FASTA. This simulates a user upgrading bam2tensor
354
+ # with an inherited v2.5 cache whose enumeration is stale.
355
+ data = _read_cache(cache_path)
356
+ data["fasta_sha256"] = "0" * 64
357
+ with gzip.open(cache_path, "wt") as f:
358
+ json.dump(data, f)
359
+
360
+ # Second run: the stale cache must be rejected and regenerated.
361
+ emb2 = embedding.GenomeMethylationEmbedding(
362
+ "sha_mismatch_test",
363
+ expected_chromosomes=["chr1", "chr2", "chr3"],
364
+ fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
365
+ window_size=150,
366
+ skip_cache=False,
367
+ verbose=False,
368
+ )
369
+
370
+ captured = capsys.readouterr()
371
+ assert "Discarding stale embedding cache" in captured.out
372
+ assert "FASTA SHA-256 mismatch" in captured.out
373
+
374
+ # Regenerated cache: same content as the first run.
375
+ assert emb2.total_cpg_sites == emb1.total_cpg_sites
376
+ # And the on-disk cache now stamps the real SHA-256, not the tampered one.
377
+ refreshed = _read_cache(cache_path)
378
+ assert refreshed["fasta_sha256"] != "0" * 64
379
+ finally:
380
+ os.chdir(original_cwd)
381
+
382
+
383
+ def test_cache_rejected_on_version_mismatch(tmp_path, capsys) -> None:
384
+ """A cache stamped with an older bam2tensor version is discarded."""
385
+ original_cwd = os.getcwd()
386
+ os.chdir(tmp_path)
387
+ try:
388
+ embedding.GenomeMethylationEmbedding(
389
+ "version_mismatch_test",
390
+ expected_chromosomes=["chr1", "chr2", "chr3"],
391
+ fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
392
+ window_size=150,
393
+ skip_cache=False,
394
+ verbose=False,
395
+ )
396
+ cache_path = "version_mismatch_test.cache.json.gz"
397
+
398
+ # Roll the version back to a release that predates the
399
+ # case-sensitivity fix; any non-current version should trigger
400
+ # rejection.
401
+ data = _read_cache(cache_path)
402
+ data["bam2tensor_version"] = "2.5"
403
+ with gzip.open(cache_path, "wt") as f:
404
+ json.dump(data, f)
405
+
406
+ embedding.GenomeMethylationEmbedding(
407
+ "version_mismatch_test",
408
+ expected_chromosomes=["chr1", "chr2", "chr3"],
409
+ fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
410
+ window_size=150,
411
+ skip_cache=False,
412
+ verbose=False,
413
+ )
414
+
415
+ captured = capsys.readouterr()
416
+ assert "Discarding stale embedding cache" in captured.out
417
+ assert "bam2tensor" in captured.out
418
+ assert "2.5" in captured.out
419
+ finally:
420
+ os.chdir(original_cwd)
421
+
422
+
423
+ def test_cache_rejected_when_provenance_absent(tmp_path, capsys) -> None:
424
+ """A pre-v2.7 cache without provenance fields is treated as stale.
425
+
426
+ Users on v2.5 / v2.6 have caches with no ``bam2tensor_version`` or
427
+ ``fasta_sha256``. After upgrading, those caches must be discarded so
428
+ the case-sensitivity fix actually takes effect on their data.
429
+ """
430
+ original_cwd = os.getcwd()
431
+ os.chdir(tmp_path)
432
+ try:
433
+ embedding.GenomeMethylationEmbedding(
434
+ "legacy_cache_test",
435
+ expected_chromosomes=["chr1", "chr2", "chr3"],
436
+ fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
437
+ window_size=150,
438
+ skip_cache=False,
439
+ verbose=False,
440
+ )
441
+ cache_path = "legacy_cache_test.cache.json.gz"
442
+
443
+ # Strip provenance fields to mimic a v2.5-era cache file.
444
+ data = _read_cache(cache_path)
445
+ data.pop("bam2tensor_version", None)
446
+ data.pop("fasta_sha256", None)
447
+ data.pop("total_cpg_sites", None)
448
+ with gzip.open(cache_path, "wt") as f:
449
+ json.dump(data, f)
450
+
451
+ embedding.GenomeMethylationEmbedding(
452
+ "legacy_cache_test",
453
+ expected_chromosomes=["chr1", "chr2", "chr3"],
454
+ fasta_source=os.path.join(original_cwd, "tests/test_fasta.fa"),
455
+ window_size=150,
456
+ skip_cache=False,
457
+ verbose=False,
458
+ )
459
+
460
+ captured = capsys.readouterr()
461
+ assert "Discarding stale embedding cache" in captured.out
462
+ finally:
463
+ os.chdir(original_cwd)
@@ -6,7 +6,7 @@ import numpy as np
6
6
  import scipy.sparse
7
7
  from click.testing import CliRunner
8
8
 
9
- from bam2tensor import __main__
9
+ from bam2tensor import __main__, __version__
10
10
  from bam2tensor.inspect import _format_size
11
11
  from bam2tensor.inspect import main as inspect_main
12
12
  from bam2tensor.metadata import write_npz_metadata, write_npz_tlen
@@ -122,7 +122,7 @@ def test_inspect_end_to_end(tmp_path) -> None:
122
122
  assert result.exit_code == 0
123
123
  assert "test" in result.output # genome_name
124
124
  assert "CpG index CRC32:" in result.output
125
- assert "v2.6" in result.output
125
+ assert f"v{__version__}" in result.output
126
126
 
127
127
 
128
128
  def test_format_size_bytes() -> None:
@@ -63,7 +63,7 @@ wheels = [
63
63
 
64
64
  [[package]]
65
65
  name = "bam2tensor"
66
- version = "2.6"
66
+ version = "2.7"
67
67
  source = { editable = "." }
68
68
  dependencies = [
69
69
  { name = "biopython" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes