bio2zarr 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/vcf_utils.py CHANGED
@@ -7,12 +7,12 @@ import struct
7
7
  from collections.abc import Sequence
8
8
  from dataclasses import dataclass
9
9
  from enum import Enum
10
- from typing import IO, Any, Optional, Union
10
+ from typing import IO, Any
11
11
 
12
- import cyvcf2
13
12
  import humanfriendly
14
13
  import numpy as np
15
14
 
15
+ from bio2zarr import core
16
16
  from bio2zarr.typing import PathType
17
17
 
18
18
  logger = logging.getLogger(__name__)
@@ -33,7 +33,7 @@ def get_file_offset(vfp: int) -> int:
33
33
  return vfp >> 16 & address_mask
34
34
 
35
35
 
36
- def read_bytes_as_value(f: IO[Any], fmt: str, nodata: Optional[Any] = None) -> Any:
36
+ def read_bytes_as_value(f: IO[Any], fmt: str, nodata: Any | None = None) -> Any:
37
37
  """Read bytes using a `struct` format string and return the unpacked data value.
38
38
 
39
39
  Parameters
@@ -85,11 +85,14 @@ class Region:
85
85
  """
86
86
 
87
87
  contig: str
88
- start: Optional[int] = None
89
- end: Optional[int] = None
88
+ start: int | None = None
89
+ end: int | None = None
90
90
 
91
91
  def __post_init__(self):
92
- if self.start is not None:
92
+ assert self.contig is not None
93
+ if self.start is None:
94
+ self.start = 1
95
+ else:
93
96
  self.start = int(self.start)
94
97
  assert self.start > 0
95
98
  if self.end is not None:
@@ -194,9 +197,7 @@ def get_first_locus_in_bin(csi: CSIIndex, bin: int) -> int:
194
197
  return (bin - first_bin_on_level) * (max_span // level_size) + 1
195
198
 
196
199
 
197
- def read_csi(
198
- file: PathType, storage_options: Optional[dict[str, str]] = None
199
- ) -> CSIIndex:
200
+ def read_csi(file: PathType, storage_options: dict[str, str] | None = None) -> CSIIndex:
200
201
  """Parse a CSI file into a `CSIIndex` object.
201
202
 
202
203
  Parameters
@@ -311,7 +312,7 @@ class TabixIndex:
311
312
 
312
313
 
313
314
  def read_tabix(
314
- file: PathType, storage_options: Optional[dict[str, str]] = None
315
+ file: PathType, storage_options: dict[str, str] | None = None
315
316
  ) -> TabixIndex:
316
317
  """Parse a tabix file into a `TabixIndex` object.
317
318
 
@@ -393,9 +394,15 @@ class VcfIndexType(Enum):
393
394
  TABIX = ".tbi"
394
395
 
395
396
 
396
- class IndexedVcf(contextlib.AbstractContextManager):
397
+ class VcfFile(contextlib.AbstractContextManager):
398
+ @core.requires_optional_dependency("cyvcf2", "vcf")
397
399
  def __init__(self, vcf_path, index_path=None):
400
+ import cyvcf2
401
+
398
402
  self.vcf = None
403
+ self.file_type = None
404
+ self.index_type = None
405
+
399
406
  vcf_path = pathlib.Path(vcf_path)
400
407
  if not vcf_path.exists():
401
408
  raise FileNotFoundError(vcf_path)
@@ -408,30 +415,34 @@ class IndexedVcf(contextlib.AbstractContextManager):
408
415
  vcf_path.suffix + VcfIndexType.CSI.value
409
416
  )
410
417
  if not index_path.exists():
411
- raise FileNotFoundError(
412
- f"Cannot find .tbi or .csi file for {vcf_path}"
413
- )
418
+ # No supported index found
419
+ index_path = None
414
420
  else:
415
421
  index_path = pathlib.Path(index_path)
422
+ if not index_path.exists():
423
+ raise FileNotFoundError(
424
+ f"Specified index path {index_path} does not exist"
425
+ )
416
426
 
417
427
  self.vcf_path = vcf_path
418
428
  self.index_path = index_path
419
- self.file_type = None
420
- self.index_type = None
421
-
422
- if index_path.suffix == VcfIndexType.CSI.value:
423
- self.index_type = VcfIndexType.CSI
424
- elif index_path.suffix == VcfIndexType.TABIX.value:
425
- self.index_type = VcfIndexType.TABIX
426
- self.file_type = VcfFileType.VCF
427
- else:
428
- raise ValueError("Only .tbi or .csi indexes are supported.")
429
+ if index_path is not None:
430
+ if index_path.suffix == VcfIndexType.CSI.value:
431
+ self.index_type = VcfIndexType.CSI
432
+ elif index_path.suffix == VcfIndexType.TABIX.value:
433
+ self.index_type = VcfIndexType.TABIX
434
+ self.file_type = VcfFileType.VCF
435
+ else:
436
+ raise ValueError("Only .tbi or .csi indexes are supported.")
429
437
 
430
438
  self.vcf = cyvcf2.VCF(vcf_path)
431
- self.vcf.set_index(str(self.index_path))
439
+ if self.index_path is not None:
440
+ self.vcf.set_index(str(self.index_path))
441
+
432
442
  logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
433
443
  self.sequence_names = None
434
444
 
445
+ self.index = None
435
446
  if self.index_type == VcfIndexType.CSI:
436
447
  # Determine the file-type based on the "aux" field.
437
448
  self.index = read_csi(self.index_path)
@@ -441,9 +452,17 @@ class IndexedVcf(contextlib.AbstractContextManager):
441
452
  self.sequence_names = self.index.parse_vcf_aux()
442
453
  else:
443
454
  self.sequence_names = self.vcf.seqnames
444
- else:
455
+ elif self.index_type == VcfIndexType.TABIX:
445
456
  self.index = read_tabix(self.index_path)
457
+ self.file_type = VcfFileType.VCF
446
458
  self.sequence_names = self.index.sequence_names
459
+ else:
460
+ assert self.index is None
461
+ var = next(self.vcf)
462
+ self.sequence_names = [var.CHROM]
463
+ self.vcf.close()
464
+ # There doesn't seem to be a way to reset the iterator
465
+ self.vcf = cyvcf2.VCF(vcf_path)
447
466
 
448
467
  def __exit__(self, exc_type, exc_val, exc_tb):
449
468
  if self.vcf is not None:
@@ -452,6 +471,8 @@ class IndexedVcf(contextlib.AbstractContextManager):
452
471
  return False
453
472
 
454
473
  def contig_record_counts(self):
474
+ if self.index is None:
475
+ return {self.sequence_names[0]: RECORD_COUNT_UNKNOWN}
455
476
  d = dict(zip(self.sequence_names, self.index.record_counts))
456
477
  if self.file_type == VcfFileType.BCF:
457
478
  d = {k: v for k, v in d.items() if v > 0}
@@ -460,12 +481,21 @@ class IndexedVcf(contextlib.AbstractContextManager):
460
481
  def count_variants(self, region):
461
482
  return sum(1 for _ in self.variants(region))
462
483
 
463
- def variants(self, region):
464
- start = 1 if region.start is None else region.start
465
- for var in self.vcf(str(region)):
466
- # Need to filter because of indels overlapping the region
467
- if var.POS >= start:
484
+ def variants(self, region=None):
485
+ if self.index is None:
486
+ contig = self.sequence_names[0]
487
+ if region is not None:
488
+ assert region.contig == contig
489
+ for var in self.vcf:
490
+ if var.CHROM != contig:
491
+ raise ValueError("Multi-contig VCFs must be indexed")
468
492
  yield var
493
+ else:
494
+ start = 1 if region.start is None else region.start
495
+ for var in self.vcf(str(region)):
496
+ # Need to filter because of indels overlapping the region
497
+ if var.POS >= start:
498
+ yield var
469
499
 
470
500
  def _filter_empty_and_refine(self, regions):
471
501
  """
@@ -483,8 +513,8 @@ class IndexedVcf(contextlib.AbstractContextManager):
483
513
 
484
514
  def partition_into_regions(
485
515
  self,
486
- num_parts: Optional[int] = None,
487
- target_part_size: Union[None, int, str] = None,
516
+ num_parts: int | None = None,
517
+ target_part_size: None | int | str = None,
488
518
  ):
489
519
  if num_parts is None and target_part_size is None:
490
520
  raise ValueError("One of num_parts or target_part_size must be specified")
@@ -505,6 +535,9 @@ class IndexedVcf(contextlib.AbstractContextManager):
505
535
  if target_part_size_bytes < 1:
506
536
  raise ValueError("target_part_size must be positive")
507
537
 
538
+ if self.index is None:
539
+ return [Region(self.sequence_names[0])]
540
+
508
541
  # Calculate the desired part file boundaries
509
542
  file_length = os.stat(self.vcf_path).st_size
510
543
  if num_parts is not None: