bio2zarr 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/__main__.py +2 -1
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +91 -24
- bio2zarr/core.py +43 -22
- bio2zarr/plink.py +314 -189
- bio2zarr/tskit.py +301 -0
- bio2zarr/typing.py +1 -2
- bio2zarr/{vcf2zarr/icf.py → vcf.py} +614 -118
- bio2zarr/vcf_utils.py +66 -33
- bio2zarr/{vcf2zarr/vcz.py → vcz.py} +544 -708
- bio2zarr/{vcf2zarr/verification.py → vcz_verification.py} +5 -2
- {bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/METADATA +19 -7
- bio2zarr-0.1.6.dist-info/RECORD +21 -0
- {bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/WHEEL +1 -1
- {bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/entry_points.txt +2 -0
- bio2zarr/vcf2zarr/__init__.py +0 -38
- bio2zarr-0.1.4.dist-info/RECORD +0 -21
- {bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info/licenses}/LICENSE +0 -0
- {bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/top_level.txt +0 -0
bio2zarr/vcf_utils.py
CHANGED
|
@@ -7,12 +7,12 @@ import struct
|
|
|
7
7
|
from collections.abc import Sequence
|
|
8
8
|
from dataclasses import dataclass
|
|
9
9
|
from enum import Enum
|
|
10
|
-
from typing import IO, Any
|
|
10
|
+
from typing import IO, Any
|
|
11
11
|
|
|
12
|
-
import cyvcf2
|
|
13
12
|
import humanfriendly
|
|
14
13
|
import numpy as np
|
|
15
14
|
|
|
15
|
+
from bio2zarr import core
|
|
16
16
|
from bio2zarr.typing import PathType
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
@@ -33,7 +33,7 @@ def get_file_offset(vfp: int) -> int:
|
|
|
33
33
|
return vfp >> 16 & address_mask
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
def read_bytes_as_value(f: IO[Any], fmt: str, nodata:
|
|
36
|
+
def read_bytes_as_value(f: IO[Any], fmt: str, nodata: Any | None = None) -> Any:
|
|
37
37
|
"""Read bytes using a `struct` format string and return the unpacked data value.
|
|
38
38
|
|
|
39
39
|
Parameters
|
|
@@ -85,11 +85,14 @@ class Region:
|
|
|
85
85
|
"""
|
|
86
86
|
|
|
87
87
|
contig: str
|
|
88
|
-
start:
|
|
89
|
-
end:
|
|
88
|
+
start: int | None = None
|
|
89
|
+
end: int | None = None
|
|
90
90
|
|
|
91
91
|
def __post_init__(self):
|
|
92
|
-
|
|
92
|
+
assert self.contig is not None
|
|
93
|
+
if self.start is None:
|
|
94
|
+
self.start = 1
|
|
95
|
+
else:
|
|
93
96
|
self.start = int(self.start)
|
|
94
97
|
assert self.start > 0
|
|
95
98
|
if self.end is not None:
|
|
@@ -194,9 +197,7 @@ def get_first_locus_in_bin(csi: CSIIndex, bin: int) -> int:
|
|
|
194
197
|
return (bin - first_bin_on_level) * (max_span // level_size) + 1
|
|
195
198
|
|
|
196
199
|
|
|
197
|
-
def read_csi(
|
|
198
|
-
file: PathType, storage_options: Optional[dict[str, str]] = None
|
|
199
|
-
) -> CSIIndex:
|
|
200
|
+
def read_csi(file: PathType, storage_options: dict[str, str] | None = None) -> CSIIndex:
|
|
200
201
|
"""Parse a CSI file into a `CSIIndex` object.
|
|
201
202
|
|
|
202
203
|
Parameters
|
|
@@ -311,7 +312,7 @@ class TabixIndex:
|
|
|
311
312
|
|
|
312
313
|
|
|
313
314
|
def read_tabix(
|
|
314
|
-
file: PathType, storage_options:
|
|
315
|
+
file: PathType, storage_options: dict[str, str] | None = None
|
|
315
316
|
) -> TabixIndex:
|
|
316
317
|
"""Parse a tabix file into a `TabixIndex` object.
|
|
317
318
|
|
|
@@ -393,9 +394,15 @@ class VcfIndexType(Enum):
|
|
|
393
394
|
TABIX = ".tbi"
|
|
394
395
|
|
|
395
396
|
|
|
396
|
-
class
|
|
397
|
+
class VcfFile(contextlib.AbstractContextManager):
|
|
398
|
+
@core.requires_optional_dependency("cyvcf2", "vcf")
|
|
397
399
|
def __init__(self, vcf_path, index_path=None):
|
|
400
|
+
import cyvcf2
|
|
401
|
+
|
|
398
402
|
self.vcf = None
|
|
403
|
+
self.file_type = None
|
|
404
|
+
self.index_type = None
|
|
405
|
+
|
|
399
406
|
vcf_path = pathlib.Path(vcf_path)
|
|
400
407
|
if not vcf_path.exists():
|
|
401
408
|
raise FileNotFoundError(vcf_path)
|
|
@@ -408,30 +415,34 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
408
415
|
vcf_path.suffix + VcfIndexType.CSI.value
|
|
409
416
|
)
|
|
410
417
|
if not index_path.exists():
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
)
|
|
418
|
+
# No supported index found
|
|
419
|
+
index_path = None
|
|
414
420
|
else:
|
|
415
421
|
index_path = pathlib.Path(index_path)
|
|
422
|
+
if not index_path.exists():
|
|
423
|
+
raise FileNotFoundError(
|
|
424
|
+
f"Specified index path {index_path} does not exist"
|
|
425
|
+
)
|
|
416
426
|
|
|
417
427
|
self.vcf_path = vcf_path
|
|
418
428
|
self.index_path = index_path
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
else:
|
|
428
|
-
raise ValueError("Only .tbi or .csi indexes are supported.")
|
|
429
|
+
if index_path is not None:
|
|
430
|
+
if index_path.suffix == VcfIndexType.CSI.value:
|
|
431
|
+
self.index_type = VcfIndexType.CSI
|
|
432
|
+
elif index_path.suffix == VcfIndexType.TABIX.value:
|
|
433
|
+
self.index_type = VcfIndexType.TABIX
|
|
434
|
+
self.file_type = VcfFileType.VCF
|
|
435
|
+
else:
|
|
436
|
+
raise ValueError("Only .tbi or .csi indexes are supported.")
|
|
429
437
|
|
|
430
438
|
self.vcf = cyvcf2.VCF(vcf_path)
|
|
431
|
-
self.
|
|
439
|
+
if self.index_path is not None:
|
|
440
|
+
self.vcf.set_index(str(self.index_path))
|
|
441
|
+
|
|
432
442
|
logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
|
|
433
443
|
self.sequence_names = None
|
|
434
444
|
|
|
445
|
+
self.index = None
|
|
435
446
|
if self.index_type == VcfIndexType.CSI:
|
|
436
447
|
# Determine the file-type based on the "aux" field.
|
|
437
448
|
self.index = read_csi(self.index_path)
|
|
@@ -441,9 +452,17 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
441
452
|
self.sequence_names = self.index.parse_vcf_aux()
|
|
442
453
|
else:
|
|
443
454
|
self.sequence_names = self.vcf.seqnames
|
|
444
|
-
|
|
455
|
+
elif self.index_type == VcfIndexType.TABIX:
|
|
445
456
|
self.index = read_tabix(self.index_path)
|
|
457
|
+
self.file_type = VcfFileType.VCF
|
|
446
458
|
self.sequence_names = self.index.sequence_names
|
|
459
|
+
else:
|
|
460
|
+
assert self.index is None
|
|
461
|
+
var = next(self.vcf)
|
|
462
|
+
self.sequence_names = [var.CHROM]
|
|
463
|
+
self.vcf.close()
|
|
464
|
+
# There doesn't seem to be a way to reset the iterator
|
|
465
|
+
self.vcf = cyvcf2.VCF(vcf_path)
|
|
447
466
|
|
|
448
467
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
449
468
|
if self.vcf is not None:
|
|
@@ -452,6 +471,8 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
452
471
|
return False
|
|
453
472
|
|
|
454
473
|
def contig_record_counts(self):
|
|
474
|
+
if self.index is None:
|
|
475
|
+
return {self.sequence_names[0]: RECORD_COUNT_UNKNOWN}
|
|
455
476
|
d = dict(zip(self.sequence_names, self.index.record_counts))
|
|
456
477
|
if self.file_type == VcfFileType.BCF:
|
|
457
478
|
d = {k: v for k, v in d.items() if v > 0}
|
|
@@ -460,12 +481,21 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
460
481
|
def count_variants(self, region):
|
|
461
482
|
return sum(1 for _ in self.variants(region))
|
|
462
483
|
|
|
463
|
-
def variants(self, region):
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
484
|
+
def variants(self, region=None):
|
|
485
|
+
if self.index is None:
|
|
486
|
+
contig = self.sequence_names[0]
|
|
487
|
+
if region is not None:
|
|
488
|
+
assert region.contig == contig
|
|
489
|
+
for var in self.vcf:
|
|
490
|
+
if var.CHROM != contig:
|
|
491
|
+
raise ValueError("Multi-contig VCFs must be indexed")
|
|
468
492
|
yield var
|
|
493
|
+
else:
|
|
494
|
+
start = 1 if region.start is None else region.start
|
|
495
|
+
for var in self.vcf(str(region)):
|
|
496
|
+
# Need to filter because of indels overlapping the region
|
|
497
|
+
if var.POS >= start:
|
|
498
|
+
yield var
|
|
469
499
|
|
|
470
500
|
def _filter_empty_and_refine(self, regions):
|
|
471
501
|
"""
|
|
@@ -483,8 +513,8 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
483
513
|
|
|
484
514
|
def partition_into_regions(
|
|
485
515
|
self,
|
|
486
|
-
num_parts:
|
|
487
|
-
target_part_size:
|
|
516
|
+
num_parts: int | None = None,
|
|
517
|
+
target_part_size: None | int | str = None,
|
|
488
518
|
):
|
|
489
519
|
if num_parts is None and target_part_size is None:
|
|
490
520
|
raise ValueError("One of num_parts or target_part_size must be specified")
|
|
@@ -505,6 +535,9 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
505
535
|
if target_part_size_bytes < 1:
|
|
506
536
|
raise ValueError("target_part_size must be positive")
|
|
507
537
|
|
|
538
|
+
if self.index is None:
|
|
539
|
+
return [Region(self.sequence_names[0])]
|
|
540
|
+
|
|
508
541
|
# Calculate the desired part file boundaries
|
|
509
542
|
file_length = os.stat(self.vcf_path).st_size
|
|
510
543
|
if num_parts is not None:
|