bio2zarr 0.1.4__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/CHANGELOG.md +6 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/PKG-INFO +3 -2
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/_version.py +2 -2
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/cli.py +2 -2
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/vcf2zarr/icf.py +20 -6
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/vcf_utils.py +54 -22
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr.egg-info/PKG-INFO +3 -2
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/.github/workflows/cd.yml +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/.github/workflows/ci.yml +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/.github/workflows/docs.yml +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/.gitignore +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/.pre-commit-config.yaml +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/LICENSE +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/MANIFEST.in +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/README.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/__init__.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/__main__.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/constants.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/core.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/plink.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/provenance.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/typing.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/vcf2zarr/__init__.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/vcf2zarr/vcz.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/vcf2zarr/verification.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/zarr_utils.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr.egg-info/SOURCES.txt +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr.egg-info/dependency_links.txt +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr.egg-info/entry_points.txt +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr.egg-info/requires.txt +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr.egg-info/top_level.txt +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/Makefile +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/_config.yml +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/_static/asciinema-player.css +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/_static/asciinema-player.min.js +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/_static/custom.css +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/_toc.yml +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/build.sh +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/cast_scripts/vcf2zarr_convert.sh +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/cast_scripts/vcf2zarr_explode.sh +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/installation.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/intro.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/logo.png +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/requirements.txt +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/vcf2zarr/cli_ref.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/vcf2zarr/overview.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/vcf2zarr/tutorial.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/vcfpartition/cli_ref.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/docs/vcfpartition/overview.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/pyproject.toml +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/setup.cfg +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/validation-data/Makefile +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/validation-data/split.sh +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/validation.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.5}/vcf_generator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Author-email: sgkit Developers <project@sgkit.dev>
|
|
6
6
|
License: Apache License
|
|
@@ -241,6 +241,7 @@ Requires-Dist: pytest-coverage; extra == "dev"
|
|
|
241
241
|
Requires-Dist: pytest-xdist; extra == "dev"
|
|
242
242
|
Requires-Dist: sgkit>=0.8.0; extra == "dev"
|
|
243
243
|
Requires-Dist: tqdm; extra == "dev"
|
|
244
|
+
Dynamic: license-file
|
|
244
245
|
|
|
245
246
|
[](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
|
|
246
247
|
[](https://coveralls.io/github/sgkit-dev/bio2zarr)
|
|
@@ -624,8 +624,8 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
|
|
|
624
624
|
num_parts_per_path = max(1, num_partitions // len(vcfs))
|
|
625
625
|
|
|
626
626
|
for vcf_path in vcfs:
|
|
627
|
-
|
|
628
|
-
regions =
|
|
627
|
+
vcf_file = vcf_utils.VcfFile(vcf_path)
|
|
628
|
+
regions = vcf_file.partition_into_regions(
|
|
629
629
|
num_parts=num_parts_per_path, target_part_size=partition_size
|
|
630
630
|
)
|
|
631
631
|
for region in regions:
|
|
@@ -228,8 +228,8 @@ def fixed_vcf_field_definitions():
|
|
|
228
228
|
|
|
229
229
|
|
|
230
230
|
def scan_vcf(path, target_num_partitions):
|
|
231
|
-
with vcf_utils.
|
|
232
|
-
vcf =
|
|
231
|
+
with vcf_utils.VcfFile(path) as vcf_file:
|
|
232
|
+
vcf = vcf_file.vcf
|
|
233
233
|
filters = []
|
|
234
234
|
pass_index = -1
|
|
235
235
|
for h in vcf.header_iter():
|
|
@@ -270,10 +270,10 @@ def scan_vcf(path, target_num_partitions):
|
|
|
270
270
|
filters=filters,
|
|
271
271
|
fields=fields,
|
|
272
272
|
partitions=[],
|
|
273
|
-
num_records=sum(
|
|
273
|
+
num_records=sum(vcf_file.contig_record_counts().values()),
|
|
274
274
|
)
|
|
275
275
|
|
|
276
|
-
regions =
|
|
276
|
+
regions = vcf_file.partition_into_regions(num_parts=target_num_partitions)
|
|
277
277
|
for region in regions:
|
|
278
278
|
metadata.partitions.append(
|
|
279
279
|
VcfPartition(
|
|
@@ -324,14 +324,28 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
|
324
324
|
# are compatible.
|
|
325
325
|
all_partitions = []
|
|
326
326
|
total_records = 0
|
|
327
|
+
contigs = {}
|
|
327
328
|
for metadata, _ in results:
|
|
328
329
|
for partition in metadata.partitions:
|
|
329
330
|
logger.debug(f"Scanned partition {partition}")
|
|
330
331
|
all_partitions.append(partition)
|
|
332
|
+
for contig in metadata.contigs:
|
|
333
|
+
if contig.id in contigs:
|
|
334
|
+
if contig != contigs[contig.id]:
|
|
335
|
+
raise ValueError(
|
|
336
|
+
"Incompatible contig definitions: "
|
|
337
|
+
f"{contig} != {contigs[contig.id]}"
|
|
338
|
+
)
|
|
339
|
+
else:
|
|
340
|
+
contigs[contig.id] = contig
|
|
331
341
|
total_records += metadata.num_records
|
|
332
342
|
metadata.num_records = 0
|
|
333
343
|
metadata.partitions = []
|
|
334
344
|
|
|
345
|
+
contig_union = list(contigs.values())
|
|
346
|
+
for metadata, _ in results:
|
|
347
|
+
metadata.contigs = contig_union
|
|
348
|
+
|
|
335
349
|
icf_metadata, header = results[0]
|
|
336
350
|
for metadata, _ in results[1:]:
|
|
337
351
|
if metadata != icf_metadata:
|
|
@@ -1079,9 +1093,9 @@ class IntermediateColumnarFormatWriter:
|
|
|
1079
1093
|
self.path,
|
|
1080
1094
|
partition_index,
|
|
1081
1095
|
) as tcw:
|
|
1082
|
-
with vcf_utils.
|
|
1096
|
+
with vcf_utils.VcfFile(partition.vcf_path) as vcf:
|
|
1083
1097
|
num_records = 0
|
|
1084
|
-
for variant in
|
|
1098
|
+
for variant in vcf.variants(partition.region):
|
|
1085
1099
|
num_records += 1
|
|
1086
1100
|
last_position = variant.POS
|
|
1087
1101
|
tcw.append("CHROM", variant.CHROM)
|
|
@@ -89,7 +89,10 @@ class Region:
|
|
|
89
89
|
end: Optional[int] = None
|
|
90
90
|
|
|
91
91
|
def __post_init__(self):
|
|
92
|
-
|
|
92
|
+
assert self.contig is not None
|
|
93
|
+
if self.start is None:
|
|
94
|
+
self.start = 1
|
|
95
|
+
else:
|
|
93
96
|
self.start = int(self.start)
|
|
94
97
|
assert self.start > 0
|
|
95
98
|
if self.end is not None:
|
|
@@ -393,9 +396,12 @@ class VcfIndexType(Enum):
|
|
|
393
396
|
TABIX = ".tbi"
|
|
394
397
|
|
|
395
398
|
|
|
396
|
-
class
|
|
399
|
+
class VcfFile(contextlib.AbstractContextManager):
|
|
397
400
|
def __init__(self, vcf_path, index_path=None):
|
|
398
401
|
self.vcf = None
|
|
402
|
+
self.file_type = None
|
|
403
|
+
self.index_type = None
|
|
404
|
+
|
|
399
405
|
vcf_path = pathlib.Path(vcf_path)
|
|
400
406
|
if not vcf_path.exists():
|
|
401
407
|
raise FileNotFoundError(vcf_path)
|
|
@@ -408,30 +414,34 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
408
414
|
vcf_path.suffix + VcfIndexType.CSI.value
|
|
409
415
|
)
|
|
410
416
|
if not index_path.exists():
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
)
|
|
417
|
+
# No supported index found
|
|
418
|
+
index_path = None
|
|
414
419
|
else:
|
|
415
420
|
index_path = pathlib.Path(index_path)
|
|
421
|
+
if not index_path.exists():
|
|
422
|
+
raise FileNotFoundError(
|
|
423
|
+
f"Specified index path {index_path} does not exist"
|
|
424
|
+
)
|
|
416
425
|
|
|
417
426
|
self.vcf_path = vcf_path
|
|
418
427
|
self.index_path = index_path
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
else:
|
|
428
|
-
raise ValueError("Only .tbi or .csi indexes are supported.")
|
|
428
|
+
if index_path is not None:
|
|
429
|
+
if index_path.suffix == VcfIndexType.CSI.value:
|
|
430
|
+
self.index_type = VcfIndexType.CSI
|
|
431
|
+
elif index_path.suffix == VcfIndexType.TABIX.value:
|
|
432
|
+
self.index_type = VcfIndexType.TABIX
|
|
433
|
+
self.file_type = VcfFileType.VCF
|
|
434
|
+
else:
|
|
435
|
+
raise ValueError("Only .tbi or .csi indexes are supported.")
|
|
429
436
|
|
|
430
437
|
self.vcf = cyvcf2.VCF(vcf_path)
|
|
431
|
-
self.
|
|
438
|
+
if self.index_path is not None:
|
|
439
|
+
self.vcf.set_index(str(self.index_path))
|
|
440
|
+
|
|
432
441
|
logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
|
|
433
442
|
self.sequence_names = None
|
|
434
443
|
|
|
444
|
+
self.index = None
|
|
435
445
|
if self.index_type == VcfIndexType.CSI:
|
|
436
446
|
# Determine the file-type based on the "aux" field.
|
|
437
447
|
self.index = read_csi(self.index_path)
|
|
@@ -441,9 +451,17 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
441
451
|
self.sequence_names = self.index.parse_vcf_aux()
|
|
442
452
|
else:
|
|
443
453
|
self.sequence_names = self.vcf.seqnames
|
|
444
|
-
|
|
454
|
+
elif self.index_type == VcfIndexType.TABIX:
|
|
445
455
|
self.index = read_tabix(self.index_path)
|
|
456
|
+
self.file_type = VcfFileType.VCF
|
|
446
457
|
self.sequence_names = self.index.sequence_names
|
|
458
|
+
else:
|
|
459
|
+
assert self.index is None
|
|
460
|
+
var = next(self.vcf)
|
|
461
|
+
self.sequence_names = [var.CHROM]
|
|
462
|
+
self.vcf.close()
|
|
463
|
+
# There doesn't seem to be a way to reset the iterator
|
|
464
|
+
self.vcf = cyvcf2.VCF(vcf_path)
|
|
447
465
|
|
|
448
466
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
449
467
|
if self.vcf is not None:
|
|
@@ -452,6 +470,8 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
452
470
|
return False
|
|
453
471
|
|
|
454
472
|
def contig_record_counts(self):
|
|
473
|
+
if self.index is None:
|
|
474
|
+
return {self.sequence_names[0]: RECORD_COUNT_UNKNOWN}
|
|
455
475
|
d = dict(zip(self.sequence_names, self.index.record_counts))
|
|
456
476
|
if self.file_type == VcfFileType.BCF:
|
|
457
477
|
d = {k: v for k, v in d.items() if v > 0}
|
|
@@ -460,12 +480,21 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
460
480
|
def count_variants(self, region):
|
|
461
481
|
return sum(1 for _ in self.variants(region))
|
|
462
482
|
|
|
463
|
-
def variants(self, region):
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
483
|
+
def variants(self, region=None):
|
|
484
|
+
if self.index is None:
|
|
485
|
+
contig = self.sequence_names[0]
|
|
486
|
+
if region is not None:
|
|
487
|
+
assert region.contig == contig
|
|
488
|
+
for var in self.vcf:
|
|
489
|
+
if var.CHROM != contig:
|
|
490
|
+
raise ValueError("Multi-contig VCFs must be indexed")
|
|
468
491
|
yield var
|
|
492
|
+
else:
|
|
493
|
+
start = 1 if region.start is None else region.start
|
|
494
|
+
for var in self.vcf(str(region)):
|
|
495
|
+
# Need to filter because of indels overlapping the region
|
|
496
|
+
if var.POS >= start:
|
|
497
|
+
yield var
|
|
469
498
|
|
|
470
499
|
def _filter_empty_and_refine(self, regions):
|
|
471
500
|
"""
|
|
@@ -505,6 +534,9 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
505
534
|
if target_part_size_bytes < 1:
|
|
506
535
|
raise ValueError("target_part_size must be positive")
|
|
507
536
|
|
|
537
|
+
if self.index is None:
|
|
538
|
+
return [Region(self.sequence_names[0])]
|
|
539
|
+
|
|
508
540
|
# Calculate the desired part file boundaries
|
|
509
541
|
file_length = os.stat(self.vcf_path).st_size
|
|
510
542
|
if num_parts is not None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Author-email: sgkit Developers <project@sgkit.dev>
|
|
6
6
|
License: Apache License
|
|
@@ -241,6 +241,7 @@ Requires-Dist: pytest-coverage; extra == "dev"
|
|
|
241
241
|
Requires-Dist: pytest-xdist; extra == "dev"
|
|
242
242
|
Requires-Dist: sgkit>=0.8.0; extra == "dev"
|
|
243
243
|
Requires-Dist: tqdm; extra == "dev"
|
|
244
|
+
Dynamic: license-file
|
|
244
245
|
|
|
245
246
|
[](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
|
|
246
247
|
[](https://coveralls.io/github/sgkit-dev/bio2zarr)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|