bio2zarr 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/__init__.py +1 -1
- bio2zarr/__main__.py +2 -0
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +4 -8
- bio2zarr/core.py +5 -6
- bio2zarr/plink.py +6 -8
- bio2zarr/typing.py +1 -1
- bio2zarr/vcf.py +93 -77
- bio2zarr/vcf_utils.py +26 -8
- {bio2zarr-0.0.4.dist-info → bio2zarr-0.0.5.dist-info}/METADATA +1 -1
- bio2zarr-0.0.5.dist-info/RECORD +16 -0
- bio2zarr-0.0.4.dist-info/RECORD +0 -16
- {bio2zarr-0.0.4.dist-info → bio2zarr-0.0.5.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.4.dist-info → bio2zarr-0.0.5.dist-info}/WHEEL +0 -0
- {bio2zarr-0.0.4.dist-info → bio2zarr-0.0.5.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.0.4.dist-info → bio2zarr-0.0.5.dist-info}/top_level.txt +0 -0
bio2zarr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
from .
|
|
1
|
+
from .provenance import __version__ # noqa F401
|
bio2zarr/__main__.py
CHANGED
|
@@ -2,11 +2,13 @@ import click
|
|
|
2
2
|
|
|
3
3
|
from . import cli
|
|
4
4
|
|
|
5
|
+
|
|
5
6
|
@cli.version
|
|
6
7
|
@click.group()
|
|
7
8
|
def bio2zarr():
|
|
8
9
|
pass
|
|
9
10
|
|
|
11
|
+
|
|
10
12
|
# Provide a single top-level interface to all of the functionality.
|
|
11
13
|
# This probably isn't the recommended way of interacting, as we
|
|
12
14
|
# install individual commands as console scripts. However, this
|
bio2zarr/_version.py
CHANGED
bio2zarr/cli.py
CHANGED
|
@@ -4,15 +4,11 @@ import pathlib
|
|
|
4
4
|
import shutil
|
|
5
5
|
|
|
6
6
|
import click
|
|
7
|
-
import tabulate
|
|
8
7
|
import coloredlogs
|
|
9
8
|
import numcodecs
|
|
9
|
+
import tabulate
|
|
10
10
|
|
|
11
|
-
from . import vcf
|
|
12
|
-
from . import vcf_utils
|
|
13
|
-
from . import plink
|
|
14
|
-
from . import provenance
|
|
15
|
-
|
|
11
|
+
from . import plink, provenance, vcf, vcf_utils
|
|
16
12
|
|
|
17
13
|
logger = logging.getLogger(__name__)
|
|
18
14
|
|
|
@@ -75,7 +71,7 @@ compressor = click.option(
|
|
|
75
71
|
"--compressor",
|
|
76
72
|
type=click.Choice(["lz4", "zstd"]),
|
|
77
73
|
default=None,
|
|
78
|
-
help="Codec to use for compressing column chunks (Default=zstd)."
|
|
74
|
+
help="Codec to use for compressing column chunks (Default=zstd).",
|
|
79
75
|
)
|
|
80
76
|
|
|
81
77
|
# Note: -l and -w were chosen when these were called "width" and "length".
|
|
@@ -207,7 +203,7 @@ def dexplode_partition(icf_path, partition, verbose):
|
|
|
207
203
|
from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
|
|
208
204
|
"""
|
|
209
205
|
setup_logging(verbose)
|
|
210
|
-
vcf.explode_partition(icf_path, partition, show_progress=
|
|
206
|
+
vcf.explode_partition(icf_path, partition, show_progress=False)
|
|
211
207
|
|
|
212
208
|
|
|
213
209
|
@click.command
|
bio2zarr/core.py
CHANGED
|
@@ -1,16 +1,15 @@
|
|
|
1
|
-
import dataclasses
|
|
2
|
-
import contextlib
|
|
3
1
|
import concurrent.futures as cf
|
|
2
|
+
import contextlib
|
|
3
|
+
import dataclasses
|
|
4
|
+
import logging
|
|
4
5
|
import multiprocessing
|
|
5
6
|
import threading
|
|
6
|
-
import logging
|
|
7
7
|
import time
|
|
8
8
|
|
|
9
|
-
import
|
|
9
|
+
import numcodecs
|
|
10
10
|
import numpy as np
|
|
11
11
|
import tqdm
|
|
12
|
-
import
|
|
13
|
-
|
|
12
|
+
import zarr
|
|
14
13
|
|
|
15
14
|
logger = logging.getLogger(__name__)
|
|
16
15
|
|
bio2zarr/plink.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
|
|
3
|
+
import bed_reader
|
|
3
4
|
import humanfriendly
|
|
5
|
+
import numcodecs
|
|
4
6
|
import numpy as np
|
|
5
7
|
import zarr
|
|
6
|
-
import bed_reader
|
|
7
|
-
import numcodecs
|
|
8
8
|
|
|
9
9
|
from . import core
|
|
10
10
|
|
|
11
|
-
|
|
12
11
|
logger = logging.getLogger(__name__)
|
|
13
12
|
|
|
14
13
|
|
|
@@ -24,7 +23,6 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
|
|
|
24
23
|
gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
|
|
25
24
|
gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
|
|
26
25
|
variants_chunk_size = gt.array.chunks[0]
|
|
27
|
-
n = gt.array.shape[1]
|
|
28
26
|
assert start % variants_chunk_size == 0
|
|
29
27
|
|
|
30
28
|
logger.debug(f"Reading slice {start}:{stop}")
|
|
@@ -96,7 +94,7 @@ def convert(
|
|
|
96
94
|
chunks=(samples_chunk_size,),
|
|
97
95
|
)
|
|
98
96
|
a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
99
|
-
logger.debug(
|
|
97
|
+
logger.debug("Encoded samples")
|
|
100
98
|
|
|
101
99
|
# TODO encode these in slices - but read them in one go to avoid
|
|
102
100
|
# fetching repeatedly from bim file
|
|
@@ -108,7 +106,7 @@ def convert(
|
|
|
108
106
|
chunks=(variants_chunk_size,),
|
|
109
107
|
)
|
|
110
108
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
|
|
111
|
-
logger.debug(
|
|
109
|
+
logger.debug("encoded variant_position")
|
|
112
110
|
|
|
113
111
|
alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
|
|
114
112
|
a = root.array(
|
|
@@ -119,7 +117,7 @@ def convert(
|
|
|
119
117
|
chunks=(variants_chunk_size,),
|
|
120
118
|
)
|
|
121
119
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
|
|
122
|
-
logger.debug(
|
|
120
|
+
logger.debug("encoded variant_allele")
|
|
123
121
|
|
|
124
122
|
# TODO remove this?
|
|
125
123
|
a = root.empty(
|
|
@@ -201,4 +199,4 @@ def validate(bed_path, zarr_path):
|
|
|
201
199
|
elif bed_call == 2:
|
|
202
200
|
assert list(zarr_call) == [1, 1]
|
|
203
201
|
else: # pragma no cover
|
|
204
|
-
|
|
202
|
+
raise AssertionError(f"Unexpected bed call {bed_call}")
|
bio2zarr/typing.py
CHANGED
bio2zarr/vcf.py
CHANGED
|
@@ -1,29 +1,27 @@
|
|
|
1
1
|
import collections
|
|
2
|
+
import contextlib
|
|
2
3
|
import dataclasses
|
|
3
4
|
import functools
|
|
5
|
+
import json
|
|
4
6
|
import logging
|
|
7
|
+
import math
|
|
5
8
|
import os
|
|
6
9
|
import pathlib
|
|
7
10
|
import pickle
|
|
8
|
-
import sys
|
|
9
11
|
import shutil
|
|
10
|
-
import
|
|
11
|
-
import math
|
|
12
|
+
import sys
|
|
12
13
|
import tempfile
|
|
13
|
-
import contextlib
|
|
14
14
|
from typing import Any, List
|
|
15
15
|
|
|
16
|
-
import humanfriendly
|
|
17
16
|
import cyvcf2
|
|
17
|
+
import humanfriendly
|
|
18
18
|
import numcodecs
|
|
19
19
|
import numpy as np
|
|
20
20
|
import numpy.testing as nt
|
|
21
21
|
import tqdm
|
|
22
22
|
import zarr
|
|
23
23
|
|
|
24
|
-
from . import core
|
|
25
|
-
from . import provenance
|
|
26
|
-
from . import vcf_utils
|
|
24
|
+
from . import core, provenance, vcf_utils
|
|
27
25
|
|
|
28
26
|
logger = logging.getLogger(__name__)
|
|
29
27
|
|
|
@@ -284,9 +282,25 @@ def scan_vcf(path, target_num_partitions):
|
|
|
284
282
|
return metadata, vcf.raw_header
|
|
285
283
|
|
|
286
284
|
|
|
285
|
+
def check_overlap(partitions):
|
|
286
|
+
for i in range(1, len(partitions)):
|
|
287
|
+
prev_partition = partitions[i - 1]
|
|
288
|
+
current_partition = partitions[i]
|
|
289
|
+
if (
|
|
290
|
+
prev_partition.region.contig == current_partition.region.contig
|
|
291
|
+
and prev_partition.region.end > current_partition.region.start
|
|
292
|
+
):
|
|
293
|
+
raise ValueError(
|
|
294
|
+
f"Multiple VCFs have the region "
|
|
295
|
+
f"{prev_partition.region.contig}:{prev_partition.region.start}-"
|
|
296
|
+
f"{current_partition.region.end}"
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
|
|
287
300
|
def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
288
301
|
logger.info(
|
|
289
|
-
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}
|
|
302
|
+
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
|
|
303
|
+
f" partitions."
|
|
290
304
|
)
|
|
291
305
|
# An easy mistake to make is to pass the same file twice. Check this early on.
|
|
292
306
|
for path, count in collections.Counter(paths).items():
|
|
@@ -331,6 +345,7 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
|
331
345
|
all_partitions.sort(
|
|
332
346
|
key=lambda x: (contig_index_map[x.region.contig], x.region.start)
|
|
333
347
|
)
|
|
348
|
+
check_overlap(all_partitions)
|
|
334
349
|
icf_metadata.partitions = all_partitions
|
|
335
350
|
logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
|
|
336
351
|
return icf_metadata, header
|
|
@@ -791,6 +806,8 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
|
|
|
791
806
|
for vcf_field in icf_metadata.fields:
|
|
792
807
|
field_path = get_vcf_field_path(out_path, vcf_field)
|
|
793
808
|
field_partition_path = field_path / f"p{partition_index}"
|
|
809
|
+
# Should be robust to running explode_partition twice.
|
|
810
|
+
field_partition_path.mkdir(exist_ok=True)
|
|
794
811
|
transformer = VcfValueTransformer.factory(vcf_field, num_samples)
|
|
795
812
|
self.field_writers[vcf_field.full_name] = IcfFieldWriter(
|
|
796
813
|
vcf_field,
|
|
@@ -832,7 +849,7 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
832
849
|
partition.num_records for partition in self.metadata.partitions
|
|
833
850
|
]
|
|
834
851
|
# Allow us to find which partition a given record is in
|
|
835
|
-
self.partition_record_index = np.cumsum([0
|
|
852
|
+
self.partition_record_index = np.cumsum([0, *partition_num_records])
|
|
836
853
|
for field in self.metadata.fields:
|
|
837
854
|
self.columns[field.full_name] = IntermediateColumnarFormatField(self, field)
|
|
838
855
|
logger.info(
|
|
@@ -842,7 +859,8 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
842
859
|
|
|
843
860
|
def __repr__(self):
|
|
844
861
|
return (
|
|
845
|
-
f"IntermediateColumnarFormat(fields={len(self)},
|
|
862
|
+
f"IntermediateColumnarFormat(fields={len(self)}, "
|
|
863
|
+
f"partitions={self.num_partitions}, "
|
|
846
864
|
f"records={self.num_records}, path={self.path})"
|
|
847
865
|
)
|
|
848
866
|
|
|
@@ -890,15 +908,6 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
890
908
|
return len(self.columns)
|
|
891
909
|
|
|
892
910
|
|
|
893
|
-
|
|
894
|
-
def mkdir_with_progress(path):
|
|
895
|
-
logger.debug(f"mkdir f{path}")
|
|
896
|
-
# NOTE we may have race-conditions here, I'm not sure. Hopefully allowing
|
|
897
|
-
# parents=True will take care of it.
|
|
898
|
-
path.mkdir(parents=True)
|
|
899
|
-
core.update_progress(1)
|
|
900
|
-
|
|
901
|
-
|
|
902
911
|
class IntermediateColumnarFormatWriter:
|
|
903
912
|
def __init__(self, path):
|
|
904
913
|
self.path = pathlib.Path(path)
|
|
@@ -941,45 +950,29 @@ class IntermediateColumnarFormatWriter:
|
|
|
941
950
|
# dependencies as well.
|
|
942
951
|
self.metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
|
|
943
952
|
|
|
944
|
-
self.mkdirs(
|
|
953
|
+
self.mkdirs()
|
|
945
954
|
|
|
946
955
|
# Note: this is needed for the current version of the vcfzarr spec, but it's
|
|
947
956
|
# probably going to be dropped.
|
|
948
957
|
# https://github.com/pystatgen/vcf-zarr-spec/issues/15
|
|
949
958
|
# May be useful to keep lying around still though?
|
|
950
|
-
logger.info(
|
|
959
|
+
logger.info("Writing VCF header")
|
|
951
960
|
with open(self.path / "header.txt", "w") as f:
|
|
952
961
|
f.write(header)
|
|
953
962
|
|
|
954
|
-
logger.info(
|
|
963
|
+
logger.info("Writing WIP metadata")
|
|
955
964
|
with open(self.wip_path / "metadata.json", "w") as f:
|
|
956
965
|
json.dump(self.metadata.asdict(), f, indent=4)
|
|
957
966
|
return self.num_partitions
|
|
958
967
|
|
|
959
|
-
def mkdirs(self
|
|
960
|
-
num_dirs = len(self.metadata.fields)
|
|
961
|
-
logger.info(f"Creating {num_dirs} directories")
|
|
968
|
+
def mkdirs(self):
|
|
969
|
+
num_dirs = len(self.metadata.fields)
|
|
970
|
+
logger.info(f"Creating {num_dirs} field directories")
|
|
962
971
|
self.path.mkdir()
|
|
963
972
|
self.wip_path.mkdir()
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
total=num_dirs,
|
|
968
|
-
units="dirs",
|
|
969
|
-
title="Mkdirs",
|
|
970
|
-
show=show_progress,
|
|
971
|
-
)
|
|
972
|
-
with core.ParallelWorkManager(
|
|
973
|
-
worker_processes=worker_processes, progress_config=progress_config
|
|
974
|
-
) as manager:
|
|
975
|
-
for field in self.metadata.fields:
|
|
976
|
-
col_path = get_vcf_field_path(self.path, field)
|
|
977
|
-
# Don't bother trying to count the intermediate directories towards
|
|
978
|
-
# progress
|
|
979
|
-
manager.submit(col_path.mkdir, parents=True)
|
|
980
|
-
for j in range(self.num_partitions):
|
|
981
|
-
part_path = col_path / f"p{j}"
|
|
982
|
-
manager.submit(mkdir_with_progress, part_path)
|
|
973
|
+
for field in self.metadata.fields:
|
|
974
|
+
col_path = get_vcf_field_path(self.path, field)
|
|
975
|
+
col_path.mkdir(parents=True)
|
|
983
976
|
|
|
984
977
|
def load_partition_summaries(self):
|
|
985
978
|
summaries = []
|
|
@@ -995,13 +988,14 @@ class IntermediateColumnarFormatWriter:
|
|
|
995
988
|
not_found.append(j)
|
|
996
989
|
if len(not_found) > 0:
|
|
997
990
|
raise FileNotFoundError(
|
|
998
|
-
f"Partition metadata not found for {len(not_found)}
|
|
991
|
+
f"Partition metadata not found for {len(not_found)}"
|
|
992
|
+
f" partitions: {not_found}"
|
|
999
993
|
)
|
|
1000
994
|
return summaries
|
|
1001
995
|
|
|
1002
996
|
def load_metadata(self):
|
|
1003
997
|
if self.metadata is None:
|
|
1004
|
-
with open(self.wip_path /
|
|
998
|
+
with open(self.wip_path / "metadata.json") as f:
|
|
1005
999
|
self.metadata = IcfMetadata.fromdict(json.load(f))
|
|
1006
1000
|
|
|
1007
1001
|
def process_partition(self, partition_index):
|
|
@@ -1050,12 +1044,14 @@ class IntermediateColumnarFormatWriter:
|
|
|
1050
1044
|
for field in format_fields:
|
|
1051
1045
|
val = variant.format(field.name)
|
|
1052
1046
|
tcw.append(field.full_name, val)
|
|
1053
|
-
# Note: an issue with updating the progress per variant here like
|
|
1054
|
-
# is that we get a significant pause at the end of the counter
|
|
1055
|
-
# all the "small" fields get flushed. Possibly not much to be
|
|
1047
|
+
# Note: an issue with updating the progress per variant here like
|
|
1048
|
+
# this is that we get a significant pause at the end of the counter
|
|
1049
|
+
# while all the "small" fields get flushed. Possibly not much to be
|
|
1050
|
+
# done about it.
|
|
1056
1051
|
core.update_progress(1)
|
|
1057
1052
|
logger.info(
|
|
1058
|
-
f"Finished reading VCF for partition {partition_index},
|
|
1053
|
+
f"Finished reading VCF for partition {partition_index}, "
|
|
1054
|
+
f"flushing buffers"
|
|
1059
1055
|
)
|
|
1060
1056
|
|
|
1061
1057
|
partition_metadata = {
|
|
@@ -1137,11 +1133,11 @@ class IntermediateColumnarFormatWriter:
|
|
|
1137
1133
|
for summary in partition_summaries:
|
|
1138
1134
|
field.summary.update(summary["field_summaries"][field.full_name])
|
|
1139
1135
|
|
|
1140
|
-
logger.info(
|
|
1136
|
+
logger.info("Finalising metadata")
|
|
1141
1137
|
with open(self.path / "metadata.json", "w") as f:
|
|
1142
1138
|
json.dump(self.metadata.asdict(), f, indent=4)
|
|
1143
1139
|
|
|
1144
|
-
logger.debug(
|
|
1140
|
+
logger.debug("Removing WIP directory")
|
|
1145
1141
|
shutil.rmtree(self.wip_path)
|
|
1146
1142
|
|
|
1147
1143
|
|
|
@@ -1155,7 +1151,7 @@ def explode(
|
|
|
1155
1151
|
compressor=None,
|
|
1156
1152
|
):
|
|
1157
1153
|
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1158
|
-
|
|
1154
|
+
writer.init(
|
|
1159
1155
|
vcfs,
|
|
1160
1156
|
# Heuristic to get reasonable worker utilisation with lumpy partition sizing
|
|
1161
1157
|
target_num_partitions=max(1, worker_processes * 4),
|
|
@@ -1226,20 +1222,25 @@ class ZarrColumnSpec:
|
|
|
1226
1222
|
dtype: str
|
|
1227
1223
|
shape: tuple
|
|
1228
1224
|
chunks: tuple
|
|
1229
|
-
dimensions:
|
|
1225
|
+
dimensions: tuple
|
|
1230
1226
|
description: str
|
|
1231
1227
|
vcf_field: str
|
|
1232
|
-
compressor: dict
|
|
1233
|
-
filters: list
|
|
1234
|
-
# TODO add filters
|
|
1228
|
+
compressor: dict
|
|
1229
|
+
filters: list
|
|
1235
1230
|
|
|
1236
1231
|
def __post_init__(self):
|
|
1232
|
+
# Ensure these are tuples for ease of comparison and consistency
|
|
1237
1233
|
self.shape = tuple(self.shape)
|
|
1238
1234
|
self.chunks = tuple(self.chunks)
|
|
1239
1235
|
self.dimensions = tuple(self.dimensions)
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1236
|
+
|
|
1237
|
+
@staticmethod
|
|
1238
|
+
def new(**kwargs):
|
|
1239
|
+
spec = ZarrColumnSpec(
|
|
1240
|
+
**kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
|
|
1241
|
+
)
|
|
1242
|
+
spec._choose_compressor_settings()
|
|
1243
|
+
return spec
|
|
1243
1244
|
|
|
1244
1245
|
def _choose_compressor_settings(self):
|
|
1245
1246
|
"""
|
|
@@ -1315,7 +1316,7 @@ class VcfZarrSchema:
|
|
|
1315
1316
|
def fixed_field_spec(
|
|
1316
1317
|
name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
|
|
1317
1318
|
):
|
|
1318
|
-
return ZarrColumnSpec(
|
|
1319
|
+
return ZarrColumnSpec.new(
|
|
1319
1320
|
vcf_field=vcf_field,
|
|
1320
1321
|
name=name,
|
|
1321
1322
|
dtype=dtype,
|
|
@@ -1383,14 +1384,23 @@ class VcfZarrSchema:
|
|
|
1383
1384
|
if field.category == "FORMAT":
|
|
1384
1385
|
prefix = "call_"
|
|
1385
1386
|
shape.append(n)
|
|
1386
|
-
chunks.append(samples_chunk_size)
|
|
1387
|
+
chunks.append(samples_chunk_size)
|
|
1387
1388
|
dimensions.append("samples")
|
|
1388
1389
|
# TODO make an option to add in the empty extra dimension
|
|
1389
1390
|
if field.summary.max_number > 1:
|
|
1390
1391
|
shape.append(field.summary.max_number)
|
|
1391
|
-
dimensions
|
|
1392
|
+
# TODO we should really be checking this to see if the named dimensions
|
|
1393
|
+
# are actually correct.
|
|
1394
|
+
if field.vcf_number == "R":
|
|
1395
|
+
dimensions.append("alleles")
|
|
1396
|
+
elif field.vcf_number == "A":
|
|
1397
|
+
dimensions.append("alt_alleles")
|
|
1398
|
+
elif field.vcf_number == "G":
|
|
1399
|
+
dimensions.append("genotypes")
|
|
1400
|
+
else:
|
|
1401
|
+
dimensions.append(f"{field.category}_{field.name}_dim")
|
|
1392
1402
|
variable_name = prefix + field.name
|
|
1393
|
-
colspec = ZarrColumnSpec(
|
|
1403
|
+
colspec = ZarrColumnSpec.new(
|
|
1394
1404
|
vcf_field=field.full_name,
|
|
1395
1405
|
name=variable_name,
|
|
1396
1406
|
dtype=field.smallest_dtype(),
|
|
@@ -1408,7 +1418,7 @@ class VcfZarrSchema:
|
|
|
1408
1418
|
dimensions = ["variants", "samples"]
|
|
1409
1419
|
|
|
1410
1420
|
colspecs.append(
|
|
1411
|
-
ZarrColumnSpec(
|
|
1421
|
+
ZarrColumnSpec.new(
|
|
1412
1422
|
vcf_field=None,
|
|
1413
1423
|
name="call_genotype_phased",
|
|
1414
1424
|
dtype="bool",
|
|
@@ -1421,7 +1431,7 @@ class VcfZarrSchema:
|
|
|
1421
1431
|
shape += [ploidy]
|
|
1422
1432
|
dimensions += ["ploidy"]
|
|
1423
1433
|
colspecs.append(
|
|
1424
|
-
ZarrColumnSpec(
|
|
1434
|
+
ZarrColumnSpec.new(
|
|
1425
1435
|
vcf_field=None,
|
|
1426
1436
|
name="call_genotype",
|
|
1427
1437
|
dtype=gt_field.smallest_dtype(),
|
|
@@ -1432,7 +1442,7 @@ class VcfZarrSchema:
|
|
|
1432
1442
|
)
|
|
1433
1443
|
)
|
|
1434
1444
|
colspecs.append(
|
|
1435
|
-
ZarrColumnSpec(
|
|
1445
|
+
ZarrColumnSpec.new(
|
|
1436
1446
|
vcf_field=None,
|
|
1437
1447
|
name="call_genotype_mask",
|
|
1438
1448
|
dtype="bool",
|
|
@@ -1514,7 +1524,9 @@ class VcfZarrWriter:
|
|
|
1514
1524
|
self.schema = schema
|
|
1515
1525
|
# Default to using nested directories following the Zarr v3 default.
|
|
1516
1526
|
# This seems to require version 2.17+ to work properly
|
|
1517
|
-
self.dimension_separator =
|
|
1527
|
+
self.dimension_separator = (
|
|
1528
|
+
"/" if dimension_separator is None else dimension_separator
|
|
1529
|
+
)
|
|
1518
1530
|
store = zarr.DirectoryStore(self.path)
|
|
1519
1531
|
self.root = zarr.group(store=store)
|
|
1520
1532
|
|
|
@@ -1624,7 +1636,9 @@ class VcfZarrWriter:
|
|
|
1624
1636
|
try:
|
|
1625
1637
|
var_filter.buff[j, lookup[f]] = True
|
|
1626
1638
|
except KeyError:
|
|
1627
|
-
raise ValueError(
|
|
1639
|
+
raise ValueError(
|
|
1640
|
+
f"Filter '{f}' was not defined " f"in the header."
|
|
1641
|
+
) from None
|
|
1628
1642
|
var_filter.flush()
|
|
1629
1643
|
logger.debug(f"Encoded FILTERS slice {start}:{stop}")
|
|
1630
1644
|
|
|
@@ -1727,7 +1741,8 @@ class VcfZarrWriter:
|
|
|
1727
1741
|
variant_chunk_size = array.blocks[0].nbytes
|
|
1728
1742
|
encoding_memory_requirements[col.name] = variant_chunk_size
|
|
1729
1743
|
logger.debug(
|
|
1730
|
-
f"{col.name} requires at least {display_size(variant_chunk_size)}
|
|
1744
|
+
f"{col.name} requires at least {display_size(variant_chunk_size)} "
|
|
1745
|
+
f"per worker"
|
|
1731
1746
|
)
|
|
1732
1747
|
total_bytes += array.nbytes
|
|
1733
1748
|
|
|
@@ -1836,8 +1851,9 @@ class VcfZarrWriter:
|
|
|
1836
1851
|
or len(future_to_work) > max_queued
|
|
1837
1852
|
):
|
|
1838
1853
|
logger.debug(
|
|
1839
|
-
f"Wait: mem_required={used_memory + wp.memory}
|
|
1840
|
-
f"queued={len(future_to_work)}
|
|
1854
|
+
f"Wait: mem_required={used_memory + wp.memory} "
|
|
1855
|
+
f"max_mem={max_memory} queued={len(future_to_work)} "
|
|
1856
|
+
f"max_queued={max_queued}"
|
|
1841
1857
|
)
|
|
1842
1858
|
service_completed_futures()
|
|
1843
1859
|
future = pwm.submit(wp.func, wp.start, wp.stop)
|
|
@@ -1881,7 +1897,7 @@ def encode(
|
|
|
1881
1897
|
raise ValueError(
|
|
1882
1898
|
"Cannot specify schema along with chunk sizes"
|
|
1883
1899
|
) # NEEDS TEST
|
|
1884
|
-
with open(schema_path
|
|
1900
|
+
with open(schema_path) as f:
|
|
1885
1901
|
schema = VcfZarrSchema.fromjson(f.read())
|
|
1886
1902
|
zarr_path = pathlib.Path(zarr_path)
|
|
1887
1903
|
if zarr_path.exists():
|
|
@@ -1962,7 +1978,7 @@ def assert_all_fill(zarr_val, vcf_type):
|
|
|
1962
1978
|
elif vcf_type == "Float":
|
|
1963
1979
|
assert_all_fill_float(zarr_val)
|
|
1964
1980
|
else: # pragma: no cover
|
|
1965
|
-
assert False
|
|
1981
|
+
assert False # noqa PT015
|
|
1966
1982
|
|
|
1967
1983
|
|
|
1968
1984
|
def assert_all_missing(zarr_val, vcf_type):
|
|
@@ -1975,7 +1991,7 @@ def assert_all_missing(zarr_val, vcf_type):
|
|
|
1975
1991
|
elif vcf_type == "Float":
|
|
1976
1992
|
assert_all_missing_float(zarr_val)
|
|
1977
1993
|
else: # pragma: no cover
|
|
1978
|
-
assert False
|
|
1994
|
+
assert False # noqa PT015
|
|
1979
1995
|
|
|
1980
1996
|
|
|
1981
1997
|
def assert_info_val_missing(zarr_val, vcf_type):
|
|
@@ -2114,7 +2130,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
|
|
|
2114
2130
|
assert vid[j] == ("." if row.ID is None else row.ID)
|
|
2115
2131
|
assert allele[j, 0] == row.REF
|
|
2116
2132
|
k = len(row.ALT)
|
|
2117
|
-
nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
|
|
2133
|
+
nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
|
|
2118
2134
|
assert np.all(allele[j, k + 1 :] == "")
|
|
2119
2135
|
# TODO FILTERS
|
|
2120
2136
|
|
bio2zarr/vcf_utils.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
from typing import IO, Any, Dict, Optional, Sequence, Union
|
|
2
1
|
import contextlib
|
|
3
|
-
import struct
|
|
4
|
-
import pathlib
|
|
5
2
|
import gzip
|
|
6
|
-
from dataclasses import dataclass
|
|
7
3
|
import os
|
|
4
|
+
import pathlib
|
|
5
|
+
import struct
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import IO, Any, Dict, Optional, Sequence, Union
|
|
8
8
|
|
|
9
|
-
import numpy as np
|
|
10
9
|
import cyvcf2
|
|
11
10
|
import humanfriendly
|
|
11
|
+
import numpy as np
|
|
12
12
|
|
|
13
13
|
from bio2zarr.typing import PathType
|
|
14
14
|
|
|
@@ -38,7 +38,8 @@ def read_bytes_as_value(f: IO[Any], fmt: str, nodata: Optional[Any] = None) -> A
|
|
|
38
38
|
fmt : str
|
|
39
39
|
A Python `struct` format string.
|
|
40
40
|
nodata : Optional[Any], optional
|
|
41
|
-
The value to return in case there is no further data in the stream,
|
|
41
|
+
The value to return in case there is no further data in the stream,
|
|
42
|
+
by default None
|
|
42
43
|
|
|
43
44
|
Returns
|
|
44
45
|
-------
|
|
@@ -277,7 +278,8 @@ class TabixIndex:
|
|
|
277
278
|
# Create file offsets for each element in the linear index
|
|
278
279
|
file_offsets = np.array([get_file_offset(vfp) for vfp in linear_index])
|
|
279
280
|
|
|
280
|
-
# Calculate corresponding contigs and positions or each element in
|
|
281
|
+
# Calculate corresponding contigs and positions or each element in
|
|
282
|
+
# the linear index
|
|
281
283
|
contig_indexes = np.hstack(
|
|
282
284
|
[np.full(len(li), i) for (i, li) in enumerate(linear_indexes)]
|
|
283
285
|
)
|
|
@@ -433,6 +435,22 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
433
435
|
if var.POS >= start:
|
|
434
436
|
yield var
|
|
435
437
|
|
|
438
|
+
def _filter_empty(self, regions):
|
|
439
|
+
"""
|
|
440
|
+
Return all regions in the specified list that have one or more records.
|
|
441
|
+
|
|
442
|
+
Sometimes with Tabix indexes these seem to crop up:
|
|
443
|
+
|
|
444
|
+
- https://github.com/sgkit-dev/bio2zarr/issues/45
|
|
445
|
+
- https://github.com/sgkit-dev/bio2zarr/issues/120
|
|
446
|
+
"""
|
|
447
|
+
ret = []
|
|
448
|
+
for region in regions:
|
|
449
|
+
variants = self.variants(region)
|
|
450
|
+
if next(variants, None) is not None:
|
|
451
|
+
ret.append(region)
|
|
452
|
+
return ret
|
|
453
|
+
|
|
436
454
|
def partition_into_regions(
|
|
437
455
|
self,
|
|
438
456
|
num_parts: Optional[int] = None,
|
|
@@ -509,4 +527,4 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
509
527
|
if self.index.record_counts[ri] > 0:
|
|
510
528
|
regions.append(Region(self.sequence_names[ri]))
|
|
511
529
|
|
|
512
|
-
return regions
|
|
530
|
+
return self._filter_empty(regions)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
|
|
2
|
+
bio2zarr/__main__.py,sha256=hO4vV-kPFgsYq0NQwG2r-WkserPL27oqae_tUvNB7yE,527
|
|
3
|
+
bio2zarr/_version.py,sha256=EJB7__SNK9kQS_SWZB_U4DHJ3P8ftF6etZEihTYnuXE,411
|
|
4
|
+
bio2zarr/cli.py,sha256=k63xex-tQkogAlJ3N68Ikx8LqZrksXbZB2s6Z7h-zXc,11446
|
|
5
|
+
bio2zarr/core.py,sha256=reF9elN1dwmCoXXLgci-y5pXmAm3fTntmomHTRcG54g,8127
|
|
6
|
+
bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
|
|
7
|
+
bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
|
|
8
|
+
bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
|
|
9
|
+
bio2zarr/vcf.py,sha256=GFnwR2YP-cHU4tfHloRjyiBK9-xXDgXcAM_tz-w2qck,74324
|
|
10
|
+
bio2zarr/vcf_utils.py,sha256=r3NQXxWK1SYU7CcwDzSWXdX5Q8Ixk7gdCTEiFPzfUAk,17307
|
|
11
|
+
bio2zarr-0.0.5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
12
|
+
bio2zarr-0.0.5.dist-info/METADATA,sha256=SasGYcKSRb7NqnYR98ODFvPEMdBNdpxWx5gqOt038QU,1077
|
|
13
|
+
bio2zarr-0.0.5.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
14
|
+
bio2zarr-0.0.5.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
|
|
15
|
+
bio2zarr-0.0.5.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
|
|
16
|
+
bio2zarr-0.0.5.dist-info/RECORD,,
|
bio2zarr-0.0.4.dist-info/RECORD
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
bio2zarr/__init__.py,sha256=yIJYx4GyKtOLOtODOX0kGCeGPYgQ-TBbsRdT1NwBpQQ,37
|
|
2
|
-
bio2zarr/__main__.py,sha256=3cgaQ4x8YKXt-9xC2GLrHnS6UA38y1GXqttwZiBZJg4,525
|
|
3
|
-
bio2zarr/_version.py,sha256=yBVOKdXLEcTVc7YV7ZPqRXhRDRt-pKrfXxcgHkgPY5g,411
|
|
4
|
-
bio2zarr/cli.py,sha256=QE0DfoZHbBbxq9K_im9y4tJ49_Wss0zzavSjjz-85Xw,11484
|
|
5
|
-
bio2zarr/core.py,sha256=tZb9exfFmuzbA8tUpPY8avSm9YvfH31-vUCTM4fpj78,8128
|
|
6
|
-
bio2zarr/plink.py,sha256=llhfP-v44BVPvgCcwXktk0YrKaJSII63U_PTtpHlGtM,6755
|
|
7
|
-
bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
|
|
8
|
-
bio2zarr/typing.py,sha256=wZ99Zzp5BD9Nqpd-S5bn38fSdPzfj6Z9IHPBfZqt9Gs,78
|
|
9
|
-
bio2zarr/vcf.py,sha256=MEskVTDq4QntzoawPz0sfmInV0aPkIPLXXNv7GmVcmY,73870
|
|
10
|
-
bio2zarr/vcf_utils.py,sha256=_kMZdpye15HGpniv8wwISw0L6NEEi54ZFaTcM83wLGs,16751
|
|
11
|
-
bio2zarr-0.0.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
12
|
-
bio2zarr-0.0.4.dist-info/METADATA,sha256=DISckjzZ0b6FpBTfBvpmJmEe00SIdTHyB3UTsTR8rws,1077
|
|
13
|
-
bio2zarr-0.0.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
14
|
-
bio2zarr-0.0.4.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
|
|
15
|
-
bio2zarr-0.0.4.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
|
|
16
|
-
bio2zarr-0.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|