bio2zarr 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/__init__.py +1 -1
- bio2zarr/__main__.py +2 -0
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +42 -14
- bio2zarr/core.py +7 -7
- bio2zarr/plink.py +6 -8
- bio2zarr/typing.py +1 -1
- bio2zarr/vcf.py +136 -87
- bio2zarr/vcf_utils.py +26 -8
- {bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/METADATA +2 -2
- bio2zarr-0.0.5.dist-info/RECORD +16 -0
- bio2zarr-0.0.3.dist-info/RECORD +0 -16
- {bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/WHEEL +0 -0
- {bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/top_level.txt +0 -0
bio2zarr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
from .
|
|
1
|
+
from .provenance import __version__ # noqa F401
|
bio2zarr/__main__.py
CHANGED
|
@@ -2,11 +2,13 @@ import click
|
|
|
2
2
|
|
|
3
3
|
from . import cli
|
|
4
4
|
|
|
5
|
+
|
|
5
6
|
@cli.version
|
|
6
7
|
@click.group()
|
|
7
8
|
def bio2zarr():
|
|
8
9
|
pass
|
|
9
10
|
|
|
11
|
+
|
|
10
12
|
# Provide a single top-level interface to all of the functionality.
|
|
11
13
|
# This probably isn't the recommended way of interacting, as we
|
|
12
14
|
# install individual commands as console scripts. However, this
|
bio2zarr/_version.py
CHANGED
bio2zarr/cli.py
CHANGED
|
@@ -4,14 +4,11 @@ import pathlib
|
|
|
4
4
|
import shutil
|
|
5
5
|
|
|
6
6
|
import click
|
|
7
|
-
import tabulate
|
|
8
7
|
import coloredlogs
|
|
8
|
+
import numcodecs
|
|
9
|
+
import tabulate
|
|
9
10
|
|
|
10
|
-
from . import vcf
|
|
11
|
-
from . import vcf_utils
|
|
12
|
-
from . import plink
|
|
13
|
-
from . import provenance
|
|
14
|
-
|
|
11
|
+
from . import plink, provenance, vcf, vcf_utils
|
|
15
12
|
|
|
16
13
|
logger = logging.getLogger(__name__)
|
|
17
14
|
|
|
@@ -66,6 +63,17 @@ column_chunk_size = click.option(
|
|
|
66
63
|
help="Approximate uncompressed size of exploded column chunks in MiB",
|
|
67
64
|
)
|
|
68
65
|
|
|
66
|
+
# We could provide the full flexiblity of numcodecs/Blosc here, but there
|
|
67
|
+
# doesn't seem much point. Can always add more arguments here to control
|
|
68
|
+
# compression level, etc.
|
|
69
|
+
compressor = click.option(
|
|
70
|
+
"-C",
|
|
71
|
+
"--compressor",
|
|
72
|
+
type=click.Choice(["lz4", "zstd"]),
|
|
73
|
+
default=None,
|
|
74
|
+
help="Codec to use for compressing column chunks (Default=zstd).",
|
|
75
|
+
)
|
|
76
|
+
|
|
69
77
|
# Note: -l and -w were chosen when these were called "width" and "length".
|
|
70
78
|
# possibly there are better letters now.
|
|
71
79
|
variants_chunk_size = click.option(
|
|
@@ -113,24 +121,36 @@ def check_overwrite_dir(path, force):
|
|
|
113
121
|
shutil.rmtree(tmp_delete_path)
|
|
114
122
|
|
|
115
123
|
|
|
124
|
+
def get_compressor(cname):
|
|
125
|
+
if cname is None:
|
|
126
|
+
return None
|
|
127
|
+
config = vcf.ICF_DEFAULT_COMPRESSOR.get_config()
|
|
128
|
+
config["cname"] = cname
|
|
129
|
+
return numcodecs.get_codec(config)
|
|
130
|
+
|
|
131
|
+
|
|
116
132
|
@click.command
|
|
117
133
|
@vcfs
|
|
118
134
|
@new_icf_path
|
|
119
135
|
@force
|
|
120
136
|
@verbose
|
|
121
|
-
@worker_processes
|
|
122
137
|
@column_chunk_size
|
|
123
|
-
|
|
138
|
+
@compressor
|
|
139
|
+
@worker_processes
|
|
140
|
+
def explode(
|
|
141
|
+
vcfs, icf_path, force, verbose, column_chunk_size, compressor, worker_processes
|
|
142
|
+
):
|
|
124
143
|
"""
|
|
125
144
|
Convert VCF(s) to intermediate columnar format
|
|
126
145
|
"""
|
|
127
146
|
setup_logging(verbose)
|
|
128
147
|
check_overwrite_dir(icf_path, force)
|
|
129
148
|
vcf.explode(
|
|
130
|
-
vcfs,
|
|
131
149
|
icf_path,
|
|
150
|
+
vcfs,
|
|
132
151
|
worker_processes=worker_processes,
|
|
133
152
|
column_chunk_size=column_chunk_size,
|
|
153
|
+
compressor=get_compressor(compressor),
|
|
134
154
|
show_progress=True,
|
|
135
155
|
)
|
|
136
156
|
|
|
@@ -141,10 +161,18 @@ def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size)
|
|
|
141
161
|
@click.argument("num_partitions", type=click.IntRange(min=1))
|
|
142
162
|
@force
|
|
143
163
|
@column_chunk_size
|
|
164
|
+
@compressor
|
|
144
165
|
@verbose
|
|
145
166
|
@worker_processes
|
|
146
167
|
def dexplode_init(
|
|
147
|
-
vcfs,
|
|
168
|
+
vcfs,
|
|
169
|
+
icf_path,
|
|
170
|
+
num_partitions,
|
|
171
|
+
force,
|
|
172
|
+
column_chunk_size,
|
|
173
|
+
compressor,
|
|
174
|
+
verbose,
|
|
175
|
+
worker_processes,
|
|
148
176
|
):
|
|
149
177
|
"""
|
|
150
178
|
Initial step for distributed conversion of VCF(s) to intermediate columnar format
|
|
@@ -158,6 +186,7 @@ def dexplode_init(
|
|
|
158
186
|
target_num_partitions=num_partitions,
|
|
159
187
|
column_chunk_size=column_chunk_size,
|
|
160
188
|
worker_processes=worker_processes,
|
|
189
|
+
compressor=get_compressor(compressor),
|
|
161
190
|
show_progress=True,
|
|
162
191
|
)
|
|
163
192
|
click.echo(num_partitions)
|
|
@@ -174,7 +203,7 @@ def dexplode_partition(icf_path, partition, verbose):
|
|
|
174
203
|
from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
|
|
175
204
|
"""
|
|
176
205
|
setup_logging(verbose)
|
|
177
|
-
vcf.explode_partition(icf_path, partition, show_progress=
|
|
206
|
+
vcf.explode_partition(icf_path, partition, show_progress=False)
|
|
178
207
|
|
|
179
208
|
|
|
180
209
|
@click.command
|
|
@@ -232,9 +261,8 @@ def mkschema(icf_path):
|
|
|
232
261
|
@click.option(
|
|
233
262
|
"-M",
|
|
234
263
|
"--max-memory",
|
|
235
|
-
type=int,
|
|
236
264
|
default=None,
|
|
237
|
-
help="An approximate bound on overall memory usage
|
|
265
|
+
help="An approximate bound on overall memory usage (e.g. 10G),",
|
|
238
266
|
)
|
|
239
267
|
@worker_processes
|
|
240
268
|
def encode(
|
|
@@ -250,7 +278,7 @@ def encode(
|
|
|
250
278
|
worker_processes,
|
|
251
279
|
):
|
|
252
280
|
"""
|
|
253
|
-
|
|
281
|
+
Convert intermediate columnar format to vcfzarr.
|
|
254
282
|
"""
|
|
255
283
|
setup_logging(verbose)
|
|
256
284
|
check_overwrite_dir(zarr_path, force)
|
bio2zarr/core.py
CHANGED
|
@@ -1,16 +1,15 @@
|
|
|
1
|
-
import dataclasses
|
|
2
|
-
import contextlib
|
|
3
1
|
import concurrent.futures as cf
|
|
2
|
+
import contextlib
|
|
3
|
+
import dataclasses
|
|
4
|
+
import logging
|
|
4
5
|
import multiprocessing
|
|
5
6
|
import threading
|
|
6
|
-
import logging
|
|
7
7
|
import time
|
|
8
8
|
|
|
9
|
-
import
|
|
9
|
+
import numcodecs
|
|
10
10
|
import numpy as np
|
|
11
11
|
import tqdm
|
|
12
|
-
import
|
|
13
|
-
|
|
12
|
+
import zarr
|
|
14
13
|
|
|
15
14
|
logger = logging.getLogger(__name__)
|
|
16
15
|
|
|
@@ -50,7 +49,8 @@ def wait_on_futures(futures):
|
|
|
50
49
|
cancel_futures(futures)
|
|
51
50
|
if isinstance(exception, cf.process.BrokenProcessPool):
|
|
52
51
|
raise RuntimeError(
|
|
53
|
-
"Worker process died: you may have run out of memory"
|
|
52
|
+
"Worker process died: you may have run out of memory"
|
|
53
|
+
) from exception
|
|
54
54
|
else:
|
|
55
55
|
raise exception
|
|
56
56
|
|
bio2zarr/plink.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
|
|
3
|
+
import bed_reader
|
|
3
4
|
import humanfriendly
|
|
5
|
+
import numcodecs
|
|
4
6
|
import numpy as np
|
|
5
7
|
import zarr
|
|
6
|
-
import bed_reader
|
|
7
|
-
import numcodecs
|
|
8
8
|
|
|
9
9
|
from . import core
|
|
10
10
|
|
|
11
|
-
|
|
12
11
|
logger = logging.getLogger(__name__)
|
|
13
12
|
|
|
14
13
|
|
|
@@ -24,7 +23,6 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
|
|
|
24
23
|
gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
|
|
25
24
|
gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
|
|
26
25
|
variants_chunk_size = gt.array.chunks[0]
|
|
27
|
-
n = gt.array.shape[1]
|
|
28
26
|
assert start % variants_chunk_size == 0
|
|
29
27
|
|
|
30
28
|
logger.debug(f"Reading slice {start}:{stop}")
|
|
@@ -96,7 +94,7 @@ def convert(
|
|
|
96
94
|
chunks=(samples_chunk_size,),
|
|
97
95
|
)
|
|
98
96
|
a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
99
|
-
logger.debug(
|
|
97
|
+
logger.debug("Encoded samples")
|
|
100
98
|
|
|
101
99
|
# TODO encode these in slices - but read them in one go to avoid
|
|
102
100
|
# fetching repeatedly from bim file
|
|
@@ -108,7 +106,7 @@ def convert(
|
|
|
108
106
|
chunks=(variants_chunk_size,),
|
|
109
107
|
)
|
|
110
108
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
|
|
111
|
-
logger.debug(
|
|
109
|
+
logger.debug("encoded variant_position")
|
|
112
110
|
|
|
113
111
|
alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
|
|
114
112
|
a = root.array(
|
|
@@ -119,7 +117,7 @@ def convert(
|
|
|
119
117
|
chunks=(variants_chunk_size,),
|
|
120
118
|
)
|
|
121
119
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
|
|
122
|
-
logger.debug(
|
|
120
|
+
logger.debug("encoded variant_allele")
|
|
123
121
|
|
|
124
122
|
# TODO remove this?
|
|
125
123
|
a = root.empty(
|
|
@@ -201,4 +199,4 @@ def validate(bed_path, zarr_path):
|
|
|
201
199
|
elif bed_call == 2:
|
|
202
200
|
assert list(zarr_call) == [1, 1]
|
|
203
201
|
else: # pragma no cover
|
|
204
|
-
|
|
202
|
+
raise AssertionError(f"Unexpected bed call {bed_call}")
|
bio2zarr/typing.py
CHANGED
bio2zarr/vcf.py
CHANGED
|
@@ -1,29 +1,27 @@
|
|
|
1
1
|
import collections
|
|
2
|
+
import contextlib
|
|
2
3
|
import dataclasses
|
|
3
4
|
import functools
|
|
5
|
+
import json
|
|
4
6
|
import logging
|
|
7
|
+
import math
|
|
5
8
|
import os
|
|
6
9
|
import pathlib
|
|
7
10
|
import pickle
|
|
8
|
-
import sys
|
|
9
11
|
import shutil
|
|
10
|
-
import
|
|
11
|
-
import math
|
|
12
|
+
import sys
|
|
12
13
|
import tempfile
|
|
13
|
-
import contextlib
|
|
14
14
|
from typing import Any, List
|
|
15
15
|
|
|
16
|
-
import humanfriendly
|
|
17
16
|
import cyvcf2
|
|
17
|
+
import humanfriendly
|
|
18
18
|
import numcodecs
|
|
19
19
|
import numpy as np
|
|
20
20
|
import numpy.testing as nt
|
|
21
21
|
import tqdm
|
|
22
22
|
import zarr
|
|
23
23
|
|
|
24
|
-
from . import core
|
|
25
|
-
from . import provenance
|
|
26
|
-
from . import vcf_utils
|
|
24
|
+
from . import core, provenance, vcf_utils
|
|
27
25
|
|
|
28
26
|
logger = logging.getLogger(__name__)
|
|
29
27
|
|
|
@@ -151,8 +149,8 @@ class VcfPartition:
|
|
|
151
149
|
|
|
152
150
|
ICF_METADATA_FORMAT_VERSION = "0.2"
|
|
153
151
|
ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
|
|
154
|
-
cname="
|
|
155
|
-
)
|
|
152
|
+
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
|
|
153
|
+
)
|
|
156
154
|
|
|
157
155
|
|
|
158
156
|
@dataclasses.dataclass
|
|
@@ -284,11 +282,25 @@ def scan_vcf(path, target_num_partitions):
|
|
|
284
282
|
return metadata, vcf.raw_header
|
|
285
283
|
|
|
286
284
|
|
|
287
|
-
def
|
|
288
|
-
|
|
289
|
-
|
|
285
|
+
def check_overlap(partitions):
|
|
286
|
+
for i in range(1, len(partitions)):
|
|
287
|
+
prev_partition = partitions[i - 1]
|
|
288
|
+
current_partition = partitions[i]
|
|
289
|
+
if (
|
|
290
|
+
prev_partition.region.contig == current_partition.region.contig
|
|
291
|
+
and prev_partition.region.end > current_partition.region.start
|
|
292
|
+
):
|
|
293
|
+
raise ValueError(
|
|
294
|
+
f"Multiple VCFs have the region "
|
|
295
|
+
f"{prev_partition.region.contig}:{prev_partition.region.start}-"
|
|
296
|
+
f"{current_partition.region.end}"
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
290
301
|
logger.info(
|
|
291
|
-
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}
|
|
302
|
+
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
|
|
303
|
+
f" partitions."
|
|
292
304
|
)
|
|
293
305
|
# An easy mistake to make is to pass the same file twice. Check this early on.
|
|
294
306
|
for path, count in collections.Counter(paths).items():
|
|
@@ -333,13 +345,8 @@ def scan_vcfs(
|
|
|
333
345
|
all_partitions.sort(
|
|
334
346
|
key=lambda x: (contig_index_map[x.region.contig], x.region.start)
|
|
335
347
|
)
|
|
348
|
+
check_overlap(all_partitions)
|
|
336
349
|
icf_metadata.partitions = all_partitions
|
|
337
|
-
icf_metadata.format_version = ICF_METADATA_FORMAT_VERSION
|
|
338
|
-
icf_metadata.compressor = ICF_DEFAULT_COMPRESSOR
|
|
339
|
-
icf_metadata.column_chunk_size = column_chunk_size
|
|
340
|
-
# Bare minimum here for provenance - would be nice to include versions of key
|
|
341
|
-
# dependencies as well.
|
|
342
|
-
icf_metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
|
|
343
350
|
logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
|
|
344
351
|
return icf_metadata, header
|
|
345
352
|
|
|
@@ -799,6 +806,8 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
|
|
|
799
806
|
for vcf_field in icf_metadata.fields:
|
|
800
807
|
field_path = get_vcf_field_path(out_path, vcf_field)
|
|
801
808
|
field_partition_path = field_path / f"p{partition_index}"
|
|
809
|
+
# Should be robust to running explode_partition twice.
|
|
810
|
+
field_partition_path.mkdir(exist_ok=True)
|
|
802
811
|
transformer = VcfValueTransformer.factory(vcf_field, num_samples)
|
|
803
812
|
self.field_writers[vcf_field.full_name] = IcfFieldWriter(
|
|
804
813
|
vcf_field,
|
|
@@ -824,13 +833,7 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
|
|
|
824
833
|
return False
|
|
825
834
|
|
|
826
835
|
|
|
827
|
-
# TODO rename to IntermediateColumnarFormat and move to icf.py
|
|
828
|
-
|
|
829
|
-
|
|
830
836
|
class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
831
|
-
# TODO Check if other compressors would give reasonable compression
|
|
832
|
-
# with significantly faster times
|
|
833
|
-
|
|
834
837
|
def __init__(self, path):
|
|
835
838
|
self.path = pathlib.Path(path)
|
|
836
839
|
# TODO raise a more informative error here telling people this
|
|
@@ -846,7 +849,7 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
846
849
|
partition.num_records for partition in self.metadata.partitions
|
|
847
850
|
]
|
|
848
851
|
# Allow us to find which partition a given record is in
|
|
849
|
-
self.partition_record_index = np.cumsum([0
|
|
852
|
+
self.partition_record_index = np.cumsum([0, *partition_num_records])
|
|
850
853
|
for field in self.metadata.fields:
|
|
851
854
|
self.columns[field.full_name] = IntermediateColumnarFormatField(self, field)
|
|
852
855
|
logger.info(
|
|
@@ -856,7 +859,8 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
856
859
|
|
|
857
860
|
def __repr__(self):
|
|
858
861
|
return (
|
|
859
|
-
f"IntermediateColumnarFormat(fields={len(self)},
|
|
862
|
+
f"IntermediateColumnarFormat(fields={len(self)}, "
|
|
863
|
+
f"partitions={self.num_partitions}, "
|
|
860
864
|
f"records={self.num_records}, path={self.path})"
|
|
861
865
|
)
|
|
862
866
|
|
|
@@ -922,9 +926,12 @@ class IntermediateColumnarFormatWriter:
|
|
|
922
926
|
worker_processes=1,
|
|
923
927
|
target_num_partitions=None,
|
|
924
928
|
show_progress=False,
|
|
929
|
+
compressor=None,
|
|
925
930
|
):
|
|
926
931
|
if self.path.exists():
|
|
927
|
-
|
|
932
|
+
raise ValueError("ICF path already exists")
|
|
933
|
+
if compressor is None:
|
|
934
|
+
compressor = ICF_DEFAULT_COMPRESSOR
|
|
928
935
|
vcfs = [pathlib.Path(vcf) for vcf in vcfs]
|
|
929
936
|
target_num_partitions = max(target_num_partitions, len(vcfs))
|
|
930
937
|
|
|
@@ -934,39 +941,38 @@ class IntermediateColumnarFormatWriter:
|
|
|
934
941
|
worker_processes=worker_processes,
|
|
935
942
|
show_progress=show_progress,
|
|
936
943
|
target_num_partitions=target_num_partitions,
|
|
937
|
-
column_chunk_size=column_chunk_size,
|
|
938
944
|
)
|
|
939
945
|
self.metadata = icf_metadata
|
|
946
|
+
self.metadata.format_version = ICF_METADATA_FORMAT_VERSION
|
|
947
|
+
self.metadata.compressor = compressor.get_config()
|
|
948
|
+
self.metadata.column_chunk_size = column_chunk_size
|
|
949
|
+
# Bare minimum here for provenance - would be nice to include versions of key
|
|
950
|
+
# dependencies as well.
|
|
951
|
+
self.metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
|
|
940
952
|
|
|
941
953
|
self.mkdirs()
|
|
942
954
|
|
|
943
955
|
# Note: this is needed for the current version of the vcfzarr spec, but it's
|
|
944
|
-
# probably
|
|
956
|
+
# probably going to be dropped.
|
|
945
957
|
# https://github.com/pystatgen/vcf-zarr-spec/issues/15
|
|
946
958
|
# May be useful to keep lying around still though?
|
|
947
|
-
logger.info(
|
|
959
|
+
logger.info("Writing VCF header")
|
|
948
960
|
with open(self.path / "header.txt", "w") as f:
|
|
949
961
|
f.write(header)
|
|
950
962
|
|
|
951
|
-
logger.info(
|
|
963
|
+
logger.info("Writing WIP metadata")
|
|
952
964
|
with open(self.wip_path / "metadata.json", "w") as f:
|
|
953
965
|
json.dump(self.metadata.asdict(), f, indent=4)
|
|
954
966
|
return self.num_partitions
|
|
955
967
|
|
|
956
968
|
def mkdirs(self):
|
|
957
|
-
|
|
958
|
-
logger.info(
|
|
959
|
-
f"Creating {len(self.metadata.fields) * self.num_partitions} directories"
|
|
960
|
-
)
|
|
969
|
+
num_dirs = len(self.metadata.fields)
|
|
970
|
+
logger.info(f"Creating {num_dirs} field directories")
|
|
961
971
|
self.path.mkdir()
|
|
962
972
|
self.wip_path.mkdir()
|
|
963
973
|
for field in self.metadata.fields:
|
|
964
974
|
col_path = get_vcf_field_path(self.path, field)
|
|
965
|
-
logger.debug(f"Make directories for {field.full_name} at {col_path}")
|
|
966
975
|
col_path.mkdir(parents=True)
|
|
967
|
-
for j in range(self.num_partitions):
|
|
968
|
-
part_path = col_path / f"p{j}"
|
|
969
|
-
part_path.mkdir()
|
|
970
976
|
|
|
971
977
|
def load_partition_summaries(self):
|
|
972
978
|
summaries = []
|
|
@@ -982,13 +988,14 @@ class IntermediateColumnarFormatWriter:
|
|
|
982
988
|
not_found.append(j)
|
|
983
989
|
if len(not_found) > 0:
|
|
984
990
|
raise FileNotFoundError(
|
|
985
|
-
f"Partition metadata not found for {len(not_found)}
|
|
991
|
+
f"Partition metadata not found for {len(not_found)}"
|
|
992
|
+
f" partitions: {not_found}"
|
|
986
993
|
)
|
|
987
994
|
return summaries
|
|
988
995
|
|
|
989
996
|
def load_metadata(self):
|
|
990
997
|
if self.metadata is None:
|
|
991
|
-
with open(self.wip_path /
|
|
998
|
+
with open(self.wip_path / "metadata.json") as f:
|
|
992
999
|
self.metadata = IcfMetadata.fromdict(json.load(f))
|
|
993
1000
|
|
|
994
1001
|
def process_partition(self, partition_index):
|
|
@@ -1037,12 +1044,14 @@ class IntermediateColumnarFormatWriter:
|
|
|
1037
1044
|
for field in format_fields:
|
|
1038
1045
|
val = variant.format(field.name)
|
|
1039
1046
|
tcw.append(field.full_name, val)
|
|
1040
|
-
# Note: an issue with updating the progress per variant here like
|
|
1041
|
-
# is that we get a significant pause at the end of the counter
|
|
1042
|
-
# all the "small" fields get flushed. Possibly not much to be
|
|
1047
|
+
# Note: an issue with updating the progress per variant here like
|
|
1048
|
+
# this is that we get a significant pause at the end of the counter
|
|
1049
|
+
# while all the "small" fields get flushed. Possibly not much to be
|
|
1050
|
+
# done about it.
|
|
1043
1051
|
core.update_progress(1)
|
|
1044
1052
|
logger.info(
|
|
1045
|
-
f"Finished reading VCF for partition {partition_index},
|
|
1053
|
+
f"Finished reading VCF for partition {partition_index}, "
|
|
1054
|
+
f"flushing buffers"
|
|
1046
1055
|
)
|
|
1047
1056
|
|
|
1048
1057
|
partition_metadata = {
|
|
@@ -1124,30 +1133,32 @@ class IntermediateColumnarFormatWriter:
|
|
|
1124
1133
|
for summary in partition_summaries:
|
|
1125
1134
|
field.summary.update(summary["field_summaries"][field.full_name])
|
|
1126
1135
|
|
|
1127
|
-
logger.info(
|
|
1136
|
+
logger.info("Finalising metadata")
|
|
1128
1137
|
with open(self.path / "metadata.json", "w") as f:
|
|
1129
1138
|
json.dump(self.metadata.asdict(), f, indent=4)
|
|
1130
1139
|
|
|
1131
|
-
logger.debug(
|
|
1140
|
+
logger.debug("Removing WIP directory")
|
|
1132
1141
|
shutil.rmtree(self.wip_path)
|
|
1133
1142
|
|
|
1134
1143
|
|
|
1135
1144
|
def explode(
|
|
1136
|
-
vcfs,
|
|
1137
1145
|
icf_path,
|
|
1146
|
+
vcfs,
|
|
1138
1147
|
*,
|
|
1139
1148
|
column_chunk_size=16,
|
|
1140
1149
|
worker_processes=1,
|
|
1141
1150
|
show_progress=False,
|
|
1151
|
+
compressor=None,
|
|
1142
1152
|
):
|
|
1143
1153
|
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1144
|
-
|
|
1154
|
+
writer.init(
|
|
1145
1155
|
vcfs,
|
|
1146
1156
|
# Heuristic to get reasonable worker utilisation with lumpy partition sizing
|
|
1147
1157
|
target_num_partitions=max(1, worker_processes * 4),
|
|
1148
1158
|
worker_processes=worker_processes,
|
|
1149
1159
|
show_progress=show_progress,
|
|
1150
1160
|
column_chunk_size=column_chunk_size,
|
|
1161
|
+
compressor=compressor,
|
|
1151
1162
|
)
|
|
1152
1163
|
writer.explode(worker_processes=worker_processes, show_progress=show_progress)
|
|
1153
1164
|
writer.finalise()
|
|
@@ -1162,6 +1173,7 @@ def explode_init(
|
|
|
1162
1173
|
target_num_partitions=1,
|
|
1163
1174
|
worker_processes=1,
|
|
1164
1175
|
show_progress=False,
|
|
1176
|
+
compressor=None,
|
|
1165
1177
|
):
|
|
1166
1178
|
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1167
1179
|
return writer.init(
|
|
@@ -1170,6 +1182,7 @@ def explode_init(
|
|
|
1170
1182
|
worker_processes=worker_processes,
|
|
1171
1183
|
show_progress=show_progress,
|
|
1172
1184
|
column_chunk_size=column_chunk_size,
|
|
1185
|
+
compressor=compressor,
|
|
1173
1186
|
)
|
|
1174
1187
|
|
|
1175
1188
|
|
|
@@ -1209,20 +1222,25 @@ class ZarrColumnSpec:
|
|
|
1209
1222
|
dtype: str
|
|
1210
1223
|
shape: tuple
|
|
1211
1224
|
chunks: tuple
|
|
1212
|
-
dimensions:
|
|
1225
|
+
dimensions: tuple
|
|
1213
1226
|
description: str
|
|
1214
1227
|
vcf_field: str
|
|
1215
|
-
compressor: dict
|
|
1216
|
-
filters: list
|
|
1217
|
-
# TODO add filters
|
|
1228
|
+
compressor: dict
|
|
1229
|
+
filters: list
|
|
1218
1230
|
|
|
1219
1231
|
def __post_init__(self):
|
|
1232
|
+
# Ensure these are tuples for ease of comparison and consistency
|
|
1220
1233
|
self.shape = tuple(self.shape)
|
|
1221
1234
|
self.chunks = tuple(self.chunks)
|
|
1222
1235
|
self.dimensions = tuple(self.dimensions)
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1236
|
+
|
|
1237
|
+
@staticmethod
|
|
1238
|
+
def new(**kwargs):
|
|
1239
|
+
spec = ZarrColumnSpec(
|
|
1240
|
+
**kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
|
|
1241
|
+
)
|
|
1242
|
+
spec._choose_compressor_settings()
|
|
1243
|
+
return spec
|
|
1226
1244
|
|
|
1227
1245
|
def _choose_compressor_settings(self):
|
|
1228
1246
|
"""
|
|
@@ -1298,7 +1316,7 @@ class VcfZarrSchema:
|
|
|
1298
1316
|
def fixed_field_spec(
|
|
1299
1317
|
name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
|
|
1300
1318
|
):
|
|
1301
|
-
return ZarrColumnSpec(
|
|
1319
|
+
return ZarrColumnSpec.new(
|
|
1302
1320
|
vcf_field=vcf_field,
|
|
1303
1321
|
name=name,
|
|
1304
1322
|
dtype=dtype,
|
|
@@ -1366,14 +1384,23 @@ class VcfZarrSchema:
|
|
|
1366
1384
|
if field.category == "FORMAT":
|
|
1367
1385
|
prefix = "call_"
|
|
1368
1386
|
shape.append(n)
|
|
1369
|
-
chunks.append(samples_chunk_size)
|
|
1387
|
+
chunks.append(samples_chunk_size)
|
|
1370
1388
|
dimensions.append("samples")
|
|
1371
1389
|
# TODO make an option to add in the empty extra dimension
|
|
1372
1390
|
if field.summary.max_number > 1:
|
|
1373
1391
|
shape.append(field.summary.max_number)
|
|
1374
|
-
dimensions
|
|
1392
|
+
# TODO we should really be checking this to see if the named dimensions
|
|
1393
|
+
# are actually correct.
|
|
1394
|
+
if field.vcf_number == "R":
|
|
1395
|
+
dimensions.append("alleles")
|
|
1396
|
+
elif field.vcf_number == "A":
|
|
1397
|
+
dimensions.append("alt_alleles")
|
|
1398
|
+
elif field.vcf_number == "G":
|
|
1399
|
+
dimensions.append("genotypes")
|
|
1400
|
+
else:
|
|
1401
|
+
dimensions.append(f"{field.category}_{field.name}_dim")
|
|
1375
1402
|
variable_name = prefix + field.name
|
|
1376
|
-
colspec = ZarrColumnSpec(
|
|
1403
|
+
colspec = ZarrColumnSpec.new(
|
|
1377
1404
|
vcf_field=field.full_name,
|
|
1378
1405
|
name=variable_name,
|
|
1379
1406
|
dtype=field.smallest_dtype(),
|
|
@@ -1391,7 +1418,7 @@ class VcfZarrSchema:
|
|
|
1391
1418
|
dimensions = ["variants", "samples"]
|
|
1392
1419
|
|
|
1393
1420
|
colspecs.append(
|
|
1394
|
-
ZarrColumnSpec(
|
|
1421
|
+
ZarrColumnSpec.new(
|
|
1395
1422
|
vcf_field=None,
|
|
1396
1423
|
name="call_genotype_phased",
|
|
1397
1424
|
dtype="bool",
|
|
@@ -1404,7 +1431,7 @@ class VcfZarrSchema:
|
|
|
1404
1431
|
shape += [ploidy]
|
|
1405
1432
|
dimensions += ["ploidy"]
|
|
1406
1433
|
colspecs.append(
|
|
1407
|
-
ZarrColumnSpec(
|
|
1434
|
+
ZarrColumnSpec.new(
|
|
1408
1435
|
vcf_field=None,
|
|
1409
1436
|
name="call_genotype",
|
|
1410
1437
|
dtype=gt_field.smallest_dtype(),
|
|
@@ -1415,7 +1442,7 @@ class VcfZarrSchema:
|
|
|
1415
1442
|
)
|
|
1416
1443
|
)
|
|
1417
1444
|
colspecs.append(
|
|
1418
|
-
ZarrColumnSpec(
|
|
1445
|
+
ZarrColumnSpec.new(
|
|
1419
1446
|
vcf_field=None,
|
|
1420
1447
|
name="call_genotype_mask",
|
|
1421
1448
|
dtype="bool",
|
|
@@ -1480,16 +1507,30 @@ class EncodingWork:
|
|
|
1480
1507
|
memory: int = 0
|
|
1481
1508
|
|
|
1482
1509
|
|
|
1510
|
+
def parse_max_memory(max_memory):
|
|
1511
|
+
if max_memory is None:
|
|
1512
|
+
# Effectively unbounded
|
|
1513
|
+
return 2**63
|
|
1514
|
+
if isinstance(max_memory, str):
|
|
1515
|
+
max_memory = humanfriendly.parse_size(max_memory)
|
|
1516
|
+
logger.info(f"Set memory budget to {display_size(max_memory)}")
|
|
1517
|
+
return max_memory
|
|
1518
|
+
|
|
1519
|
+
|
|
1483
1520
|
class VcfZarrWriter:
|
|
1484
|
-
def __init__(self, path, icf, schema):
|
|
1521
|
+
def __init__(self, path, icf, schema, dimension_separator=None):
|
|
1485
1522
|
self.path = pathlib.Path(path)
|
|
1486
1523
|
self.icf = icf
|
|
1487
1524
|
self.schema = schema
|
|
1525
|
+
# Default to using nested directories following the Zarr v3 default.
|
|
1526
|
+
# This seems to require version 2.17+ to work properly
|
|
1527
|
+
self.dimension_separator = (
|
|
1528
|
+
"/" if dimension_separator is None else dimension_separator
|
|
1529
|
+
)
|
|
1488
1530
|
store = zarr.DirectoryStore(self.path)
|
|
1489
1531
|
self.root = zarr.group(store=store)
|
|
1490
1532
|
|
|
1491
1533
|
def init_array(self, variable):
|
|
1492
|
-
# print("CREATE", variable)
|
|
1493
1534
|
object_codec = None
|
|
1494
1535
|
if variable.dtype == "O":
|
|
1495
1536
|
object_codec = numcodecs.VLenUTF8()
|
|
@@ -1501,7 +1542,9 @@ class VcfZarrWriter:
|
|
|
1501
1542
|
compressor=numcodecs.get_codec(variable.compressor),
|
|
1502
1543
|
filters=[numcodecs.get_codec(filt) for filt in variable.filters],
|
|
1503
1544
|
object_codec=object_codec,
|
|
1545
|
+
dimension_separator=self.dimension_separator,
|
|
1504
1546
|
)
|
|
1547
|
+
# Dimension names are part of the spec in Zarr v3
|
|
1505
1548
|
a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
|
|
1506
1549
|
|
|
1507
1550
|
def get_array(self, name):
|
|
@@ -1593,7 +1636,9 @@ class VcfZarrWriter:
|
|
|
1593
1636
|
try:
|
|
1594
1637
|
var_filter.buff[j, lookup[f]] = True
|
|
1595
1638
|
except KeyError:
|
|
1596
|
-
raise ValueError(
|
|
1639
|
+
raise ValueError(
|
|
1640
|
+
f"Filter '{f}' was not defined " f"in the header."
|
|
1641
|
+
) from None
|
|
1597
1642
|
var_filter.flush()
|
|
1598
1643
|
logger.debug(f"Encoded FILTERS slice {start}:{stop}")
|
|
1599
1644
|
|
|
@@ -1639,6 +1684,7 @@ class VcfZarrWriter:
|
|
|
1639
1684
|
"contig_length",
|
|
1640
1685
|
self.schema.contig_length,
|
|
1641
1686
|
dtype=np.int64,
|
|
1687
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1642
1688
|
)
|
|
1643
1689
|
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
1644
1690
|
return {v: j for j, v in enumerate(self.schema.contig_id)}
|
|
@@ -1661,8 +1707,6 @@ class VcfZarrWriter:
|
|
|
1661
1707
|
self.init_array(column)
|
|
1662
1708
|
|
|
1663
1709
|
def finalise(self):
|
|
1664
|
-
# for column in self.schema.columns.values():
|
|
1665
|
-
# self.finalise_array(column)
|
|
1666
1710
|
zarr.consolidate_metadata(self.path)
|
|
1667
1711
|
|
|
1668
1712
|
def encode(
|
|
@@ -1672,12 +1716,7 @@ class VcfZarrWriter:
|
|
|
1672
1716
|
show_progress=False,
|
|
1673
1717
|
max_memory=None,
|
|
1674
1718
|
):
|
|
1675
|
-
|
|
1676
|
-
# Unbounded
|
|
1677
|
-
max_memory = 2**63
|
|
1678
|
-
else:
|
|
1679
|
-
# Value is specified in Mibibytes
|
|
1680
|
-
max_memory *= 2**20 # NEEDS TEST
|
|
1719
|
+
max_memory = parse_max_memory(max_memory)
|
|
1681
1720
|
|
|
1682
1721
|
# TODO this will move into the setup logic later when we're making it possible
|
|
1683
1722
|
# to split the work by slice
|
|
@@ -1702,7 +1741,8 @@ class VcfZarrWriter:
|
|
|
1702
1741
|
variant_chunk_size = array.blocks[0].nbytes
|
|
1703
1742
|
encoding_memory_requirements[col.name] = variant_chunk_size
|
|
1704
1743
|
logger.debug(
|
|
1705
|
-
f"{col.name} requires at least {display_size(variant_chunk_size)}
|
|
1744
|
+
f"{col.name} requires at least {display_size(variant_chunk_size)} "
|
|
1745
|
+
f"per worker"
|
|
1706
1746
|
)
|
|
1707
1747
|
total_bytes += array.nbytes
|
|
1708
1748
|
|
|
@@ -1764,8 +1804,8 @@ class VcfZarrWriter:
|
|
|
1764
1804
|
|
|
1765
1805
|
# Fail early if we can't fit a particular column into memory
|
|
1766
1806
|
for wp in work:
|
|
1767
|
-
if wp.memory
|
|
1768
|
-
raise ValueError(
|
|
1807
|
+
if wp.memory > max_memory:
|
|
1808
|
+
raise ValueError(
|
|
1769
1809
|
f"Insufficient memory for {wp.columns}: "
|
|
1770
1810
|
f"{display_size(wp.memory)} > {display_size(max_memory)}"
|
|
1771
1811
|
)
|
|
@@ -1778,6 +1818,8 @@ class VcfZarrWriter:
|
|
|
1778
1818
|
)
|
|
1779
1819
|
|
|
1780
1820
|
used_memory = 0
|
|
1821
|
+
# We need to keep some bounds on the queue size or the memory bounds algorithm
|
|
1822
|
+
# below doesn't really work.
|
|
1781
1823
|
max_queued = 4 * max(1, worker_processes)
|
|
1782
1824
|
encoded_slices = collections.Counter()
|
|
1783
1825
|
|
|
@@ -1804,10 +1846,15 @@ class VcfZarrWriter:
|
|
|
1804
1846
|
self.finalise_array(column)
|
|
1805
1847
|
|
|
1806
1848
|
for wp in work:
|
|
1807
|
-
|
|
1849
|
+
while (
|
|
1808
1850
|
used_memory + wp.memory > max_memory
|
|
1809
1851
|
or len(future_to_work) > max_queued
|
|
1810
1852
|
):
|
|
1853
|
+
logger.debug(
|
|
1854
|
+
f"Wait: mem_required={used_memory + wp.memory} "
|
|
1855
|
+
f"max_mem={max_memory} queued={len(future_to_work)} "
|
|
1856
|
+
f"max_queued={max_queued}"
|
|
1857
|
+
)
|
|
1811
1858
|
service_completed_futures()
|
|
1812
1859
|
future = pwm.submit(wp.func, wp.start, wp.stop)
|
|
1813
1860
|
used_memory += wp.memory
|
|
@@ -1832,6 +1879,7 @@ def encode(
|
|
|
1832
1879
|
variants_chunk_size=None,
|
|
1833
1880
|
samples_chunk_size=None,
|
|
1834
1881
|
max_v_chunks=None,
|
|
1882
|
+
dimension_separator=None,
|
|
1835
1883
|
max_memory=None,
|
|
1836
1884
|
worker_processes=1,
|
|
1837
1885
|
show_progress=False,
|
|
@@ -1849,13 +1897,13 @@ def encode(
|
|
|
1849
1897
|
raise ValueError(
|
|
1850
1898
|
"Cannot specify schema along with chunk sizes"
|
|
1851
1899
|
) # NEEDS TEST
|
|
1852
|
-
with open(schema_path
|
|
1900
|
+
with open(schema_path) as f:
|
|
1853
1901
|
schema = VcfZarrSchema.fromjson(f.read())
|
|
1854
1902
|
zarr_path = pathlib.Path(zarr_path)
|
|
1855
1903
|
if zarr_path.exists():
|
|
1856
1904
|
logger.warning(f"Deleting existing {zarr_path}")
|
|
1857
1905
|
shutil.rmtree(zarr_path)
|
|
1858
|
-
vzw = VcfZarrWriter(zarr_path, icf, schema)
|
|
1906
|
+
vzw = VcfZarrWriter(zarr_path, icf, schema, dimension_separator=dimension_separator)
|
|
1859
1907
|
vzw.init()
|
|
1860
1908
|
vzw.encode(
|
|
1861
1909
|
max_v_chunks=max_v_chunks,
|
|
@@ -1876,10 +1924,11 @@ def convert(
|
|
|
1876
1924
|
show_progress=False,
|
|
1877
1925
|
# TODO add arguments to control location of tmpdir
|
|
1878
1926
|
):
|
|
1879
|
-
with tempfile.TemporaryDirectory(prefix="
|
|
1927
|
+
with tempfile.TemporaryDirectory(prefix="vcf2zarr") as tmp:
|
|
1928
|
+
if_dir = pathlib.Path(tmp) / "if"
|
|
1880
1929
|
explode(
|
|
1881
|
-
vcfs,
|
|
1882
1930
|
if_dir,
|
|
1931
|
+
vcfs,
|
|
1883
1932
|
worker_processes=worker_processes,
|
|
1884
1933
|
show_progress=show_progress,
|
|
1885
1934
|
)
|
|
@@ -1929,7 +1978,7 @@ def assert_all_fill(zarr_val, vcf_type):
|
|
|
1929
1978
|
elif vcf_type == "Float":
|
|
1930
1979
|
assert_all_fill_float(zarr_val)
|
|
1931
1980
|
else: # pragma: no cover
|
|
1932
|
-
assert False
|
|
1981
|
+
assert False # noqa PT015
|
|
1933
1982
|
|
|
1934
1983
|
|
|
1935
1984
|
def assert_all_missing(zarr_val, vcf_type):
|
|
@@ -1942,7 +1991,7 @@ def assert_all_missing(zarr_val, vcf_type):
|
|
|
1942
1991
|
elif vcf_type == "Float":
|
|
1943
1992
|
assert_all_missing_float(zarr_val)
|
|
1944
1993
|
else: # pragma: no cover
|
|
1945
|
-
assert False
|
|
1994
|
+
assert False # noqa PT015
|
|
1946
1995
|
|
|
1947
1996
|
|
|
1948
1997
|
def assert_info_val_missing(zarr_val, vcf_type):
|
|
@@ -2081,7 +2130,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
|
|
|
2081
2130
|
assert vid[j] == ("." if row.ID is None else row.ID)
|
|
2082
2131
|
assert allele[j, 0] == row.REF
|
|
2083
2132
|
k = len(row.ALT)
|
|
2084
|
-
nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
|
|
2133
|
+
nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
|
|
2085
2134
|
assert np.all(allele[j, k + 1 :] == "")
|
|
2086
2135
|
# TODO FILTERS
|
|
2087
2136
|
|
bio2zarr/vcf_utils.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
from typing import IO, Any, Dict, Optional, Sequence, Union
|
|
2
1
|
import contextlib
|
|
3
|
-
import struct
|
|
4
|
-
import pathlib
|
|
5
2
|
import gzip
|
|
6
|
-
from dataclasses import dataclass
|
|
7
3
|
import os
|
|
4
|
+
import pathlib
|
|
5
|
+
import struct
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import IO, Any, Dict, Optional, Sequence, Union
|
|
8
8
|
|
|
9
|
-
import numpy as np
|
|
10
9
|
import cyvcf2
|
|
11
10
|
import humanfriendly
|
|
11
|
+
import numpy as np
|
|
12
12
|
|
|
13
13
|
from bio2zarr.typing import PathType
|
|
14
14
|
|
|
@@ -38,7 +38,8 @@ def read_bytes_as_value(f: IO[Any], fmt: str, nodata: Optional[Any] = None) -> A
|
|
|
38
38
|
fmt : str
|
|
39
39
|
A Python `struct` format string.
|
|
40
40
|
nodata : Optional[Any], optional
|
|
41
|
-
The value to return in case there is no further data in the stream,
|
|
41
|
+
The value to return in case there is no further data in the stream,
|
|
42
|
+
by default None
|
|
42
43
|
|
|
43
44
|
Returns
|
|
44
45
|
-------
|
|
@@ -277,7 +278,8 @@ class TabixIndex:
|
|
|
277
278
|
# Create file offsets for each element in the linear index
|
|
278
279
|
file_offsets = np.array([get_file_offset(vfp) for vfp in linear_index])
|
|
279
280
|
|
|
280
|
-
# Calculate corresponding contigs and positions or each element in
|
|
281
|
+
# Calculate corresponding contigs and positions or each element in
|
|
282
|
+
# the linear index
|
|
281
283
|
contig_indexes = np.hstack(
|
|
282
284
|
[np.full(len(li), i) for (i, li) in enumerate(linear_indexes)]
|
|
283
285
|
)
|
|
@@ -433,6 +435,22 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
433
435
|
if var.POS >= start:
|
|
434
436
|
yield var
|
|
435
437
|
|
|
438
|
+
def _filter_empty(self, regions):
|
|
439
|
+
"""
|
|
440
|
+
Return all regions in the specified list that have one or more records.
|
|
441
|
+
|
|
442
|
+
Sometimes with Tabix indexes these seem to crop up:
|
|
443
|
+
|
|
444
|
+
- https://github.com/sgkit-dev/bio2zarr/issues/45
|
|
445
|
+
- https://github.com/sgkit-dev/bio2zarr/issues/120
|
|
446
|
+
"""
|
|
447
|
+
ret = []
|
|
448
|
+
for region in regions:
|
|
449
|
+
variants = self.variants(region)
|
|
450
|
+
if next(variants, None) is not None:
|
|
451
|
+
ret.append(region)
|
|
452
|
+
return ret
|
|
453
|
+
|
|
436
454
|
def partition_into_regions(
|
|
437
455
|
self,
|
|
438
456
|
num_parts: Optional[int] = None,
|
|
@@ -509,4 +527,4 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
509
527
|
if self.index.record_counts[ri] > 0:
|
|
510
528
|
regions.append(Region(self.sequence_names[ri]))
|
|
511
529
|
|
|
512
|
-
return regions
|
|
530
|
+
return self._filter_empty(regions)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.5
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Home-page: https://github.com/pystatgen/bio2zarr
|
|
6
6
|
Author: sgkit Developers
|
|
@@ -20,7 +20,7 @@ Requires-Python: >=3.9
|
|
|
20
20
|
Description-Content-Type: text/x-rst
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: numpy
|
|
23
|
-
Requires-Dist: zarr
|
|
23
|
+
Requires-Dist: zarr >=2.17
|
|
24
24
|
Requires-Dist: click
|
|
25
25
|
Requires-Dist: tabulate
|
|
26
26
|
Requires-Dist: tqdm
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
|
|
2
|
+
bio2zarr/__main__.py,sha256=hO4vV-kPFgsYq0NQwG2r-WkserPL27oqae_tUvNB7yE,527
|
|
3
|
+
bio2zarr/_version.py,sha256=EJB7__SNK9kQS_SWZB_U4DHJ3P8ftF6etZEihTYnuXE,411
|
|
4
|
+
bio2zarr/cli.py,sha256=k63xex-tQkogAlJ3N68Ikx8LqZrksXbZB2s6Z7h-zXc,11446
|
|
5
|
+
bio2zarr/core.py,sha256=reF9elN1dwmCoXXLgci-y5pXmAm3fTntmomHTRcG54g,8127
|
|
6
|
+
bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
|
|
7
|
+
bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
|
|
8
|
+
bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
|
|
9
|
+
bio2zarr/vcf.py,sha256=GFnwR2YP-cHU4tfHloRjyiBK9-xXDgXcAM_tz-w2qck,74324
|
|
10
|
+
bio2zarr/vcf_utils.py,sha256=r3NQXxWK1SYU7CcwDzSWXdX5Q8Ixk7gdCTEiFPzfUAk,17307
|
|
11
|
+
bio2zarr-0.0.5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
12
|
+
bio2zarr-0.0.5.dist-info/METADATA,sha256=SasGYcKSRb7NqnYR98ODFvPEMdBNdpxWx5gqOt038QU,1077
|
|
13
|
+
bio2zarr-0.0.5.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
14
|
+
bio2zarr-0.0.5.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
|
|
15
|
+
bio2zarr-0.0.5.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
|
|
16
|
+
bio2zarr-0.0.5.dist-info/RECORD,,
|
bio2zarr-0.0.3.dist-info/RECORD
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
bio2zarr/__init__.py,sha256=yIJYx4GyKtOLOtODOX0kGCeGPYgQ-TBbsRdT1NwBpQQ,37
|
|
2
|
-
bio2zarr/__main__.py,sha256=3cgaQ4x8YKXt-9xC2GLrHnS6UA38y1GXqttwZiBZJg4,525
|
|
3
|
-
bio2zarr/_version.py,sha256=hB095avW4HuDZxn8qPHRG1UMzSSonb8ZDAsLxt9hmk8,411
|
|
4
|
-
bio2zarr/cli.py,sha256=N_vEFj730p_TL7Dk9m9T3ceAhVV58BMYRDmBmoeKH7A,10766
|
|
5
|
-
bio2zarr/core.py,sha256=sBlWmHjcb7tAn_7WQRBdrbGcEd_lT_3HTQ_JbzomVMg,8111
|
|
6
|
-
bio2zarr/plink.py,sha256=llhfP-v44BVPvgCcwXktk0YrKaJSII63U_PTtpHlGtM,6755
|
|
7
|
-
bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
|
|
8
|
-
bio2zarr/typing.py,sha256=wZ99Zzp5BD9Nqpd-S5bn38fSdPzfj6Z9IHPBfZqt9Gs,78
|
|
9
|
-
bio2zarr/vcf.py,sha256=g2TqH9Lbp4Ds8kjOnjvHvoMAgnG6Kx8pKPN1bqBKKIQ,72201
|
|
10
|
-
bio2zarr/vcf_utils.py,sha256=_kMZdpye15HGpniv8wwISw0L6NEEi54ZFaTcM83wLGs,16751
|
|
11
|
-
bio2zarr-0.0.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
12
|
-
bio2zarr-0.0.3.dist-info/METADATA,sha256=dc2y5xrnkcvD1qmKGFL5GrsbM1_tiIlAYB2GrAlLunM,1106
|
|
13
|
-
bio2zarr-0.0.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
14
|
-
bio2zarr-0.0.3.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
|
|
15
|
-
bio2zarr-0.0.3.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
|
|
16
|
-
bio2zarr-0.0.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|