bio2zarr 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/__main__.py +2 -1
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +89 -22
- bio2zarr/core.py +43 -22
- bio2zarr/plink.py +314 -189
- bio2zarr/tskit.py +301 -0
- bio2zarr/typing.py +1 -2
- bio2zarr/{vcf2zarr/icf.py → vcf.py} +594 -112
- bio2zarr/vcf_utils.py +12 -11
- bio2zarr/{vcf2zarr/vcz.py → vcz.py} +544 -708
- bio2zarr/{vcf2zarr/verification.py → vcz_verification.py} +5 -2
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/METADATA +17 -6
- bio2zarr-0.1.6.dist-info/RECORD +21 -0
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/WHEEL +1 -1
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/entry_points.txt +2 -0
- bio2zarr/vcf2zarr/__init__.py +0 -38
- bio2zarr-0.1.5.dist-info/RECORD +0 -21
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/licenses/LICENSE +0 -0
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/top_level.txt +0 -0
|
@@ -1,41 +1,29 @@
|
|
|
1
|
-
import
|
|
1
|
+
import abc
|
|
2
2
|
import dataclasses
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
-
import os.path
|
|
7
6
|
import pathlib
|
|
8
7
|
import shutil
|
|
9
|
-
import tempfile
|
|
10
8
|
|
|
11
|
-
import humanfriendly
|
|
12
9
|
import numcodecs
|
|
13
10
|
import numpy as np
|
|
14
11
|
import zarr
|
|
15
12
|
|
|
16
|
-
from bio2zarr
|
|
17
|
-
|
|
18
|
-
from .. import constants, core, provenance
|
|
19
|
-
from . import icf
|
|
13
|
+
from bio2zarr import constants, core, provenance, zarr_utils
|
|
20
14
|
|
|
21
15
|
logger = logging.getLogger(__name__)
|
|
22
16
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
if not path.exists():
|
|
27
|
-
raise ValueError(f"Path not found: {path}")
|
|
28
|
-
if (path / "metadata.json").exists():
|
|
29
|
-
obj = icf.IntermediateColumnarFormat(path)
|
|
30
|
-
# NOTE: this is too strict, we should support more general Zarrs, see #276
|
|
31
|
-
elif (path / ".zmetadata").exists():
|
|
32
|
-
obj = VcfZarr(path)
|
|
33
|
-
else:
|
|
34
|
-
raise ValueError(f"{path} not in ICF or VCF Zarr format")
|
|
35
|
-
return obj.summary_table()
|
|
36
|
-
|
|
37
|
-
|
|
17
|
+
ZARR_SCHEMA_FORMAT_VERSION = "0.6"
|
|
18
|
+
DEFAULT_VARIANT_CHUNK_SIZE = 1000
|
|
19
|
+
DEFAULT_SAMPLE_CHUNK_SIZE = 10_000
|
|
38
20
|
DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
|
|
21
|
+
DEFAULT_ZARR_COMPRESSOR_GENOTYPES = numcodecs.Blosc(
|
|
22
|
+
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
|
|
23
|
+
)
|
|
24
|
+
DEFAULT_ZARR_COMPRESSOR_BOOL = numcodecs.Blosc(
|
|
25
|
+
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
|
|
26
|
+
)
|
|
39
27
|
|
|
40
28
|
_fixed_field_descriptions = {
|
|
41
29
|
"variant_contig": "An identifier from the reference genome or an angle-bracketed ID"
|
|
@@ -49,128 +37,251 @@ _fixed_field_descriptions = {
|
|
|
49
37
|
}
|
|
50
38
|
|
|
51
39
|
|
|
40
|
+
@dataclasses.dataclass
|
|
41
|
+
class VariantData:
|
|
42
|
+
"""Represents variant data returned by iter_alleles_and_genotypes."""
|
|
43
|
+
|
|
44
|
+
variant_length: int
|
|
45
|
+
alleles: np.ndarray
|
|
46
|
+
genotypes: np.ndarray
|
|
47
|
+
phased: np.ndarray
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Source(abc.ABC):
|
|
51
|
+
@property
|
|
52
|
+
@abc.abstractmethod
|
|
53
|
+
def path(self):
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
@abc.abstractmethod
|
|
58
|
+
def num_records(self):
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
@abc.abstractmethod
|
|
63
|
+
def num_samples(self):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
@abc.abstractmethod
|
|
68
|
+
def samples(self):
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def contigs(self):
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def filters(self):
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def root_attrs(self):
|
|
81
|
+
return {}
|
|
82
|
+
|
|
83
|
+
@abc.abstractmethod
|
|
84
|
+
def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
def iter_id(self, start, stop):
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
def iter_contig(self, start, stop):
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
@abc.abstractmethod
|
|
94
|
+
def iter_field(self, field_name, shape, start, stop):
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
@abc.abstractmethod
|
|
98
|
+
def generate_schema(self, variants_chunk_size, samples_chunk_size, local_alleles):
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclasses.dataclass
|
|
103
|
+
class VcfZarrDimension:
|
|
104
|
+
size: int
|
|
105
|
+
chunk_size: int
|
|
106
|
+
|
|
107
|
+
def asdict(self):
|
|
108
|
+
return dataclasses.asdict(self)
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def fromdict(cls, d):
|
|
112
|
+
return cls(**d)
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def unchunked(cls, size):
|
|
116
|
+
return cls(size, max(size, 1))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def standard_dimensions(
|
|
120
|
+
*,
|
|
121
|
+
variants_size,
|
|
122
|
+
samples_size,
|
|
123
|
+
variants_chunk_size=None,
|
|
124
|
+
samples_chunk_size=None,
|
|
125
|
+
alleles_size=None,
|
|
126
|
+
filters_size=None,
|
|
127
|
+
ploidy_size=None,
|
|
128
|
+
genotypes_size=None,
|
|
129
|
+
):
|
|
130
|
+
"""
|
|
131
|
+
Returns a dictionary mapping dimension names to definition for the standard
|
|
132
|
+
fields in a VCF.
|
|
133
|
+
"""
|
|
134
|
+
if variants_chunk_size is None:
|
|
135
|
+
variants_chunk_size = max(1, min(variants_size, DEFAULT_VARIANT_CHUNK_SIZE))
|
|
136
|
+
if samples_chunk_size is None:
|
|
137
|
+
samples_chunk_size = max(1, min(samples_size, DEFAULT_SAMPLE_CHUNK_SIZE))
|
|
138
|
+
|
|
139
|
+
dimensions = {
|
|
140
|
+
"variants": VcfZarrDimension(variants_size, variants_chunk_size),
|
|
141
|
+
"samples": VcfZarrDimension(samples_size, samples_chunk_size),
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if alleles_size is not None:
|
|
145
|
+
dimensions["alleles"] = VcfZarrDimension.unchunked(alleles_size)
|
|
146
|
+
if alleles_size > 1:
|
|
147
|
+
dimensions["alt_alleles"] = VcfZarrDimension.unchunked(alleles_size - 1)
|
|
148
|
+
|
|
149
|
+
if filters_size is not None:
|
|
150
|
+
dimensions["filters"] = VcfZarrDimension.unchunked(filters_size)
|
|
151
|
+
|
|
152
|
+
if ploidy_size is not None:
|
|
153
|
+
dimensions["ploidy"] = VcfZarrDimension.unchunked(ploidy_size)
|
|
154
|
+
|
|
155
|
+
if genotypes_size is not None:
|
|
156
|
+
dimensions["genotypes"] = VcfZarrDimension.unchunked(genotypes_size)
|
|
157
|
+
|
|
158
|
+
return dimensions
|
|
159
|
+
|
|
160
|
+
|
|
52
161
|
@dataclasses.dataclass
|
|
53
162
|
class ZarrArraySpec:
|
|
54
163
|
name: str
|
|
55
164
|
dtype: str
|
|
56
|
-
shape: tuple
|
|
57
|
-
chunks: tuple
|
|
58
165
|
dimensions: tuple
|
|
59
166
|
description: str
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
167
|
+
compressor: dict = None
|
|
168
|
+
filters: list = None
|
|
169
|
+
source: str = None
|
|
63
170
|
|
|
64
171
|
def __post_init__(self):
|
|
65
172
|
if self.name in _fixed_field_descriptions:
|
|
66
173
|
self.description = self.description or _fixed_field_descriptions[self.name]
|
|
67
174
|
|
|
68
|
-
# Ensure these are tuples for ease of comparison and consistency
|
|
69
|
-
self.shape = tuple(self.shape)
|
|
70
|
-
self.chunks = tuple(self.chunks)
|
|
71
175
|
self.dimensions = tuple(self.dimensions)
|
|
72
|
-
self.filters = tuple(self.filters)
|
|
176
|
+
self.filters = tuple(self.filters) if self.filters is not None else None
|
|
73
177
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
|
|
178
|
+
def get_shape(self, schema):
|
|
179
|
+
return schema.get_shape(self.dimensions)
|
|
180
|
+
|
|
181
|
+
def get_chunks(self, schema):
|
|
182
|
+
return schema.get_chunks(self.dimensions)
|
|
183
|
+
|
|
184
|
+
def get_chunk_nbytes(self, schema):
|
|
185
|
+
element_size = np.dtype(self.dtype).itemsize
|
|
186
|
+
chunks = self.get_chunks(schema)
|
|
187
|
+
shape = self.get_shape(schema)
|
|
188
|
+
|
|
189
|
+
# Calculate actual chunk size accounting for dimension limits
|
|
190
|
+
items = 1
|
|
191
|
+
for i, chunk_size in enumerate(chunks):
|
|
192
|
+
items *= min(chunk_size, shape[i])
|
|
193
|
+
|
|
194
|
+
# Include sizes for extra dimensions (if any)
|
|
195
|
+
if len(shape) > len(chunks):
|
|
196
|
+
for size in shape[len(chunks) :]:
|
|
197
|
+
items *= size
|
|
198
|
+
|
|
199
|
+
return element_size * items
|
|
81
200
|
|
|
82
201
|
@staticmethod
|
|
83
202
|
def from_field(
|
|
84
203
|
vcf_field,
|
|
204
|
+
schema,
|
|
85
205
|
*,
|
|
86
|
-
num_variants,
|
|
87
|
-
num_samples,
|
|
88
|
-
variants_chunk_size,
|
|
89
|
-
samples_chunk_size,
|
|
90
206
|
array_name=None,
|
|
207
|
+
compressor=None,
|
|
208
|
+
filters=None,
|
|
91
209
|
):
|
|
92
|
-
shape = [num_variants]
|
|
93
210
|
prefix = "variant_"
|
|
94
211
|
dimensions = ["variants"]
|
|
95
|
-
chunks = [variants_chunk_size]
|
|
96
212
|
if vcf_field.category == "FORMAT":
|
|
97
213
|
prefix = "call_"
|
|
98
|
-
shape.append(num_samples)
|
|
99
|
-
chunks.append(samples_chunk_size)
|
|
100
214
|
dimensions.append("samples")
|
|
101
215
|
if array_name is None:
|
|
102
216
|
array_name = prefix + vcf_field.name
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
217
|
+
|
|
218
|
+
max_number = vcf_field.max_number
|
|
219
|
+
if vcf_field.vcf_number == "R":
|
|
220
|
+
max_alleles = schema.dimensions["alleles"].size
|
|
221
|
+
if max_number > max_alleles:
|
|
222
|
+
raise ValueError(
|
|
223
|
+
f"Max number of values {max_number} exceeds max alleles "
|
|
224
|
+
f"{max_alleles} for {vcf_field.full_name}"
|
|
225
|
+
)
|
|
226
|
+
if max_alleles > 0:
|
|
110
227
|
dimensions.append("alleles")
|
|
111
|
-
|
|
228
|
+
elif vcf_field.vcf_number == "A":
|
|
229
|
+
max_alt_alleles = schema.dimensions["alt_alleles"].size
|
|
230
|
+
if max_number > max_alt_alleles:
|
|
231
|
+
raise ValueError(
|
|
232
|
+
f"Max number of values {max_number} exceeds max alt alleles "
|
|
233
|
+
f"{max_alt_alleles} for {vcf_field.full_name}"
|
|
234
|
+
)
|
|
235
|
+
if max_alt_alleles > 0:
|
|
112
236
|
dimensions.append("alt_alleles")
|
|
113
|
-
|
|
237
|
+
elif vcf_field.vcf_number == "G":
|
|
238
|
+
max_genotypes = schema.dimensions["genotypes"].size
|
|
239
|
+
if max_number > max_genotypes:
|
|
240
|
+
raise ValueError(
|
|
241
|
+
f"Max number of values {max_number} exceeds max genotypes "
|
|
242
|
+
f"{max_genotypes} for {vcf_field.full_name}"
|
|
243
|
+
)
|
|
244
|
+
if max_genotypes > 0:
|
|
114
245
|
dimensions.append("genotypes")
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
246
|
+
elif max_number > 1 or vcf_field.full_name == "FORMAT/LAA":
|
|
247
|
+
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
|
|
248
|
+
if dimensions[-1] not in schema.dimensions:
|
|
249
|
+
schema.dimensions[dimensions[-1]] = VcfZarrDimension.unchunked(
|
|
250
|
+
vcf_field.max_number
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
return ZarrArraySpec(
|
|
254
|
+
source=vcf_field.full_name,
|
|
119
255
|
name=array_name,
|
|
120
256
|
dtype=vcf_field.smallest_dtype(),
|
|
121
|
-
shape=shape,
|
|
122
|
-
chunks=chunks,
|
|
123
257
|
dimensions=dimensions,
|
|
124
258
|
description=vcf_field.description,
|
|
259
|
+
compressor=compressor,
|
|
260
|
+
filters=filters,
|
|
125
261
|
)
|
|
126
262
|
|
|
127
|
-
def
|
|
128
|
-
"""
|
|
129
|
-
Choose compressor and filter settings based on the size and
|
|
130
|
-
type of the array, plus some hueristics from observed properties
|
|
131
|
-
of VCFs.
|
|
132
|
-
|
|
133
|
-
See https://github.com/pystatgen/bio2zarr/discussions/74
|
|
134
|
-
"""
|
|
135
|
-
# Default is to not shuffle, because autoshuffle isn't recognised
|
|
136
|
-
# by many Zarr implementations, and shuffling can lead to worse
|
|
137
|
-
# performance in some cases anyway. Turning on shuffle should be a
|
|
138
|
-
# deliberate choice.
|
|
139
|
-
shuffle = numcodecs.Blosc.NOSHUFFLE
|
|
140
|
-
if self.name == "call_genotype" and self.dtype == "i1":
|
|
141
|
-
# call_genotype gets BITSHUFFLE by default as it gets
|
|
142
|
-
# significantly better compression (at a cost of slower
|
|
143
|
-
# decoding)
|
|
144
|
-
shuffle = numcodecs.Blosc.BITSHUFFLE
|
|
145
|
-
elif self.dtype == "bool":
|
|
146
|
-
shuffle = numcodecs.Blosc.BITSHUFFLE
|
|
147
|
-
|
|
148
|
-
self.compressor["shuffle"] = shuffle
|
|
149
|
-
|
|
150
|
-
@property
|
|
151
|
-
def chunk_nbytes(self):
|
|
263
|
+
def chunk_nbytes(self, schema):
|
|
152
264
|
"""
|
|
153
265
|
Returns the nbytes for a single chunk in this array.
|
|
154
266
|
"""
|
|
155
267
|
items = 1
|
|
156
268
|
dim = 0
|
|
157
|
-
for chunk_size in self.
|
|
158
|
-
size = min(chunk_size, self.
|
|
269
|
+
for chunk_size in self.get_chunks(schema):
|
|
270
|
+
size = min(chunk_size, self.get_shape(schema)[dim])
|
|
159
271
|
items *= size
|
|
160
272
|
dim += 1
|
|
161
273
|
# Include sizes for extra dimensions.
|
|
162
|
-
for size in self.
|
|
274
|
+
for size in self.get_shape(schema)[dim:]:
|
|
163
275
|
items *= size
|
|
164
276
|
dt = np.dtype(self.dtype)
|
|
165
277
|
return items * dt.itemsize
|
|
166
278
|
|
|
167
|
-
|
|
168
|
-
def variant_chunk_nbytes(self):
|
|
279
|
+
def variant_chunk_nbytes(self, schema):
|
|
169
280
|
"""
|
|
170
281
|
Returns the nbytes for a single variant chunk of this array.
|
|
171
282
|
"""
|
|
172
|
-
chunk_items = self.
|
|
173
|
-
for size in self.
|
|
283
|
+
chunk_items = self.get_chunks(schema)[0]
|
|
284
|
+
for size in self.get_shape(schema)[1:]:
|
|
174
285
|
chunk_items *= size
|
|
175
286
|
dt = np.dtype(self.dtype)
|
|
176
287
|
if dt.kind == "O" and "samples" in self.dimensions:
|
|
@@ -181,87 +292,71 @@ class ZarrArraySpec:
|
|
|
181
292
|
return chunk_items * dt.itemsize
|
|
182
293
|
|
|
183
294
|
|
|
184
|
-
|
|
295
|
+
@dataclasses.dataclass
|
|
296
|
+
class Contig:
|
|
297
|
+
id: str
|
|
298
|
+
length: int = None
|
|
185
299
|
|
|
186
300
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
stored.
|
|
197
|
-
"""
|
|
198
|
-
fields_by_name = {field.name: field for field in fields}
|
|
199
|
-
gt = fields_by_name["call_genotype"]
|
|
200
|
-
if gt.shape[-1] != 2:
|
|
201
|
-
raise ValueError("Local alleles only supported on diploid data")
|
|
202
|
-
|
|
203
|
-
# TODO check if LA is already in here
|
|
204
|
-
|
|
205
|
-
shape = gt.shape[:-1]
|
|
206
|
-
chunks = gt.chunks[:-1]
|
|
207
|
-
dimensions = gt.dimensions[:-1]
|
|
208
|
-
|
|
209
|
-
la = ZarrArraySpec.new(
|
|
210
|
-
vcf_field=None,
|
|
211
|
-
name="call_LA",
|
|
212
|
-
dtype="i1",
|
|
213
|
-
shape=gt.shape,
|
|
214
|
-
chunks=gt.chunks,
|
|
215
|
-
dimensions=(*dimensions, "local_alleles"),
|
|
216
|
-
description=(
|
|
217
|
-
"0-based indices into REF+ALT, indicating which alleles"
|
|
218
|
-
" are relevant (local) for the current sample"
|
|
219
|
-
),
|
|
220
|
-
)
|
|
221
|
-
ad = fields_by_name.get("call_AD", None)
|
|
222
|
-
if ad is not None:
|
|
223
|
-
# TODO check if call_LAD is in the list already
|
|
224
|
-
ad.name = "call_LAD"
|
|
225
|
-
ad.vcf_field = None
|
|
226
|
-
ad.shape = (*shape, 2)
|
|
227
|
-
ad.chunks = (*chunks, 2)
|
|
228
|
-
ad.dimensions = (*dimensions, "local_alleles")
|
|
229
|
-
ad.description += " (local-alleles)"
|
|
230
|
-
|
|
231
|
-
pl = fields_by_name.get("call_PL", None)
|
|
232
|
-
if pl is not None:
|
|
233
|
-
# TODO check if call_LPL is in the list already
|
|
234
|
-
pl.name = "call_LPL"
|
|
235
|
-
pl.vcf_field = None
|
|
236
|
-
pl.shape = (*shape, 3)
|
|
237
|
-
pl.chunks = (*chunks, 3)
|
|
238
|
-
pl.description += " (local-alleles)"
|
|
239
|
-
pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1])
|
|
240
|
-
return [*fields, la]
|
|
301
|
+
@dataclasses.dataclass
|
|
302
|
+
class Sample:
|
|
303
|
+
id: str
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
@dataclasses.dataclass
|
|
307
|
+
class Filter:
|
|
308
|
+
id: str
|
|
309
|
+
description: str = ""
|
|
241
310
|
|
|
242
311
|
|
|
243
312
|
@dataclasses.dataclass
|
|
244
313
|
class VcfZarrSchema(core.JsonDataclass):
|
|
245
314
|
format_version: str
|
|
246
|
-
|
|
247
|
-
variants_chunk_size: int
|
|
248
|
-
samples: list
|
|
249
|
-
contigs: list
|
|
250
|
-
filters: list
|
|
315
|
+
dimensions: dict
|
|
251
316
|
fields: list
|
|
317
|
+
defaults: dict
|
|
318
|
+
|
|
319
|
+
def __init__(
|
|
320
|
+
self,
|
|
321
|
+
format_version: str,
|
|
322
|
+
fields: list,
|
|
323
|
+
dimensions: dict,
|
|
324
|
+
defaults: dict = None,
|
|
325
|
+
):
|
|
326
|
+
self.format_version = format_version
|
|
327
|
+
self.fields = fields
|
|
328
|
+
defaults = defaults.copy() if defaults is not None else {}
|
|
329
|
+
if defaults.get("compressor", None) is None:
|
|
330
|
+
defaults["compressor"] = DEFAULT_ZARR_COMPRESSOR.get_config()
|
|
331
|
+
if defaults.get("filters", None) is None:
|
|
332
|
+
defaults["filters"] = []
|
|
333
|
+
self.defaults = defaults
|
|
334
|
+
self.dimensions = dimensions
|
|
335
|
+
|
|
336
|
+
def get_shape(self, dimensions):
|
|
337
|
+
return [self.dimensions[dim].size for dim in dimensions]
|
|
338
|
+
|
|
339
|
+
def get_chunks(self, dimensions):
|
|
340
|
+
return [self.dimensions[dim].chunk_size for dim in dimensions]
|
|
252
341
|
|
|
253
342
|
def validate(self):
|
|
254
343
|
"""
|
|
255
344
|
Checks that the schema is well-formed and within required limits.
|
|
256
345
|
"""
|
|
257
346
|
for field in self.fields:
|
|
347
|
+
for dim in field.dimensions:
|
|
348
|
+
if dim not in self.dimensions:
|
|
349
|
+
raise ValueError(
|
|
350
|
+
f"Dimension '{dim}' used in field '{field.name}' is "
|
|
351
|
+
"not defined in the schema"
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
chunk_nbytes = field.get_chunk_nbytes(self)
|
|
258
355
|
# This is the Blosc max buffer size
|
|
259
|
-
if
|
|
260
|
-
# TODO add some links to documentation here advising how to
|
|
261
|
-
# deal with PL values.
|
|
356
|
+
if chunk_nbytes > 2147483647:
|
|
262
357
|
raise ValueError(
|
|
263
358
|
f"Field {field.name} chunks are too large "
|
|
264
|
-
f"({
|
|
359
|
+
f"({chunk_nbytes} > 2**31 - 1 bytes). "
|
|
265
360
|
"Either generate a schema and drop this field (if you don't "
|
|
266
361
|
"need it) or reduce the variant or sample chunk sizes."
|
|
267
362
|
)
|
|
@@ -278,253 +373,30 @@ class VcfZarrSchema(core.JsonDataclass):
|
|
|
278
373
|
"Zarr schema format version mismatch: "
|
|
279
374
|
f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
|
|
280
375
|
)
|
|
376
|
+
|
|
281
377
|
ret = VcfZarrSchema(**d)
|
|
282
|
-
ret.samples = [icf.Sample(**sd) for sd in d["samples"]]
|
|
283
|
-
ret.contigs = [icf.Contig(**sd) for sd in d["contigs"]]
|
|
284
|
-
ret.filters = [icf.Filter(**sd) for sd in d["filters"]]
|
|
285
378
|
ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
|
|
379
|
+
ret.dimensions = {
|
|
380
|
+
k: VcfZarrDimension.fromdict(v) for k, v in d["dimensions"].items()
|
|
381
|
+
}
|
|
382
|
+
|
|
286
383
|
return ret
|
|
287
384
|
|
|
288
385
|
@staticmethod
|
|
289
386
|
def fromjson(s):
|
|
290
387
|
return VcfZarrSchema.fromdict(json.loads(s))
|
|
291
388
|
|
|
292
|
-
@staticmethod
|
|
293
|
-
def generate(
|
|
294
|
-
icf, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
|
|
295
|
-
):
|
|
296
|
-
m = icf.num_records
|
|
297
|
-
n = icf.num_samples
|
|
298
|
-
if samples_chunk_size is None:
|
|
299
|
-
samples_chunk_size = 10_000
|
|
300
|
-
if variants_chunk_size is None:
|
|
301
|
-
variants_chunk_size = 1000
|
|
302
|
-
if local_alleles is None:
|
|
303
|
-
local_alleles = False
|
|
304
|
-
logger.info(
|
|
305
|
-
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
|
|
306
|
-
)
|
|
307
|
-
|
|
308
|
-
def spec_from_field(field, array_name=None):
|
|
309
|
-
return ZarrArraySpec.from_field(
|
|
310
|
-
field,
|
|
311
|
-
num_samples=n,
|
|
312
|
-
num_variants=m,
|
|
313
|
-
samples_chunk_size=samples_chunk_size,
|
|
314
|
-
variants_chunk_size=variants_chunk_size,
|
|
315
|
-
array_name=array_name,
|
|
316
|
-
)
|
|
317
|
-
|
|
318
|
-
def fixed_field_spec(
|
|
319
|
-
name,
|
|
320
|
-
dtype,
|
|
321
|
-
vcf_field=None,
|
|
322
|
-
shape=(m,),
|
|
323
|
-
dimensions=("variants",),
|
|
324
|
-
chunks=None,
|
|
325
|
-
):
|
|
326
|
-
return ZarrArraySpec.new(
|
|
327
|
-
vcf_field=vcf_field,
|
|
328
|
-
name=name,
|
|
329
|
-
dtype=dtype,
|
|
330
|
-
shape=shape,
|
|
331
|
-
description="",
|
|
332
|
-
dimensions=dimensions,
|
|
333
|
-
chunks=chunks or [variants_chunk_size],
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
alt_field = icf.fields["ALT"]
|
|
337
|
-
max_alleles = alt_field.vcf_field.summary.max_number + 1
|
|
338
|
-
|
|
339
|
-
array_specs = [
|
|
340
|
-
fixed_field_spec(
|
|
341
|
-
name="variant_contig",
|
|
342
|
-
dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
|
|
343
|
-
),
|
|
344
|
-
fixed_field_spec(
|
|
345
|
-
name="variant_filter",
|
|
346
|
-
dtype="bool",
|
|
347
|
-
shape=(m, icf.metadata.num_filters),
|
|
348
|
-
dimensions=["variants", "filters"],
|
|
349
|
-
chunks=(variants_chunk_size, icf.metadata.num_filters),
|
|
350
|
-
),
|
|
351
|
-
fixed_field_spec(
|
|
352
|
-
name="variant_allele",
|
|
353
|
-
dtype="O",
|
|
354
|
-
shape=(m, max_alleles),
|
|
355
|
-
dimensions=["variants", "alleles"],
|
|
356
|
-
chunks=(variants_chunk_size, max_alleles),
|
|
357
|
-
),
|
|
358
|
-
fixed_field_spec(
|
|
359
|
-
name="variant_id",
|
|
360
|
-
dtype="O",
|
|
361
|
-
),
|
|
362
|
-
fixed_field_spec(
|
|
363
|
-
name="variant_id_mask",
|
|
364
|
-
dtype="bool",
|
|
365
|
-
),
|
|
366
|
-
]
|
|
367
|
-
name_map = {field.full_name: field for field in icf.metadata.fields}
|
|
368
|
-
|
|
369
|
-
# Only three of the fixed fields have a direct one-to-one mapping.
|
|
370
|
-
array_specs.extend(
|
|
371
|
-
[
|
|
372
|
-
spec_from_field(name_map["QUAL"], array_name="variant_quality"),
|
|
373
|
-
spec_from_field(name_map["POS"], array_name="variant_position"),
|
|
374
|
-
spec_from_field(name_map["rlen"], array_name="variant_length"),
|
|
375
|
-
]
|
|
376
|
-
)
|
|
377
|
-
array_specs.extend(
|
|
378
|
-
[spec_from_field(field) for field in icf.metadata.info_fields]
|
|
379
|
-
)
|
|
380
|
-
|
|
381
|
-
gt_field = None
|
|
382
|
-
for field in icf.metadata.format_fields:
|
|
383
|
-
if field.name == "GT":
|
|
384
|
-
gt_field = field
|
|
385
|
-
continue
|
|
386
|
-
array_specs.append(spec_from_field(field))
|
|
387
|
-
|
|
388
|
-
if gt_field is not None and n > 0:
|
|
389
|
-
ploidy = max(gt_field.summary.max_number - 1, 1)
|
|
390
|
-
shape = [m, n]
|
|
391
|
-
chunks = [variants_chunk_size, samples_chunk_size]
|
|
392
|
-
dimensions = ["variants", "samples"]
|
|
393
|
-
array_specs.append(
|
|
394
|
-
ZarrArraySpec.new(
|
|
395
|
-
vcf_field=None,
|
|
396
|
-
name="call_genotype_phased",
|
|
397
|
-
dtype="bool",
|
|
398
|
-
shape=list(shape),
|
|
399
|
-
chunks=list(chunks),
|
|
400
|
-
dimensions=list(dimensions),
|
|
401
|
-
description="",
|
|
402
|
-
)
|
|
403
|
-
)
|
|
404
|
-
shape += [ploidy]
|
|
405
|
-
chunks += [ploidy]
|
|
406
|
-
dimensions += ["ploidy"]
|
|
407
|
-
array_specs.append(
|
|
408
|
-
ZarrArraySpec.new(
|
|
409
|
-
vcf_field=None,
|
|
410
|
-
name="call_genotype",
|
|
411
|
-
dtype=gt_field.smallest_dtype(),
|
|
412
|
-
shape=list(shape),
|
|
413
|
-
chunks=list(chunks),
|
|
414
|
-
dimensions=list(dimensions),
|
|
415
|
-
description="",
|
|
416
|
-
)
|
|
417
|
-
)
|
|
418
|
-
array_specs.append(
|
|
419
|
-
ZarrArraySpec.new(
|
|
420
|
-
vcf_field=None,
|
|
421
|
-
name="call_genotype_mask",
|
|
422
|
-
dtype="bool",
|
|
423
|
-
shape=list(shape),
|
|
424
|
-
chunks=list(chunks),
|
|
425
|
-
dimensions=list(dimensions),
|
|
426
|
-
description="",
|
|
427
|
-
)
|
|
428
|
-
)
|
|
429
|
-
|
|
430
|
-
if local_alleles:
|
|
431
|
-
array_specs = convert_local_allele_field_types(array_specs)
|
|
432
|
-
|
|
433
|
-
return VcfZarrSchema(
|
|
434
|
-
format_version=ZARR_SCHEMA_FORMAT_VERSION,
|
|
435
|
-
samples_chunk_size=samples_chunk_size,
|
|
436
|
-
variants_chunk_size=variants_chunk_size,
|
|
437
|
-
fields=array_specs,
|
|
438
|
-
samples=icf.metadata.samples,
|
|
439
|
-
contigs=icf.metadata.contigs,
|
|
440
|
-
filters=icf.metadata.filters,
|
|
441
|
-
)
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
class VcfZarr:
|
|
445
|
-
def __init__(self, path):
|
|
446
|
-
if not (path / ".zmetadata").exists():
|
|
447
|
-
raise ValueError("Not in VcfZarr format") # NEEDS TEST
|
|
448
|
-
self.path = path
|
|
449
|
-
self.root = zarr.open(path, mode="r")
|
|
450
|
-
|
|
451
|
-
def summary_table(self):
|
|
452
|
-
data = []
|
|
453
|
-
arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()]
|
|
454
|
-
arrays.sort(key=lambda x: x[0])
|
|
455
|
-
for stored, array in reversed(arrays):
|
|
456
|
-
d = {
|
|
457
|
-
"name": array.name,
|
|
458
|
-
"dtype": str(array.dtype),
|
|
459
|
-
"stored": core.display_size(stored),
|
|
460
|
-
"size": core.display_size(array.nbytes),
|
|
461
|
-
"ratio": core.display_number(array.nbytes / stored),
|
|
462
|
-
"nchunks": str(array.nchunks),
|
|
463
|
-
"chunk_size": core.display_size(array.nbytes / array.nchunks),
|
|
464
|
-
"avg_chunk_stored": core.display_size(int(stored / array.nchunks)),
|
|
465
|
-
"shape": str(array.shape),
|
|
466
|
-
"chunk_shape": str(array.chunks),
|
|
467
|
-
"compressor": str(array.compressor),
|
|
468
|
-
"filters": str(array.filters),
|
|
469
|
-
}
|
|
470
|
-
data.append(d)
|
|
471
|
-
return data
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
def parse_max_memory(max_memory):
|
|
475
|
-
if max_memory is None:
|
|
476
|
-
# Effectively unbounded
|
|
477
|
-
return 2**63
|
|
478
|
-
if isinstance(max_memory, str):
|
|
479
|
-
max_memory = humanfriendly.parse_size(max_memory)
|
|
480
|
-
logger.info(f"Set memory budget to {core.display_size(max_memory)}")
|
|
481
|
-
return max_memory
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
@dataclasses.dataclass
|
|
485
|
-
class VcfZarrPartition:
|
|
486
|
-
start: int
|
|
487
|
-
stop: int
|
|
488
|
-
|
|
489
|
-
@staticmethod
|
|
490
|
-
def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
|
|
491
|
-
num_chunks = int(np.ceil(num_records / chunk_size))
|
|
492
|
-
if max_chunks is not None:
|
|
493
|
-
num_chunks = min(num_chunks, max_chunks)
|
|
494
|
-
partitions = []
|
|
495
|
-
splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
|
|
496
|
-
for chunk_slice in splits:
|
|
497
|
-
start_chunk = int(chunk_slice[0])
|
|
498
|
-
stop_chunk = int(chunk_slice[-1]) + 1
|
|
499
|
-
start_index = start_chunk * chunk_size
|
|
500
|
-
stop_index = min(stop_chunk * chunk_size, num_records)
|
|
501
|
-
partitions.append(VcfZarrPartition(start_index, stop_index))
|
|
502
|
-
return partitions
|
|
503
389
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
partitions: list
|
|
515
|
-
provenance: dict
|
|
516
|
-
|
|
517
|
-
@staticmethod
|
|
518
|
-
def fromdict(d):
|
|
519
|
-
if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
|
|
520
|
-
raise ValueError(
|
|
521
|
-
"VcfZarrWriter format version mismatch: "
|
|
522
|
-
f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
|
|
523
|
-
)
|
|
524
|
-
ret = VcfZarrWriterMetadata(**d)
|
|
525
|
-
ret.schema = VcfZarrSchema.fromdict(ret.schema)
|
|
526
|
-
ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
|
|
527
|
-
return ret
|
|
390
|
+
def sanitise_int_array(value, ndmin, dtype):
|
|
391
|
+
if isinstance(value, tuple):
|
|
392
|
+
value = [
|
|
393
|
+
constants.VCF_INT_MISSING if x is None else x for x in value
|
|
394
|
+
] # NEEDS TEST
|
|
395
|
+
value = np.array(value, ndmin=ndmin, copy=True)
|
|
396
|
+
value[value == constants.VCF_INT_MISSING] = -1
|
|
397
|
+
value[value == constants.VCF_INT_FILL] = -2
|
|
398
|
+
# TODO watch out for clipping here!
|
|
399
|
+
return value.astype(dtype)
|
|
528
400
|
|
|
529
401
|
|
|
530
402
|
def compute_la_field(genotypes):
|
|
@@ -597,14 +469,60 @@ class LocalisableFieldDescriptor:
|
|
|
597
469
|
|
|
598
470
|
localisable_fields = [
|
|
599
471
|
LocalisableFieldDescriptor(
|
|
600
|
-
"call_LAD", "FORMAT/AD",
|
|
472
|
+
"call_LAD", "FORMAT/AD", sanitise_int_array, compute_lad_field
|
|
601
473
|
),
|
|
602
474
|
LocalisableFieldDescriptor(
|
|
603
|
-
"call_LPL", "FORMAT/PL",
|
|
475
|
+
"call_LPL", "FORMAT/PL", sanitise_int_array, compute_lpl_field
|
|
604
476
|
),
|
|
605
477
|
]
|
|
606
478
|
|
|
607
479
|
|
|
480
|
+
@dataclasses.dataclass
|
|
481
|
+
class VcfZarrPartition:
|
|
482
|
+
start: int
|
|
483
|
+
stop: int
|
|
484
|
+
|
|
485
|
+
@staticmethod
|
|
486
|
+
def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
|
|
487
|
+
num_chunks = int(np.ceil(num_records / chunk_size))
|
|
488
|
+
if max_chunks is not None:
|
|
489
|
+
num_chunks = min(num_chunks, max_chunks)
|
|
490
|
+
partitions = []
|
|
491
|
+
splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
|
|
492
|
+
for chunk_slice in splits:
|
|
493
|
+
start_chunk = int(chunk_slice[0])
|
|
494
|
+
stop_chunk = int(chunk_slice[-1]) + 1
|
|
495
|
+
start_index = start_chunk * chunk_size
|
|
496
|
+
stop_index = min(stop_chunk * chunk_size, num_records)
|
|
497
|
+
partitions.append(VcfZarrPartition(start_index, stop_index))
|
|
498
|
+
return partitions
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
VZW_METADATA_FORMAT_VERSION = "0.1"
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
@dataclasses.dataclass
|
|
505
|
+
class VcfZarrWriterMetadata(core.JsonDataclass):
|
|
506
|
+
format_version: str
|
|
507
|
+
source_path: str
|
|
508
|
+
schema: VcfZarrSchema
|
|
509
|
+
dimension_separator: str
|
|
510
|
+
partitions: list
|
|
511
|
+
provenance: dict
|
|
512
|
+
|
|
513
|
+
@staticmethod
|
|
514
|
+
def fromdict(d):
|
|
515
|
+
if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
|
|
516
|
+
raise ValueError(
|
|
517
|
+
"VcfZarrWriter format version mismatch: "
|
|
518
|
+
f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
|
|
519
|
+
)
|
|
520
|
+
ret = VcfZarrWriterMetadata(**d)
|
|
521
|
+
ret.schema = VcfZarrSchema.fromdict(ret.schema)
|
|
522
|
+
ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
|
|
523
|
+
return ret
|
|
524
|
+
|
|
525
|
+
|
|
608
526
|
@dataclasses.dataclass
|
|
609
527
|
class VcfZarrWriteSummary(core.JsonDataclass):
|
|
610
528
|
num_partitions: int
|
|
@@ -615,13 +533,14 @@ class VcfZarrWriteSummary(core.JsonDataclass):
|
|
|
615
533
|
|
|
616
534
|
|
|
617
535
|
class VcfZarrWriter:
|
|
618
|
-
def __init__(self, path):
|
|
536
|
+
def __init__(self, source_type, path):
|
|
537
|
+
self.source_type = source_type
|
|
619
538
|
self.path = pathlib.Path(path)
|
|
620
539
|
self.wip_path = self.path / "wip"
|
|
621
540
|
self.arrays_path = self.wip_path / "arrays"
|
|
622
541
|
self.partitions_path = self.wip_path / "partitions"
|
|
623
542
|
self.metadata = None
|
|
624
|
-
self.
|
|
543
|
+
self.source = None
|
|
625
544
|
|
|
626
545
|
@property
|
|
627
546
|
def schema(self):
|
|
@@ -639,7 +558,7 @@ class VcfZarrWriter:
|
|
|
639
558
|
|
|
640
559
|
def has_local_alleles(self):
|
|
641
560
|
for field in self.schema.fields:
|
|
642
|
-
if field.name == "call_LA" and field.
|
|
561
|
+
if field.name == "call_LA" and field.source is None:
|
|
643
562
|
return True
|
|
644
563
|
return False
|
|
645
564
|
|
|
@@ -649,20 +568,20 @@ class VcfZarrWriter:
|
|
|
649
568
|
|
|
650
569
|
def init(
|
|
651
570
|
self,
|
|
652
|
-
|
|
571
|
+
source,
|
|
653
572
|
*,
|
|
654
573
|
target_num_partitions,
|
|
655
574
|
schema,
|
|
656
575
|
dimension_separator=None,
|
|
657
576
|
max_variant_chunks=None,
|
|
658
577
|
):
|
|
659
|
-
self.
|
|
578
|
+
self.source = source
|
|
660
579
|
if self.path.exists():
|
|
661
580
|
raise ValueError("Zarr path already exists") # NEEDS TEST
|
|
662
581
|
schema.validate()
|
|
663
582
|
partitions = VcfZarrPartition.generate_partitions(
|
|
664
|
-
self.
|
|
665
|
-
schema.
|
|
583
|
+
self.source.num_records,
|
|
584
|
+
schema.get_chunks(["variants"])[0],
|
|
666
585
|
target_num_partitions,
|
|
667
586
|
max_chunks=max_variant_chunks,
|
|
668
587
|
)
|
|
@@ -673,7 +592,7 @@ class VcfZarrWriter:
|
|
|
673
592
|
)
|
|
674
593
|
self.metadata = VcfZarrWriterMetadata(
|
|
675
594
|
format_version=VZW_METADATA_FORMAT_VERSION,
|
|
676
|
-
|
|
595
|
+
source_path=str(self.source.path),
|
|
677
596
|
schema=schema,
|
|
678
597
|
dimension_separator=dimension_separator,
|
|
679
598
|
partitions=partitions,
|
|
@@ -682,27 +601,32 @@ class VcfZarrWriter:
|
|
|
682
601
|
)
|
|
683
602
|
|
|
684
603
|
self.path.mkdir()
|
|
685
|
-
root = zarr.open(store=self.path, mode="a", **ZARR_FORMAT_KWARGS)
|
|
604
|
+
root = zarr.open(store=self.path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS)
|
|
686
605
|
root.attrs.update(
|
|
687
606
|
{
|
|
688
|
-
"vcf_zarr_version": "0.
|
|
689
|
-
"vcf_header": self.icf.vcf_header,
|
|
607
|
+
"vcf_zarr_version": "0.4",
|
|
690
608
|
"source": f"bio2zarr-{provenance.__version__}",
|
|
691
609
|
}
|
|
692
610
|
)
|
|
693
|
-
|
|
611
|
+
root.attrs.update(self.source.root_attrs)
|
|
612
|
+
|
|
613
|
+
# Doing this synchronously - this is fine surely
|
|
694
614
|
self.encode_samples(root)
|
|
695
|
-
self.
|
|
696
|
-
|
|
615
|
+
if self.source.filters is not None:
|
|
616
|
+
self.encode_filters(root)
|
|
617
|
+
if self.source.contigs is not None:
|
|
618
|
+
self.encode_contigs(root)
|
|
697
619
|
|
|
698
620
|
self.wip_path.mkdir()
|
|
699
621
|
self.arrays_path.mkdir()
|
|
700
622
|
self.partitions_path.mkdir()
|
|
701
|
-
root = zarr.open(
|
|
623
|
+
root = zarr.open(
|
|
624
|
+
store=self.arrays_path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS
|
|
625
|
+
)
|
|
702
626
|
|
|
703
627
|
total_chunks = 0
|
|
704
628
|
for field in self.schema.fields:
|
|
705
|
-
a = self.init_array(root, field, partitions[-1].stop)
|
|
629
|
+
a = self.init_array(root, self.metadata.schema, field, partitions[-1].stop)
|
|
706
630
|
total_chunks += a.nchunks
|
|
707
631
|
|
|
708
632
|
logger.info("Writing WIP metadata")
|
|
@@ -710,79 +634,97 @@ class VcfZarrWriter:
|
|
|
710
634
|
json.dump(self.metadata.asdict(), f, indent=4)
|
|
711
635
|
|
|
712
636
|
return VcfZarrWriteSummary(
|
|
713
|
-
num_variants=self.
|
|
714
|
-
num_samples=self.
|
|
637
|
+
num_variants=self.source.num_records,
|
|
638
|
+
num_samples=self.source.num_samples,
|
|
715
639
|
num_partitions=self.num_partitions,
|
|
716
640
|
num_chunks=total_chunks,
|
|
717
641
|
max_encoding_memory=core.display_size(self.get_max_encoding_memory()),
|
|
718
642
|
)
|
|
719
643
|
|
|
720
644
|
def encode_samples(self, root):
|
|
721
|
-
|
|
722
|
-
raise ValueError("Subsetting or reordering samples not supported currently")
|
|
645
|
+
samples = self.source.samples
|
|
723
646
|
array = root.array(
|
|
724
647
|
"sample_id",
|
|
725
|
-
data=[sample.id for sample in
|
|
726
|
-
shape=len(
|
|
648
|
+
data=[sample.id for sample in samples],
|
|
649
|
+
shape=len(samples),
|
|
727
650
|
dtype="str",
|
|
728
651
|
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
729
|
-
chunks=(self.schema.
|
|
652
|
+
chunks=(self.schema.get_chunks(["samples"])[0],),
|
|
730
653
|
)
|
|
731
654
|
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
732
655
|
logger.debug("Samples done")
|
|
733
656
|
|
|
734
|
-
def
|
|
657
|
+
def encode_contigs(self, root):
|
|
658
|
+
contigs = self.source.contigs
|
|
735
659
|
array = root.array(
|
|
736
660
|
"contig_id",
|
|
737
|
-
data=[contig.id for contig in
|
|
738
|
-
shape=len(
|
|
661
|
+
data=[contig.id for contig in contigs],
|
|
662
|
+
shape=len(contigs),
|
|
739
663
|
dtype="str",
|
|
740
664
|
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
741
665
|
)
|
|
742
666
|
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
743
|
-
if all(contig.length is not None for contig in
|
|
667
|
+
if all(contig.length is not None for contig in contigs):
|
|
744
668
|
array = root.array(
|
|
745
669
|
"contig_length",
|
|
746
|
-
data=[contig.length for contig in
|
|
747
|
-
shape=len(
|
|
670
|
+
data=[contig.length for contig in contigs],
|
|
671
|
+
shape=len(contigs),
|
|
748
672
|
dtype=np.int64,
|
|
749
673
|
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
750
674
|
)
|
|
751
675
|
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
752
676
|
|
|
753
|
-
def
|
|
754
|
-
|
|
755
|
-
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
|
|
677
|
+
def encode_filters(self, root):
|
|
678
|
+
filters = self.source.filters
|
|
756
679
|
array = root.array(
|
|
757
680
|
"filter_id",
|
|
758
|
-
data=[filt.id for filt in
|
|
759
|
-
shape=len(
|
|
681
|
+
data=[filt.id for filt in filters],
|
|
682
|
+
shape=len(filters),
|
|
683
|
+
dtype="str",
|
|
684
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
685
|
+
)
|
|
686
|
+
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
|
|
687
|
+
array = root.array(
|
|
688
|
+
"filter_description",
|
|
689
|
+
data=[filt.description for filt in filters],
|
|
690
|
+
shape=len(filters),
|
|
760
691
|
dtype="str",
|
|
761
692
|
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
762
693
|
)
|
|
763
694
|
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
|
|
764
695
|
|
|
765
|
-
def init_array(self, root, array_spec, variants_dim_size):
|
|
766
|
-
kwargs = dict(ZARR_FORMAT_KWARGS)
|
|
767
|
-
filters =
|
|
696
|
+
def init_array(self, root, schema, array_spec, variants_dim_size):
|
|
697
|
+
kwargs = dict(zarr_utils.ZARR_FORMAT_KWARGS)
|
|
698
|
+
filters = (
|
|
699
|
+
array_spec.filters
|
|
700
|
+
if array_spec.filters is not None
|
|
701
|
+
else schema.defaults["filters"]
|
|
702
|
+
)
|
|
703
|
+
filters = [numcodecs.get_codec(filt) for filt in filters]
|
|
704
|
+
compressor = (
|
|
705
|
+
array_spec.compressor
|
|
706
|
+
if array_spec.compressor is not None
|
|
707
|
+
else schema.defaults["compressor"]
|
|
708
|
+
)
|
|
709
|
+
compressor = numcodecs.get_codec(compressor)
|
|
768
710
|
if array_spec.dtype == "O":
|
|
769
|
-
if zarr_v3():
|
|
711
|
+
if zarr_utils.zarr_v3():
|
|
770
712
|
filters = [*list(filters), numcodecs.VLenUTF8()]
|
|
771
713
|
else:
|
|
772
714
|
kwargs["object_codec"] = numcodecs.VLenUTF8()
|
|
773
715
|
|
|
774
|
-
if not zarr_v3():
|
|
716
|
+
if not zarr_utils.zarr_v3():
|
|
775
717
|
kwargs["dimension_separator"] = self.metadata.dimension_separator
|
|
776
718
|
|
|
777
|
-
shape =
|
|
778
|
-
# Truncate the variants dimension
|
|
719
|
+
shape = schema.get_shape(array_spec.dimensions)
|
|
720
|
+
# Truncate the variants dimension if max_variant_chunks was specified
|
|
779
721
|
shape[0] = variants_dim_size
|
|
780
722
|
a = root.empty(
|
|
781
723
|
name=array_spec.name,
|
|
782
724
|
shape=shape,
|
|
783
|
-
chunks=array_spec.
|
|
725
|
+
chunks=schema.get_chunks(array_spec.dimensions),
|
|
784
726
|
dtype=array_spec.dtype,
|
|
785
|
-
compressor=
|
|
727
|
+
compressor=compressor,
|
|
786
728
|
filters=filters,
|
|
787
729
|
**kwargs,
|
|
788
730
|
)
|
|
@@ -804,7 +746,7 @@ class VcfZarrWriter:
|
|
|
804
746
|
if self.metadata is None:
|
|
805
747
|
with open(self.wip_path / "metadata.json") as f:
|
|
806
748
|
self.metadata = VcfZarrWriterMetadata.fromdict(json.load(f))
|
|
807
|
-
self.
|
|
749
|
+
self.source = self.source_type(self.metadata.source_path)
|
|
808
750
|
|
|
809
751
|
def partition_path(self, partition_index):
|
|
810
752
|
return self.partitions_path / f"p{partition_index}"
|
|
@@ -826,15 +768,18 @@ class VcfZarrWriter:
|
|
|
826
768
|
partition_path.mkdir(exist_ok=True)
|
|
827
769
|
logger.info(f"Encoding partition {partition_index} to {partition_path}")
|
|
828
770
|
|
|
829
|
-
self.
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
771
|
+
all_field_names = [field.name for field in self.schema.fields]
|
|
772
|
+
if "variant_id" in all_field_names:
|
|
773
|
+
self.encode_id_partition(partition_index)
|
|
774
|
+
if "variant_filter" in all_field_names:
|
|
775
|
+
self.encode_filters_partition(partition_index)
|
|
776
|
+
if "variant_contig" in all_field_names:
|
|
777
|
+
self.encode_contig_partition(partition_index)
|
|
778
|
+
self.encode_alleles_and_genotypes_partition(partition_index)
|
|
833
779
|
for array_spec in self.schema.fields:
|
|
834
|
-
if array_spec.
|
|
780
|
+
if array_spec.source is not None:
|
|
835
781
|
self.encode_array_partition(array_spec, partition_index)
|
|
836
782
|
if self.has_genotypes():
|
|
837
|
-
self.encode_genotypes_partition(partition_index)
|
|
838
783
|
self.encode_genotype_mask_partition(partition_index)
|
|
839
784
|
if self.has_local_alleles():
|
|
840
785
|
self.encode_local_alleles_partition(partition_index)
|
|
@@ -874,34 +819,48 @@ class VcfZarrWriter:
|
|
|
874
819
|
def encode_array_partition(self, array_spec, partition_index):
|
|
875
820
|
partition = self.metadata.partitions[partition_index]
|
|
876
821
|
ba = self.init_partition_array(partition_index, array_spec.name)
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
822
|
+
for value in self.source.iter_field(
|
|
823
|
+
array_spec.source,
|
|
824
|
+
ba.buff.shape[1:],
|
|
825
|
+
partition.start,
|
|
826
|
+
partition.stop,
|
|
827
|
+
):
|
|
883
828
|
j = ba.next_buffer_row()
|
|
884
|
-
|
|
829
|
+
ba.buff[j] = value
|
|
830
|
+
|
|
885
831
|
self.finalise_partition_array(partition_index, ba)
|
|
886
832
|
|
|
887
|
-
def
|
|
833
|
+
def encode_alleles_and_genotypes_partition(self, partition_index):
|
|
888
834
|
partition = self.metadata.partitions[partition_index]
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
)
|
|
898
|
-
j = gt_phased.next_buffer_row()
|
|
899
|
-
icf.sanitise_value_int_1d(
|
|
900
|
-
gt_phased.buff, j, value[:, -1] if value is not None else None
|
|
835
|
+
alleles = self.init_partition_array(partition_index, "variant_allele")
|
|
836
|
+
variant_lengths = self.init_partition_array(partition_index, "variant_length")
|
|
837
|
+
has_gt = self.has_genotypes()
|
|
838
|
+
shape = None
|
|
839
|
+
if has_gt:
|
|
840
|
+
gt = self.init_partition_array(partition_index, "call_genotype")
|
|
841
|
+
gt_phased = self.init_partition_array(
|
|
842
|
+
partition_index, "call_genotype_phased"
|
|
901
843
|
)
|
|
844
|
+
shape = gt.buff.shape[1:]
|
|
845
|
+
|
|
846
|
+
for variant_data in self.source.iter_alleles_and_genotypes(
|
|
847
|
+
partition.start, partition.stop, shape, alleles.array.shape[1]
|
|
848
|
+
):
|
|
849
|
+
j_alleles = alleles.next_buffer_row()
|
|
850
|
+
alleles.buff[j_alleles] = variant_data.alleles
|
|
851
|
+
j_variant_length = variant_lengths.next_buffer_row()
|
|
852
|
+
variant_lengths.buff[j_variant_length] = variant_data.variant_length
|
|
853
|
+
if has_gt:
|
|
854
|
+
j = gt.next_buffer_row()
|
|
855
|
+
gt.buff[j] = variant_data.genotypes
|
|
856
|
+
j_phased = gt_phased.next_buffer_row()
|
|
857
|
+
gt_phased.buff[j_phased] = variant_data.phased
|
|
902
858
|
|
|
903
|
-
self.finalise_partition_array(partition_index,
|
|
904
|
-
self.finalise_partition_array(partition_index,
|
|
859
|
+
self.finalise_partition_array(partition_index, alleles)
|
|
860
|
+
self.finalise_partition_array(partition_index, variant_lengths)
|
|
861
|
+
if has_gt:
|
|
862
|
+
self.finalise_partition_array(partition_index, gt)
|
|
863
|
+
self.finalise_partition_array(partition_index, gt_phased)
|
|
905
864
|
|
|
906
865
|
def encode_genotype_mask_partition(self, partition_index):
|
|
907
866
|
partition = self.metadata.partitions[partition_index]
|
|
@@ -948,10 +907,10 @@ class VcfZarrWriter:
|
|
|
948
907
|
for descriptor in localisable_fields:
|
|
949
908
|
if descriptor.array_name not in field_map:
|
|
950
909
|
continue
|
|
951
|
-
assert field_map[descriptor.array_name].
|
|
910
|
+
assert field_map[descriptor.array_name].source is None
|
|
952
911
|
|
|
953
912
|
buff = self.init_partition_array(partition_index, descriptor.array_name)
|
|
954
|
-
source = self.
|
|
913
|
+
source = self.source.fields[descriptor.vcf_field].iter_values(
|
|
955
914
|
partition.start, partition.stop
|
|
956
915
|
)
|
|
957
916
|
for la in core.first_dim_slice_iter(
|
|
@@ -963,34 +922,17 @@ class VcfZarrWriter:
|
|
|
963
922
|
buff.buff[j] = descriptor.convert(value, la)
|
|
964
923
|
self.finalise_partition_array(partition_index, buff)
|
|
965
924
|
|
|
966
|
-
def encode_alleles_partition(self, partition_index):
|
|
967
|
-
alleles = self.init_partition_array(partition_index, "variant_allele")
|
|
968
|
-
partition = self.metadata.partitions[partition_index]
|
|
969
|
-
ref_field = self.icf.fields["REF"]
|
|
970
|
-
alt_field = self.icf.fields["ALT"]
|
|
971
|
-
|
|
972
|
-
for ref, alt in zip(
|
|
973
|
-
ref_field.iter_values(partition.start, partition.stop),
|
|
974
|
-
alt_field.iter_values(partition.start, partition.stop),
|
|
975
|
-
):
|
|
976
|
-
j = alleles.next_buffer_row()
|
|
977
|
-
alleles.buff[j, :] = constants.STR_FILL
|
|
978
|
-
alleles.buff[j, 0] = ref[0]
|
|
979
|
-
alleles.buff[j, 1 : 1 + len(alt)] = alt
|
|
980
|
-
self.finalise_partition_array(partition_index, alleles)
|
|
981
|
-
|
|
982
925
|
def encode_id_partition(self, partition_index):
|
|
983
926
|
vid = self.init_partition_array(partition_index, "variant_id")
|
|
984
927
|
vid_mask = self.init_partition_array(partition_index, "variant_id_mask")
|
|
985
928
|
partition = self.metadata.partitions[partition_index]
|
|
986
|
-
field = self.icf.fields["ID"]
|
|
987
929
|
|
|
988
|
-
for value in
|
|
930
|
+
for value in self.source.iter_id(partition.start, partition.stop):
|
|
989
931
|
j = vid.next_buffer_row()
|
|
990
932
|
k = vid_mask.next_buffer_row()
|
|
991
933
|
assert j == k
|
|
992
934
|
if value is not None:
|
|
993
|
-
vid.buff[j] = value
|
|
935
|
+
vid.buff[j] = value
|
|
994
936
|
vid_mask.buff[j] = False
|
|
995
937
|
else:
|
|
996
938
|
vid.buff[j] = constants.STR_MISSING
|
|
@@ -1000,37 +942,22 @@ class VcfZarrWriter:
|
|
|
1000
942
|
self.finalise_partition_array(partition_index, vid_mask)
|
|
1001
943
|
|
|
1002
944
|
def encode_filters_partition(self, partition_index):
|
|
1003
|
-
lookup = {filt.id: index for index, filt in enumerate(self.schema.filters)}
|
|
1004
945
|
var_filter = self.init_partition_array(partition_index, "variant_filter")
|
|
1005
946
|
partition = self.metadata.partitions[partition_index]
|
|
1006
947
|
|
|
1007
|
-
|
|
1008
|
-
for value in field.iter_values(partition.start, partition.stop):
|
|
948
|
+
for filter_values in self.source.iter_filters(partition.start, partition.stop):
|
|
1009
949
|
j = var_filter.next_buffer_row()
|
|
1010
|
-
var_filter.buff[j] =
|
|
1011
|
-
for f in value:
|
|
1012
|
-
try:
|
|
1013
|
-
var_filter.buff[j, lookup[f]] = True
|
|
1014
|
-
except KeyError:
|
|
1015
|
-
raise ValueError(
|
|
1016
|
-
f"Filter '{f}' was not defined in the header."
|
|
1017
|
-
) from None
|
|
950
|
+
var_filter.buff[j] = filter_values
|
|
1018
951
|
|
|
1019
952
|
self.finalise_partition_array(partition_index, var_filter)
|
|
1020
953
|
|
|
1021
954
|
def encode_contig_partition(self, partition_index):
|
|
1022
|
-
lookup = {contig.id: index for index, contig in enumerate(self.schema.contigs)}
|
|
1023
955
|
contig = self.init_partition_array(partition_index, "variant_contig")
|
|
1024
956
|
partition = self.metadata.partitions[partition_index]
|
|
1025
|
-
field = self.icf.fields["CHROM"]
|
|
1026
957
|
|
|
1027
|
-
for
|
|
958
|
+
for contig_index in self.source.iter_contig(partition.start, partition.stop):
|
|
1028
959
|
j = contig.next_buffer_row()
|
|
1029
|
-
|
|
1030
|
-
# and we always have an index, it seems that we the contig lookup
|
|
1031
|
-
# will always succeed. However, if anyone ever does hit a KeyError
|
|
1032
|
-
# here, please do open an issue with a reproducible example!
|
|
1033
|
-
contig.buff[j] = lookup[value[0]]
|
|
960
|
+
contig.buff[j] = contig_index
|
|
1034
961
|
|
|
1035
962
|
self.finalise_partition_array(partition_index, contig)
|
|
1036
963
|
|
|
@@ -1109,60 +1036,8 @@ class VcfZarrWriter:
|
|
|
1109
1036
|
def create_index(self):
|
|
1110
1037
|
"""Create an index to support efficient region queries."""
|
|
1111
1038
|
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
contig = root["variant_contig"]
|
|
1115
|
-
pos = root["variant_position"]
|
|
1116
|
-
length = root["variant_length"]
|
|
1117
|
-
|
|
1118
|
-
assert contig.cdata_shape == pos.cdata_shape
|
|
1119
|
-
|
|
1120
|
-
index = []
|
|
1121
|
-
|
|
1122
|
-
logger.info("Creating region index")
|
|
1123
|
-
for v_chunk in range(pos.cdata_shape[0]):
|
|
1124
|
-
c = contig.blocks[v_chunk]
|
|
1125
|
-
p = pos.blocks[v_chunk]
|
|
1126
|
-
e = p + length.blocks[v_chunk] - 1
|
|
1127
|
-
|
|
1128
|
-
# create a row for each contig in the chunk
|
|
1129
|
-
d = np.diff(c, append=-1)
|
|
1130
|
-
c_start_idx = 0
|
|
1131
|
-
for c_end_idx in np.nonzero(d)[0]:
|
|
1132
|
-
assert c[c_start_idx] == c[c_end_idx]
|
|
1133
|
-
index.append(
|
|
1134
|
-
(
|
|
1135
|
-
v_chunk, # chunk index
|
|
1136
|
-
c[c_start_idx], # contig ID
|
|
1137
|
-
p[c_start_idx], # start
|
|
1138
|
-
p[c_end_idx], # end
|
|
1139
|
-
np.max(e[c_start_idx : c_end_idx + 1]), # max end
|
|
1140
|
-
c_end_idx - c_start_idx + 1, # num records
|
|
1141
|
-
)
|
|
1142
|
-
)
|
|
1143
|
-
c_start_idx = c_end_idx + 1
|
|
1144
|
-
|
|
1145
|
-
index = np.array(index, dtype=pos.dtype)
|
|
1146
|
-
kwargs = {}
|
|
1147
|
-
if not zarr_v3():
|
|
1148
|
-
kwargs["dimension_separator"] = self.metadata.dimension_separator
|
|
1149
|
-
array = root.array(
|
|
1150
|
-
"region_index",
|
|
1151
|
-
data=index,
|
|
1152
|
-
shape=index.shape,
|
|
1153
|
-
chunks=index.shape,
|
|
1154
|
-
dtype=index.dtype,
|
|
1155
|
-
compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
|
|
1156
|
-
fill_value=None,
|
|
1157
|
-
**kwargs,
|
|
1158
|
-
)
|
|
1159
|
-
array.attrs["_ARRAY_DIMENSIONS"] = [
|
|
1160
|
-
"region_index_values",
|
|
1161
|
-
"region_index_fields",
|
|
1162
|
-
]
|
|
1163
|
-
|
|
1164
|
-
logger.info("Consolidating Zarr metadata")
|
|
1165
|
-
zarr.consolidate_metadata(self.path)
|
|
1039
|
+
indexer = VcfZarrIndexer(self.path)
|
|
1040
|
+
indexer.create_index()
|
|
1166
1041
|
|
|
1167
1042
|
######################
|
|
1168
1043
|
# encode_all_partitions
|
|
@@ -1174,11 +1049,13 @@ class VcfZarrWriter:
|
|
|
1174
1049
|
"""
|
|
1175
1050
|
max_encoding_mem = 0
|
|
1176
1051
|
for array_spec in self.schema.fields:
|
|
1177
|
-
max_encoding_mem = max(
|
|
1052
|
+
max_encoding_mem = max(
|
|
1053
|
+
max_encoding_mem, array_spec.variant_chunk_nbytes(self.schema)
|
|
1054
|
+
)
|
|
1178
1055
|
gt_mem = 0
|
|
1179
1056
|
if self.has_genotypes:
|
|
1180
1057
|
gt_mem = sum(
|
|
1181
|
-
field.variant_chunk_nbytes
|
|
1058
|
+
field.variant_chunk_nbytes(self.schema)
|
|
1182
1059
|
for field in self.schema.fields
|
|
1183
1060
|
if field.name.startswith("call_genotype")
|
|
1184
1061
|
)
|
|
@@ -1187,7 +1064,7 @@ class VcfZarrWriter:
|
|
|
1187
1064
|
def encode_all_partitions(
|
|
1188
1065
|
self, *, worker_processes=1, show_progress=False, max_memory=None
|
|
1189
1066
|
):
|
|
1190
|
-
max_memory = parse_max_memory(max_memory)
|
|
1067
|
+
max_memory = core.parse_max_memory(max_memory)
|
|
1191
1068
|
self.load_metadata()
|
|
1192
1069
|
num_partitions = self.num_partitions
|
|
1193
1070
|
per_worker_memory = self.get_max_encoding_memory()
|
|
@@ -1229,147 +1106,106 @@ class VcfZarrWriter:
|
|
|
1229
1106
|
pwm.submit(self.encode_partition, partition_index)
|
|
1230
1107
|
|
|
1231
1108
|
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
):
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
worker_processes=worker_processes,
|
|
1279
|
-
show_progress=show_progress,
|
|
1280
|
-
max_memory=max_memory,
|
|
1281
|
-
)
|
|
1282
|
-
vzw.finalise(show_progress)
|
|
1283
|
-
vzw.create_index()
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
def encode_init(
|
|
1287
|
-
icf_path,
|
|
1288
|
-
zarr_path,
|
|
1289
|
-
target_num_partitions,
|
|
1290
|
-
*,
|
|
1291
|
-
schema_path=None,
|
|
1292
|
-
variants_chunk_size=None,
|
|
1293
|
-
samples_chunk_size=None,
|
|
1294
|
-
local_alleles=None,
|
|
1295
|
-
max_variant_chunks=None,
|
|
1296
|
-
dimension_separator=None,
|
|
1297
|
-
max_memory=None,
|
|
1298
|
-
worker_processes=1,
|
|
1299
|
-
show_progress=False,
|
|
1300
|
-
):
|
|
1301
|
-
icf_store = icf.IntermediateColumnarFormat(icf_path)
|
|
1302
|
-
if schema_path is None:
|
|
1303
|
-
schema = VcfZarrSchema.generate(
|
|
1304
|
-
icf_store,
|
|
1305
|
-
variants_chunk_size=variants_chunk_size,
|
|
1306
|
-
samples_chunk_size=samples_chunk_size,
|
|
1307
|
-
local_alleles=local_alleles,
|
|
1308
|
-
)
|
|
1309
|
-
else:
|
|
1310
|
-
logger.info(f"Reading schema from {schema_path}")
|
|
1311
|
-
if variants_chunk_size is not None or samples_chunk_size is not None:
|
|
1109
|
+
class VcfZarr:
|
|
1110
|
+
def __init__(self, path):
|
|
1111
|
+
if not (path / ".zmetadata").exists():
|
|
1112
|
+
raise ValueError("Not in VcfZarr format") # NEEDS TEST
|
|
1113
|
+
self.path = path
|
|
1114
|
+
self.root = zarr.open(path, mode="r")
|
|
1115
|
+
|
|
1116
|
+
def summary_table(self):
|
|
1117
|
+
data = []
|
|
1118
|
+
arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()]
|
|
1119
|
+
arrays.sort(key=lambda x: x[0])
|
|
1120
|
+
for stored, array in reversed(arrays):
|
|
1121
|
+
d = {
|
|
1122
|
+
"name": array.name,
|
|
1123
|
+
"dtype": str(array.dtype),
|
|
1124
|
+
"stored": core.display_size(stored),
|
|
1125
|
+
"size": core.display_size(array.nbytes),
|
|
1126
|
+
"ratio": core.display_number(array.nbytes / stored),
|
|
1127
|
+
"nchunks": str(array.nchunks),
|
|
1128
|
+
"chunk_size": core.display_size(array.nbytes / array.nchunks),
|
|
1129
|
+
"avg_chunk_stored": core.display_size(int(stored / array.nchunks)),
|
|
1130
|
+
"shape": str(array.shape),
|
|
1131
|
+
"chunk_shape": str(array.chunks),
|
|
1132
|
+
"compressor": str(array.compressor),
|
|
1133
|
+
"filters": str(array.filters),
|
|
1134
|
+
}
|
|
1135
|
+
data.append(d)
|
|
1136
|
+
return data
|
|
1137
|
+
|
|
1138
|
+
|
|
1139
|
+
class VcfZarrIndexer:
|
|
1140
|
+
"""
|
|
1141
|
+
Creates an index for efficient region queries in a VCF Zarr dataset.
|
|
1142
|
+
"""
|
|
1143
|
+
|
|
1144
|
+
def __init__(self, path):
|
|
1145
|
+
self.path = pathlib.Path(path)
|
|
1146
|
+
|
|
1147
|
+
def create_index(self):
|
|
1148
|
+
"""Create an index to support efficient region queries."""
|
|
1149
|
+
root = zarr.open_group(store=self.path, mode="r+")
|
|
1150
|
+
if (
|
|
1151
|
+
"variant_contig" not in root
|
|
1152
|
+
or "variant_position" not in root
|
|
1153
|
+
or "variant_length" not in root
|
|
1154
|
+
):
|
|
1312
1155
|
raise ValueError(
|
|
1313
|
-
"Cannot
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
schema=schema,
|
|
1323
|
-
dimension_separator=dimension_separator,
|
|
1324
|
-
max_variant_chunks=max_variant_chunks,
|
|
1325
|
-
)
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
def encode_partition(zarr_path, partition):
|
|
1329
|
-
writer = VcfZarrWriter(zarr_path)
|
|
1330
|
-
writer.encode_partition(partition)
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
def encode_finalise(zarr_path, show_progress=False):
|
|
1334
|
-
writer = VcfZarrWriter(zarr_path)
|
|
1335
|
-
writer.finalise(show_progress=show_progress)
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
def convert(
|
|
1339
|
-
vcfs,
|
|
1340
|
-
out_path,
|
|
1341
|
-
*,
|
|
1342
|
-
variants_chunk_size=None,
|
|
1343
|
-
samples_chunk_size=None,
|
|
1344
|
-
worker_processes=1,
|
|
1345
|
-
local_alleles=None,
|
|
1346
|
-
show_progress=False,
|
|
1347
|
-
icf_path=None,
|
|
1348
|
-
):
|
|
1349
|
-
if icf_path is None:
|
|
1350
|
-
cm = temp_icf_path(prefix="vcf2zarr")
|
|
1351
|
-
else:
|
|
1352
|
-
cm = contextlib.nullcontext(icf_path)
|
|
1353
|
-
|
|
1354
|
-
with cm as icf_path:
|
|
1355
|
-
icf.explode(
|
|
1356
|
-
icf_path,
|
|
1357
|
-
vcfs,
|
|
1358
|
-
worker_processes=worker_processes,
|
|
1359
|
-
show_progress=show_progress,
|
|
1360
|
-
)
|
|
1361
|
-
encode(
|
|
1362
|
-
icf_path,
|
|
1363
|
-
out_path,
|
|
1364
|
-
variants_chunk_size=variants_chunk_size,
|
|
1365
|
-
samples_chunk_size=samples_chunk_size,
|
|
1366
|
-
worker_processes=worker_processes,
|
|
1367
|
-
show_progress=show_progress,
|
|
1368
|
-
local_alleles=local_alleles,
|
|
1369
|
-
)
|
|
1156
|
+
"Cannot create index: variant_contig, "
|
|
1157
|
+
"variant_position and variant_length arrays are required"
|
|
1158
|
+
)
|
|
1159
|
+
|
|
1160
|
+
contig = root["variant_contig"]
|
|
1161
|
+
pos = root["variant_position"]
|
|
1162
|
+
length = root["variant_length"]
|
|
1163
|
+
|
|
1164
|
+
assert contig.cdata_shape == pos.cdata_shape
|
|
1370
1165
|
|
|
1166
|
+
index = []
|
|
1371
1167
|
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1168
|
+
logger.info("Creating region index")
|
|
1169
|
+
for v_chunk in range(pos.cdata_shape[0]):
|
|
1170
|
+
c = contig.blocks[v_chunk]
|
|
1171
|
+
p = pos.blocks[v_chunk]
|
|
1172
|
+
e = p + length.blocks[v_chunk] - 1
|
|
1173
|
+
|
|
1174
|
+
# create a row for each contig in the chunk
|
|
1175
|
+
d = np.diff(c, append=-1)
|
|
1176
|
+
c_start_idx = 0
|
|
1177
|
+
for c_end_idx in np.nonzero(d)[0]:
|
|
1178
|
+
assert c[c_start_idx] == c[c_end_idx]
|
|
1179
|
+
index.append(
|
|
1180
|
+
(
|
|
1181
|
+
v_chunk, # chunk index
|
|
1182
|
+
c[c_start_idx], # contig ID
|
|
1183
|
+
p[c_start_idx], # start
|
|
1184
|
+
p[c_end_idx], # end
|
|
1185
|
+
np.max(e[c_start_idx : c_end_idx + 1]), # max end
|
|
1186
|
+
c_end_idx - c_start_idx + 1, # num records
|
|
1187
|
+
)
|
|
1188
|
+
)
|
|
1189
|
+
c_start_idx = c_end_idx + 1
|
|
1190
|
+
|
|
1191
|
+
index = np.array(index, dtype=pos.dtype)
|
|
1192
|
+
kwargs = {}
|
|
1193
|
+
if not zarr_utils.zarr_v3():
|
|
1194
|
+
kwargs["dimension_separator"] = "/"
|
|
1195
|
+
array = root.array(
|
|
1196
|
+
"region_index",
|
|
1197
|
+
data=index,
|
|
1198
|
+
shape=index.shape,
|
|
1199
|
+
chunks=index.shape,
|
|
1200
|
+
dtype=index.dtype,
|
|
1201
|
+
compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
|
|
1202
|
+
fill_value=None,
|
|
1203
|
+
**kwargs,
|
|
1204
|
+
)
|
|
1205
|
+
array.attrs["_ARRAY_DIMENSIONS"] = [
|
|
1206
|
+
"region_index_values",
|
|
1207
|
+
"region_index_fields",
|
|
1208
|
+
]
|
|
1209
|
+
|
|
1210
|
+
logger.info("Consolidating Zarr metadata")
|
|
1211
|
+
zarr.consolidate_metadata(self.path)
|