bio2zarr 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bio2zarr/__main__.py +2 -1
- bio2zarr/_version.py +16 -3
- bio2zarr/cli.py +102 -22
- bio2zarr/core.py +43 -22
- bio2zarr/plink.py +316 -189
- bio2zarr/tskit.py +296 -0
- bio2zarr/typing.py +1 -2
- bio2zarr/{vcf2zarr/icf.py → vcf.py} +606 -114
- bio2zarr/vcf_utils.py +12 -11
- bio2zarr/{vcf2zarr/vcz.py → vcz.py} +568 -739
- bio2zarr/{vcf2zarr/verification.py → vcz_verification.py} +5 -2
- bio2zarr/zarr_utils.py +169 -2
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/METADATA +23 -8
- bio2zarr-0.1.7.dist-info/RECORD +21 -0
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/WHEEL +1 -1
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/entry_points.txt +2 -0
- bio2zarr/vcf2zarr/__init__.py +0 -38
- bio2zarr-0.1.5.dist-info/RECORD +0 -21
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,41 +1,29 @@
|
|
|
1
|
-
import
|
|
1
|
+
import abc
|
|
2
2
|
import dataclasses
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
-
import os.path
|
|
7
6
|
import pathlib
|
|
8
7
|
import shutil
|
|
9
|
-
import tempfile
|
|
10
8
|
|
|
11
|
-
import humanfriendly
|
|
12
9
|
import numcodecs
|
|
13
10
|
import numpy as np
|
|
14
11
|
import zarr
|
|
15
12
|
|
|
16
|
-
from bio2zarr
|
|
17
|
-
|
|
18
|
-
from .. import constants, core, provenance
|
|
19
|
-
from . import icf
|
|
13
|
+
from bio2zarr import constants, core, provenance, zarr_utils
|
|
20
14
|
|
|
21
15
|
logger = logging.getLogger(__name__)
|
|
22
16
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
if not path.exists():
|
|
27
|
-
raise ValueError(f"Path not found: {path}")
|
|
28
|
-
if (path / "metadata.json").exists():
|
|
29
|
-
obj = icf.IntermediateColumnarFormat(path)
|
|
30
|
-
# NOTE: this is too strict, we should support more general Zarrs, see #276
|
|
31
|
-
elif (path / ".zmetadata").exists():
|
|
32
|
-
obj = VcfZarr(path)
|
|
33
|
-
else:
|
|
34
|
-
raise ValueError(f"{path} not in ICF or VCF Zarr format")
|
|
35
|
-
return obj.summary_table()
|
|
36
|
-
|
|
37
|
-
|
|
17
|
+
ZARR_SCHEMA_FORMAT_VERSION = "0.6"
|
|
18
|
+
DEFAULT_VARIANT_CHUNK_SIZE = 1000
|
|
19
|
+
DEFAULT_SAMPLE_CHUNK_SIZE = 10_000
|
|
38
20
|
DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
|
|
21
|
+
DEFAULT_ZARR_COMPRESSOR_GENOTYPES = numcodecs.Blosc(
|
|
22
|
+
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
|
|
23
|
+
)
|
|
24
|
+
DEFAULT_ZARR_COMPRESSOR_BOOL = numcodecs.Blosc(
|
|
25
|
+
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
|
|
26
|
+
)
|
|
39
27
|
|
|
40
28
|
_fixed_field_descriptions = {
|
|
41
29
|
"variant_contig": "An identifier from the reference genome or an angle-bracketed ID"
|
|
@@ -49,131 +37,254 @@ _fixed_field_descriptions = {
|
|
|
49
37
|
}
|
|
50
38
|
|
|
51
39
|
|
|
40
|
+
@dataclasses.dataclass
|
|
41
|
+
class VariantData:
|
|
42
|
+
"""Represents variant data returned by iter_alleles_and_genotypes."""
|
|
43
|
+
|
|
44
|
+
variant_length: int
|
|
45
|
+
alleles: np.ndarray
|
|
46
|
+
genotypes: np.ndarray
|
|
47
|
+
phased: np.ndarray
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Source(abc.ABC):
|
|
51
|
+
@property
|
|
52
|
+
@abc.abstractmethod
|
|
53
|
+
def path(self):
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
@abc.abstractmethod
|
|
58
|
+
def num_records(self):
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
@abc.abstractmethod
|
|
63
|
+
def num_samples(self):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
@abc.abstractmethod
|
|
68
|
+
def samples(self):
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def contigs(self):
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def filters(self):
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def root_attrs(self):
|
|
81
|
+
return {}
|
|
82
|
+
|
|
83
|
+
@abc.abstractmethod
|
|
84
|
+
def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
def iter_id(self, start, stop):
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
def iter_contig(self, start, stop):
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
@abc.abstractmethod
|
|
94
|
+
def iter_field(self, field_name, shape, start, stop):
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
@abc.abstractmethod
|
|
98
|
+
def generate_schema(self, variants_chunk_size, samples_chunk_size, local_alleles):
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclasses.dataclass
|
|
103
|
+
class VcfZarrDimension:
|
|
104
|
+
size: int
|
|
105
|
+
chunk_size: int
|
|
106
|
+
|
|
107
|
+
def asdict(self):
|
|
108
|
+
return dataclasses.asdict(self)
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def fromdict(cls, d):
|
|
112
|
+
return cls(**d)
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def unchunked(cls, size):
|
|
116
|
+
return cls(size, max(size, 1))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def standard_dimensions(
|
|
120
|
+
*,
|
|
121
|
+
variants_size,
|
|
122
|
+
samples_size,
|
|
123
|
+
variants_chunk_size=None,
|
|
124
|
+
samples_chunk_size=None,
|
|
125
|
+
alleles_size=None,
|
|
126
|
+
filters_size=None,
|
|
127
|
+
ploidy_size=None,
|
|
128
|
+
genotypes_size=None,
|
|
129
|
+
):
|
|
130
|
+
"""
|
|
131
|
+
Returns a dictionary mapping dimension names to definition for the standard
|
|
132
|
+
fields in a VCF.
|
|
133
|
+
"""
|
|
134
|
+
if variants_chunk_size is None:
|
|
135
|
+
variants_chunk_size = max(1, min(variants_size, DEFAULT_VARIANT_CHUNK_SIZE))
|
|
136
|
+
if samples_chunk_size is None:
|
|
137
|
+
samples_chunk_size = max(1, min(samples_size, DEFAULT_SAMPLE_CHUNK_SIZE))
|
|
138
|
+
|
|
139
|
+
dimensions = {
|
|
140
|
+
"variants": VcfZarrDimension(variants_size, variants_chunk_size),
|
|
141
|
+
"samples": VcfZarrDimension(samples_size, samples_chunk_size),
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if alleles_size is not None:
|
|
145
|
+
dimensions["alleles"] = VcfZarrDimension.unchunked(alleles_size)
|
|
146
|
+
if alleles_size > 1:
|
|
147
|
+
dimensions["alt_alleles"] = VcfZarrDimension.unchunked(alleles_size - 1)
|
|
148
|
+
|
|
149
|
+
if filters_size is not None:
|
|
150
|
+
dimensions["filters"] = VcfZarrDimension.unchunked(filters_size)
|
|
151
|
+
|
|
152
|
+
if ploidy_size is not None:
|
|
153
|
+
dimensions["ploidy"] = VcfZarrDimension.unchunked(ploidy_size)
|
|
154
|
+
|
|
155
|
+
if genotypes_size is not None:
|
|
156
|
+
dimensions["genotypes"] = VcfZarrDimension.unchunked(genotypes_size)
|
|
157
|
+
|
|
158
|
+
return dimensions
|
|
159
|
+
|
|
160
|
+
|
|
52
161
|
@dataclasses.dataclass
|
|
53
162
|
class ZarrArraySpec:
|
|
54
163
|
name: str
|
|
55
164
|
dtype: str
|
|
56
|
-
shape: tuple
|
|
57
|
-
chunks: tuple
|
|
58
165
|
dimensions: tuple
|
|
59
166
|
description: str
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
167
|
+
compressor: dict = None
|
|
168
|
+
filters: list = None
|
|
169
|
+
source: str = None
|
|
63
170
|
|
|
64
171
|
def __post_init__(self):
|
|
65
172
|
if self.name in _fixed_field_descriptions:
|
|
66
173
|
self.description = self.description or _fixed_field_descriptions[self.name]
|
|
67
174
|
|
|
68
|
-
# Ensure these are tuples for ease of comparison and consistency
|
|
69
|
-
self.shape = tuple(self.shape)
|
|
70
|
-
self.chunks = tuple(self.chunks)
|
|
71
175
|
self.dimensions = tuple(self.dimensions)
|
|
72
|
-
self.filters = tuple(self.filters)
|
|
176
|
+
self.filters = tuple(self.filters) if self.filters is not None else None
|
|
73
177
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
|
|
178
|
+
def get_shape(self, schema):
|
|
179
|
+
return schema.get_shape(self.dimensions)
|
|
180
|
+
|
|
181
|
+
def get_chunks(self, schema):
|
|
182
|
+
return schema.get_chunks(self.dimensions)
|
|
183
|
+
|
|
184
|
+
def get_chunk_nbytes(self, schema):
|
|
185
|
+
element_size = np.dtype(self.dtype).itemsize
|
|
186
|
+
chunks = self.get_chunks(schema)
|
|
187
|
+
shape = self.get_shape(schema)
|
|
188
|
+
|
|
189
|
+
# Calculate actual chunk size accounting for dimension limits
|
|
190
|
+
items = 1
|
|
191
|
+
for i, chunk_size in enumerate(chunks):
|
|
192
|
+
items *= min(chunk_size, shape[i])
|
|
193
|
+
|
|
194
|
+
# Include sizes for extra dimensions (if any)
|
|
195
|
+
if len(shape) > len(chunks):
|
|
196
|
+
for size in shape[len(chunks) :]:
|
|
197
|
+
items *= size
|
|
198
|
+
|
|
199
|
+
return element_size * items
|
|
81
200
|
|
|
82
201
|
@staticmethod
|
|
83
202
|
def from_field(
|
|
84
203
|
vcf_field,
|
|
204
|
+
schema,
|
|
85
205
|
*,
|
|
86
|
-
num_variants,
|
|
87
|
-
num_samples,
|
|
88
|
-
variants_chunk_size,
|
|
89
|
-
samples_chunk_size,
|
|
90
206
|
array_name=None,
|
|
207
|
+
compressor=None,
|
|
208
|
+
filters=None,
|
|
91
209
|
):
|
|
92
|
-
shape = [num_variants]
|
|
93
210
|
prefix = "variant_"
|
|
94
211
|
dimensions = ["variants"]
|
|
95
|
-
chunks = [variants_chunk_size]
|
|
96
212
|
if vcf_field.category == "FORMAT":
|
|
97
213
|
prefix = "call_"
|
|
98
|
-
shape.append(num_samples)
|
|
99
|
-
chunks.append(samples_chunk_size)
|
|
100
214
|
dimensions.append("samples")
|
|
101
215
|
if array_name is None:
|
|
102
216
|
array_name = prefix + vcf_field.name
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
217
|
+
|
|
218
|
+
max_number = vcf_field.max_number
|
|
219
|
+
if vcf_field.vcf_number == "R":
|
|
220
|
+
max_alleles = schema.dimensions["alleles"].size
|
|
221
|
+
if max_number > max_alleles:
|
|
222
|
+
raise ValueError(
|
|
223
|
+
f"Max number of values {max_number} exceeds max alleles "
|
|
224
|
+
f"{max_alleles} for {vcf_field.full_name}"
|
|
225
|
+
)
|
|
226
|
+
if max_alleles > 0:
|
|
110
227
|
dimensions.append("alleles")
|
|
111
|
-
|
|
228
|
+
elif vcf_field.vcf_number == "A":
|
|
229
|
+
max_alt_alleles = schema.dimensions["alt_alleles"].size
|
|
230
|
+
if max_number > max_alt_alleles:
|
|
231
|
+
raise ValueError(
|
|
232
|
+
f"Max number of values {max_number} exceeds max alt alleles "
|
|
233
|
+
f"{max_alt_alleles} for {vcf_field.full_name}"
|
|
234
|
+
)
|
|
235
|
+
if max_alt_alleles > 0:
|
|
112
236
|
dimensions.append("alt_alleles")
|
|
113
|
-
|
|
237
|
+
elif vcf_field.vcf_number == "G":
|
|
238
|
+
max_genotypes = schema.dimensions["genotypes"].size
|
|
239
|
+
if max_number > max_genotypes:
|
|
240
|
+
raise ValueError(
|
|
241
|
+
f"Max number of values {max_number} exceeds max genotypes "
|
|
242
|
+
f"{max_genotypes} for {vcf_field.full_name}"
|
|
243
|
+
)
|
|
244
|
+
if max_genotypes > 0:
|
|
114
245
|
dimensions.append("genotypes")
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
246
|
+
elif max_number > 1 or vcf_field.full_name == "FORMAT/LAA":
|
|
247
|
+
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
|
|
248
|
+
if dimensions[-1] not in schema.dimensions:
|
|
249
|
+
schema.dimensions[dimensions[-1]] = VcfZarrDimension.unchunked(
|
|
250
|
+
vcf_field.max_number
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
return ZarrArraySpec(
|
|
254
|
+
source=vcf_field.full_name,
|
|
119
255
|
name=array_name,
|
|
120
256
|
dtype=vcf_field.smallest_dtype(),
|
|
121
|
-
shape=shape,
|
|
122
|
-
chunks=chunks,
|
|
123
257
|
dimensions=dimensions,
|
|
124
258
|
description=vcf_field.description,
|
|
259
|
+
compressor=compressor,
|
|
260
|
+
filters=filters,
|
|
125
261
|
)
|
|
126
262
|
|
|
127
|
-
def
|
|
128
|
-
"""
|
|
129
|
-
Choose compressor and filter settings based on the size and
|
|
130
|
-
type of the array, plus some hueristics from observed properties
|
|
131
|
-
of VCFs.
|
|
132
|
-
|
|
133
|
-
See https://github.com/pystatgen/bio2zarr/discussions/74
|
|
134
|
-
"""
|
|
135
|
-
# Default is to not shuffle, because autoshuffle isn't recognised
|
|
136
|
-
# by many Zarr implementations, and shuffling can lead to worse
|
|
137
|
-
# performance in some cases anyway. Turning on shuffle should be a
|
|
138
|
-
# deliberate choice.
|
|
139
|
-
shuffle = numcodecs.Blosc.NOSHUFFLE
|
|
140
|
-
if self.name == "call_genotype" and self.dtype == "i1":
|
|
141
|
-
# call_genotype gets BITSHUFFLE by default as it gets
|
|
142
|
-
# significantly better compression (at a cost of slower
|
|
143
|
-
# decoding)
|
|
144
|
-
shuffle = numcodecs.Blosc.BITSHUFFLE
|
|
145
|
-
elif self.dtype == "bool":
|
|
146
|
-
shuffle = numcodecs.Blosc.BITSHUFFLE
|
|
147
|
-
|
|
148
|
-
self.compressor["shuffle"] = shuffle
|
|
149
|
-
|
|
150
|
-
@property
|
|
151
|
-
def chunk_nbytes(self):
|
|
263
|
+
def chunk_nbytes(self, schema):
|
|
152
264
|
"""
|
|
153
265
|
Returns the nbytes for a single chunk in this array.
|
|
154
266
|
"""
|
|
155
267
|
items = 1
|
|
156
268
|
dim = 0
|
|
157
|
-
for chunk_size in self.
|
|
158
|
-
size = min(chunk_size, self.
|
|
269
|
+
for chunk_size in self.get_chunks(schema):
|
|
270
|
+
size = min(chunk_size, self.get_shape(schema)[dim])
|
|
159
271
|
items *= size
|
|
160
272
|
dim += 1
|
|
161
273
|
# Include sizes for extra dimensions.
|
|
162
|
-
for size in self.
|
|
274
|
+
for size in self.get_shape(schema)[dim:]:
|
|
163
275
|
items *= size
|
|
164
276
|
dt = np.dtype(self.dtype)
|
|
165
277
|
return items * dt.itemsize
|
|
166
278
|
|
|
167
|
-
|
|
168
|
-
def variant_chunk_nbytes(self):
|
|
279
|
+
def variant_chunk_nbytes(self, schema):
|
|
169
280
|
"""
|
|
170
281
|
Returns the nbytes for a single variant chunk of this array.
|
|
171
282
|
"""
|
|
172
|
-
chunk_items = self.
|
|
173
|
-
for size in self.
|
|
283
|
+
chunk_items = self.get_chunks(schema)[0]
|
|
284
|
+
for size in self.get_shape(schema)[1:]:
|
|
174
285
|
chunk_items *= size
|
|
175
286
|
dt = np.dtype(self.dtype)
|
|
176
|
-
if dt.kind ==
|
|
287
|
+
if dt.kind == zarr_utils.STRING_DTYPE_NAME and "samples" in self.dimensions:
|
|
177
288
|
logger.warning(
|
|
178
289
|
f"Field {self.name} is a string; max memory usage may "
|
|
179
290
|
"be a significant underestimate"
|
|
@@ -181,87 +292,71 @@ class ZarrArraySpec:
|
|
|
181
292
|
return chunk_items * dt.itemsize
|
|
182
293
|
|
|
183
294
|
|
|
184
|
-
|
|
295
|
+
@dataclasses.dataclass
|
|
296
|
+
class Contig:
|
|
297
|
+
id: str
|
|
298
|
+
length: int = None
|
|
185
299
|
|
|
186
300
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
stored.
|
|
197
|
-
"""
|
|
198
|
-
fields_by_name = {field.name: field for field in fields}
|
|
199
|
-
gt = fields_by_name["call_genotype"]
|
|
200
|
-
if gt.shape[-1] != 2:
|
|
201
|
-
raise ValueError("Local alleles only supported on diploid data")
|
|
202
|
-
|
|
203
|
-
# TODO check if LA is already in here
|
|
204
|
-
|
|
205
|
-
shape = gt.shape[:-1]
|
|
206
|
-
chunks = gt.chunks[:-1]
|
|
207
|
-
dimensions = gt.dimensions[:-1]
|
|
208
|
-
|
|
209
|
-
la = ZarrArraySpec.new(
|
|
210
|
-
vcf_field=None,
|
|
211
|
-
name="call_LA",
|
|
212
|
-
dtype="i1",
|
|
213
|
-
shape=gt.shape,
|
|
214
|
-
chunks=gt.chunks,
|
|
215
|
-
dimensions=(*dimensions, "local_alleles"),
|
|
216
|
-
description=(
|
|
217
|
-
"0-based indices into REF+ALT, indicating which alleles"
|
|
218
|
-
" are relevant (local) for the current sample"
|
|
219
|
-
),
|
|
220
|
-
)
|
|
221
|
-
ad = fields_by_name.get("call_AD", None)
|
|
222
|
-
if ad is not None:
|
|
223
|
-
# TODO check if call_LAD is in the list already
|
|
224
|
-
ad.name = "call_LAD"
|
|
225
|
-
ad.vcf_field = None
|
|
226
|
-
ad.shape = (*shape, 2)
|
|
227
|
-
ad.chunks = (*chunks, 2)
|
|
228
|
-
ad.dimensions = (*dimensions, "local_alleles")
|
|
229
|
-
ad.description += " (local-alleles)"
|
|
230
|
-
|
|
231
|
-
pl = fields_by_name.get("call_PL", None)
|
|
232
|
-
if pl is not None:
|
|
233
|
-
# TODO check if call_LPL is in the list already
|
|
234
|
-
pl.name = "call_LPL"
|
|
235
|
-
pl.vcf_field = None
|
|
236
|
-
pl.shape = (*shape, 3)
|
|
237
|
-
pl.chunks = (*chunks, 3)
|
|
238
|
-
pl.description += " (local-alleles)"
|
|
239
|
-
pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1])
|
|
240
|
-
return [*fields, la]
|
|
301
|
+
@dataclasses.dataclass
|
|
302
|
+
class Sample:
|
|
303
|
+
id: str
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
@dataclasses.dataclass
|
|
307
|
+
class Filter:
|
|
308
|
+
id: str
|
|
309
|
+
description: str = ""
|
|
241
310
|
|
|
242
311
|
|
|
243
312
|
@dataclasses.dataclass
|
|
244
313
|
class VcfZarrSchema(core.JsonDataclass):
|
|
245
314
|
format_version: str
|
|
246
|
-
|
|
247
|
-
variants_chunk_size: int
|
|
248
|
-
samples: list
|
|
249
|
-
contigs: list
|
|
250
|
-
filters: list
|
|
315
|
+
dimensions: dict
|
|
251
316
|
fields: list
|
|
317
|
+
defaults: dict
|
|
318
|
+
|
|
319
|
+
def __init__(
|
|
320
|
+
self,
|
|
321
|
+
format_version: str,
|
|
322
|
+
fields: list,
|
|
323
|
+
dimensions: dict,
|
|
324
|
+
defaults: dict = None,
|
|
325
|
+
):
|
|
326
|
+
self.format_version = format_version
|
|
327
|
+
self.fields = fields
|
|
328
|
+
defaults = defaults.copy() if defaults is not None else {}
|
|
329
|
+
if defaults.get("compressor", None) is None:
|
|
330
|
+
defaults["compressor"] = DEFAULT_ZARR_COMPRESSOR.get_config()
|
|
331
|
+
if defaults.get("filters", None) is None:
|
|
332
|
+
defaults["filters"] = []
|
|
333
|
+
self.defaults = defaults
|
|
334
|
+
self.dimensions = dimensions
|
|
335
|
+
|
|
336
|
+
def get_shape(self, dimensions):
|
|
337
|
+
return [self.dimensions[dim].size for dim in dimensions]
|
|
338
|
+
|
|
339
|
+
def get_chunks(self, dimensions):
|
|
340
|
+
return [self.dimensions[dim].chunk_size for dim in dimensions]
|
|
252
341
|
|
|
253
342
|
def validate(self):
|
|
254
343
|
"""
|
|
255
344
|
Checks that the schema is well-formed and within required limits.
|
|
256
345
|
"""
|
|
257
346
|
for field in self.fields:
|
|
347
|
+
for dim in field.dimensions:
|
|
348
|
+
if dim not in self.dimensions:
|
|
349
|
+
raise ValueError(
|
|
350
|
+
f"Dimension '{dim}' used in field '{field.name}' is "
|
|
351
|
+
"not defined in the schema"
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
chunk_nbytes = field.get_chunk_nbytes(self)
|
|
258
355
|
# This is the Blosc max buffer size
|
|
259
|
-
if
|
|
260
|
-
# TODO add some links to documentation here advising how to
|
|
261
|
-
# deal with PL values.
|
|
356
|
+
if chunk_nbytes > 2147483647:
|
|
262
357
|
raise ValueError(
|
|
263
358
|
f"Field {field.name} chunks are too large "
|
|
264
|
-
f"({
|
|
359
|
+
f"({chunk_nbytes} > 2**31 - 1 bytes). "
|
|
265
360
|
"Either generate a schema and drop this field (if you don't "
|
|
266
361
|
"need it) or reduce the variant or sample chunk sizes."
|
|
267
362
|
)
|
|
@@ -278,253 +373,30 @@ class VcfZarrSchema(core.JsonDataclass):
|
|
|
278
373
|
"Zarr schema format version mismatch: "
|
|
279
374
|
f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
|
|
280
375
|
)
|
|
376
|
+
|
|
281
377
|
ret = VcfZarrSchema(**d)
|
|
282
|
-
ret.samples = [icf.Sample(**sd) for sd in d["samples"]]
|
|
283
|
-
ret.contigs = [icf.Contig(**sd) for sd in d["contigs"]]
|
|
284
|
-
ret.filters = [icf.Filter(**sd) for sd in d["filters"]]
|
|
285
378
|
ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
|
|
379
|
+
ret.dimensions = {
|
|
380
|
+
k: VcfZarrDimension.fromdict(v) for k, v in d["dimensions"].items()
|
|
381
|
+
}
|
|
382
|
+
|
|
286
383
|
return ret
|
|
287
384
|
|
|
288
385
|
@staticmethod
|
|
289
386
|
def fromjson(s):
|
|
290
387
|
return VcfZarrSchema.fromdict(json.loads(s))
|
|
291
388
|
|
|
292
|
-
@staticmethod
|
|
293
|
-
def generate(
|
|
294
|
-
icf, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
|
|
295
|
-
):
|
|
296
|
-
m = icf.num_records
|
|
297
|
-
n = icf.num_samples
|
|
298
|
-
if samples_chunk_size is None:
|
|
299
|
-
samples_chunk_size = 10_000
|
|
300
|
-
if variants_chunk_size is None:
|
|
301
|
-
variants_chunk_size = 1000
|
|
302
|
-
if local_alleles is None:
|
|
303
|
-
local_alleles = False
|
|
304
|
-
logger.info(
|
|
305
|
-
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
|
|
306
|
-
)
|
|
307
|
-
|
|
308
|
-
def spec_from_field(field, array_name=None):
|
|
309
|
-
return ZarrArraySpec.from_field(
|
|
310
|
-
field,
|
|
311
|
-
num_samples=n,
|
|
312
|
-
num_variants=m,
|
|
313
|
-
samples_chunk_size=samples_chunk_size,
|
|
314
|
-
variants_chunk_size=variants_chunk_size,
|
|
315
|
-
array_name=array_name,
|
|
316
|
-
)
|
|
317
389
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
name=name,
|
|
329
|
-
dtype=dtype,
|
|
330
|
-
shape=shape,
|
|
331
|
-
description="",
|
|
332
|
-
dimensions=dimensions,
|
|
333
|
-
chunks=chunks or [variants_chunk_size],
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
alt_field = icf.fields["ALT"]
|
|
337
|
-
max_alleles = alt_field.vcf_field.summary.max_number + 1
|
|
338
|
-
|
|
339
|
-
array_specs = [
|
|
340
|
-
fixed_field_spec(
|
|
341
|
-
name="variant_contig",
|
|
342
|
-
dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
|
|
343
|
-
),
|
|
344
|
-
fixed_field_spec(
|
|
345
|
-
name="variant_filter",
|
|
346
|
-
dtype="bool",
|
|
347
|
-
shape=(m, icf.metadata.num_filters),
|
|
348
|
-
dimensions=["variants", "filters"],
|
|
349
|
-
chunks=(variants_chunk_size, icf.metadata.num_filters),
|
|
350
|
-
),
|
|
351
|
-
fixed_field_spec(
|
|
352
|
-
name="variant_allele",
|
|
353
|
-
dtype="O",
|
|
354
|
-
shape=(m, max_alleles),
|
|
355
|
-
dimensions=["variants", "alleles"],
|
|
356
|
-
chunks=(variants_chunk_size, max_alleles),
|
|
357
|
-
),
|
|
358
|
-
fixed_field_spec(
|
|
359
|
-
name="variant_id",
|
|
360
|
-
dtype="O",
|
|
361
|
-
),
|
|
362
|
-
fixed_field_spec(
|
|
363
|
-
name="variant_id_mask",
|
|
364
|
-
dtype="bool",
|
|
365
|
-
),
|
|
366
|
-
]
|
|
367
|
-
name_map = {field.full_name: field for field in icf.metadata.fields}
|
|
368
|
-
|
|
369
|
-
# Only three of the fixed fields have a direct one-to-one mapping.
|
|
370
|
-
array_specs.extend(
|
|
371
|
-
[
|
|
372
|
-
spec_from_field(name_map["QUAL"], array_name="variant_quality"),
|
|
373
|
-
spec_from_field(name_map["POS"], array_name="variant_position"),
|
|
374
|
-
spec_from_field(name_map["rlen"], array_name="variant_length"),
|
|
375
|
-
]
|
|
376
|
-
)
|
|
377
|
-
array_specs.extend(
|
|
378
|
-
[spec_from_field(field) for field in icf.metadata.info_fields]
|
|
379
|
-
)
|
|
380
|
-
|
|
381
|
-
gt_field = None
|
|
382
|
-
for field in icf.metadata.format_fields:
|
|
383
|
-
if field.name == "GT":
|
|
384
|
-
gt_field = field
|
|
385
|
-
continue
|
|
386
|
-
array_specs.append(spec_from_field(field))
|
|
387
|
-
|
|
388
|
-
if gt_field is not None and n > 0:
|
|
389
|
-
ploidy = max(gt_field.summary.max_number - 1, 1)
|
|
390
|
-
shape = [m, n]
|
|
391
|
-
chunks = [variants_chunk_size, samples_chunk_size]
|
|
392
|
-
dimensions = ["variants", "samples"]
|
|
393
|
-
array_specs.append(
|
|
394
|
-
ZarrArraySpec.new(
|
|
395
|
-
vcf_field=None,
|
|
396
|
-
name="call_genotype_phased",
|
|
397
|
-
dtype="bool",
|
|
398
|
-
shape=list(shape),
|
|
399
|
-
chunks=list(chunks),
|
|
400
|
-
dimensions=list(dimensions),
|
|
401
|
-
description="",
|
|
402
|
-
)
|
|
403
|
-
)
|
|
404
|
-
shape += [ploidy]
|
|
405
|
-
chunks += [ploidy]
|
|
406
|
-
dimensions += ["ploidy"]
|
|
407
|
-
array_specs.append(
|
|
408
|
-
ZarrArraySpec.new(
|
|
409
|
-
vcf_field=None,
|
|
410
|
-
name="call_genotype",
|
|
411
|
-
dtype=gt_field.smallest_dtype(),
|
|
412
|
-
shape=list(shape),
|
|
413
|
-
chunks=list(chunks),
|
|
414
|
-
dimensions=list(dimensions),
|
|
415
|
-
description="",
|
|
416
|
-
)
|
|
417
|
-
)
|
|
418
|
-
array_specs.append(
|
|
419
|
-
ZarrArraySpec.new(
|
|
420
|
-
vcf_field=None,
|
|
421
|
-
name="call_genotype_mask",
|
|
422
|
-
dtype="bool",
|
|
423
|
-
shape=list(shape),
|
|
424
|
-
chunks=list(chunks),
|
|
425
|
-
dimensions=list(dimensions),
|
|
426
|
-
description="",
|
|
427
|
-
)
|
|
428
|
-
)
|
|
429
|
-
|
|
430
|
-
if local_alleles:
|
|
431
|
-
array_specs = convert_local_allele_field_types(array_specs)
|
|
432
|
-
|
|
433
|
-
return VcfZarrSchema(
|
|
434
|
-
format_version=ZARR_SCHEMA_FORMAT_VERSION,
|
|
435
|
-
samples_chunk_size=samples_chunk_size,
|
|
436
|
-
variants_chunk_size=variants_chunk_size,
|
|
437
|
-
fields=array_specs,
|
|
438
|
-
samples=icf.metadata.samples,
|
|
439
|
-
contigs=icf.metadata.contigs,
|
|
440
|
-
filters=icf.metadata.filters,
|
|
441
|
-
)
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
class VcfZarr:
|
|
445
|
-
def __init__(self, path):
|
|
446
|
-
if not (path / ".zmetadata").exists():
|
|
447
|
-
raise ValueError("Not in VcfZarr format") # NEEDS TEST
|
|
448
|
-
self.path = path
|
|
449
|
-
self.root = zarr.open(path, mode="r")
|
|
450
|
-
|
|
451
|
-
def summary_table(self):
|
|
452
|
-
data = []
|
|
453
|
-
arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()]
|
|
454
|
-
arrays.sort(key=lambda x: x[0])
|
|
455
|
-
for stored, array in reversed(arrays):
|
|
456
|
-
d = {
|
|
457
|
-
"name": array.name,
|
|
458
|
-
"dtype": str(array.dtype),
|
|
459
|
-
"stored": core.display_size(stored),
|
|
460
|
-
"size": core.display_size(array.nbytes),
|
|
461
|
-
"ratio": core.display_number(array.nbytes / stored),
|
|
462
|
-
"nchunks": str(array.nchunks),
|
|
463
|
-
"chunk_size": core.display_size(array.nbytes / array.nchunks),
|
|
464
|
-
"avg_chunk_stored": core.display_size(int(stored / array.nchunks)),
|
|
465
|
-
"shape": str(array.shape),
|
|
466
|
-
"chunk_shape": str(array.chunks),
|
|
467
|
-
"compressor": str(array.compressor),
|
|
468
|
-
"filters": str(array.filters),
|
|
469
|
-
}
|
|
470
|
-
data.append(d)
|
|
471
|
-
return data
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
def parse_max_memory(max_memory):
|
|
475
|
-
if max_memory is None:
|
|
476
|
-
# Effectively unbounded
|
|
477
|
-
return 2**63
|
|
478
|
-
if isinstance(max_memory, str):
|
|
479
|
-
max_memory = humanfriendly.parse_size(max_memory)
|
|
480
|
-
logger.info(f"Set memory budget to {core.display_size(max_memory)}")
|
|
481
|
-
return max_memory
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
@dataclasses.dataclass
|
|
485
|
-
class VcfZarrPartition:
|
|
486
|
-
start: int
|
|
487
|
-
stop: int
|
|
488
|
-
|
|
489
|
-
@staticmethod
|
|
490
|
-
def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
|
|
491
|
-
num_chunks = int(np.ceil(num_records / chunk_size))
|
|
492
|
-
if max_chunks is not None:
|
|
493
|
-
num_chunks = min(num_chunks, max_chunks)
|
|
494
|
-
partitions = []
|
|
495
|
-
splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
|
|
496
|
-
for chunk_slice in splits:
|
|
497
|
-
start_chunk = int(chunk_slice[0])
|
|
498
|
-
stop_chunk = int(chunk_slice[-1]) + 1
|
|
499
|
-
start_index = start_chunk * chunk_size
|
|
500
|
-
stop_index = min(stop_chunk * chunk_size, num_records)
|
|
501
|
-
partitions.append(VcfZarrPartition(start_index, stop_index))
|
|
502
|
-
return partitions
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
VZW_METADATA_FORMAT_VERSION = "0.1"
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
@dataclasses.dataclass
|
|
509
|
-
class VcfZarrWriterMetadata(core.JsonDataclass):
|
|
510
|
-
format_version: str
|
|
511
|
-
icf_path: str
|
|
512
|
-
schema: VcfZarrSchema
|
|
513
|
-
dimension_separator: str
|
|
514
|
-
partitions: list
|
|
515
|
-
provenance: dict
|
|
516
|
-
|
|
517
|
-
@staticmethod
|
|
518
|
-
def fromdict(d):
|
|
519
|
-
if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
|
|
520
|
-
raise ValueError(
|
|
521
|
-
"VcfZarrWriter format version mismatch: "
|
|
522
|
-
f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
|
|
523
|
-
)
|
|
524
|
-
ret = VcfZarrWriterMetadata(**d)
|
|
525
|
-
ret.schema = VcfZarrSchema.fromdict(ret.schema)
|
|
526
|
-
ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
|
|
527
|
-
return ret
|
|
390
|
+
def sanitise_int_array(value, ndmin, dtype):
|
|
391
|
+
if isinstance(value, tuple):
|
|
392
|
+
value = [
|
|
393
|
+
constants.VCF_INT_MISSING if x is None else x for x in value
|
|
394
|
+
] # NEEDS TEST
|
|
395
|
+
value = np.array(value, ndmin=ndmin, copy=True)
|
|
396
|
+
value[value == constants.VCF_INT_MISSING] = -1
|
|
397
|
+
value[value == constants.VCF_INT_FILL] = -2
|
|
398
|
+
# TODO watch out for clipping here!
|
|
399
|
+
return value.astype(dtype)
|
|
528
400
|
|
|
529
401
|
|
|
530
402
|
def compute_la_field(genotypes):
|
|
@@ -597,14 +469,60 @@ class LocalisableFieldDescriptor:
|
|
|
597
469
|
|
|
598
470
|
localisable_fields = [
|
|
599
471
|
LocalisableFieldDescriptor(
|
|
600
|
-
"call_LAD", "FORMAT/AD",
|
|
472
|
+
"call_LAD", "FORMAT/AD", sanitise_int_array, compute_lad_field
|
|
601
473
|
),
|
|
602
474
|
LocalisableFieldDescriptor(
|
|
603
|
-
"call_LPL", "FORMAT/PL",
|
|
475
|
+
"call_LPL", "FORMAT/PL", sanitise_int_array, compute_lpl_field
|
|
604
476
|
),
|
|
605
477
|
]
|
|
606
478
|
|
|
607
479
|
|
|
480
|
+
@dataclasses.dataclass
|
|
481
|
+
class VcfZarrPartition:
|
|
482
|
+
start: int
|
|
483
|
+
stop: int
|
|
484
|
+
|
|
485
|
+
@staticmethod
|
|
486
|
+
def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
|
|
487
|
+
num_chunks = int(np.ceil(num_records / chunk_size))
|
|
488
|
+
if max_chunks is not None:
|
|
489
|
+
num_chunks = min(num_chunks, max_chunks)
|
|
490
|
+
partitions = []
|
|
491
|
+
splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
|
|
492
|
+
for chunk_slice in splits:
|
|
493
|
+
start_chunk = int(chunk_slice[0])
|
|
494
|
+
stop_chunk = int(chunk_slice[-1]) + 1
|
|
495
|
+
start_index = start_chunk * chunk_size
|
|
496
|
+
stop_index = min(stop_chunk * chunk_size, num_records)
|
|
497
|
+
partitions.append(VcfZarrPartition(start_index, stop_index))
|
|
498
|
+
return partitions
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
VZW_METADATA_FORMAT_VERSION = "0.1"
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
@dataclasses.dataclass
|
|
505
|
+
class VcfZarrWriterMetadata(core.JsonDataclass):
|
|
506
|
+
format_version: str
|
|
507
|
+
source_path: str
|
|
508
|
+
schema: VcfZarrSchema
|
|
509
|
+
dimension_separator: str
|
|
510
|
+
partitions: list
|
|
511
|
+
provenance: dict
|
|
512
|
+
|
|
513
|
+
@staticmethod
|
|
514
|
+
def fromdict(d):
|
|
515
|
+
if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
|
|
516
|
+
raise ValueError(
|
|
517
|
+
"VcfZarrWriter format version mismatch: "
|
|
518
|
+
f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
|
|
519
|
+
)
|
|
520
|
+
ret = VcfZarrWriterMetadata(**d)
|
|
521
|
+
ret.schema = VcfZarrSchema.fromdict(ret.schema)
|
|
522
|
+
ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
|
|
523
|
+
return ret
|
|
524
|
+
|
|
525
|
+
|
|
608
526
|
@dataclasses.dataclass
|
|
609
527
|
class VcfZarrWriteSummary(core.JsonDataclass):
|
|
610
528
|
num_partitions: int
|
|
@@ -615,13 +533,14 @@ class VcfZarrWriteSummary(core.JsonDataclass):
|
|
|
615
533
|
|
|
616
534
|
|
|
617
535
|
class VcfZarrWriter:
|
|
618
|
-
def __init__(self, path):
|
|
536
|
+
def __init__(self, source_type, path):
|
|
537
|
+
self.source_type = source_type
|
|
619
538
|
self.path = pathlib.Path(path)
|
|
620
539
|
self.wip_path = self.path / "wip"
|
|
621
540
|
self.arrays_path = self.wip_path / "arrays"
|
|
622
541
|
self.partitions_path = self.wip_path / "partitions"
|
|
623
542
|
self.metadata = None
|
|
624
|
-
self.
|
|
543
|
+
self.source = None
|
|
625
544
|
|
|
626
545
|
@property
|
|
627
546
|
def schema(self):
|
|
@@ -639,7 +558,7 @@ class VcfZarrWriter:
|
|
|
639
558
|
|
|
640
559
|
def has_local_alleles(self):
|
|
641
560
|
for field in self.schema.fields:
|
|
642
|
-
if field.name == "call_LA" and field.
|
|
561
|
+
if field.name == "call_LA" and field.source is None:
|
|
643
562
|
return True
|
|
644
563
|
return False
|
|
645
564
|
|
|
@@ -649,20 +568,20 @@ class VcfZarrWriter:
|
|
|
649
568
|
|
|
650
569
|
def init(
|
|
651
570
|
self,
|
|
652
|
-
|
|
571
|
+
source,
|
|
653
572
|
*,
|
|
654
573
|
target_num_partitions,
|
|
655
574
|
schema,
|
|
656
575
|
dimension_separator=None,
|
|
657
576
|
max_variant_chunks=None,
|
|
658
577
|
):
|
|
659
|
-
self.
|
|
578
|
+
self.source = source
|
|
660
579
|
if self.path.exists():
|
|
661
580
|
raise ValueError("Zarr path already exists") # NEEDS TEST
|
|
662
581
|
schema.validate()
|
|
663
582
|
partitions = VcfZarrPartition.generate_partitions(
|
|
664
|
-
self.
|
|
665
|
-
schema.
|
|
583
|
+
self.source.num_records,
|
|
584
|
+
schema.get_chunks(["variants"])[0],
|
|
666
585
|
target_num_partitions,
|
|
667
586
|
max_chunks=max_variant_chunks,
|
|
668
587
|
)
|
|
@@ -673,7 +592,7 @@ class VcfZarrWriter:
|
|
|
673
592
|
)
|
|
674
593
|
self.metadata = VcfZarrWriterMetadata(
|
|
675
594
|
format_version=VZW_METADATA_FORMAT_VERSION,
|
|
676
|
-
|
|
595
|
+
source_path=str(self.source.path),
|
|
677
596
|
schema=schema,
|
|
678
597
|
dimension_separator=dimension_separator,
|
|
679
598
|
partitions=partitions,
|
|
@@ -682,27 +601,32 @@ class VcfZarrWriter:
|
|
|
682
601
|
)
|
|
683
602
|
|
|
684
603
|
self.path.mkdir()
|
|
685
|
-
root = zarr.open(store=self.path, mode="a", **ZARR_FORMAT_KWARGS)
|
|
604
|
+
root = zarr.open(store=self.path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS)
|
|
686
605
|
root.attrs.update(
|
|
687
606
|
{
|
|
688
|
-
"vcf_zarr_version": "0.
|
|
689
|
-
"vcf_header": self.icf.vcf_header,
|
|
607
|
+
"vcf_zarr_version": "0.4",
|
|
690
608
|
"source": f"bio2zarr-{provenance.__version__}",
|
|
691
609
|
}
|
|
692
610
|
)
|
|
693
|
-
|
|
611
|
+
root.attrs.update(self.source.root_attrs)
|
|
612
|
+
|
|
613
|
+
# Doing this synchronously - this is fine surely
|
|
694
614
|
self.encode_samples(root)
|
|
695
|
-
self.
|
|
696
|
-
|
|
615
|
+
if self.source.filters is not None:
|
|
616
|
+
self.encode_filters(root)
|
|
617
|
+
if self.source.contigs is not None:
|
|
618
|
+
self.encode_contigs(root)
|
|
697
619
|
|
|
698
620
|
self.wip_path.mkdir()
|
|
699
621
|
self.arrays_path.mkdir()
|
|
700
622
|
self.partitions_path.mkdir()
|
|
701
|
-
root = zarr.open(
|
|
623
|
+
root = zarr.open(
|
|
624
|
+
store=self.arrays_path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS
|
|
625
|
+
)
|
|
702
626
|
|
|
703
627
|
total_chunks = 0
|
|
704
628
|
for field in self.schema.fields:
|
|
705
|
-
a = self.init_array(root, field, partitions[-1].stop)
|
|
629
|
+
a = self.init_array(root, self.metadata.schema, field, partitions[-1].stop)
|
|
706
630
|
total_chunks += a.nchunks
|
|
707
631
|
|
|
708
632
|
logger.info("Writing WIP metadata")
|
|
@@ -710,89 +634,111 @@ class VcfZarrWriter:
|
|
|
710
634
|
json.dump(self.metadata.asdict(), f, indent=4)
|
|
711
635
|
|
|
712
636
|
return VcfZarrWriteSummary(
|
|
713
|
-
num_variants=self.
|
|
714
|
-
num_samples=self.
|
|
637
|
+
num_variants=self.source.num_records,
|
|
638
|
+
num_samples=self.source.num_samples,
|
|
715
639
|
num_partitions=self.num_partitions,
|
|
716
640
|
num_chunks=total_chunks,
|
|
717
641
|
max_encoding_memory=core.display_size(self.get_max_encoding_memory()),
|
|
718
642
|
)
|
|
719
643
|
|
|
720
644
|
def encode_samples(self, root):
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
645
|
+
samples = self.source.samples
|
|
646
|
+
zarr_utils.create_group_array(
|
|
647
|
+
root,
|
|
724
648
|
"sample_id",
|
|
725
|
-
data=[sample.id for sample in
|
|
726
|
-
shape=len(
|
|
649
|
+
data=[sample.id for sample in samples],
|
|
650
|
+
shape=len(samples),
|
|
727
651
|
dtype="str",
|
|
728
652
|
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
729
|
-
chunks=(self.schema.
|
|
653
|
+
chunks=(self.schema.get_chunks(["samples"])[0],),
|
|
654
|
+
dimension_names=["samples"],
|
|
730
655
|
)
|
|
731
|
-
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
732
656
|
logger.debug("Samples done")
|
|
733
657
|
|
|
734
|
-
def
|
|
735
|
-
|
|
658
|
+
def encode_contigs(self, root):
|
|
659
|
+
contigs = self.source.contigs
|
|
660
|
+
zarr_utils.create_group_array(
|
|
661
|
+
root,
|
|
736
662
|
"contig_id",
|
|
737
|
-
data=[contig.id for contig in
|
|
738
|
-
shape=len(
|
|
663
|
+
data=[contig.id for contig in contigs],
|
|
664
|
+
shape=len(contigs),
|
|
739
665
|
dtype="str",
|
|
740
666
|
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
667
|
+
dimension_names=["contigs"],
|
|
741
668
|
)
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
669
|
+
if all(contig.length is not None for contig in contigs):
|
|
670
|
+
zarr_utils.create_group_array(
|
|
671
|
+
root,
|
|
745
672
|
"contig_length",
|
|
746
|
-
data=[contig.length for contig in
|
|
747
|
-
shape=len(
|
|
673
|
+
data=[contig.length for contig in contigs],
|
|
674
|
+
shape=len(contigs),
|
|
748
675
|
dtype=np.int64,
|
|
749
676
|
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
677
|
+
dimension_names=["contigs"],
|
|
750
678
|
)
|
|
751
|
-
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
752
679
|
|
|
753
|
-
def
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
680
|
+
def encode_filters(self, root):
|
|
681
|
+
filters = self.source.filters
|
|
682
|
+
zarr_utils.create_group_array(
|
|
683
|
+
root,
|
|
757
684
|
"filter_id",
|
|
758
|
-
data=[filt.id for filt in
|
|
759
|
-
shape=len(
|
|
685
|
+
data=[filt.id for filt in filters],
|
|
686
|
+
shape=len(filters),
|
|
687
|
+
dtype="str",
|
|
688
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
689
|
+
dimension_names=["filters"],
|
|
690
|
+
)
|
|
691
|
+
zarr_utils.create_group_array(
|
|
692
|
+
root,
|
|
693
|
+
"filter_description",
|
|
694
|
+
data=[filt.description for filt in filters],
|
|
695
|
+
shape=len(filters),
|
|
760
696
|
dtype="str",
|
|
761
697
|
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
698
|
+
dimension_names=["filters"],
|
|
762
699
|
)
|
|
763
|
-
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
|
|
764
700
|
|
|
765
|
-
def init_array(self, root, array_spec, variants_dim_size):
|
|
766
|
-
kwargs = dict(ZARR_FORMAT_KWARGS)
|
|
767
|
-
filters =
|
|
768
|
-
|
|
769
|
-
if
|
|
701
|
+
def init_array(self, root, schema, array_spec, variants_dim_size):
|
|
702
|
+
kwargs = dict(zarr_utils.ZARR_FORMAT_KWARGS)
|
|
703
|
+
filters = (
|
|
704
|
+
array_spec.filters
|
|
705
|
+
if array_spec.filters is not None
|
|
706
|
+
else schema.defaults["filters"]
|
|
707
|
+
)
|
|
708
|
+
filters = [numcodecs.get_codec(filt) for filt in filters]
|
|
709
|
+
compressor = (
|
|
710
|
+
array_spec.compressor
|
|
711
|
+
if array_spec.compressor is not None
|
|
712
|
+
else schema.defaults["compressor"]
|
|
713
|
+
)
|
|
714
|
+
compressor = numcodecs.get_codec(compressor)
|
|
715
|
+
if array_spec.dtype == zarr_utils.STRING_DTYPE_NAME:
|
|
716
|
+
if zarr_utils.zarr_v3():
|
|
770
717
|
filters = [*list(filters), numcodecs.VLenUTF8()]
|
|
771
718
|
else:
|
|
772
719
|
kwargs["object_codec"] = numcodecs.VLenUTF8()
|
|
773
720
|
|
|
774
|
-
if
|
|
721
|
+
if zarr_utils.zarr_v3():
|
|
722
|
+
# see https://github.com/zarr-developers/zarr-python/issues/3197
|
|
723
|
+
kwargs["fill_value"] = None
|
|
724
|
+
else:
|
|
775
725
|
kwargs["dimension_separator"] = self.metadata.dimension_separator
|
|
776
726
|
|
|
777
|
-
shape =
|
|
778
|
-
# Truncate the variants dimension
|
|
727
|
+
shape = schema.get_shape(array_spec.dimensions)
|
|
728
|
+
# Truncate the variants dimension if max_variant_chunks was specified
|
|
779
729
|
shape[0] = variants_dim_size
|
|
780
|
-
a =
|
|
730
|
+
a = zarr_utils.create_empty_group_array(
|
|
731
|
+
root,
|
|
781
732
|
name=array_spec.name,
|
|
782
733
|
shape=shape,
|
|
783
|
-
chunks=array_spec.
|
|
734
|
+
chunks=schema.get_chunks(array_spec.dimensions),
|
|
784
735
|
dtype=array_spec.dtype,
|
|
785
|
-
compressor=
|
|
736
|
+
compressor=compressor,
|
|
786
737
|
filters=filters,
|
|
738
|
+
dimension_names=array_spec.dimensions,
|
|
787
739
|
**kwargs,
|
|
788
740
|
)
|
|
789
|
-
a.attrs.update(
|
|
790
|
-
{
|
|
791
|
-
"description": array_spec.description,
|
|
792
|
-
# Dimension names are part of the spec in Zarr v3
|
|
793
|
-
"_ARRAY_DIMENSIONS": array_spec.dimensions,
|
|
794
|
-
}
|
|
795
|
-
)
|
|
741
|
+
a.attrs.update({"description": array_spec.description})
|
|
796
742
|
logger.debug(f"Initialised {a}")
|
|
797
743
|
return a
|
|
798
744
|
|
|
@@ -804,7 +750,7 @@ class VcfZarrWriter:
|
|
|
804
750
|
if self.metadata is None:
|
|
805
751
|
with open(self.wip_path / "metadata.json") as f:
|
|
806
752
|
self.metadata = VcfZarrWriterMetadata.fromdict(json.load(f))
|
|
807
|
-
self.
|
|
753
|
+
self.source = self.source_type(self.metadata.source_path)
|
|
808
754
|
|
|
809
755
|
def partition_path(self, partition_index):
|
|
810
756
|
return self.partitions_path / f"p{partition_index}"
|
|
@@ -826,15 +772,18 @@ class VcfZarrWriter:
|
|
|
826
772
|
partition_path.mkdir(exist_ok=True)
|
|
827
773
|
logger.info(f"Encoding partition {partition_index} to {partition_path}")
|
|
828
774
|
|
|
829
|
-
self.
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
775
|
+
all_field_names = [field.name for field in self.schema.fields]
|
|
776
|
+
if "variant_id" in all_field_names:
|
|
777
|
+
self.encode_id_partition(partition_index)
|
|
778
|
+
if "variant_filter" in all_field_names:
|
|
779
|
+
self.encode_filters_partition(partition_index)
|
|
780
|
+
if "variant_contig" in all_field_names:
|
|
781
|
+
self.encode_contig_partition(partition_index)
|
|
782
|
+
self.encode_alleles_and_genotypes_partition(partition_index)
|
|
833
783
|
for array_spec in self.schema.fields:
|
|
834
|
-
if array_spec.
|
|
784
|
+
if array_spec.source is not None:
|
|
835
785
|
self.encode_array_partition(array_spec, partition_index)
|
|
836
786
|
if self.has_genotypes():
|
|
837
|
-
self.encode_genotypes_partition(partition_index)
|
|
838
787
|
self.encode_genotype_mask_partition(partition_index)
|
|
839
788
|
if self.has_local_alleles():
|
|
840
789
|
self.encode_local_alleles_partition(partition_index)
|
|
@@ -874,34 +823,48 @@ class VcfZarrWriter:
|
|
|
874
823
|
def encode_array_partition(self, array_spec, partition_index):
|
|
875
824
|
partition = self.metadata.partitions[partition_index]
|
|
876
825
|
ba = self.init_partition_array(partition_index, array_spec.name)
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
826
|
+
for value in self.source.iter_field(
|
|
827
|
+
array_spec.source,
|
|
828
|
+
ba.buff.shape[1:],
|
|
829
|
+
partition.start,
|
|
830
|
+
partition.stop,
|
|
831
|
+
):
|
|
883
832
|
j = ba.next_buffer_row()
|
|
884
|
-
|
|
833
|
+
ba.buff[j] = value
|
|
834
|
+
|
|
885
835
|
self.finalise_partition_array(partition_index, ba)
|
|
886
836
|
|
|
887
|
-
def
|
|
837
|
+
def encode_alleles_and_genotypes_partition(self, partition_index):
|
|
888
838
|
partition = self.metadata.partitions[partition_index]
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
)
|
|
898
|
-
j = gt_phased.next_buffer_row()
|
|
899
|
-
icf.sanitise_value_int_1d(
|
|
900
|
-
gt_phased.buff, j, value[:, -1] if value is not None else None
|
|
839
|
+
alleles = self.init_partition_array(partition_index, "variant_allele")
|
|
840
|
+
variant_lengths = self.init_partition_array(partition_index, "variant_length")
|
|
841
|
+
has_gt = self.has_genotypes()
|
|
842
|
+
shape = None
|
|
843
|
+
if has_gt:
|
|
844
|
+
gt = self.init_partition_array(partition_index, "call_genotype")
|
|
845
|
+
gt_phased = self.init_partition_array(
|
|
846
|
+
partition_index, "call_genotype_phased"
|
|
901
847
|
)
|
|
848
|
+
shape = gt.buff.shape[1:]
|
|
902
849
|
|
|
903
|
-
self.
|
|
904
|
-
|
|
850
|
+
for variant_data in self.source.iter_alleles_and_genotypes(
|
|
851
|
+
partition.start, partition.stop, shape, alleles.array.shape[1]
|
|
852
|
+
):
|
|
853
|
+
j_alleles = alleles.next_buffer_row()
|
|
854
|
+
alleles.buff[j_alleles] = variant_data.alleles
|
|
855
|
+
j_variant_length = variant_lengths.next_buffer_row()
|
|
856
|
+
variant_lengths.buff[j_variant_length] = variant_data.variant_length
|
|
857
|
+
if has_gt:
|
|
858
|
+
j = gt.next_buffer_row()
|
|
859
|
+
gt.buff[j] = variant_data.genotypes
|
|
860
|
+
j_phased = gt_phased.next_buffer_row()
|
|
861
|
+
gt_phased.buff[j_phased] = variant_data.phased
|
|
862
|
+
|
|
863
|
+
self.finalise_partition_array(partition_index, alleles)
|
|
864
|
+
self.finalise_partition_array(partition_index, variant_lengths)
|
|
865
|
+
if has_gt:
|
|
866
|
+
self.finalise_partition_array(partition_index, gt)
|
|
867
|
+
self.finalise_partition_array(partition_index, gt_phased)
|
|
905
868
|
|
|
906
869
|
def encode_genotype_mask_partition(self, partition_index):
|
|
907
870
|
partition = self.metadata.partitions[partition_index]
|
|
@@ -948,10 +911,10 @@ class VcfZarrWriter:
|
|
|
948
911
|
for descriptor in localisable_fields:
|
|
949
912
|
if descriptor.array_name not in field_map:
|
|
950
913
|
continue
|
|
951
|
-
assert field_map[descriptor.array_name].
|
|
914
|
+
assert field_map[descriptor.array_name].source is None
|
|
952
915
|
|
|
953
916
|
buff = self.init_partition_array(partition_index, descriptor.array_name)
|
|
954
|
-
source = self.
|
|
917
|
+
source = self.source.fields[descriptor.vcf_field].iter_values(
|
|
955
918
|
partition.start, partition.stop
|
|
956
919
|
)
|
|
957
920
|
for la in core.first_dim_slice_iter(
|
|
@@ -963,34 +926,17 @@ class VcfZarrWriter:
|
|
|
963
926
|
buff.buff[j] = descriptor.convert(value, la)
|
|
964
927
|
self.finalise_partition_array(partition_index, buff)
|
|
965
928
|
|
|
966
|
-
def encode_alleles_partition(self, partition_index):
|
|
967
|
-
alleles = self.init_partition_array(partition_index, "variant_allele")
|
|
968
|
-
partition = self.metadata.partitions[partition_index]
|
|
969
|
-
ref_field = self.icf.fields["REF"]
|
|
970
|
-
alt_field = self.icf.fields["ALT"]
|
|
971
|
-
|
|
972
|
-
for ref, alt in zip(
|
|
973
|
-
ref_field.iter_values(partition.start, partition.stop),
|
|
974
|
-
alt_field.iter_values(partition.start, partition.stop),
|
|
975
|
-
):
|
|
976
|
-
j = alleles.next_buffer_row()
|
|
977
|
-
alleles.buff[j, :] = constants.STR_FILL
|
|
978
|
-
alleles.buff[j, 0] = ref[0]
|
|
979
|
-
alleles.buff[j, 1 : 1 + len(alt)] = alt
|
|
980
|
-
self.finalise_partition_array(partition_index, alleles)
|
|
981
|
-
|
|
982
929
|
def encode_id_partition(self, partition_index):
|
|
983
930
|
vid = self.init_partition_array(partition_index, "variant_id")
|
|
984
931
|
vid_mask = self.init_partition_array(partition_index, "variant_id_mask")
|
|
985
932
|
partition = self.metadata.partitions[partition_index]
|
|
986
|
-
field = self.icf.fields["ID"]
|
|
987
933
|
|
|
988
|
-
for value in
|
|
934
|
+
for value in self.source.iter_id(partition.start, partition.stop):
|
|
989
935
|
j = vid.next_buffer_row()
|
|
990
936
|
k = vid_mask.next_buffer_row()
|
|
991
937
|
assert j == k
|
|
992
938
|
if value is not None:
|
|
993
|
-
vid.buff[j] = value
|
|
939
|
+
vid.buff[j] = value
|
|
994
940
|
vid_mask.buff[j] = False
|
|
995
941
|
else:
|
|
996
942
|
vid.buff[j] = constants.STR_MISSING
|
|
@@ -1000,37 +946,22 @@ class VcfZarrWriter:
|
|
|
1000
946
|
self.finalise_partition_array(partition_index, vid_mask)
|
|
1001
947
|
|
|
1002
948
|
def encode_filters_partition(self, partition_index):
|
|
1003
|
-
lookup = {filt.id: index for index, filt in enumerate(self.schema.filters)}
|
|
1004
949
|
var_filter = self.init_partition_array(partition_index, "variant_filter")
|
|
1005
950
|
partition = self.metadata.partitions[partition_index]
|
|
1006
951
|
|
|
1007
|
-
|
|
1008
|
-
for value in field.iter_values(partition.start, partition.stop):
|
|
952
|
+
for filter_values in self.source.iter_filters(partition.start, partition.stop):
|
|
1009
953
|
j = var_filter.next_buffer_row()
|
|
1010
|
-
var_filter.buff[j] =
|
|
1011
|
-
for f in value:
|
|
1012
|
-
try:
|
|
1013
|
-
var_filter.buff[j, lookup[f]] = True
|
|
1014
|
-
except KeyError:
|
|
1015
|
-
raise ValueError(
|
|
1016
|
-
f"Filter '{f}' was not defined in the header."
|
|
1017
|
-
) from None
|
|
954
|
+
var_filter.buff[j] = filter_values
|
|
1018
955
|
|
|
1019
956
|
self.finalise_partition_array(partition_index, var_filter)
|
|
1020
957
|
|
|
1021
958
|
def encode_contig_partition(self, partition_index):
|
|
1022
|
-
lookup = {contig.id: index for index, contig in enumerate(self.schema.contigs)}
|
|
1023
959
|
contig = self.init_partition_array(partition_index, "variant_contig")
|
|
1024
960
|
partition = self.metadata.partitions[partition_index]
|
|
1025
|
-
field = self.icf.fields["CHROM"]
|
|
1026
961
|
|
|
1027
|
-
for
|
|
962
|
+
for contig_index in self.source.iter_contig(partition.start, partition.stop):
|
|
1028
963
|
j = contig.next_buffer_row()
|
|
1029
|
-
|
|
1030
|
-
# and we always have an index, it seems that we the contig lookup
|
|
1031
|
-
# will always succeed. However, if anyone ever does hit a KeyError
|
|
1032
|
-
# here, please do open an issue with a reproducible example!
|
|
1033
|
-
contig.buff[j] = lookup[value[0]]
|
|
964
|
+
contig.buff[j] = contig_index
|
|
1034
965
|
|
|
1035
966
|
self.finalise_partition_array(partition_index, contig)
|
|
1036
967
|
|
|
@@ -1050,19 +981,7 @@ class VcfZarrWriter:
|
|
|
1050
981
|
if not src.exists():
|
|
1051
982
|
# Needs test
|
|
1052
983
|
raise ValueError(f"Partition {partition} of {name} does not exist")
|
|
1053
|
-
|
|
1054
|
-
# This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
|
|
1055
|
-
chunk_files = [
|
|
1056
|
-
path for path in src.iterdir() if not path.name.startswith(".")
|
|
1057
|
-
]
|
|
1058
|
-
# TODO check for a count of then number of files. If we require a
|
|
1059
|
-
# dimension_separator of "/" then we could make stronger assertions
|
|
1060
|
-
# here, as we'd always have num_variant_chunks
|
|
1061
|
-
logger.debug(
|
|
1062
|
-
f"Moving {len(chunk_files)} chunks for {name} partition {partition}"
|
|
1063
|
-
)
|
|
1064
|
-
for chunk_file in chunk_files:
|
|
1065
|
-
os.rename(chunk_file, dest / chunk_file.name)
|
|
984
|
+
zarr_utils.move_chunks(src, self.arrays_path, partition, name)
|
|
1066
985
|
# Finally, once all the chunks have moved into the arrays dir,
|
|
1067
986
|
# we move it out of wip
|
|
1068
987
|
os.rename(self.arrays_path / name, self.path / name)
|
|
@@ -1109,60 +1028,8 @@ class VcfZarrWriter:
|
|
|
1109
1028
|
def create_index(self):
|
|
1110
1029
|
"""Create an index to support efficient region queries."""
|
|
1111
1030
|
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
contig = root["variant_contig"]
|
|
1115
|
-
pos = root["variant_position"]
|
|
1116
|
-
length = root["variant_length"]
|
|
1117
|
-
|
|
1118
|
-
assert contig.cdata_shape == pos.cdata_shape
|
|
1119
|
-
|
|
1120
|
-
index = []
|
|
1121
|
-
|
|
1122
|
-
logger.info("Creating region index")
|
|
1123
|
-
for v_chunk in range(pos.cdata_shape[0]):
|
|
1124
|
-
c = contig.blocks[v_chunk]
|
|
1125
|
-
p = pos.blocks[v_chunk]
|
|
1126
|
-
e = p + length.blocks[v_chunk] - 1
|
|
1127
|
-
|
|
1128
|
-
# create a row for each contig in the chunk
|
|
1129
|
-
d = np.diff(c, append=-1)
|
|
1130
|
-
c_start_idx = 0
|
|
1131
|
-
for c_end_idx in np.nonzero(d)[0]:
|
|
1132
|
-
assert c[c_start_idx] == c[c_end_idx]
|
|
1133
|
-
index.append(
|
|
1134
|
-
(
|
|
1135
|
-
v_chunk, # chunk index
|
|
1136
|
-
c[c_start_idx], # contig ID
|
|
1137
|
-
p[c_start_idx], # start
|
|
1138
|
-
p[c_end_idx], # end
|
|
1139
|
-
np.max(e[c_start_idx : c_end_idx + 1]), # max end
|
|
1140
|
-
c_end_idx - c_start_idx + 1, # num records
|
|
1141
|
-
)
|
|
1142
|
-
)
|
|
1143
|
-
c_start_idx = c_end_idx + 1
|
|
1144
|
-
|
|
1145
|
-
index = np.array(index, dtype=pos.dtype)
|
|
1146
|
-
kwargs = {}
|
|
1147
|
-
if not zarr_v3():
|
|
1148
|
-
kwargs["dimension_separator"] = self.metadata.dimension_separator
|
|
1149
|
-
array = root.array(
|
|
1150
|
-
"region_index",
|
|
1151
|
-
data=index,
|
|
1152
|
-
shape=index.shape,
|
|
1153
|
-
chunks=index.shape,
|
|
1154
|
-
dtype=index.dtype,
|
|
1155
|
-
compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
|
|
1156
|
-
fill_value=None,
|
|
1157
|
-
**kwargs,
|
|
1158
|
-
)
|
|
1159
|
-
array.attrs["_ARRAY_DIMENSIONS"] = [
|
|
1160
|
-
"region_index_values",
|
|
1161
|
-
"region_index_fields",
|
|
1162
|
-
]
|
|
1163
|
-
|
|
1164
|
-
logger.info("Consolidating Zarr metadata")
|
|
1165
|
-
zarr.consolidate_metadata(self.path)
|
|
1031
|
+
indexer = VcfZarrIndexer(self.path)
|
|
1032
|
+
indexer.create_index()
|
|
1166
1033
|
|
|
1167
1034
|
######################
|
|
1168
1035
|
# encode_all_partitions
|
|
@@ -1174,11 +1041,13 @@ class VcfZarrWriter:
|
|
|
1174
1041
|
"""
|
|
1175
1042
|
max_encoding_mem = 0
|
|
1176
1043
|
for array_spec in self.schema.fields:
|
|
1177
|
-
max_encoding_mem = max(
|
|
1044
|
+
max_encoding_mem = max(
|
|
1045
|
+
max_encoding_mem, array_spec.variant_chunk_nbytes(self.schema)
|
|
1046
|
+
)
|
|
1178
1047
|
gt_mem = 0
|
|
1179
1048
|
if self.has_genotypes:
|
|
1180
1049
|
gt_mem = sum(
|
|
1181
|
-
field.variant_chunk_nbytes
|
|
1050
|
+
field.variant_chunk_nbytes(self.schema)
|
|
1182
1051
|
for field in self.schema.fields
|
|
1183
1052
|
if field.name.startswith("call_genotype")
|
|
1184
1053
|
)
|
|
@@ -1187,7 +1056,7 @@ class VcfZarrWriter:
|
|
|
1187
1056
|
def encode_all_partitions(
|
|
1188
1057
|
self, *, worker_processes=1, show_progress=False, max_memory=None
|
|
1189
1058
|
):
|
|
1190
|
-
max_memory = parse_max_memory(max_memory)
|
|
1059
|
+
max_memory = core.parse_max_memory(max_memory)
|
|
1191
1060
|
self.load_metadata()
|
|
1192
1061
|
num_partitions = self.num_partitions
|
|
1193
1062
|
per_worker_memory = self.get_max_encoding_memory()
|
|
@@ -1229,147 +1098,107 @@ class VcfZarrWriter:
|
|
|
1229
1098
|
pwm.submit(self.encode_partition, partition_index)
|
|
1230
1099
|
|
|
1231
1100
|
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
):
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
worker_processes=worker_processes,
|
|
1279
|
-
show_progress=show_progress,
|
|
1280
|
-
max_memory=max_memory,
|
|
1281
|
-
)
|
|
1282
|
-
vzw.finalise(show_progress)
|
|
1283
|
-
vzw.create_index()
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
def encode_init(
|
|
1287
|
-
icf_path,
|
|
1288
|
-
zarr_path,
|
|
1289
|
-
target_num_partitions,
|
|
1290
|
-
*,
|
|
1291
|
-
schema_path=None,
|
|
1292
|
-
variants_chunk_size=None,
|
|
1293
|
-
samples_chunk_size=None,
|
|
1294
|
-
local_alleles=None,
|
|
1295
|
-
max_variant_chunks=None,
|
|
1296
|
-
dimension_separator=None,
|
|
1297
|
-
max_memory=None,
|
|
1298
|
-
worker_processes=1,
|
|
1299
|
-
show_progress=False,
|
|
1300
|
-
):
|
|
1301
|
-
icf_store = icf.IntermediateColumnarFormat(icf_path)
|
|
1302
|
-
if schema_path is None:
|
|
1303
|
-
schema = VcfZarrSchema.generate(
|
|
1304
|
-
icf_store,
|
|
1305
|
-
variants_chunk_size=variants_chunk_size,
|
|
1306
|
-
samples_chunk_size=samples_chunk_size,
|
|
1307
|
-
local_alleles=local_alleles,
|
|
1308
|
-
)
|
|
1309
|
-
else:
|
|
1310
|
-
logger.info(f"Reading schema from {schema_path}")
|
|
1311
|
-
if variants_chunk_size is not None or samples_chunk_size is not None:
|
|
1101
|
+
class VcfZarr:
|
|
1102
|
+
def __init__(self, path):
|
|
1103
|
+
if not zarr_utils.zarr_exists(path):
|
|
1104
|
+
raise ValueError("Not in VcfZarr format") # NEEDS TEST
|
|
1105
|
+
self.path = path
|
|
1106
|
+
self.root = zarr.open(path, mode="r")
|
|
1107
|
+
|
|
1108
|
+
def summary_table(self):
|
|
1109
|
+
data = []
|
|
1110
|
+
arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()]
|
|
1111
|
+
arrays.sort(key=lambda x: x[0])
|
|
1112
|
+
for stored, array in reversed(arrays):
|
|
1113
|
+
d = {
|
|
1114
|
+
"name": array.name,
|
|
1115
|
+
"dtype": str(array.dtype),
|
|
1116
|
+
"stored": core.display_size(stored),
|
|
1117
|
+
"size": core.display_size(array.nbytes),
|
|
1118
|
+
"ratio": core.display_number(array.nbytes / stored),
|
|
1119
|
+
"nchunks": str(array.nchunks),
|
|
1120
|
+
"chunk_size": core.display_size(array.nbytes / array.nchunks),
|
|
1121
|
+
"avg_chunk_stored": core.display_size(int(stored / array.nchunks)),
|
|
1122
|
+
"shape": str(array.shape),
|
|
1123
|
+
"chunk_shape": str(array.chunks),
|
|
1124
|
+
"compressor": str(zarr_utils.get_compressor(array)),
|
|
1125
|
+
"filters": str(array.filters),
|
|
1126
|
+
}
|
|
1127
|
+
data.append(d)
|
|
1128
|
+
return data
|
|
1129
|
+
|
|
1130
|
+
|
|
1131
|
+
class VcfZarrIndexer:
|
|
1132
|
+
"""
|
|
1133
|
+
Creates an index for efficient region queries in a VCF Zarr dataset.
|
|
1134
|
+
"""
|
|
1135
|
+
|
|
1136
|
+
def __init__(self, path):
|
|
1137
|
+
self.path = pathlib.Path(path)
|
|
1138
|
+
|
|
1139
|
+
def create_index(self):
|
|
1140
|
+
"""Create an index to support efficient region queries."""
|
|
1141
|
+
root = zarr.open_group(store=self.path, mode="r+")
|
|
1142
|
+
if (
|
|
1143
|
+
"variant_contig" not in root
|
|
1144
|
+
or "variant_position" not in root
|
|
1145
|
+
or "variant_length" not in root
|
|
1146
|
+
):
|
|
1312
1147
|
raise ValueError(
|
|
1313
|
-
"Cannot
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
schema=schema,
|
|
1323
|
-
dimension_separator=dimension_separator,
|
|
1324
|
-
max_variant_chunks=max_variant_chunks,
|
|
1325
|
-
)
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
def encode_partition(zarr_path, partition):
|
|
1329
|
-
writer = VcfZarrWriter(zarr_path)
|
|
1330
|
-
writer.encode_partition(partition)
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
def encode_finalise(zarr_path, show_progress=False):
|
|
1334
|
-
writer = VcfZarrWriter(zarr_path)
|
|
1335
|
-
writer.finalise(show_progress=show_progress)
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
def convert(
|
|
1339
|
-
vcfs,
|
|
1340
|
-
out_path,
|
|
1341
|
-
*,
|
|
1342
|
-
variants_chunk_size=None,
|
|
1343
|
-
samples_chunk_size=None,
|
|
1344
|
-
worker_processes=1,
|
|
1345
|
-
local_alleles=None,
|
|
1346
|
-
show_progress=False,
|
|
1347
|
-
icf_path=None,
|
|
1348
|
-
):
|
|
1349
|
-
if icf_path is None:
|
|
1350
|
-
cm = temp_icf_path(prefix="vcf2zarr")
|
|
1351
|
-
else:
|
|
1352
|
-
cm = contextlib.nullcontext(icf_path)
|
|
1353
|
-
|
|
1354
|
-
with cm as icf_path:
|
|
1355
|
-
icf.explode(
|
|
1356
|
-
icf_path,
|
|
1357
|
-
vcfs,
|
|
1358
|
-
worker_processes=worker_processes,
|
|
1359
|
-
show_progress=show_progress,
|
|
1360
|
-
)
|
|
1361
|
-
encode(
|
|
1362
|
-
icf_path,
|
|
1363
|
-
out_path,
|
|
1364
|
-
variants_chunk_size=variants_chunk_size,
|
|
1365
|
-
samples_chunk_size=samples_chunk_size,
|
|
1366
|
-
worker_processes=worker_processes,
|
|
1367
|
-
show_progress=show_progress,
|
|
1368
|
-
local_alleles=local_alleles,
|
|
1369
|
-
)
|
|
1148
|
+
"Cannot create index: variant_contig, "
|
|
1149
|
+
"variant_position and variant_length arrays are required"
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1152
|
+
contig = root["variant_contig"]
|
|
1153
|
+
pos = root["variant_position"]
|
|
1154
|
+
length = root["variant_length"]
|
|
1155
|
+
|
|
1156
|
+
assert contig.cdata_shape == pos.cdata_shape
|
|
1370
1157
|
|
|
1158
|
+
index = []
|
|
1159
|
+
|
|
1160
|
+
logger.info("Creating region index")
|
|
1161
|
+
for v_chunk in range(pos.cdata_shape[0]):
|
|
1162
|
+
c = contig.blocks[v_chunk]
|
|
1163
|
+
p = pos.blocks[v_chunk]
|
|
1164
|
+
e = p + length.blocks[v_chunk] - 1
|
|
1371
1165
|
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1166
|
+
# create a row for each contig in the chunk
|
|
1167
|
+
d = np.diff(c, append=-1)
|
|
1168
|
+
c_start_idx = 0
|
|
1169
|
+
for c_end_idx in np.nonzero(d)[0]:
|
|
1170
|
+
assert c[c_start_idx] == c[c_end_idx]
|
|
1171
|
+
index.append(
|
|
1172
|
+
(
|
|
1173
|
+
v_chunk, # chunk index
|
|
1174
|
+
c[c_start_idx], # contig ID
|
|
1175
|
+
p[c_start_idx], # start
|
|
1176
|
+
p[c_end_idx], # end
|
|
1177
|
+
np.max(e[c_start_idx : c_end_idx + 1]), # max end
|
|
1178
|
+
c_end_idx - c_start_idx + 1, # num records
|
|
1179
|
+
)
|
|
1180
|
+
)
|
|
1181
|
+
c_start_idx = c_end_idx + 1
|
|
1182
|
+
|
|
1183
|
+
index = np.array(index, dtype=pos.dtype)
|
|
1184
|
+
kwargs = {}
|
|
1185
|
+
if not zarr_utils.zarr_v3():
|
|
1186
|
+
kwargs["dimension_separator"] = "/"
|
|
1187
|
+
zarr_utils.create_group_array(
|
|
1188
|
+
root,
|
|
1189
|
+
"region_index",
|
|
1190
|
+
data=index,
|
|
1191
|
+
shape=index.shape,
|
|
1192
|
+
chunks=index.shape,
|
|
1193
|
+
dtype=index.dtype,
|
|
1194
|
+
compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
|
|
1195
|
+
fill_value=None,
|
|
1196
|
+
dimension_names=[
|
|
1197
|
+
"region_index_values",
|
|
1198
|
+
"region_index_fields",
|
|
1199
|
+
],
|
|
1200
|
+
**kwargs,
|
|
1201
|
+
)
|
|
1202
|
+
|
|
1203
|
+
logger.info("Consolidating Zarr metadata")
|
|
1204
|
+
zarr.consolidate_metadata(self.path)
|