bio2zarr 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

@@ -1,41 +1,29 @@
1
- import contextlib
1
+ import abc
2
2
  import dataclasses
3
3
  import json
4
4
  import logging
5
5
  import os
6
- import os.path
7
6
  import pathlib
8
7
  import shutil
9
- import tempfile
10
8
 
11
- import humanfriendly
12
9
  import numcodecs
13
10
  import numpy as np
14
11
  import zarr
15
12
 
16
- from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS, zarr_v3
17
-
18
- from .. import constants, core, provenance
19
- from . import icf
13
+ from bio2zarr import constants, core, provenance, zarr_utils
20
14
 
21
15
  logger = logging.getLogger(__name__)
22
16
 
23
-
24
- def inspect(path):
25
- path = pathlib.Path(path)
26
- if not path.exists():
27
- raise ValueError(f"Path not found: {path}")
28
- if (path / "metadata.json").exists():
29
- obj = icf.IntermediateColumnarFormat(path)
30
- # NOTE: this is too strict, we should support more general Zarrs, see #276
31
- elif (path / ".zmetadata").exists():
32
- obj = VcfZarr(path)
33
- else:
34
- raise ValueError(f"{path} not in ICF or VCF Zarr format")
35
- return obj.summary_table()
36
-
37
-
17
+ ZARR_SCHEMA_FORMAT_VERSION = "0.6"
18
+ DEFAULT_VARIANT_CHUNK_SIZE = 1000
19
+ DEFAULT_SAMPLE_CHUNK_SIZE = 10_000
38
20
  DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
21
+ DEFAULT_ZARR_COMPRESSOR_GENOTYPES = numcodecs.Blosc(
22
+ cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
23
+ )
24
+ DEFAULT_ZARR_COMPRESSOR_BOOL = numcodecs.Blosc(
25
+ cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
26
+ )
39
27
 
40
28
  _fixed_field_descriptions = {
41
29
  "variant_contig": "An identifier from the reference genome or an angle-bracketed ID"
@@ -49,128 +37,251 @@ _fixed_field_descriptions = {
49
37
  }
50
38
 
51
39
 
40
+ @dataclasses.dataclass
41
+ class VariantData:
42
+ """Represents variant data returned by iter_alleles_and_genotypes."""
43
+
44
+ variant_length: int
45
+ alleles: np.ndarray
46
+ genotypes: np.ndarray
47
+ phased: np.ndarray
48
+
49
+
50
+ class Source(abc.ABC):
51
+ @property
52
+ @abc.abstractmethod
53
+ def path(self):
54
+ pass
55
+
56
+ @property
57
+ @abc.abstractmethod
58
+ def num_records(self):
59
+ pass
60
+
61
+ @property
62
+ @abc.abstractmethod
63
+ def num_samples(self):
64
+ pass
65
+
66
+ @property
67
+ @abc.abstractmethod
68
+ def samples(self):
69
+ pass
70
+
71
+ @property
72
+ def contigs(self):
73
+ return None
74
+
75
+ @property
76
+ def filters(self):
77
+ return None
78
+
79
+ @property
80
+ def root_attrs(self):
81
+ return {}
82
+
83
+ @abc.abstractmethod
84
+ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
85
+ pass
86
+
87
+ def iter_id(self, start, stop):
88
+ return
89
+
90
+ def iter_contig(self, start, stop):
91
+ return
92
+
93
+ @abc.abstractmethod
94
+ def iter_field(self, field_name, shape, start, stop):
95
+ pass
96
+
97
+ @abc.abstractmethod
98
+ def generate_schema(self, variants_chunk_size, samples_chunk_size, local_alleles):
99
+ pass
100
+
101
+
102
+ @dataclasses.dataclass
103
+ class VcfZarrDimension:
104
+ size: int
105
+ chunk_size: int
106
+
107
+ def asdict(self):
108
+ return dataclasses.asdict(self)
109
+
110
+ @classmethod
111
+ def fromdict(cls, d):
112
+ return cls(**d)
113
+
114
+ @classmethod
115
+ def unchunked(cls, size):
116
+ return cls(size, max(size, 1))
117
+
118
+
119
+ def standard_dimensions(
120
+ *,
121
+ variants_size,
122
+ samples_size,
123
+ variants_chunk_size=None,
124
+ samples_chunk_size=None,
125
+ alleles_size=None,
126
+ filters_size=None,
127
+ ploidy_size=None,
128
+ genotypes_size=None,
129
+ ):
130
+ """
131
+ Returns a dictionary mapping dimension names to definition for the standard
132
+ fields in a VCF.
133
+ """
134
+ if variants_chunk_size is None:
135
+ variants_chunk_size = max(1, min(variants_size, DEFAULT_VARIANT_CHUNK_SIZE))
136
+ if samples_chunk_size is None:
137
+ samples_chunk_size = max(1, min(samples_size, DEFAULT_SAMPLE_CHUNK_SIZE))
138
+
139
+ dimensions = {
140
+ "variants": VcfZarrDimension(variants_size, variants_chunk_size),
141
+ "samples": VcfZarrDimension(samples_size, samples_chunk_size),
142
+ }
143
+
144
+ if alleles_size is not None:
145
+ dimensions["alleles"] = VcfZarrDimension.unchunked(alleles_size)
146
+ if alleles_size > 1:
147
+ dimensions["alt_alleles"] = VcfZarrDimension.unchunked(alleles_size - 1)
148
+
149
+ if filters_size is not None:
150
+ dimensions["filters"] = VcfZarrDimension.unchunked(filters_size)
151
+
152
+ if ploidy_size is not None:
153
+ dimensions["ploidy"] = VcfZarrDimension.unchunked(ploidy_size)
154
+
155
+ if genotypes_size is not None:
156
+ dimensions["genotypes"] = VcfZarrDimension.unchunked(genotypes_size)
157
+
158
+ return dimensions
159
+
160
+
52
161
  @dataclasses.dataclass
53
162
  class ZarrArraySpec:
54
163
  name: str
55
164
  dtype: str
56
- shape: tuple
57
- chunks: tuple
58
165
  dimensions: tuple
59
166
  description: str
60
- vcf_field: str
61
- compressor: dict
62
- filters: list
167
+ compressor: dict = None
168
+ filters: list = None
169
+ source: str = None
63
170
 
64
171
  def __post_init__(self):
65
172
  if self.name in _fixed_field_descriptions:
66
173
  self.description = self.description or _fixed_field_descriptions[self.name]
67
174
 
68
- # Ensure these are tuples for ease of comparison and consistency
69
- self.shape = tuple(self.shape)
70
- self.chunks = tuple(self.chunks)
71
175
  self.dimensions = tuple(self.dimensions)
72
- self.filters = tuple(self.filters)
176
+ self.filters = tuple(self.filters) if self.filters is not None else None
73
177
 
74
- @staticmethod
75
- def new(**kwargs):
76
- spec = ZarrArraySpec(
77
- **kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
78
- )
79
- spec._choose_compressor_settings()
80
- return spec
178
+ def get_shape(self, schema):
179
+ return schema.get_shape(self.dimensions)
180
+
181
+ def get_chunks(self, schema):
182
+ return schema.get_chunks(self.dimensions)
183
+
184
+ def get_chunk_nbytes(self, schema):
185
+ element_size = np.dtype(self.dtype).itemsize
186
+ chunks = self.get_chunks(schema)
187
+ shape = self.get_shape(schema)
188
+
189
+ # Calculate actual chunk size accounting for dimension limits
190
+ items = 1
191
+ for i, chunk_size in enumerate(chunks):
192
+ items *= min(chunk_size, shape[i])
193
+
194
+ # Include sizes for extra dimensions (if any)
195
+ if len(shape) > len(chunks):
196
+ for size in shape[len(chunks) :]:
197
+ items *= size
198
+
199
+ return element_size * items
81
200
 
82
201
  @staticmethod
83
202
  def from_field(
84
203
  vcf_field,
204
+ schema,
85
205
  *,
86
- num_variants,
87
- num_samples,
88
- variants_chunk_size,
89
- samples_chunk_size,
90
206
  array_name=None,
207
+ compressor=None,
208
+ filters=None,
91
209
  ):
92
- shape = [num_variants]
93
210
  prefix = "variant_"
94
211
  dimensions = ["variants"]
95
- chunks = [variants_chunk_size]
96
212
  if vcf_field.category == "FORMAT":
97
213
  prefix = "call_"
98
- shape.append(num_samples)
99
- chunks.append(samples_chunk_size)
100
214
  dimensions.append("samples")
101
215
  if array_name is None:
102
216
  array_name = prefix + vcf_field.name
103
- # TODO make an option to add in the empty extra dimension
104
- if vcf_field.summary.max_number > 1 or vcf_field.full_name == "FORMAT/LAA":
105
- shape.append(vcf_field.summary.max_number)
106
- chunks.append(vcf_field.summary.max_number)
107
- # TODO we should really be checking this to see if the named dimensions
108
- # are actually correct.
109
- if vcf_field.vcf_number == "R":
217
+
218
+ max_number = vcf_field.max_number
219
+ if vcf_field.vcf_number == "R":
220
+ max_alleles = schema.dimensions["alleles"].size
221
+ if max_number > max_alleles:
222
+ raise ValueError(
223
+ f"Max number of values {max_number} exceeds max alleles "
224
+ f"{max_alleles} for {vcf_field.full_name}"
225
+ )
226
+ if max_alleles > 0:
110
227
  dimensions.append("alleles")
111
- elif vcf_field.vcf_number == "A":
228
+ elif vcf_field.vcf_number == "A":
229
+ max_alt_alleles = schema.dimensions["alt_alleles"].size
230
+ if max_number > max_alt_alleles:
231
+ raise ValueError(
232
+ f"Max number of values {max_number} exceeds max alt alleles "
233
+ f"{max_alt_alleles} for {vcf_field.full_name}"
234
+ )
235
+ if max_alt_alleles > 0:
112
236
  dimensions.append("alt_alleles")
113
- elif vcf_field.vcf_number == "G":
237
+ elif vcf_field.vcf_number == "G":
238
+ max_genotypes = schema.dimensions["genotypes"].size
239
+ if max_number > max_genotypes:
240
+ raise ValueError(
241
+ f"Max number of values {max_number} exceeds max genotypes "
242
+ f"{max_genotypes} for {vcf_field.full_name}"
243
+ )
244
+ if max_genotypes > 0:
114
245
  dimensions.append("genotypes")
115
- else:
116
- dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
117
- return ZarrArraySpec.new(
118
- vcf_field=vcf_field.full_name,
246
+ elif max_number > 1 or vcf_field.full_name == "FORMAT/LAA":
247
+ dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
248
+ if dimensions[-1] not in schema.dimensions:
249
+ schema.dimensions[dimensions[-1]] = VcfZarrDimension.unchunked(
250
+ vcf_field.max_number
251
+ )
252
+
253
+ return ZarrArraySpec(
254
+ source=vcf_field.full_name,
119
255
  name=array_name,
120
256
  dtype=vcf_field.smallest_dtype(),
121
- shape=shape,
122
- chunks=chunks,
123
257
  dimensions=dimensions,
124
258
  description=vcf_field.description,
259
+ compressor=compressor,
260
+ filters=filters,
125
261
  )
126
262
 
127
- def _choose_compressor_settings(self):
128
- """
129
- Choose compressor and filter settings based on the size and
130
- type of the array, plus some hueristics from observed properties
131
- of VCFs.
132
-
133
- See https://github.com/pystatgen/bio2zarr/discussions/74
134
- """
135
- # Default is to not shuffle, because autoshuffle isn't recognised
136
- # by many Zarr implementations, and shuffling can lead to worse
137
- # performance in some cases anyway. Turning on shuffle should be a
138
- # deliberate choice.
139
- shuffle = numcodecs.Blosc.NOSHUFFLE
140
- if self.name == "call_genotype" and self.dtype == "i1":
141
- # call_genotype gets BITSHUFFLE by default as it gets
142
- # significantly better compression (at a cost of slower
143
- # decoding)
144
- shuffle = numcodecs.Blosc.BITSHUFFLE
145
- elif self.dtype == "bool":
146
- shuffle = numcodecs.Blosc.BITSHUFFLE
147
-
148
- self.compressor["shuffle"] = shuffle
149
-
150
- @property
151
- def chunk_nbytes(self):
263
+ def chunk_nbytes(self, schema):
152
264
  """
153
265
  Returns the nbytes for a single chunk in this array.
154
266
  """
155
267
  items = 1
156
268
  dim = 0
157
- for chunk_size in self.chunks:
158
- size = min(chunk_size, self.shape[dim])
269
+ for chunk_size in self.get_chunks(schema):
270
+ size = min(chunk_size, self.get_shape(schema)[dim])
159
271
  items *= size
160
272
  dim += 1
161
273
  # Include sizes for extra dimensions.
162
- for size in self.shape[dim:]:
274
+ for size in self.get_shape(schema)[dim:]:
163
275
  items *= size
164
276
  dt = np.dtype(self.dtype)
165
277
  return items * dt.itemsize
166
278
 
167
- @property
168
- def variant_chunk_nbytes(self):
279
+ def variant_chunk_nbytes(self, schema):
169
280
  """
170
281
  Returns the nbytes for a single variant chunk of this array.
171
282
  """
172
- chunk_items = self.chunks[0]
173
- for size in self.shape[1:]:
283
+ chunk_items = self.get_chunks(schema)[0]
284
+ for size in self.get_shape(schema)[1:]:
174
285
  chunk_items *= size
175
286
  dt = np.dtype(self.dtype)
176
287
  if dt.kind == "O" and "samples" in self.dimensions:
@@ -181,87 +292,71 @@ class ZarrArraySpec:
181
292
  return chunk_items * dt.itemsize
182
293
 
183
294
 
184
- ZARR_SCHEMA_FORMAT_VERSION = "0.4"
295
+ @dataclasses.dataclass
296
+ class Contig:
297
+ id: str
298
+ length: int = None
185
299
 
186
300
 
187
- def convert_local_allele_field_types(fields):
188
- """
189
- Update the specified list of fields to include the LAA field, and to convert
190
- any supported localisable fields to the L* counterpart.
191
-
192
- Note that we currently support only two ALT alleles per sample, and so the
193
- dimensions of these fields are fixed by that requirement. Later versions may
194
- use summry data storted in the ICF to make different choices, if information
195
- about subsequent alleles (not in the actual genotype calls) should also be
196
- stored.
197
- """
198
- fields_by_name = {field.name: field for field in fields}
199
- gt = fields_by_name["call_genotype"]
200
- if gt.shape[-1] != 2:
201
- raise ValueError("Local alleles only supported on diploid data")
202
-
203
- # TODO check if LA is already in here
204
-
205
- shape = gt.shape[:-1]
206
- chunks = gt.chunks[:-1]
207
- dimensions = gt.dimensions[:-1]
208
-
209
- la = ZarrArraySpec.new(
210
- vcf_field=None,
211
- name="call_LA",
212
- dtype="i1",
213
- shape=gt.shape,
214
- chunks=gt.chunks,
215
- dimensions=(*dimensions, "local_alleles"),
216
- description=(
217
- "0-based indices into REF+ALT, indicating which alleles"
218
- " are relevant (local) for the current sample"
219
- ),
220
- )
221
- ad = fields_by_name.get("call_AD", None)
222
- if ad is not None:
223
- # TODO check if call_LAD is in the list already
224
- ad.name = "call_LAD"
225
- ad.vcf_field = None
226
- ad.shape = (*shape, 2)
227
- ad.chunks = (*chunks, 2)
228
- ad.dimensions = (*dimensions, "local_alleles")
229
- ad.description += " (local-alleles)"
230
-
231
- pl = fields_by_name.get("call_PL", None)
232
- if pl is not None:
233
- # TODO check if call_LPL is in the list already
234
- pl.name = "call_LPL"
235
- pl.vcf_field = None
236
- pl.shape = (*shape, 3)
237
- pl.chunks = (*chunks, 3)
238
- pl.description += " (local-alleles)"
239
- pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1])
240
- return [*fields, la]
301
+ @dataclasses.dataclass
302
+ class Sample:
303
+ id: str
304
+
305
+
306
+ @dataclasses.dataclass
307
+ class Filter:
308
+ id: str
309
+ description: str = ""
241
310
 
242
311
 
243
312
  @dataclasses.dataclass
244
313
  class VcfZarrSchema(core.JsonDataclass):
245
314
  format_version: str
246
- samples_chunk_size: int
247
- variants_chunk_size: int
248
- samples: list
249
- contigs: list
250
- filters: list
315
+ dimensions: dict
251
316
  fields: list
317
+ defaults: dict
318
+
319
+ def __init__(
320
+ self,
321
+ format_version: str,
322
+ fields: list,
323
+ dimensions: dict,
324
+ defaults: dict = None,
325
+ ):
326
+ self.format_version = format_version
327
+ self.fields = fields
328
+ defaults = defaults.copy() if defaults is not None else {}
329
+ if defaults.get("compressor", None) is None:
330
+ defaults["compressor"] = DEFAULT_ZARR_COMPRESSOR.get_config()
331
+ if defaults.get("filters", None) is None:
332
+ defaults["filters"] = []
333
+ self.defaults = defaults
334
+ self.dimensions = dimensions
335
+
336
+ def get_shape(self, dimensions):
337
+ return [self.dimensions[dim].size for dim in dimensions]
338
+
339
+ def get_chunks(self, dimensions):
340
+ return [self.dimensions[dim].chunk_size for dim in dimensions]
252
341
 
253
342
  def validate(self):
254
343
  """
255
344
  Checks that the schema is well-formed and within required limits.
256
345
  """
257
346
  for field in self.fields:
347
+ for dim in field.dimensions:
348
+ if dim not in self.dimensions:
349
+ raise ValueError(
350
+ f"Dimension '{dim}' used in field '{field.name}' is "
351
+ "not defined in the schema"
352
+ )
353
+
354
+ chunk_nbytes = field.get_chunk_nbytes(self)
258
355
  # This is the Blosc max buffer size
259
- if field.chunk_nbytes > 2147483647:
260
- # TODO add some links to documentation here advising how to
261
- # deal with PL values.
356
+ if chunk_nbytes > 2147483647:
262
357
  raise ValueError(
263
358
  f"Field {field.name} chunks are too large "
264
- f"({field.chunk_nbytes} > 2**31 - 1 bytes). "
359
+ f"({chunk_nbytes} > 2**31 - 1 bytes). "
265
360
  "Either generate a schema and drop this field (if you don't "
266
361
  "need it) or reduce the variant or sample chunk sizes."
267
362
  )
@@ -278,253 +373,30 @@ class VcfZarrSchema(core.JsonDataclass):
278
373
  "Zarr schema format version mismatch: "
279
374
  f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
280
375
  )
376
+
281
377
  ret = VcfZarrSchema(**d)
282
- ret.samples = [icf.Sample(**sd) for sd in d["samples"]]
283
- ret.contigs = [icf.Contig(**sd) for sd in d["contigs"]]
284
- ret.filters = [icf.Filter(**sd) for sd in d["filters"]]
285
378
  ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
379
+ ret.dimensions = {
380
+ k: VcfZarrDimension.fromdict(v) for k, v in d["dimensions"].items()
381
+ }
382
+
286
383
  return ret
287
384
 
288
385
  @staticmethod
289
386
  def fromjson(s):
290
387
  return VcfZarrSchema.fromdict(json.loads(s))
291
388
 
292
- @staticmethod
293
- def generate(
294
- icf, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
295
- ):
296
- m = icf.num_records
297
- n = icf.num_samples
298
- if samples_chunk_size is None:
299
- samples_chunk_size = 10_000
300
- if variants_chunk_size is None:
301
- variants_chunk_size = 1000
302
- if local_alleles is None:
303
- local_alleles = False
304
- logger.info(
305
- f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
306
- )
307
-
308
- def spec_from_field(field, array_name=None):
309
- return ZarrArraySpec.from_field(
310
- field,
311
- num_samples=n,
312
- num_variants=m,
313
- samples_chunk_size=samples_chunk_size,
314
- variants_chunk_size=variants_chunk_size,
315
- array_name=array_name,
316
- )
317
-
318
- def fixed_field_spec(
319
- name,
320
- dtype,
321
- vcf_field=None,
322
- shape=(m,),
323
- dimensions=("variants",),
324
- chunks=None,
325
- ):
326
- return ZarrArraySpec.new(
327
- vcf_field=vcf_field,
328
- name=name,
329
- dtype=dtype,
330
- shape=shape,
331
- description="",
332
- dimensions=dimensions,
333
- chunks=chunks or [variants_chunk_size],
334
- )
335
-
336
- alt_field = icf.fields["ALT"]
337
- max_alleles = alt_field.vcf_field.summary.max_number + 1
338
-
339
- array_specs = [
340
- fixed_field_spec(
341
- name="variant_contig",
342
- dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
343
- ),
344
- fixed_field_spec(
345
- name="variant_filter",
346
- dtype="bool",
347
- shape=(m, icf.metadata.num_filters),
348
- dimensions=["variants", "filters"],
349
- chunks=(variants_chunk_size, icf.metadata.num_filters),
350
- ),
351
- fixed_field_spec(
352
- name="variant_allele",
353
- dtype="O",
354
- shape=(m, max_alleles),
355
- dimensions=["variants", "alleles"],
356
- chunks=(variants_chunk_size, max_alleles),
357
- ),
358
- fixed_field_spec(
359
- name="variant_id",
360
- dtype="O",
361
- ),
362
- fixed_field_spec(
363
- name="variant_id_mask",
364
- dtype="bool",
365
- ),
366
- ]
367
- name_map = {field.full_name: field for field in icf.metadata.fields}
368
-
369
- # Only three of the fixed fields have a direct one-to-one mapping.
370
- array_specs.extend(
371
- [
372
- spec_from_field(name_map["QUAL"], array_name="variant_quality"),
373
- spec_from_field(name_map["POS"], array_name="variant_position"),
374
- spec_from_field(name_map["rlen"], array_name="variant_length"),
375
- ]
376
- )
377
- array_specs.extend(
378
- [spec_from_field(field) for field in icf.metadata.info_fields]
379
- )
380
-
381
- gt_field = None
382
- for field in icf.metadata.format_fields:
383
- if field.name == "GT":
384
- gt_field = field
385
- continue
386
- array_specs.append(spec_from_field(field))
387
-
388
- if gt_field is not None and n > 0:
389
- ploidy = max(gt_field.summary.max_number - 1, 1)
390
- shape = [m, n]
391
- chunks = [variants_chunk_size, samples_chunk_size]
392
- dimensions = ["variants", "samples"]
393
- array_specs.append(
394
- ZarrArraySpec.new(
395
- vcf_field=None,
396
- name="call_genotype_phased",
397
- dtype="bool",
398
- shape=list(shape),
399
- chunks=list(chunks),
400
- dimensions=list(dimensions),
401
- description="",
402
- )
403
- )
404
- shape += [ploidy]
405
- chunks += [ploidy]
406
- dimensions += ["ploidy"]
407
- array_specs.append(
408
- ZarrArraySpec.new(
409
- vcf_field=None,
410
- name="call_genotype",
411
- dtype=gt_field.smallest_dtype(),
412
- shape=list(shape),
413
- chunks=list(chunks),
414
- dimensions=list(dimensions),
415
- description="",
416
- )
417
- )
418
- array_specs.append(
419
- ZarrArraySpec.new(
420
- vcf_field=None,
421
- name="call_genotype_mask",
422
- dtype="bool",
423
- shape=list(shape),
424
- chunks=list(chunks),
425
- dimensions=list(dimensions),
426
- description="",
427
- )
428
- )
429
-
430
- if local_alleles:
431
- array_specs = convert_local_allele_field_types(array_specs)
432
-
433
- return VcfZarrSchema(
434
- format_version=ZARR_SCHEMA_FORMAT_VERSION,
435
- samples_chunk_size=samples_chunk_size,
436
- variants_chunk_size=variants_chunk_size,
437
- fields=array_specs,
438
- samples=icf.metadata.samples,
439
- contigs=icf.metadata.contigs,
440
- filters=icf.metadata.filters,
441
- )
442
-
443
-
444
- class VcfZarr:
445
- def __init__(self, path):
446
- if not (path / ".zmetadata").exists():
447
- raise ValueError("Not in VcfZarr format") # NEEDS TEST
448
- self.path = path
449
- self.root = zarr.open(path, mode="r")
450
-
451
- def summary_table(self):
452
- data = []
453
- arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()]
454
- arrays.sort(key=lambda x: x[0])
455
- for stored, array in reversed(arrays):
456
- d = {
457
- "name": array.name,
458
- "dtype": str(array.dtype),
459
- "stored": core.display_size(stored),
460
- "size": core.display_size(array.nbytes),
461
- "ratio": core.display_number(array.nbytes / stored),
462
- "nchunks": str(array.nchunks),
463
- "chunk_size": core.display_size(array.nbytes / array.nchunks),
464
- "avg_chunk_stored": core.display_size(int(stored / array.nchunks)),
465
- "shape": str(array.shape),
466
- "chunk_shape": str(array.chunks),
467
- "compressor": str(array.compressor),
468
- "filters": str(array.filters),
469
- }
470
- data.append(d)
471
- return data
472
-
473
-
474
- def parse_max_memory(max_memory):
475
- if max_memory is None:
476
- # Effectively unbounded
477
- return 2**63
478
- if isinstance(max_memory, str):
479
- max_memory = humanfriendly.parse_size(max_memory)
480
- logger.info(f"Set memory budget to {core.display_size(max_memory)}")
481
- return max_memory
482
-
483
-
484
- @dataclasses.dataclass
485
- class VcfZarrPartition:
486
- start: int
487
- stop: int
488
-
489
- @staticmethod
490
- def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
491
- num_chunks = int(np.ceil(num_records / chunk_size))
492
- if max_chunks is not None:
493
- num_chunks = min(num_chunks, max_chunks)
494
- partitions = []
495
- splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
496
- for chunk_slice in splits:
497
- start_chunk = int(chunk_slice[0])
498
- stop_chunk = int(chunk_slice[-1]) + 1
499
- start_index = start_chunk * chunk_size
500
- stop_index = min(stop_chunk * chunk_size, num_records)
501
- partitions.append(VcfZarrPartition(start_index, stop_index))
502
- return partitions
503
389
 
504
-
505
- VZW_METADATA_FORMAT_VERSION = "0.1"
506
-
507
-
508
- @dataclasses.dataclass
509
- class VcfZarrWriterMetadata(core.JsonDataclass):
510
- format_version: str
511
- icf_path: str
512
- schema: VcfZarrSchema
513
- dimension_separator: str
514
- partitions: list
515
- provenance: dict
516
-
517
- @staticmethod
518
- def fromdict(d):
519
- if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
520
- raise ValueError(
521
- "VcfZarrWriter format version mismatch: "
522
- f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
523
- )
524
- ret = VcfZarrWriterMetadata(**d)
525
- ret.schema = VcfZarrSchema.fromdict(ret.schema)
526
- ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
527
- return ret
390
+ def sanitise_int_array(value, ndmin, dtype):
391
+ if isinstance(value, tuple):
392
+ value = [
393
+ constants.VCF_INT_MISSING if x is None else x for x in value
394
+ ] # NEEDS TEST
395
+ value = np.array(value, ndmin=ndmin, copy=True)
396
+ value[value == constants.VCF_INT_MISSING] = -1
397
+ value[value == constants.VCF_INT_FILL] = -2
398
+ # TODO watch out for clipping here!
399
+ return value.astype(dtype)
528
400
 
529
401
 
530
402
  def compute_la_field(genotypes):
@@ -597,14 +469,60 @@ class LocalisableFieldDescriptor:
597
469
 
598
470
  localisable_fields = [
599
471
  LocalisableFieldDescriptor(
600
- "call_LAD", "FORMAT/AD", icf.sanitise_int_array, compute_lad_field
472
+ "call_LAD", "FORMAT/AD", sanitise_int_array, compute_lad_field
601
473
  ),
602
474
  LocalisableFieldDescriptor(
603
- "call_LPL", "FORMAT/PL", icf.sanitise_int_array, compute_lpl_field
475
+ "call_LPL", "FORMAT/PL", sanitise_int_array, compute_lpl_field
604
476
  ),
605
477
  ]
606
478
 
607
479
 
480
+ @dataclasses.dataclass
481
+ class VcfZarrPartition:
482
+ start: int
483
+ stop: int
484
+
485
+ @staticmethod
486
+ def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
487
+ num_chunks = int(np.ceil(num_records / chunk_size))
488
+ if max_chunks is not None:
489
+ num_chunks = min(num_chunks, max_chunks)
490
+ partitions = []
491
+ splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
492
+ for chunk_slice in splits:
493
+ start_chunk = int(chunk_slice[0])
494
+ stop_chunk = int(chunk_slice[-1]) + 1
495
+ start_index = start_chunk * chunk_size
496
+ stop_index = min(stop_chunk * chunk_size, num_records)
497
+ partitions.append(VcfZarrPartition(start_index, stop_index))
498
+ return partitions
499
+
500
+
501
+ VZW_METADATA_FORMAT_VERSION = "0.1"
502
+
503
+
504
+ @dataclasses.dataclass
505
+ class VcfZarrWriterMetadata(core.JsonDataclass):
506
+ format_version: str
507
+ source_path: str
508
+ schema: VcfZarrSchema
509
+ dimension_separator: str
510
+ partitions: list
511
+ provenance: dict
512
+
513
+ @staticmethod
514
+ def fromdict(d):
515
+ if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
516
+ raise ValueError(
517
+ "VcfZarrWriter format version mismatch: "
518
+ f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
519
+ )
520
+ ret = VcfZarrWriterMetadata(**d)
521
+ ret.schema = VcfZarrSchema.fromdict(ret.schema)
522
+ ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
523
+ return ret
524
+
525
+
608
526
  @dataclasses.dataclass
609
527
  class VcfZarrWriteSummary(core.JsonDataclass):
610
528
  num_partitions: int
@@ -615,13 +533,14 @@ class VcfZarrWriteSummary(core.JsonDataclass):
615
533
 
616
534
 
617
535
  class VcfZarrWriter:
618
- def __init__(self, path):
536
+ def __init__(self, source_type, path):
537
+ self.source_type = source_type
619
538
  self.path = pathlib.Path(path)
620
539
  self.wip_path = self.path / "wip"
621
540
  self.arrays_path = self.wip_path / "arrays"
622
541
  self.partitions_path = self.wip_path / "partitions"
623
542
  self.metadata = None
624
- self.icf = None
543
+ self.source = None
625
544
 
626
545
  @property
627
546
  def schema(self):
@@ -639,7 +558,7 @@ class VcfZarrWriter:
639
558
 
640
559
  def has_local_alleles(self):
641
560
  for field in self.schema.fields:
642
- if field.name == "call_LA" and field.vcf_field is None:
561
+ if field.name == "call_LA" and field.source is None:
643
562
  return True
644
563
  return False
645
564
 
@@ -649,20 +568,20 @@ class VcfZarrWriter:
649
568
 
650
569
  def init(
651
570
  self,
652
- icf,
571
+ source,
653
572
  *,
654
573
  target_num_partitions,
655
574
  schema,
656
575
  dimension_separator=None,
657
576
  max_variant_chunks=None,
658
577
  ):
659
- self.icf = icf
578
+ self.source = source
660
579
  if self.path.exists():
661
580
  raise ValueError("Zarr path already exists") # NEEDS TEST
662
581
  schema.validate()
663
582
  partitions = VcfZarrPartition.generate_partitions(
664
- self.icf.num_records,
665
- schema.variants_chunk_size,
583
+ self.source.num_records,
584
+ schema.get_chunks(["variants"])[0],
666
585
  target_num_partitions,
667
586
  max_chunks=max_variant_chunks,
668
587
  )
@@ -673,7 +592,7 @@ class VcfZarrWriter:
673
592
  )
674
593
  self.metadata = VcfZarrWriterMetadata(
675
594
  format_version=VZW_METADATA_FORMAT_VERSION,
676
- icf_path=str(self.icf.path),
595
+ source_path=str(self.source.path),
677
596
  schema=schema,
678
597
  dimension_separator=dimension_separator,
679
598
  partitions=partitions,
@@ -682,27 +601,32 @@ class VcfZarrWriter:
682
601
  )
683
602
 
684
603
  self.path.mkdir()
685
- root = zarr.open(store=self.path, mode="a", **ZARR_FORMAT_KWARGS)
604
+ root = zarr.open(store=self.path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS)
686
605
  root.attrs.update(
687
606
  {
688
- "vcf_zarr_version": "0.2",
689
- "vcf_header": self.icf.vcf_header,
607
+ "vcf_zarr_version": "0.4",
690
608
  "source": f"bio2zarr-{provenance.__version__}",
691
609
  }
692
610
  )
693
- # Doing this syncronously - this is fine surely
611
+ root.attrs.update(self.source.root_attrs)
612
+
613
+ # Doing this synchronously - this is fine surely
694
614
  self.encode_samples(root)
695
- self.encode_filter_id(root)
696
- self.encode_contig_id(root)
615
+ if self.source.filters is not None:
616
+ self.encode_filters(root)
617
+ if self.source.contigs is not None:
618
+ self.encode_contigs(root)
697
619
 
698
620
  self.wip_path.mkdir()
699
621
  self.arrays_path.mkdir()
700
622
  self.partitions_path.mkdir()
701
- root = zarr.open(store=self.arrays_path, mode="a", **ZARR_FORMAT_KWARGS)
623
+ root = zarr.open(
624
+ store=self.arrays_path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS
625
+ )
702
626
 
703
627
  total_chunks = 0
704
628
  for field in self.schema.fields:
705
- a = self.init_array(root, field, partitions[-1].stop)
629
+ a = self.init_array(root, self.metadata.schema, field, partitions[-1].stop)
706
630
  total_chunks += a.nchunks
707
631
 
708
632
  logger.info("Writing WIP metadata")
@@ -710,79 +634,97 @@ class VcfZarrWriter:
710
634
  json.dump(self.metadata.asdict(), f, indent=4)
711
635
 
712
636
  return VcfZarrWriteSummary(
713
- num_variants=self.icf.num_records,
714
- num_samples=self.icf.num_samples,
637
+ num_variants=self.source.num_records,
638
+ num_samples=self.source.num_samples,
715
639
  num_partitions=self.num_partitions,
716
640
  num_chunks=total_chunks,
717
641
  max_encoding_memory=core.display_size(self.get_max_encoding_memory()),
718
642
  )
719
643
 
720
644
  def encode_samples(self, root):
721
- if self.schema.samples != self.icf.metadata.samples:
722
- raise ValueError("Subsetting or reordering samples not supported currently")
645
+ samples = self.source.samples
723
646
  array = root.array(
724
647
  "sample_id",
725
- data=[sample.id for sample in self.schema.samples],
726
- shape=len(self.schema.samples),
648
+ data=[sample.id for sample in samples],
649
+ shape=len(samples),
727
650
  dtype="str",
728
651
  compressor=DEFAULT_ZARR_COMPRESSOR,
729
- chunks=(self.schema.samples_chunk_size,),
652
+ chunks=(self.schema.get_chunks(["samples"])[0],),
730
653
  )
731
654
  array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
732
655
  logger.debug("Samples done")
733
656
 
734
- def encode_contig_id(self, root):
657
+ def encode_contigs(self, root):
658
+ contigs = self.source.contigs
735
659
  array = root.array(
736
660
  "contig_id",
737
- data=[contig.id for contig in self.schema.contigs],
738
- shape=len(self.schema.contigs),
661
+ data=[contig.id for contig in contigs],
662
+ shape=len(contigs),
739
663
  dtype="str",
740
664
  compressor=DEFAULT_ZARR_COMPRESSOR,
741
665
  )
742
666
  array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
743
- if all(contig.length is not None for contig in self.schema.contigs):
667
+ if all(contig.length is not None for contig in contigs):
744
668
  array = root.array(
745
669
  "contig_length",
746
- data=[contig.length for contig in self.schema.contigs],
747
- shape=len(self.schema.contigs),
670
+ data=[contig.length for contig in contigs],
671
+ shape=len(contigs),
748
672
  dtype=np.int64,
749
673
  compressor=DEFAULT_ZARR_COMPRESSOR,
750
674
  )
751
675
  array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
752
676
 
753
- def encode_filter_id(self, root):
754
- # TODO need a way to store description also
755
- # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
677
+ def encode_filters(self, root):
678
+ filters = self.source.filters
756
679
  array = root.array(
757
680
  "filter_id",
758
- data=[filt.id for filt in self.schema.filters],
759
- shape=len(self.schema.filters),
681
+ data=[filt.id for filt in filters],
682
+ shape=len(filters),
683
+ dtype="str",
684
+ compressor=DEFAULT_ZARR_COMPRESSOR,
685
+ )
686
+ array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
687
+ array = root.array(
688
+ "filter_description",
689
+ data=[filt.description for filt in filters],
690
+ shape=len(filters),
760
691
  dtype="str",
761
692
  compressor=DEFAULT_ZARR_COMPRESSOR,
762
693
  )
763
694
  array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
764
695
 
765
- def init_array(self, root, array_spec, variants_dim_size):
766
- kwargs = dict(ZARR_FORMAT_KWARGS)
767
- filters = [numcodecs.get_codec(filt) for filt in array_spec.filters]
696
+ def init_array(self, root, schema, array_spec, variants_dim_size):
697
+ kwargs = dict(zarr_utils.ZARR_FORMAT_KWARGS)
698
+ filters = (
699
+ array_spec.filters
700
+ if array_spec.filters is not None
701
+ else schema.defaults["filters"]
702
+ )
703
+ filters = [numcodecs.get_codec(filt) for filt in filters]
704
+ compressor = (
705
+ array_spec.compressor
706
+ if array_spec.compressor is not None
707
+ else schema.defaults["compressor"]
708
+ )
709
+ compressor = numcodecs.get_codec(compressor)
768
710
  if array_spec.dtype == "O":
769
- if zarr_v3():
711
+ if zarr_utils.zarr_v3():
770
712
  filters = [*list(filters), numcodecs.VLenUTF8()]
771
713
  else:
772
714
  kwargs["object_codec"] = numcodecs.VLenUTF8()
773
715
 
774
- if not zarr_v3():
716
+ if not zarr_utils.zarr_v3():
775
717
  kwargs["dimension_separator"] = self.metadata.dimension_separator
776
718
 
777
- shape = list(array_spec.shape)
778
- # Truncate the variants dimension is max_variant_chunks was specified
719
+ shape = schema.get_shape(array_spec.dimensions)
720
+ # Truncate the variants dimension if max_variant_chunks was specified
779
721
  shape[0] = variants_dim_size
780
722
  a = root.empty(
781
723
  name=array_spec.name,
782
724
  shape=shape,
783
- chunks=array_spec.chunks,
725
+ chunks=schema.get_chunks(array_spec.dimensions),
784
726
  dtype=array_spec.dtype,
785
- compressor=numcodecs.get_codec(array_spec.compressor),
727
+ compressor=compressor,
786
728
  filters=filters,
787
729
  **kwargs,
788
730
  )
@@ -804,7 +746,7 @@ class VcfZarrWriter:
804
746
  if self.metadata is None:
805
747
  with open(self.wip_path / "metadata.json") as f:
806
748
  self.metadata = VcfZarrWriterMetadata.fromdict(json.load(f))
807
- self.icf = icf.IntermediateColumnarFormat(self.metadata.icf_path)
749
+ self.source = self.source_type(self.metadata.source_path)
808
750
 
809
751
  def partition_path(self, partition_index):
810
752
  return self.partitions_path / f"p{partition_index}"
@@ -826,15 +768,18 @@ class VcfZarrWriter:
826
768
  partition_path.mkdir(exist_ok=True)
827
769
  logger.info(f"Encoding partition {partition_index} to {partition_path}")
828
770
 
829
- self.encode_id_partition(partition_index)
830
- self.encode_filters_partition(partition_index)
831
- self.encode_contig_partition(partition_index)
832
- self.encode_alleles_partition(partition_index)
771
+ all_field_names = [field.name for field in self.schema.fields]
772
+ if "variant_id" in all_field_names:
773
+ self.encode_id_partition(partition_index)
774
+ if "variant_filter" in all_field_names:
775
+ self.encode_filters_partition(partition_index)
776
+ if "variant_contig" in all_field_names:
777
+ self.encode_contig_partition(partition_index)
778
+ self.encode_alleles_and_genotypes_partition(partition_index)
833
779
  for array_spec in self.schema.fields:
834
- if array_spec.vcf_field is not None:
780
+ if array_spec.source is not None:
835
781
  self.encode_array_partition(array_spec, partition_index)
836
782
  if self.has_genotypes():
837
- self.encode_genotypes_partition(partition_index)
838
783
  self.encode_genotype_mask_partition(partition_index)
839
784
  if self.has_local_alleles():
840
785
  self.encode_local_alleles_partition(partition_index)
@@ -874,34 +819,48 @@ class VcfZarrWriter:
874
819
  def encode_array_partition(self, array_spec, partition_index):
875
820
  partition = self.metadata.partitions[partition_index]
876
821
  ba = self.init_partition_array(partition_index, array_spec.name)
877
- source_field = self.icf.fields[array_spec.vcf_field]
878
- sanitiser = source_field.sanitiser_factory(ba.buff.shape)
879
-
880
- for value in source_field.iter_values(partition.start, partition.stop):
881
- # We write directly into the buffer in the sanitiser function
882
- # to make it easier to reason about dimension padding
822
+ for value in self.source.iter_field(
823
+ array_spec.source,
824
+ ba.buff.shape[1:],
825
+ partition.start,
826
+ partition.stop,
827
+ ):
883
828
  j = ba.next_buffer_row()
884
- sanitiser(ba.buff, j, value)
829
+ ba.buff[j] = value
830
+
885
831
  self.finalise_partition_array(partition_index, ba)
886
832
 
887
- def encode_genotypes_partition(self, partition_index):
833
+ def encode_alleles_and_genotypes_partition(self, partition_index):
888
834
  partition = self.metadata.partitions[partition_index]
889
- gt = self.init_partition_array(partition_index, "call_genotype")
890
- gt_phased = self.init_partition_array(partition_index, "call_genotype_phased")
891
-
892
- source_field = self.icf.fields["FORMAT/GT"]
893
- for value in source_field.iter_values(partition.start, partition.stop):
894
- j = gt.next_buffer_row()
895
- icf.sanitise_value_int_2d(
896
- gt.buff, j, value[:, :-1] if value is not None else None
897
- )
898
- j = gt_phased.next_buffer_row()
899
- icf.sanitise_value_int_1d(
900
- gt_phased.buff, j, value[:, -1] if value is not None else None
835
+ alleles = self.init_partition_array(partition_index, "variant_allele")
836
+ variant_lengths = self.init_partition_array(partition_index, "variant_length")
837
+ has_gt = self.has_genotypes()
838
+ shape = None
839
+ if has_gt:
840
+ gt = self.init_partition_array(partition_index, "call_genotype")
841
+ gt_phased = self.init_partition_array(
842
+ partition_index, "call_genotype_phased"
901
843
  )
844
+ shape = gt.buff.shape[1:]
845
+
846
+ for variant_data in self.source.iter_alleles_and_genotypes(
847
+ partition.start, partition.stop, shape, alleles.array.shape[1]
848
+ ):
849
+ j_alleles = alleles.next_buffer_row()
850
+ alleles.buff[j_alleles] = variant_data.alleles
851
+ j_variant_length = variant_lengths.next_buffer_row()
852
+ variant_lengths.buff[j_variant_length] = variant_data.variant_length
853
+ if has_gt:
854
+ j = gt.next_buffer_row()
855
+ gt.buff[j] = variant_data.genotypes
856
+ j_phased = gt_phased.next_buffer_row()
857
+ gt_phased.buff[j_phased] = variant_data.phased
902
858
 
903
- self.finalise_partition_array(partition_index, gt)
904
- self.finalise_partition_array(partition_index, gt_phased)
859
+ self.finalise_partition_array(partition_index, alleles)
860
+ self.finalise_partition_array(partition_index, variant_lengths)
861
+ if has_gt:
862
+ self.finalise_partition_array(partition_index, gt)
863
+ self.finalise_partition_array(partition_index, gt_phased)
905
864
 
906
865
  def encode_genotype_mask_partition(self, partition_index):
907
866
  partition = self.metadata.partitions[partition_index]
@@ -948,10 +907,10 @@ class VcfZarrWriter:
948
907
  for descriptor in localisable_fields:
949
908
  if descriptor.array_name not in field_map:
950
909
  continue
951
- assert field_map[descriptor.array_name].vcf_field is None
910
+ assert field_map[descriptor.array_name].source is None
952
911
 
953
912
  buff = self.init_partition_array(partition_index, descriptor.array_name)
954
- source = self.icf.fields[descriptor.vcf_field].iter_values(
913
+ source = self.source.fields[descriptor.vcf_field].iter_values(
955
914
  partition.start, partition.stop
956
915
  )
957
916
  for la in core.first_dim_slice_iter(
@@ -963,34 +922,17 @@ class VcfZarrWriter:
963
922
  buff.buff[j] = descriptor.convert(value, la)
964
923
  self.finalise_partition_array(partition_index, buff)
965
924
 
966
- def encode_alleles_partition(self, partition_index):
967
- alleles = self.init_partition_array(partition_index, "variant_allele")
968
- partition = self.metadata.partitions[partition_index]
969
- ref_field = self.icf.fields["REF"]
970
- alt_field = self.icf.fields["ALT"]
971
-
972
- for ref, alt in zip(
973
- ref_field.iter_values(partition.start, partition.stop),
974
- alt_field.iter_values(partition.start, partition.stop),
975
- ):
976
- j = alleles.next_buffer_row()
977
- alleles.buff[j, :] = constants.STR_FILL
978
- alleles.buff[j, 0] = ref[0]
979
- alleles.buff[j, 1 : 1 + len(alt)] = alt
980
- self.finalise_partition_array(partition_index, alleles)
981
-
982
925
  def encode_id_partition(self, partition_index):
983
926
  vid = self.init_partition_array(partition_index, "variant_id")
984
927
  vid_mask = self.init_partition_array(partition_index, "variant_id_mask")
985
928
  partition = self.metadata.partitions[partition_index]
986
- field = self.icf.fields["ID"]
987
929
 
988
- for value in field.iter_values(partition.start, partition.stop):
930
+ for value in self.source.iter_id(partition.start, partition.stop):
989
931
  j = vid.next_buffer_row()
990
932
  k = vid_mask.next_buffer_row()
991
933
  assert j == k
992
934
  if value is not None:
993
- vid.buff[j] = value[0]
935
+ vid.buff[j] = value
994
936
  vid_mask.buff[j] = False
995
937
  else:
996
938
  vid.buff[j] = constants.STR_MISSING
@@ -1000,37 +942,22 @@ class VcfZarrWriter:
1000
942
  self.finalise_partition_array(partition_index, vid_mask)
1001
943
 
1002
944
  def encode_filters_partition(self, partition_index):
1003
- lookup = {filt.id: index for index, filt in enumerate(self.schema.filters)}
1004
945
  var_filter = self.init_partition_array(partition_index, "variant_filter")
1005
946
  partition = self.metadata.partitions[partition_index]
1006
947
 
1007
- field = self.icf.fields["FILTERS"]
1008
- for value in field.iter_values(partition.start, partition.stop):
948
+ for filter_values in self.source.iter_filters(partition.start, partition.stop):
1009
949
  j = var_filter.next_buffer_row()
1010
- var_filter.buff[j] = False
1011
- for f in value:
1012
- try:
1013
- var_filter.buff[j, lookup[f]] = True
1014
- except KeyError:
1015
- raise ValueError(
1016
- f"Filter '{f}' was not defined in the header."
1017
- ) from None
950
+ var_filter.buff[j] = filter_values
1018
951
 
1019
952
  self.finalise_partition_array(partition_index, var_filter)
1020
953
 
1021
954
  def encode_contig_partition(self, partition_index):
1022
- lookup = {contig.id: index for index, contig in enumerate(self.schema.contigs)}
1023
955
  contig = self.init_partition_array(partition_index, "variant_contig")
1024
956
  partition = self.metadata.partitions[partition_index]
1025
- field = self.icf.fields["CHROM"]
1026
957
 
1027
- for value in field.iter_values(partition.start, partition.stop):
958
+ for contig_index in self.source.iter_contig(partition.start, partition.stop):
1028
959
  j = contig.next_buffer_row()
1029
- # Note: because we are using the indexes to define the lookups
1030
- # and we always have an index, it seems that we the contig lookup
1031
- # will always succeed. However, if anyone ever does hit a KeyError
1032
- # here, please do open an issue with a reproducible example!
1033
- contig.buff[j] = lookup[value[0]]
960
+ contig.buff[j] = contig_index
1034
961
 
1035
962
  self.finalise_partition_array(partition_index, contig)
1036
963
 
@@ -1109,60 +1036,8 @@ class VcfZarrWriter:
1109
1036
  def create_index(self):
1110
1037
  """Create an index to support efficient region queries."""
1111
1038
 
1112
- root = zarr.open_group(store=self.path, mode="r+")
1113
-
1114
- contig = root["variant_contig"]
1115
- pos = root["variant_position"]
1116
- length = root["variant_length"]
1117
-
1118
- assert contig.cdata_shape == pos.cdata_shape
1119
-
1120
- index = []
1121
-
1122
- logger.info("Creating region index")
1123
- for v_chunk in range(pos.cdata_shape[0]):
1124
- c = contig.blocks[v_chunk]
1125
- p = pos.blocks[v_chunk]
1126
- e = p + length.blocks[v_chunk] - 1
1127
-
1128
- # create a row for each contig in the chunk
1129
- d = np.diff(c, append=-1)
1130
- c_start_idx = 0
1131
- for c_end_idx in np.nonzero(d)[0]:
1132
- assert c[c_start_idx] == c[c_end_idx]
1133
- index.append(
1134
- (
1135
- v_chunk, # chunk index
1136
- c[c_start_idx], # contig ID
1137
- p[c_start_idx], # start
1138
- p[c_end_idx], # end
1139
- np.max(e[c_start_idx : c_end_idx + 1]), # max end
1140
- c_end_idx - c_start_idx + 1, # num records
1141
- )
1142
- )
1143
- c_start_idx = c_end_idx + 1
1144
-
1145
- index = np.array(index, dtype=pos.dtype)
1146
- kwargs = {}
1147
- if not zarr_v3():
1148
- kwargs["dimension_separator"] = self.metadata.dimension_separator
1149
- array = root.array(
1150
- "region_index",
1151
- data=index,
1152
- shape=index.shape,
1153
- chunks=index.shape,
1154
- dtype=index.dtype,
1155
- compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
1156
- fill_value=None,
1157
- **kwargs,
1158
- )
1159
- array.attrs["_ARRAY_DIMENSIONS"] = [
1160
- "region_index_values",
1161
- "region_index_fields",
1162
- ]
1163
-
1164
- logger.info("Consolidating Zarr metadata")
1165
- zarr.consolidate_metadata(self.path)
1039
+ indexer = VcfZarrIndexer(self.path)
1040
+ indexer.create_index()
1166
1041
 
1167
1042
  ######################
1168
1043
  # encode_all_partitions
@@ -1174,11 +1049,13 @@ class VcfZarrWriter:
1174
1049
  """
1175
1050
  max_encoding_mem = 0
1176
1051
  for array_spec in self.schema.fields:
1177
- max_encoding_mem = max(max_encoding_mem, array_spec.variant_chunk_nbytes)
1052
+ max_encoding_mem = max(
1053
+ max_encoding_mem, array_spec.variant_chunk_nbytes(self.schema)
1054
+ )
1178
1055
  gt_mem = 0
1179
1056
  if self.has_genotypes:
1180
1057
  gt_mem = sum(
1181
- field.variant_chunk_nbytes
1058
+ field.variant_chunk_nbytes(self.schema)
1182
1059
  for field in self.schema.fields
1183
1060
  if field.name.startswith("call_genotype")
1184
1061
  )
@@ -1187,7 +1064,7 @@ class VcfZarrWriter:
1187
1064
  def encode_all_partitions(
1188
1065
  self, *, worker_processes=1, show_progress=False, max_memory=None
1189
1066
  ):
1190
- max_memory = parse_max_memory(max_memory)
1067
+ max_memory = core.parse_max_memory(max_memory)
1191
1068
  self.load_metadata()
1192
1069
  num_partitions = self.num_partitions
1193
1070
  per_worker_memory = self.get_max_encoding_memory()
@@ -1229,147 +1106,106 @@ class VcfZarrWriter:
1229
1106
  pwm.submit(self.encode_partition, partition_index)
1230
1107
 
1231
1108
 
1232
- def mkschema(
1233
- if_path,
1234
- out,
1235
- *,
1236
- variants_chunk_size=None,
1237
- samples_chunk_size=None,
1238
- local_alleles=None,
1239
- ):
1240
- store = icf.IntermediateColumnarFormat(if_path)
1241
- spec = VcfZarrSchema.generate(
1242
- store,
1243
- variants_chunk_size=variants_chunk_size,
1244
- samples_chunk_size=samples_chunk_size,
1245
- local_alleles=local_alleles,
1246
- )
1247
- out.write(spec.asjson())
1248
-
1249
-
1250
- def encode(
1251
- if_path,
1252
- zarr_path,
1253
- schema_path=None,
1254
- variants_chunk_size=None,
1255
- samples_chunk_size=None,
1256
- max_variant_chunks=None,
1257
- dimension_separator=None,
1258
- max_memory=None,
1259
- local_alleles=None,
1260
- worker_processes=1,
1261
- show_progress=False,
1262
- ):
1263
- # Rough heuristic to split work up enough to keep utilisation high
1264
- target_num_partitions = max(1, worker_processes * 4)
1265
- encode_init(
1266
- if_path,
1267
- zarr_path,
1268
- target_num_partitions,
1269
- schema_path=schema_path,
1270
- variants_chunk_size=variants_chunk_size,
1271
- samples_chunk_size=samples_chunk_size,
1272
- local_alleles=local_alleles,
1273
- max_variant_chunks=max_variant_chunks,
1274
- dimension_separator=dimension_separator,
1275
- )
1276
- vzw = VcfZarrWriter(zarr_path)
1277
- vzw.encode_all_partitions(
1278
- worker_processes=worker_processes,
1279
- show_progress=show_progress,
1280
- max_memory=max_memory,
1281
- )
1282
- vzw.finalise(show_progress)
1283
- vzw.create_index()
1284
-
1285
-
1286
- def encode_init(
1287
- icf_path,
1288
- zarr_path,
1289
- target_num_partitions,
1290
- *,
1291
- schema_path=None,
1292
- variants_chunk_size=None,
1293
- samples_chunk_size=None,
1294
- local_alleles=None,
1295
- max_variant_chunks=None,
1296
- dimension_separator=None,
1297
- max_memory=None,
1298
- worker_processes=1,
1299
- show_progress=False,
1300
- ):
1301
- icf_store = icf.IntermediateColumnarFormat(icf_path)
1302
- if schema_path is None:
1303
- schema = VcfZarrSchema.generate(
1304
- icf_store,
1305
- variants_chunk_size=variants_chunk_size,
1306
- samples_chunk_size=samples_chunk_size,
1307
- local_alleles=local_alleles,
1308
- )
1309
- else:
1310
- logger.info(f"Reading schema from {schema_path}")
1311
- if variants_chunk_size is not None or samples_chunk_size is not None:
1109
+ class VcfZarr:
1110
+ def __init__(self, path):
1111
+ if not (path / ".zmetadata").exists():
1112
+ raise ValueError("Not in VcfZarr format") # NEEDS TEST
1113
+ self.path = path
1114
+ self.root = zarr.open(path, mode="r")
1115
+
1116
+ def summary_table(self):
1117
+ data = []
1118
+ arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()]
1119
+ arrays.sort(key=lambda x: x[0])
1120
+ for stored, array in reversed(arrays):
1121
+ d = {
1122
+ "name": array.name,
1123
+ "dtype": str(array.dtype),
1124
+ "stored": core.display_size(stored),
1125
+ "size": core.display_size(array.nbytes),
1126
+ "ratio": core.display_number(array.nbytes / stored),
1127
+ "nchunks": str(array.nchunks),
1128
+ "chunk_size": core.display_size(array.nbytes / array.nchunks),
1129
+ "avg_chunk_stored": core.display_size(int(stored / array.nchunks)),
1130
+ "shape": str(array.shape),
1131
+ "chunk_shape": str(array.chunks),
1132
+ "compressor": str(array.compressor),
1133
+ "filters": str(array.filters),
1134
+ }
1135
+ data.append(d)
1136
+ return data
1137
+
1138
+
1139
+ class VcfZarrIndexer:
1140
+ """
1141
+ Creates an index for efficient region queries in a VCF Zarr dataset.
1142
+ """
1143
+
1144
+ def __init__(self, path):
1145
+ self.path = pathlib.Path(path)
1146
+
1147
+ def create_index(self):
1148
+ """Create an index to support efficient region queries."""
1149
+ root = zarr.open_group(store=self.path, mode="r+")
1150
+ if (
1151
+ "variant_contig" not in root
1152
+ or "variant_position" not in root
1153
+ or "variant_length" not in root
1154
+ ):
1312
1155
  raise ValueError(
1313
- "Cannot specify schema along with chunk sizes"
1314
- ) # NEEDS TEST
1315
- with open(schema_path) as f:
1316
- schema = VcfZarrSchema.fromjson(f.read())
1317
- zarr_path = pathlib.Path(zarr_path)
1318
- vzw = VcfZarrWriter(zarr_path)
1319
- return vzw.init(
1320
- icf_store,
1321
- target_num_partitions=target_num_partitions,
1322
- schema=schema,
1323
- dimension_separator=dimension_separator,
1324
- max_variant_chunks=max_variant_chunks,
1325
- )
1326
-
1327
-
1328
- def encode_partition(zarr_path, partition):
1329
- writer = VcfZarrWriter(zarr_path)
1330
- writer.encode_partition(partition)
1331
-
1332
-
1333
- def encode_finalise(zarr_path, show_progress=False):
1334
- writer = VcfZarrWriter(zarr_path)
1335
- writer.finalise(show_progress=show_progress)
1336
-
1337
-
1338
- def convert(
1339
- vcfs,
1340
- out_path,
1341
- *,
1342
- variants_chunk_size=None,
1343
- samples_chunk_size=None,
1344
- worker_processes=1,
1345
- local_alleles=None,
1346
- show_progress=False,
1347
- icf_path=None,
1348
- ):
1349
- if icf_path is None:
1350
- cm = temp_icf_path(prefix="vcf2zarr")
1351
- else:
1352
- cm = contextlib.nullcontext(icf_path)
1353
-
1354
- with cm as icf_path:
1355
- icf.explode(
1356
- icf_path,
1357
- vcfs,
1358
- worker_processes=worker_processes,
1359
- show_progress=show_progress,
1360
- )
1361
- encode(
1362
- icf_path,
1363
- out_path,
1364
- variants_chunk_size=variants_chunk_size,
1365
- samples_chunk_size=samples_chunk_size,
1366
- worker_processes=worker_processes,
1367
- show_progress=show_progress,
1368
- local_alleles=local_alleles,
1369
- )
1156
+ "Cannot create index: variant_contig, "
1157
+ "variant_position and variant_length arrays are required"
1158
+ )
1159
+
1160
+ contig = root["variant_contig"]
1161
+ pos = root["variant_position"]
1162
+ length = root["variant_length"]
1163
+
1164
+ assert contig.cdata_shape == pos.cdata_shape
1370
1165
 
1166
+ index = []
1371
1167
 
1372
- @contextlib.contextmanager
1373
- def temp_icf_path(prefix=None):
1374
- with tempfile.TemporaryDirectory(prefix=prefix) as tmp:
1375
- yield pathlib.Path(tmp) / "icf"
1168
+ logger.info("Creating region index")
1169
+ for v_chunk in range(pos.cdata_shape[0]):
1170
+ c = contig.blocks[v_chunk]
1171
+ p = pos.blocks[v_chunk]
1172
+ e = p + length.blocks[v_chunk] - 1
1173
+
1174
+ # create a row for each contig in the chunk
1175
+ d = np.diff(c, append=-1)
1176
+ c_start_idx = 0
1177
+ for c_end_idx in np.nonzero(d)[0]:
1178
+ assert c[c_start_idx] == c[c_end_idx]
1179
+ index.append(
1180
+ (
1181
+ v_chunk, # chunk index
1182
+ c[c_start_idx], # contig ID
1183
+ p[c_start_idx], # start
1184
+ p[c_end_idx], # end
1185
+ np.max(e[c_start_idx : c_end_idx + 1]), # max end
1186
+ c_end_idx - c_start_idx + 1, # num records
1187
+ )
1188
+ )
1189
+ c_start_idx = c_end_idx + 1
1190
+
1191
+ index = np.array(index, dtype=pos.dtype)
1192
+ kwargs = {}
1193
+ if not zarr_utils.zarr_v3():
1194
+ kwargs["dimension_separator"] = "/"
1195
+ array = root.array(
1196
+ "region_index",
1197
+ data=index,
1198
+ shape=index.shape,
1199
+ chunks=index.shape,
1200
+ dtype=index.dtype,
1201
+ compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
1202
+ fill_value=None,
1203
+ **kwargs,
1204
+ )
1205
+ array.attrs["_ARRAY_DIMENSIONS"] = [
1206
+ "region_index_values",
1207
+ "region_index_fields",
1208
+ ]
1209
+
1210
+ logger.info("Consolidating Zarr metadata")
1211
+ zarr.consolidate_metadata(self.path)