bio2zarr 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,41 +1,29 @@
1
- import contextlib
1
+ import abc
2
2
  import dataclasses
3
3
  import json
4
4
  import logging
5
5
  import os
6
- import os.path
7
6
  import pathlib
8
7
  import shutil
9
- import tempfile
10
8
 
11
- import humanfriendly
12
9
  import numcodecs
13
10
  import numpy as np
14
11
  import zarr
15
12
 
16
- from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS, zarr_v3
17
-
18
- from .. import constants, core, provenance
19
- from . import icf
13
+ from bio2zarr import constants, core, provenance, zarr_utils
20
14
 
21
15
  logger = logging.getLogger(__name__)
22
16
 
23
-
24
- def inspect(path):
25
- path = pathlib.Path(path)
26
- if not path.exists():
27
- raise ValueError(f"Path not found: {path}")
28
- if (path / "metadata.json").exists():
29
- obj = icf.IntermediateColumnarFormat(path)
30
- # NOTE: this is too strict, we should support more general Zarrs, see #276
31
- elif (path / ".zmetadata").exists():
32
- obj = VcfZarr(path)
33
- else:
34
- raise ValueError(f"{path} not in ICF or VCF Zarr format")
35
- return obj.summary_table()
36
-
37
-
17
+ ZARR_SCHEMA_FORMAT_VERSION = "0.6"
18
+ DEFAULT_VARIANT_CHUNK_SIZE = 1000
19
+ DEFAULT_SAMPLE_CHUNK_SIZE = 10_000
38
20
  DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
21
+ DEFAULT_ZARR_COMPRESSOR_GENOTYPES = numcodecs.Blosc(
22
+ cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
23
+ )
24
+ DEFAULT_ZARR_COMPRESSOR_BOOL = numcodecs.Blosc(
25
+ cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
26
+ )
39
27
 
40
28
  _fixed_field_descriptions = {
41
29
  "variant_contig": "An identifier from the reference genome or an angle-bracketed ID"
@@ -49,131 +37,254 @@ _fixed_field_descriptions = {
49
37
  }
50
38
 
51
39
 
40
+ @dataclasses.dataclass
41
+ class VariantData:
42
+ """Represents variant data returned by iter_alleles_and_genotypes."""
43
+
44
+ variant_length: int
45
+ alleles: np.ndarray
46
+ genotypes: np.ndarray
47
+ phased: np.ndarray
48
+
49
+
50
+ class Source(abc.ABC):
51
+ @property
52
+ @abc.abstractmethod
53
+ def path(self):
54
+ pass
55
+
56
+ @property
57
+ @abc.abstractmethod
58
+ def num_records(self):
59
+ pass
60
+
61
+ @property
62
+ @abc.abstractmethod
63
+ def num_samples(self):
64
+ pass
65
+
66
+ @property
67
+ @abc.abstractmethod
68
+ def samples(self):
69
+ pass
70
+
71
+ @property
72
+ def contigs(self):
73
+ return None
74
+
75
+ @property
76
+ def filters(self):
77
+ return None
78
+
79
+ @property
80
+ def root_attrs(self):
81
+ return {}
82
+
83
+ @abc.abstractmethod
84
+ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
85
+ pass
86
+
87
+ def iter_id(self, start, stop):
88
+ return
89
+
90
+ def iter_contig(self, start, stop):
91
+ return
92
+
93
+ @abc.abstractmethod
94
+ def iter_field(self, field_name, shape, start, stop):
95
+ pass
96
+
97
+ @abc.abstractmethod
98
+ def generate_schema(self, variants_chunk_size, samples_chunk_size, local_alleles):
99
+ pass
100
+
101
+
102
+ @dataclasses.dataclass
103
+ class VcfZarrDimension:
104
+ size: int
105
+ chunk_size: int
106
+
107
+ def asdict(self):
108
+ return dataclasses.asdict(self)
109
+
110
+ @classmethod
111
+ def fromdict(cls, d):
112
+ return cls(**d)
113
+
114
+ @classmethod
115
+ def unchunked(cls, size):
116
+ return cls(size, max(size, 1))
117
+
118
+
119
+ def standard_dimensions(
120
+ *,
121
+ variants_size,
122
+ samples_size,
123
+ variants_chunk_size=None,
124
+ samples_chunk_size=None,
125
+ alleles_size=None,
126
+ filters_size=None,
127
+ ploidy_size=None,
128
+ genotypes_size=None,
129
+ ):
130
+ """
131
+ Returns a dictionary mapping dimension names to definition for the standard
132
+ fields in a VCF.
133
+ """
134
+ if variants_chunk_size is None:
135
+ variants_chunk_size = max(1, min(variants_size, DEFAULT_VARIANT_CHUNK_SIZE))
136
+ if samples_chunk_size is None:
137
+ samples_chunk_size = max(1, min(samples_size, DEFAULT_SAMPLE_CHUNK_SIZE))
138
+
139
+ dimensions = {
140
+ "variants": VcfZarrDimension(variants_size, variants_chunk_size),
141
+ "samples": VcfZarrDimension(samples_size, samples_chunk_size),
142
+ }
143
+
144
+ if alleles_size is not None:
145
+ dimensions["alleles"] = VcfZarrDimension.unchunked(alleles_size)
146
+ if alleles_size > 1:
147
+ dimensions["alt_alleles"] = VcfZarrDimension.unchunked(alleles_size - 1)
148
+
149
+ if filters_size is not None:
150
+ dimensions["filters"] = VcfZarrDimension.unchunked(filters_size)
151
+
152
+ if ploidy_size is not None:
153
+ dimensions["ploidy"] = VcfZarrDimension.unchunked(ploidy_size)
154
+
155
+ if genotypes_size is not None:
156
+ dimensions["genotypes"] = VcfZarrDimension.unchunked(genotypes_size)
157
+
158
+ return dimensions
159
+
160
+
52
161
  @dataclasses.dataclass
53
162
  class ZarrArraySpec:
54
163
  name: str
55
164
  dtype: str
56
- shape: tuple
57
- chunks: tuple
58
165
  dimensions: tuple
59
166
  description: str
60
- vcf_field: str
61
- compressor: dict
62
- filters: list
167
+ compressor: dict = None
168
+ filters: list = None
169
+ source: str = None
63
170
 
64
171
  def __post_init__(self):
65
172
  if self.name in _fixed_field_descriptions:
66
173
  self.description = self.description or _fixed_field_descriptions[self.name]
67
174
 
68
- # Ensure these are tuples for ease of comparison and consistency
69
- self.shape = tuple(self.shape)
70
- self.chunks = tuple(self.chunks)
71
175
  self.dimensions = tuple(self.dimensions)
72
- self.filters = tuple(self.filters)
176
+ self.filters = tuple(self.filters) if self.filters is not None else None
73
177
 
74
- @staticmethod
75
- def new(**kwargs):
76
- spec = ZarrArraySpec(
77
- **kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
78
- )
79
- spec._choose_compressor_settings()
80
- return spec
178
+ def get_shape(self, schema):
179
+ return schema.get_shape(self.dimensions)
180
+
181
+ def get_chunks(self, schema):
182
+ return schema.get_chunks(self.dimensions)
183
+
184
+ def get_chunk_nbytes(self, schema):
185
+ element_size = np.dtype(self.dtype).itemsize
186
+ chunks = self.get_chunks(schema)
187
+ shape = self.get_shape(schema)
188
+
189
+ # Calculate actual chunk size accounting for dimension limits
190
+ items = 1
191
+ for i, chunk_size in enumerate(chunks):
192
+ items *= min(chunk_size, shape[i])
193
+
194
+ # Include sizes for extra dimensions (if any)
195
+ if len(shape) > len(chunks):
196
+ for size in shape[len(chunks) :]:
197
+ items *= size
198
+
199
+ return element_size * items
81
200
 
82
201
  @staticmethod
83
202
  def from_field(
84
203
  vcf_field,
204
+ schema,
85
205
  *,
86
- num_variants,
87
- num_samples,
88
- variants_chunk_size,
89
- samples_chunk_size,
90
206
  array_name=None,
207
+ compressor=None,
208
+ filters=None,
91
209
  ):
92
- shape = [num_variants]
93
210
  prefix = "variant_"
94
211
  dimensions = ["variants"]
95
- chunks = [variants_chunk_size]
96
212
  if vcf_field.category == "FORMAT":
97
213
  prefix = "call_"
98
- shape.append(num_samples)
99
- chunks.append(samples_chunk_size)
100
214
  dimensions.append("samples")
101
215
  if array_name is None:
102
216
  array_name = prefix + vcf_field.name
103
- # TODO make an option to add in the empty extra dimension
104
- if vcf_field.summary.max_number > 1 or vcf_field.full_name == "FORMAT/LAA":
105
- shape.append(vcf_field.summary.max_number)
106
- chunks.append(vcf_field.summary.max_number)
107
- # TODO we should really be checking this to see if the named dimensions
108
- # are actually correct.
109
- if vcf_field.vcf_number == "R":
217
+
218
+ max_number = vcf_field.max_number
219
+ if vcf_field.vcf_number == "R":
220
+ max_alleles = schema.dimensions["alleles"].size
221
+ if max_number > max_alleles:
222
+ raise ValueError(
223
+ f"Max number of values {max_number} exceeds max alleles "
224
+ f"{max_alleles} for {vcf_field.full_name}"
225
+ )
226
+ if max_alleles > 0:
110
227
  dimensions.append("alleles")
111
- elif vcf_field.vcf_number == "A":
228
+ elif vcf_field.vcf_number == "A":
229
+ max_alt_alleles = schema.dimensions["alt_alleles"].size
230
+ if max_number > max_alt_alleles:
231
+ raise ValueError(
232
+ f"Max number of values {max_number} exceeds max alt alleles "
233
+ f"{max_alt_alleles} for {vcf_field.full_name}"
234
+ )
235
+ if max_alt_alleles > 0:
112
236
  dimensions.append("alt_alleles")
113
- elif vcf_field.vcf_number == "G":
237
+ elif vcf_field.vcf_number == "G":
238
+ max_genotypes = schema.dimensions["genotypes"].size
239
+ if max_number > max_genotypes:
240
+ raise ValueError(
241
+ f"Max number of values {max_number} exceeds max genotypes "
242
+ f"{max_genotypes} for {vcf_field.full_name}"
243
+ )
244
+ if max_genotypes > 0:
114
245
  dimensions.append("genotypes")
115
- else:
116
- dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
117
- return ZarrArraySpec.new(
118
- vcf_field=vcf_field.full_name,
246
+ elif max_number > 1 or vcf_field.full_name == "FORMAT/LAA":
247
+ dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
248
+ if dimensions[-1] not in schema.dimensions:
249
+ schema.dimensions[dimensions[-1]] = VcfZarrDimension.unchunked(
250
+ vcf_field.max_number
251
+ )
252
+
253
+ return ZarrArraySpec(
254
+ source=vcf_field.full_name,
119
255
  name=array_name,
120
256
  dtype=vcf_field.smallest_dtype(),
121
- shape=shape,
122
- chunks=chunks,
123
257
  dimensions=dimensions,
124
258
  description=vcf_field.description,
259
+ compressor=compressor,
260
+ filters=filters,
125
261
  )
126
262
 
127
- def _choose_compressor_settings(self):
128
- """
129
- Choose compressor and filter settings based on the size and
130
- type of the array, plus some hueristics from observed properties
131
- of VCFs.
132
-
133
- See https://github.com/pystatgen/bio2zarr/discussions/74
134
- """
135
- # Default is to not shuffle, because autoshuffle isn't recognised
136
- # by many Zarr implementations, and shuffling can lead to worse
137
- # performance in some cases anyway. Turning on shuffle should be a
138
- # deliberate choice.
139
- shuffle = numcodecs.Blosc.NOSHUFFLE
140
- if self.name == "call_genotype" and self.dtype == "i1":
141
- # call_genotype gets BITSHUFFLE by default as it gets
142
- # significantly better compression (at a cost of slower
143
- # decoding)
144
- shuffle = numcodecs.Blosc.BITSHUFFLE
145
- elif self.dtype == "bool":
146
- shuffle = numcodecs.Blosc.BITSHUFFLE
147
-
148
- self.compressor["shuffle"] = shuffle
149
-
150
- @property
151
- def chunk_nbytes(self):
263
+ def chunk_nbytes(self, schema):
152
264
  """
153
265
  Returns the nbytes for a single chunk in this array.
154
266
  """
155
267
  items = 1
156
268
  dim = 0
157
- for chunk_size in self.chunks:
158
- size = min(chunk_size, self.shape[dim])
269
+ for chunk_size in self.get_chunks(schema):
270
+ size = min(chunk_size, self.get_shape(schema)[dim])
159
271
  items *= size
160
272
  dim += 1
161
273
  # Include sizes for extra dimensions.
162
- for size in self.shape[dim:]:
274
+ for size in self.get_shape(schema)[dim:]:
163
275
  items *= size
164
276
  dt = np.dtype(self.dtype)
165
277
  return items * dt.itemsize
166
278
 
167
- @property
168
- def variant_chunk_nbytes(self):
279
+ def variant_chunk_nbytes(self, schema):
169
280
  """
170
281
  Returns the nbytes for a single variant chunk of this array.
171
282
  """
172
- chunk_items = self.chunks[0]
173
- for size in self.shape[1:]:
283
+ chunk_items = self.get_chunks(schema)[0]
284
+ for size in self.get_shape(schema)[1:]:
174
285
  chunk_items *= size
175
286
  dt = np.dtype(self.dtype)
176
- if dt.kind == "O" and "samples" in self.dimensions:
287
+ if dt.kind == zarr_utils.STRING_DTYPE_NAME and "samples" in self.dimensions:
177
288
  logger.warning(
178
289
  f"Field {self.name} is a string; max memory usage may "
179
290
  "be a significant underestimate"
@@ -181,87 +292,71 @@ class ZarrArraySpec:
181
292
  return chunk_items * dt.itemsize
182
293
 
183
294
 
184
- ZARR_SCHEMA_FORMAT_VERSION = "0.4"
295
+ @dataclasses.dataclass
296
+ class Contig:
297
+ id: str
298
+ length: int = None
185
299
 
186
300
 
187
- def convert_local_allele_field_types(fields):
188
- """
189
- Update the specified list of fields to include the LAA field, and to convert
190
- any supported localisable fields to the L* counterpart.
191
-
192
- Note that we currently support only two ALT alleles per sample, and so the
193
- dimensions of these fields are fixed by that requirement. Later versions may
194
- use summry data storted in the ICF to make different choices, if information
195
- about subsequent alleles (not in the actual genotype calls) should also be
196
- stored.
197
- """
198
- fields_by_name = {field.name: field for field in fields}
199
- gt = fields_by_name["call_genotype"]
200
- if gt.shape[-1] != 2:
201
- raise ValueError("Local alleles only supported on diploid data")
202
-
203
- # TODO check if LA is already in here
204
-
205
- shape = gt.shape[:-1]
206
- chunks = gt.chunks[:-1]
207
- dimensions = gt.dimensions[:-1]
208
-
209
- la = ZarrArraySpec.new(
210
- vcf_field=None,
211
- name="call_LA",
212
- dtype="i1",
213
- shape=gt.shape,
214
- chunks=gt.chunks,
215
- dimensions=(*dimensions, "local_alleles"),
216
- description=(
217
- "0-based indices into REF+ALT, indicating which alleles"
218
- " are relevant (local) for the current sample"
219
- ),
220
- )
221
- ad = fields_by_name.get("call_AD", None)
222
- if ad is not None:
223
- # TODO check if call_LAD is in the list already
224
- ad.name = "call_LAD"
225
- ad.vcf_field = None
226
- ad.shape = (*shape, 2)
227
- ad.chunks = (*chunks, 2)
228
- ad.dimensions = (*dimensions, "local_alleles")
229
- ad.description += " (local-alleles)"
230
-
231
- pl = fields_by_name.get("call_PL", None)
232
- if pl is not None:
233
- # TODO check if call_LPL is in the list already
234
- pl.name = "call_LPL"
235
- pl.vcf_field = None
236
- pl.shape = (*shape, 3)
237
- pl.chunks = (*chunks, 3)
238
- pl.description += " (local-alleles)"
239
- pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1])
240
- return [*fields, la]
301
+ @dataclasses.dataclass
302
+ class Sample:
303
+ id: str
304
+
305
+
306
+ @dataclasses.dataclass
307
+ class Filter:
308
+ id: str
309
+ description: str = ""
241
310
 
242
311
 
243
312
  @dataclasses.dataclass
244
313
  class VcfZarrSchema(core.JsonDataclass):
245
314
  format_version: str
246
- samples_chunk_size: int
247
- variants_chunk_size: int
248
- samples: list
249
- contigs: list
250
- filters: list
315
+ dimensions: dict
251
316
  fields: list
317
+ defaults: dict
318
+
319
+ def __init__(
320
+ self,
321
+ format_version: str,
322
+ fields: list,
323
+ dimensions: dict,
324
+ defaults: dict = None,
325
+ ):
326
+ self.format_version = format_version
327
+ self.fields = fields
328
+ defaults = defaults.copy() if defaults is not None else {}
329
+ if defaults.get("compressor", None) is None:
330
+ defaults["compressor"] = DEFAULT_ZARR_COMPRESSOR.get_config()
331
+ if defaults.get("filters", None) is None:
332
+ defaults["filters"] = []
333
+ self.defaults = defaults
334
+ self.dimensions = dimensions
335
+
336
+ def get_shape(self, dimensions):
337
+ return [self.dimensions[dim].size for dim in dimensions]
338
+
339
+ def get_chunks(self, dimensions):
340
+ return [self.dimensions[dim].chunk_size for dim in dimensions]
252
341
 
253
342
  def validate(self):
254
343
  """
255
344
  Checks that the schema is well-formed and within required limits.
256
345
  """
257
346
  for field in self.fields:
347
+ for dim in field.dimensions:
348
+ if dim not in self.dimensions:
349
+ raise ValueError(
350
+ f"Dimension '{dim}' used in field '{field.name}' is "
351
+ "not defined in the schema"
352
+ )
353
+
354
+ chunk_nbytes = field.get_chunk_nbytes(self)
258
355
  # This is the Blosc max buffer size
259
- if field.chunk_nbytes > 2147483647:
260
- # TODO add some links to documentation here advising how to
261
- # deal with PL values.
356
+ if chunk_nbytes > 2147483647:
262
357
  raise ValueError(
263
358
  f"Field {field.name} chunks are too large "
264
- f"({field.chunk_nbytes} > 2**31 - 1 bytes). "
359
+ f"({chunk_nbytes} > 2**31 - 1 bytes). "
265
360
  "Either generate a schema and drop this field (if you don't "
266
361
  "need it) or reduce the variant or sample chunk sizes."
267
362
  )
@@ -278,253 +373,30 @@ class VcfZarrSchema(core.JsonDataclass):
278
373
  "Zarr schema format version mismatch: "
279
374
  f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
280
375
  )
376
+
281
377
  ret = VcfZarrSchema(**d)
282
- ret.samples = [icf.Sample(**sd) for sd in d["samples"]]
283
- ret.contigs = [icf.Contig(**sd) for sd in d["contigs"]]
284
- ret.filters = [icf.Filter(**sd) for sd in d["filters"]]
285
378
  ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
379
+ ret.dimensions = {
380
+ k: VcfZarrDimension.fromdict(v) for k, v in d["dimensions"].items()
381
+ }
382
+
286
383
  return ret
287
384
 
288
385
  @staticmethod
289
386
  def fromjson(s):
290
387
  return VcfZarrSchema.fromdict(json.loads(s))
291
388
 
292
- @staticmethod
293
- def generate(
294
- icf, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
295
- ):
296
- m = icf.num_records
297
- n = icf.num_samples
298
- if samples_chunk_size is None:
299
- samples_chunk_size = 10_000
300
- if variants_chunk_size is None:
301
- variants_chunk_size = 1000
302
- if local_alleles is None:
303
- local_alleles = False
304
- logger.info(
305
- f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
306
- )
307
-
308
- def spec_from_field(field, array_name=None):
309
- return ZarrArraySpec.from_field(
310
- field,
311
- num_samples=n,
312
- num_variants=m,
313
- samples_chunk_size=samples_chunk_size,
314
- variants_chunk_size=variants_chunk_size,
315
- array_name=array_name,
316
- )
317
389
 
318
- def fixed_field_spec(
319
- name,
320
- dtype,
321
- vcf_field=None,
322
- shape=(m,),
323
- dimensions=("variants",),
324
- chunks=None,
325
- ):
326
- return ZarrArraySpec.new(
327
- vcf_field=vcf_field,
328
- name=name,
329
- dtype=dtype,
330
- shape=shape,
331
- description="",
332
- dimensions=dimensions,
333
- chunks=chunks or [variants_chunk_size],
334
- )
335
-
336
- alt_field = icf.fields["ALT"]
337
- max_alleles = alt_field.vcf_field.summary.max_number + 1
338
-
339
- array_specs = [
340
- fixed_field_spec(
341
- name="variant_contig",
342
- dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
343
- ),
344
- fixed_field_spec(
345
- name="variant_filter",
346
- dtype="bool",
347
- shape=(m, icf.metadata.num_filters),
348
- dimensions=["variants", "filters"],
349
- chunks=(variants_chunk_size, icf.metadata.num_filters),
350
- ),
351
- fixed_field_spec(
352
- name="variant_allele",
353
- dtype="O",
354
- shape=(m, max_alleles),
355
- dimensions=["variants", "alleles"],
356
- chunks=(variants_chunk_size, max_alleles),
357
- ),
358
- fixed_field_spec(
359
- name="variant_id",
360
- dtype="O",
361
- ),
362
- fixed_field_spec(
363
- name="variant_id_mask",
364
- dtype="bool",
365
- ),
366
- ]
367
- name_map = {field.full_name: field for field in icf.metadata.fields}
368
-
369
- # Only three of the fixed fields have a direct one-to-one mapping.
370
- array_specs.extend(
371
- [
372
- spec_from_field(name_map["QUAL"], array_name="variant_quality"),
373
- spec_from_field(name_map["POS"], array_name="variant_position"),
374
- spec_from_field(name_map["rlen"], array_name="variant_length"),
375
- ]
376
- )
377
- array_specs.extend(
378
- [spec_from_field(field) for field in icf.metadata.info_fields]
379
- )
380
-
381
- gt_field = None
382
- for field in icf.metadata.format_fields:
383
- if field.name == "GT":
384
- gt_field = field
385
- continue
386
- array_specs.append(spec_from_field(field))
387
-
388
- if gt_field is not None and n > 0:
389
- ploidy = max(gt_field.summary.max_number - 1, 1)
390
- shape = [m, n]
391
- chunks = [variants_chunk_size, samples_chunk_size]
392
- dimensions = ["variants", "samples"]
393
- array_specs.append(
394
- ZarrArraySpec.new(
395
- vcf_field=None,
396
- name="call_genotype_phased",
397
- dtype="bool",
398
- shape=list(shape),
399
- chunks=list(chunks),
400
- dimensions=list(dimensions),
401
- description="",
402
- )
403
- )
404
- shape += [ploidy]
405
- chunks += [ploidy]
406
- dimensions += ["ploidy"]
407
- array_specs.append(
408
- ZarrArraySpec.new(
409
- vcf_field=None,
410
- name="call_genotype",
411
- dtype=gt_field.smallest_dtype(),
412
- shape=list(shape),
413
- chunks=list(chunks),
414
- dimensions=list(dimensions),
415
- description="",
416
- )
417
- )
418
- array_specs.append(
419
- ZarrArraySpec.new(
420
- vcf_field=None,
421
- name="call_genotype_mask",
422
- dtype="bool",
423
- shape=list(shape),
424
- chunks=list(chunks),
425
- dimensions=list(dimensions),
426
- description="",
427
- )
428
- )
429
-
430
- if local_alleles:
431
- array_specs = convert_local_allele_field_types(array_specs)
432
-
433
- return VcfZarrSchema(
434
- format_version=ZARR_SCHEMA_FORMAT_VERSION,
435
- samples_chunk_size=samples_chunk_size,
436
- variants_chunk_size=variants_chunk_size,
437
- fields=array_specs,
438
- samples=icf.metadata.samples,
439
- contigs=icf.metadata.contigs,
440
- filters=icf.metadata.filters,
441
- )
442
-
443
-
444
- class VcfZarr:
445
- def __init__(self, path):
446
- if not (path / ".zmetadata").exists():
447
- raise ValueError("Not in VcfZarr format") # NEEDS TEST
448
- self.path = path
449
- self.root = zarr.open(path, mode="r")
450
-
451
- def summary_table(self):
452
- data = []
453
- arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()]
454
- arrays.sort(key=lambda x: x[0])
455
- for stored, array in reversed(arrays):
456
- d = {
457
- "name": array.name,
458
- "dtype": str(array.dtype),
459
- "stored": core.display_size(stored),
460
- "size": core.display_size(array.nbytes),
461
- "ratio": core.display_number(array.nbytes / stored),
462
- "nchunks": str(array.nchunks),
463
- "chunk_size": core.display_size(array.nbytes / array.nchunks),
464
- "avg_chunk_stored": core.display_size(int(stored / array.nchunks)),
465
- "shape": str(array.shape),
466
- "chunk_shape": str(array.chunks),
467
- "compressor": str(array.compressor),
468
- "filters": str(array.filters),
469
- }
470
- data.append(d)
471
- return data
472
-
473
-
474
- def parse_max_memory(max_memory):
475
- if max_memory is None:
476
- # Effectively unbounded
477
- return 2**63
478
- if isinstance(max_memory, str):
479
- max_memory = humanfriendly.parse_size(max_memory)
480
- logger.info(f"Set memory budget to {core.display_size(max_memory)}")
481
- return max_memory
482
-
483
-
484
- @dataclasses.dataclass
485
- class VcfZarrPartition:
486
- start: int
487
- stop: int
488
-
489
- @staticmethod
490
- def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
491
- num_chunks = int(np.ceil(num_records / chunk_size))
492
- if max_chunks is not None:
493
- num_chunks = min(num_chunks, max_chunks)
494
- partitions = []
495
- splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
496
- for chunk_slice in splits:
497
- start_chunk = int(chunk_slice[0])
498
- stop_chunk = int(chunk_slice[-1]) + 1
499
- start_index = start_chunk * chunk_size
500
- stop_index = min(stop_chunk * chunk_size, num_records)
501
- partitions.append(VcfZarrPartition(start_index, stop_index))
502
- return partitions
503
-
504
-
505
- VZW_METADATA_FORMAT_VERSION = "0.1"
506
-
507
-
508
- @dataclasses.dataclass
509
- class VcfZarrWriterMetadata(core.JsonDataclass):
510
- format_version: str
511
- icf_path: str
512
- schema: VcfZarrSchema
513
- dimension_separator: str
514
- partitions: list
515
- provenance: dict
516
-
517
- @staticmethod
518
- def fromdict(d):
519
- if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
520
- raise ValueError(
521
- "VcfZarrWriter format version mismatch: "
522
- f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
523
- )
524
- ret = VcfZarrWriterMetadata(**d)
525
- ret.schema = VcfZarrSchema.fromdict(ret.schema)
526
- ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
527
- return ret
390
+ def sanitise_int_array(value, ndmin, dtype):
391
+ if isinstance(value, tuple):
392
+ value = [
393
+ constants.VCF_INT_MISSING if x is None else x for x in value
394
+ ] # NEEDS TEST
395
+ value = np.array(value, ndmin=ndmin, copy=True)
396
+ value[value == constants.VCF_INT_MISSING] = -1
397
+ value[value == constants.VCF_INT_FILL] = -2
398
+ # TODO watch out for clipping here!
399
+ return value.astype(dtype)
528
400
 
529
401
 
530
402
  def compute_la_field(genotypes):
@@ -597,14 +469,60 @@ class LocalisableFieldDescriptor:
597
469
 
598
470
  localisable_fields = [
599
471
  LocalisableFieldDescriptor(
600
- "call_LAD", "FORMAT/AD", icf.sanitise_int_array, compute_lad_field
472
+ "call_LAD", "FORMAT/AD", sanitise_int_array, compute_lad_field
601
473
  ),
602
474
  LocalisableFieldDescriptor(
603
- "call_LPL", "FORMAT/PL", icf.sanitise_int_array, compute_lpl_field
475
+ "call_LPL", "FORMAT/PL", sanitise_int_array, compute_lpl_field
604
476
  ),
605
477
  ]
606
478
 
607
479
 
480
+ @dataclasses.dataclass
481
+ class VcfZarrPartition:
482
+ start: int
483
+ stop: int
484
+
485
+ @staticmethod
486
+ def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
487
+ num_chunks = int(np.ceil(num_records / chunk_size))
488
+ if max_chunks is not None:
489
+ num_chunks = min(num_chunks, max_chunks)
490
+ partitions = []
491
+ splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
492
+ for chunk_slice in splits:
493
+ start_chunk = int(chunk_slice[0])
494
+ stop_chunk = int(chunk_slice[-1]) + 1
495
+ start_index = start_chunk * chunk_size
496
+ stop_index = min(stop_chunk * chunk_size, num_records)
497
+ partitions.append(VcfZarrPartition(start_index, stop_index))
498
+ return partitions
499
+
500
+
501
+ VZW_METADATA_FORMAT_VERSION = "0.1"
502
+
503
+
504
+ @dataclasses.dataclass
505
+ class VcfZarrWriterMetadata(core.JsonDataclass):
506
+ format_version: str
507
+ source_path: str
508
+ schema: VcfZarrSchema
509
+ dimension_separator: str
510
+ partitions: list
511
+ provenance: dict
512
+
513
+ @staticmethod
514
+ def fromdict(d):
515
+ if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
516
+ raise ValueError(
517
+ "VcfZarrWriter format version mismatch: "
518
+ f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
519
+ )
520
+ ret = VcfZarrWriterMetadata(**d)
521
+ ret.schema = VcfZarrSchema.fromdict(ret.schema)
522
+ ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
523
+ return ret
524
+
525
+
608
526
  @dataclasses.dataclass
609
527
  class VcfZarrWriteSummary(core.JsonDataclass):
610
528
  num_partitions: int
@@ -615,13 +533,14 @@ class VcfZarrWriteSummary(core.JsonDataclass):
615
533
 
616
534
 
617
535
  class VcfZarrWriter:
618
- def __init__(self, path):
536
+ def __init__(self, source_type, path):
537
+ self.source_type = source_type
619
538
  self.path = pathlib.Path(path)
620
539
  self.wip_path = self.path / "wip"
621
540
  self.arrays_path = self.wip_path / "arrays"
622
541
  self.partitions_path = self.wip_path / "partitions"
623
542
  self.metadata = None
624
- self.icf = None
543
+ self.source = None
625
544
 
626
545
  @property
627
546
  def schema(self):
@@ -639,7 +558,7 @@ class VcfZarrWriter:
639
558
 
640
559
  def has_local_alleles(self):
641
560
  for field in self.schema.fields:
642
- if field.name == "call_LA" and field.vcf_field is None:
561
+ if field.name == "call_LA" and field.source is None:
643
562
  return True
644
563
  return False
645
564
 
@@ -649,20 +568,20 @@ class VcfZarrWriter:
649
568
 
650
569
  def init(
651
570
  self,
652
- icf,
571
+ source,
653
572
  *,
654
573
  target_num_partitions,
655
574
  schema,
656
575
  dimension_separator=None,
657
576
  max_variant_chunks=None,
658
577
  ):
659
- self.icf = icf
578
+ self.source = source
660
579
  if self.path.exists():
661
580
  raise ValueError("Zarr path already exists") # NEEDS TEST
662
581
  schema.validate()
663
582
  partitions = VcfZarrPartition.generate_partitions(
664
- self.icf.num_records,
665
- schema.variants_chunk_size,
583
+ self.source.num_records,
584
+ schema.get_chunks(["variants"])[0],
666
585
  target_num_partitions,
667
586
  max_chunks=max_variant_chunks,
668
587
  )
@@ -673,7 +592,7 @@ class VcfZarrWriter:
673
592
  )
674
593
  self.metadata = VcfZarrWriterMetadata(
675
594
  format_version=VZW_METADATA_FORMAT_VERSION,
676
- icf_path=str(self.icf.path),
595
+ source_path=str(self.source.path),
677
596
  schema=schema,
678
597
  dimension_separator=dimension_separator,
679
598
  partitions=partitions,
@@ -682,27 +601,32 @@ class VcfZarrWriter:
682
601
  )
683
602
 
684
603
  self.path.mkdir()
685
- root = zarr.open(store=self.path, mode="a", **ZARR_FORMAT_KWARGS)
604
+ root = zarr.open(store=self.path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS)
686
605
  root.attrs.update(
687
606
  {
688
- "vcf_zarr_version": "0.2",
689
- "vcf_header": self.icf.vcf_header,
607
+ "vcf_zarr_version": "0.4",
690
608
  "source": f"bio2zarr-{provenance.__version__}",
691
609
  }
692
610
  )
693
- # Doing this syncronously - this is fine surely
611
+ root.attrs.update(self.source.root_attrs)
612
+
613
+ # Doing this synchronously - this is fine surely
694
614
  self.encode_samples(root)
695
- self.encode_filter_id(root)
696
- self.encode_contig_id(root)
615
+ if self.source.filters is not None:
616
+ self.encode_filters(root)
617
+ if self.source.contigs is not None:
618
+ self.encode_contigs(root)
697
619
 
698
620
  self.wip_path.mkdir()
699
621
  self.arrays_path.mkdir()
700
622
  self.partitions_path.mkdir()
701
- root = zarr.open(store=self.arrays_path, mode="a", **ZARR_FORMAT_KWARGS)
623
+ root = zarr.open(
624
+ store=self.arrays_path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS
625
+ )
702
626
 
703
627
  total_chunks = 0
704
628
  for field in self.schema.fields:
705
- a = self.init_array(root, field, partitions[-1].stop)
629
+ a = self.init_array(root, self.metadata.schema, field, partitions[-1].stop)
706
630
  total_chunks += a.nchunks
707
631
 
708
632
  logger.info("Writing WIP metadata")
@@ -710,89 +634,111 @@ class VcfZarrWriter:
710
634
  json.dump(self.metadata.asdict(), f, indent=4)
711
635
 
712
636
  return VcfZarrWriteSummary(
713
- num_variants=self.icf.num_records,
714
- num_samples=self.icf.num_samples,
637
+ num_variants=self.source.num_records,
638
+ num_samples=self.source.num_samples,
715
639
  num_partitions=self.num_partitions,
716
640
  num_chunks=total_chunks,
717
641
  max_encoding_memory=core.display_size(self.get_max_encoding_memory()),
718
642
  )
719
643
 
720
644
  def encode_samples(self, root):
721
- if self.schema.samples != self.icf.metadata.samples:
722
- raise ValueError("Subsetting or reordering samples not supported currently")
723
- array = root.array(
645
+ samples = self.source.samples
646
+ zarr_utils.create_group_array(
647
+ root,
724
648
  "sample_id",
725
- data=[sample.id for sample in self.schema.samples],
726
- shape=len(self.schema.samples),
649
+ data=[sample.id for sample in samples],
650
+ shape=len(samples),
727
651
  dtype="str",
728
652
  compressor=DEFAULT_ZARR_COMPRESSOR,
729
- chunks=(self.schema.samples_chunk_size,),
653
+ chunks=(self.schema.get_chunks(["samples"])[0],),
654
+ dimension_names=["samples"],
730
655
  )
731
- array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
732
656
  logger.debug("Samples done")
733
657
 
734
- def encode_contig_id(self, root):
735
- array = root.array(
658
+ def encode_contigs(self, root):
659
+ contigs = self.source.contigs
660
+ zarr_utils.create_group_array(
661
+ root,
736
662
  "contig_id",
737
- data=[contig.id for contig in self.schema.contigs],
738
- shape=len(self.schema.contigs),
663
+ data=[contig.id for contig in contigs],
664
+ shape=len(contigs),
739
665
  dtype="str",
740
666
  compressor=DEFAULT_ZARR_COMPRESSOR,
667
+ dimension_names=["contigs"],
741
668
  )
742
- array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
743
- if all(contig.length is not None for contig in self.schema.contigs):
744
- array = root.array(
669
+ if all(contig.length is not None for contig in contigs):
670
+ zarr_utils.create_group_array(
671
+ root,
745
672
  "contig_length",
746
- data=[contig.length for contig in self.schema.contigs],
747
- shape=len(self.schema.contigs),
673
+ data=[contig.length for contig in contigs],
674
+ shape=len(contigs),
748
675
  dtype=np.int64,
749
676
  compressor=DEFAULT_ZARR_COMPRESSOR,
677
+ dimension_names=["contigs"],
750
678
  )
751
- array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
752
679
 
753
- def encode_filter_id(self, root):
754
- # TODO need a way to store description also
755
- # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
756
- array = root.array(
680
+ def encode_filters(self, root):
681
+ filters = self.source.filters
682
+ zarr_utils.create_group_array(
683
+ root,
757
684
  "filter_id",
758
- data=[filt.id for filt in self.schema.filters],
759
- shape=len(self.schema.filters),
685
+ data=[filt.id for filt in filters],
686
+ shape=len(filters),
687
+ dtype="str",
688
+ compressor=DEFAULT_ZARR_COMPRESSOR,
689
+ dimension_names=["filters"],
690
+ )
691
+ zarr_utils.create_group_array(
692
+ root,
693
+ "filter_description",
694
+ data=[filt.description for filt in filters],
695
+ shape=len(filters),
760
696
  dtype="str",
761
697
  compressor=DEFAULT_ZARR_COMPRESSOR,
698
+ dimension_names=["filters"],
762
699
  )
763
- array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
764
700
 
765
- def init_array(self, root, array_spec, variants_dim_size):
766
- kwargs = dict(ZARR_FORMAT_KWARGS)
767
- filters = [numcodecs.get_codec(filt) for filt in array_spec.filters]
768
- if array_spec.dtype == "O":
769
- if zarr_v3():
701
+ def init_array(self, root, schema, array_spec, variants_dim_size):
702
+ kwargs = dict(zarr_utils.ZARR_FORMAT_KWARGS)
703
+ filters = (
704
+ array_spec.filters
705
+ if array_spec.filters is not None
706
+ else schema.defaults["filters"]
707
+ )
708
+ filters = [numcodecs.get_codec(filt) for filt in filters]
709
+ compressor = (
710
+ array_spec.compressor
711
+ if array_spec.compressor is not None
712
+ else schema.defaults["compressor"]
713
+ )
714
+ compressor = numcodecs.get_codec(compressor)
715
+ if array_spec.dtype == zarr_utils.STRING_DTYPE_NAME:
716
+ if zarr_utils.zarr_v3():
770
717
  filters = [*list(filters), numcodecs.VLenUTF8()]
771
718
  else:
772
719
  kwargs["object_codec"] = numcodecs.VLenUTF8()
773
720
 
774
- if not zarr_v3():
721
+ if zarr_utils.zarr_v3():
722
+ # see https://github.com/zarr-developers/zarr-python/issues/3197
723
+ kwargs["fill_value"] = None
724
+ else:
775
725
  kwargs["dimension_separator"] = self.metadata.dimension_separator
776
726
 
777
- shape = list(array_spec.shape)
778
- # Truncate the variants dimension is max_variant_chunks was specified
727
+ shape = schema.get_shape(array_spec.dimensions)
728
+ # Truncate the variants dimension if max_variant_chunks was specified
779
729
  shape[0] = variants_dim_size
780
- a = root.empty(
730
+ a = zarr_utils.create_empty_group_array(
731
+ root,
781
732
  name=array_spec.name,
782
733
  shape=shape,
783
- chunks=array_spec.chunks,
734
+ chunks=schema.get_chunks(array_spec.dimensions),
784
735
  dtype=array_spec.dtype,
785
- compressor=numcodecs.get_codec(array_spec.compressor),
736
+ compressor=compressor,
786
737
  filters=filters,
738
+ dimension_names=array_spec.dimensions,
787
739
  **kwargs,
788
740
  )
789
- a.attrs.update(
790
- {
791
- "description": array_spec.description,
792
- # Dimension names are part of the spec in Zarr v3
793
- "_ARRAY_DIMENSIONS": array_spec.dimensions,
794
- }
795
- )
741
+ a.attrs.update({"description": array_spec.description})
796
742
  logger.debug(f"Initialised {a}")
797
743
  return a
798
744
 
@@ -804,7 +750,7 @@ class VcfZarrWriter:
804
750
  if self.metadata is None:
805
751
  with open(self.wip_path / "metadata.json") as f:
806
752
  self.metadata = VcfZarrWriterMetadata.fromdict(json.load(f))
807
- self.icf = icf.IntermediateColumnarFormat(self.metadata.icf_path)
753
+ self.source = self.source_type(self.metadata.source_path)
808
754
 
809
755
  def partition_path(self, partition_index):
810
756
  return self.partitions_path / f"p{partition_index}"
@@ -826,15 +772,18 @@ class VcfZarrWriter:
826
772
  partition_path.mkdir(exist_ok=True)
827
773
  logger.info(f"Encoding partition {partition_index} to {partition_path}")
828
774
 
829
- self.encode_id_partition(partition_index)
830
- self.encode_filters_partition(partition_index)
831
- self.encode_contig_partition(partition_index)
832
- self.encode_alleles_partition(partition_index)
775
+ all_field_names = [field.name for field in self.schema.fields]
776
+ if "variant_id" in all_field_names:
777
+ self.encode_id_partition(partition_index)
778
+ if "variant_filter" in all_field_names:
779
+ self.encode_filters_partition(partition_index)
780
+ if "variant_contig" in all_field_names:
781
+ self.encode_contig_partition(partition_index)
782
+ self.encode_alleles_and_genotypes_partition(partition_index)
833
783
  for array_spec in self.schema.fields:
834
- if array_spec.vcf_field is not None:
784
+ if array_spec.source is not None:
835
785
  self.encode_array_partition(array_spec, partition_index)
836
786
  if self.has_genotypes():
837
- self.encode_genotypes_partition(partition_index)
838
787
  self.encode_genotype_mask_partition(partition_index)
839
788
  if self.has_local_alleles():
840
789
  self.encode_local_alleles_partition(partition_index)
@@ -874,34 +823,48 @@ class VcfZarrWriter:
874
823
  def encode_array_partition(self, array_spec, partition_index):
875
824
  partition = self.metadata.partitions[partition_index]
876
825
  ba = self.init_partition_array(partition_index, array_spec.name)
877
- source_field = self.icf.fields[array_spec.vcf_field]
878
- sanitiser = source_field.sanitiser_factory(ba.buff.shape)
879
-
880
- for value in source_field.iter_values(partition.start, partition.stop):
881
- # We write directly into the buffer in the sanitiser function
882
- # to make it easier to reason about dimension padding
826
+ for value in self.source.iter_field(
827
+ array_spec.source,
828
+ ba.buff.shape[1:],
829
+ partition.start,
830
+ partition.stop,
831
+ ):
883
832
  j = ba.next_buffer_row()
884
- sanitiser(ba.buff, j, value)
833
+ ba.buff[j] = value
834
+
885
835
  self.finalise_partition_array(partition_index, ba)
886
836
 
887
- def encode_genotypes_partition(self, partition_index):
837
+ def encode_alleles_and_genotypes_partition(self, partition_index):
888
838
  partition = self.metadata.partitions[partition_index]
889
- gt = self.init_partition_array(partition_index, "call_genotype")
890
- gt_phased = self.init_partition_array(partition_index, "call_genotype_phased")
891
-
892
- source_field = self.icf.fields["FORMAT/GT"]
893
- for value in source_field.iter_values(partition.start, partition.stop):
894
- j = gt.next_buffer_row()
895
- icf.sanitise_value_int_2d(
896
- gt.buff, j, value[:, :-1] if value is not None else None
897
- )
898
- j = gt_phased.next_buffer_row()
899
- icf.sanitise_value_int_1d(
900
- gt_phased.buff, j, value[:, -1] if value is not None else None
839
+ alleles = self.init_partition_array(partition_index, "variant_allele")
840
+ variant_lengths = self.init_partition_array(partition_index, "variant_length")
841
+ has_gt = self.has_genotypes()
842
+ shape = None
843
+ if has_gt:
844
+ gt = self.init_partition_array(partition_index, "call_genotype")
845
+ gt_phased = self.init_partition_array(
846
+ partition_index, "call_genotype_phased"
901
847
  )
848
+ shape = gt.buff.shape[1:]
902
849
 
903
- self.finalise_partition_array(partition_index, gt)
904
- self.finalise_partition_array(partition_index, gt_phased)
850
+ for variant_data in self.source.iter_alleles_and_genotypes(
851
+ partition.start, partition.stop, shape, alleles.array.shape[1]
852
+ ):
853
+ j_alleles = alleles.next_buffer_row()
854
+ alleles.buff[j_alleles] = variant_data.alleles
855
+ j_variant_length = variant_lengths.next_buffer_row()
856
+ variant_lengths.buff[j_variant_length] = variant_data.variant_length
857
+ if has_gt:
858
+ j = gt.next_buffer_row()
859
+ gt.buff[j] = variant_data.genotypes
860
+ j_phased = gt_phased.next_buffer_row()
861
+ gt_phased.buff[j_phased] = variant_data.phased
862
+
863
+ self.finalise_partition_array(partition_index, alleles)
864
+ self.finalise_partition_array(partition_index, variant_lengths)
865
+ if has_gt:
866
+ self.finalise_partition_array(partition_index, gt)
867
+ self.finalise_partition_array(partition_index, gt_phased)
905
868
 
906
869
  def encode_genotype_mask_partition(self, partition_index):
907
870
  partition = self.metadata.partitions[partition_index]
@@ -948,10 +911,10 @@ class VcfZarrWriter:
948
911
  for descriptor in localisable_fields:
949
912
  if descriptor.array_name not in field_map:
950
913
  continue
951
- assert field_map[descriptor.array_name].vcf_field is None
914
+ assert field_map[descriptor.array_name].source is None
952
915
 
953
916
  buff = self.init_partition_array(partition_index, descriptor.array_name)
954
- source = self.icf.fields[descriptor.vcf_field].iter_values(
917
+ source = self.source.fields[descriptor.vcf_field].iter_values(
955
918
  partition.start, partition.stop
956
919
  )
957
920
  for la in core.first_dim_slice_iter(
@@ -963,34 +926,17 @@ class VcfZarrWriter:
963
926
  buff.buff[j] = descriptor.convert(value, la)
964
927
  self.finalise_partition_array(partition_index, buff)
965
928
 
966
- def encode_alleles_partition(self, partition_index):
967
- alleles = self.init_partition_array(partition_index, "variant_allele")
968
- partition = self.metadata.partitions[partition_index]
969
- ref_field = self.icf.fields["REF"]
970
- alt_field = self.icf.fields["ALT"]
971
-
972
- for ref, alt in zip(
973
- ref_field.iter_values(partition.start, partition.stop),
974
- alt_field.iter_values(partition.start, partition.stop),
975
- ):
976
- j = alleles.next_buffer_row()
977
- alleles.buff[j, :] = constants.STR_FILL
978
- alleles.buff[j, 0] = ref[0]
979
- alleles.buff[j, 1 : 1 + len(alt)] = alt
980
- self.finalise_partition_array(partition_index, alleles)
981
-
982
929
  def encode_id_partition(self, partition_index):
983
930
  vid = self.init_partition_array(partition_index, "variant_id")
984
931
  vid_mask = self.init_partition_array(partition_index, "variant_id_mask")
985
932
  partition = self.metadata.partitions[partition_index]
986
- field = self.icf.fields["ID"]
987
933
 
988
- for value in field.iter_values(partition.start, partition.stop):
934
+ for value in self.source.iter_id(partition.start, partition.stop):
989
935
  j = vid.next_buffer_row()
990
936
  k = vid_mask.next_buffer_row()
991
937
  assert j == k
992
938
  if value is not None:
993
- vid.buff[j] = value[0]
939
+ vid.buff[j] = value
994
940
  vid_mask.buff[j] = False
995
941
  else:
996
942
  vid.buff[j] = constants.STR_MISSING
@@ -1000,37 +946,22 @@ class VcfZarrWriter:
1000
946
  self.finalise_partition_array(partition_index, vid_mask)
1001
947
 
1002
948
  def encode_filters_partition(self, partition_index):
1003
- lookup = {filt.id: index for index, filt in enumerate(self.schema.filters)}
1004
949
  var_filter = self.init_partition_array(partition_index, "variant_filter")
1005
950
  partition = self.metadata.partitions[partition_index]
1006
951
 
1007
- field = self.icf.fields["FILTERS"]
1008
- for value in field.iter_values(partition.start, partition.stop):
952
+ for filter_values in self.source.iter_filters(partition.start, partition.stop):
1009
953
  j = var_filter.next_buffer_row()
1010
- var_filter.buff[j] = False
1011
- for f in value:
1012
- try:
1013
- var_filter.buff[j, lookup[f]] = True
1014
- except KeyError:
1015
- raise ValueError(
1016
- f"Filter '{f}' was not defined in the header."
1017
- ) from None
954
+ var_filter.buff[j] = filter_values
1018
955
 
1019
956
  self.finalise_partition_array(partition_index, var_filter)
1020
957
 
1021
958
  def encode_contig_partition(self, partition_index):
1022
- lookup = {contig.id: index for index, contig in enumerate(self.schema.contigs)}
1023
959
  contig = self.init_partition_array(partition_index, "variant_contig")
1024
960
  partition = self.metadata.partitions[partition_index]
1025
- field = self.icf.fields["CHROM"]
1026
961
 
1027
- for value in field.iter_values(partition.start, partition.stop):
962
+ for contig_index in self.source.iter_contig(partition.start, partition.stop):
1028
963
  j = contig.next_buffer_row()
1029
- # Note: because we are using the indexes to define the lookups
1030
- # and we always have an index, it seems that we the contig lookup
1031
- # will always succeed. However, if anyone ever does hit a KeyError
1032
- # here, please do open an issue with a reproducible example!
1033
- contig.buff[j] = lookup[value[0]]
964
+ contig.buff[j] = contig_index
1034
965
 
1035
966
  self.finalise_partition_array(partition_index, contig)
1036
967
 
@@ -1050,19 +981,7 @@ class VcfZarrWriter:
1050
981
  if not src.exists():
1051
982
  # Needs test
1052
983
  raise ValueError(f"Partition {partition} of {name} does not exist")
1053
- dest = self.arrays_path / name
1054
- # This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
1055
- chunk_files = [
1056
- path for path in src.iterdir() if not path.name.startswith(".")
1057
- ]
1058
- # TODO check for a count of then number of files. If we require a
1059
- # dimension_separator of "/" then we could make stronger assertions
1060
- # here, as we'd always have num_variant_chunks
1061
- logger.debug(
1062
- f"Moving {len(chunk_files)} chunks for {name} partition {partition}"
1063
- )
1064
- for chunk_file in chunk_files:
1065
- os.rename(chunk_file, dest / chunk_file.name)
984
+ zarr_utils.move_chunks(src, self.arrays_path, partition, name)
1066
985
  # Finally, once all the chunks have moved into the arrays dir,
1067
986
  # we move it out of wip
1068
987
  os.rename(self.arrays_path / name, self.path / name)
@@ -1109,60 +1028,8 @@ class VcfZarrWriter:
1109
1028
  def create_index(self):
1110
1029
  """Create an index to support efficient region queries."""
1111
1030
 
1112
- root = zarr.open_group(store=self.path, mode="r+")
1113
-
1114
- contig = root["variant_contig"]
1115
- pos = root["variant_position"]
1116
- length = root["variant_length"]
1117
-
1118
- assert contig.cdata_shape == pos.cdata_shape
1119
-
1120
- index = []
1121
-
1122
- logger.info("Creating region index")
1123
- for v_chunk in range(pos.cdata_shape[0]):
1124
- c = contig.blocks[v_chunk]
1125
- p = pos.blocks[v_chunk]
1126
- e = p + length.blocks[v_chunk] - 1
1127
-
1128
- # create a row for each contig in the chunk
1129
- d = np.diff(c, append=-1)
1130
- c_start_idx = 0
1131
- for c_end_idx in np.nonzero(d)[0]:
1132
- assert c[c_start_idx] == c[c_end_idx]
1133
- index.append(
1134
- (
1135
- v_chunk, # chunk index
1136
- c[c_start_idx], # contig ID
1137
- p[c_start_idx], # start
1138
- p[c_end_idx], # end
1139
- np.max(e[c_start_idx : c_end_idx + 1]), # max end
1140
- c_end_idx - c_start_idx + 1, # num records
1141
- )
1142
- )
1143
- c_start_idx = c_end_idx + 1
1144
-
1145
- index = np.array(index, dtype=pos.dtype)
1146
- kwargs = {}
1147
- if not zarr_v3():
1148
- kwargs["dimension_separator"] = self.metadata.dimension_separator
1149
- array = root.array(
1150
- "region_index",
1151
- data=index,
1152
- shape=index.shape,
1153
- chunks=index.shape,
1154
- dtype=index.dtype,
1155
- compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
1156
- fill_value=None,
1157
- **kwargs,
1158
- )
1159
- array.attrs["_ARRAY_DIMENSIONS"] = [
1160
- "region_index_values",
1161
- "region_index_fields",
1162
- ]
1163
-
1164
- logger.info("Consolidating Zarr metadata")
1165
- zarr.consolidate_metadata(self.path)
1031
+ indexer = VcfZarrIndexer(self.path)
1032
+ indexer.create_index()
1166
1033
 
1167
1034
  ######################
1168
1035
  # encode_all_partitions
@@ -1174,11 +1041,13 @@ class VcfZarrWriter:
1174
1041
  """
1175
1042
  max_encoding_mem = 0
1176
1043
  for array_spec in self.schema.fields:
1177
- max_encoding_mem = max(max_encoding_mem, array_spec.variant_chunk_nbytes)
1044
+ max_encoding_mem = max(
1045
+ max_encoding_mem, array_spec.variant_chunk_nbytes(self.schema)
1046
+ )
1178
1047
  gt_mem = 0
1179
1048
  if self.has_genotypes:
1180
1049
  gt_mem = sum(
1181
- field.variant_chunk_nbytes
1050
+ field.variant_chunk_nbytes(self.schema)
1182
1051
  for field in self.schema.fields
1183
1052
  if field.name.startswith("call_genotype")
1184
1053
  )
@@ -1187,7 +1056,7 @@ class VcfZarrWriter:
1187
1056
  def encode_all_partitions(
1188
1057
  self, *, worker_processes=1, show_progress=False, max_memory=None
1189
1058
  ):
1190
- max_memory = parse_max_memory(max_memory)
1059
+ max_memory = core.parse_max_memory(max_memory)
1191
1060
  self.load_metadata()
1192
1061
  num_partitions = self.num_partitions
1193
1062
  per_worker_memory = self.get_max_encoding_memory()
@@ -1229,147 +1098,107 @@ class VcfZarrWriter:
1229
1098
  pwm.submit(self.encode_partition, partition_index)
1230
1099
 
1231
1100
 
1232
- def mkschema(
1233
- if_path,
1234
- out,
1235
- *,
1236
- variants_chunk_size=None,
1237
- samples_chunk_size=None,
1238
- local_alleles=None,
1239
- ):
1240
- store = icf.IntermediateColumnarFormat(if_path)
1241
- spec = VcfZarrSchema.generate(
1242
- store,
1243
- variants_chunk_size=variants_chunk_size,
1244
- samples_chunk_size=samples_chunk_size,
1245
- local_alleles=local_alleles,
1246
- )
1247
- out.write(spec.asjson())
1248
-
1249
-
1250
- def encode(
1251
- if_path,
1252
- zarr_path,
1253
- schema_path=None,
1254
- variants_chunk_size=None,
1255
- samples_chunk_size=None,
1256
- max_variant_chunks=None,
1257
- dimension_separator=None,
1258
- max_memory=None,
1259
- local_alleles=None,
1260
- worker_processes=1,
1261
- show_progress=False,
1262
- ):
1263
- # Rough heuristic to split work up enough to keep utilisation high
1264
- target_num_partitions = max(1, worker_processes * 4)
1265
- encode_init(
1266
- if_path,
1267
- zarr_path,
1268
- target_num_partitions,
1269
- schema_path=schema_path,
1270
- variants_chunk_size=variants_chunk_size,
1271
- samples_chunk_size=samples_chunk_size,
1272
- local_alleles=local_alleles,
1273
- max_variant_chunks=max_variant_chunks,
1274
- dimension_separator=dimension_separator,
1275
- )
1276
- vzw = VcfZarrWriter(zarr_path)
1277
- vzw.encode_all_partitions(
1278
- worker_processes=worker_processes,
1279
- show_progress=show_progress,
1280
- max_memory=max_memory,
1281
- )
1282
- vzw.finalise(show_progress)
1283
- vzw.create_index()
1284
-
1285
-
1286
- def encode_init(
1287
- icf_path,
1288
- zarr_path,
1289
- target_num_partitions,
1290
- *,
1291
- schema_path=None,
1292
- variants_chunk_size=None,
1293
- samples_chunk_size=None,
1294
- local_alleles=None,
1295
- max_variant_chunks=None,
1296
- dimension_separator=None,
1297
- max_memory=None,
1298
- worker_processes=1,
1299
- show_progress=False,
1300
- ):
1301
- icf_store = icf.IntermediateColumnarFormat(icf_path)
1302
- if schema_path is None:
1303
- schema = VcfZarrSchema.generate(
1304
- icf_store,
1305
- variants_chunk_size=variants_chunk_size,
1306
- samples_chunk_size=samples_chunk_size,
1307
- local_alleles=local_alleles,
1308
- )
1309
- else:
1310
- logger.info(f"Reading schema from {schema_path}")
1311
- if variants_chunk_size is not None or samples_chunk_size is not None:
1101
+ class VcfZarr:
1102
+ def __init__(self, path):
1103
+ if not zarr_utils.zarr_exists(path):
1104
+ raise ValueError("Not in VcfZarr format") # NEEDS TEST
1105
+ self.path = path
1106
+ self.root = zarr.open(path, mode="r")
1107
+
1108
+ def summary_table(self):
1109
+ data = []
1110
+ arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()]
1111
+ arrays.sort(key=lambda x: x[0])
1112
+ for stored, array in reversed(arrays):
1113
+ d = {
1114
+ "name": array.name,
1115
+ "dtype": str(array.dtype),
1116
+ "stored": core.display_size(stored),
1117
+ "size": core.display_size(array.nbytes),
1118
+ "ratio": core.display_number(array.nbytes / stored),
1119
+ "nchunks": str(array.nchunks),
1120
+ "chunk_size": core.display_size(array.nbytes / array.nchunks),
1121
+ "avg_chunk_stored": core.display_size(int(stored / array.nchunks)),
1122
+ "shape": str(array.shape),
1123
+ "chunk_shape": str(array.chunks),
1124
+ "compressor": str(zarr_utils.get_compressor(array)),
1125
+ "filters": str(array.filters),
1126
+ }
1127
+ data.append(d)
1128
+ return data
1129
+
1130
+
1131
+ class VcfZarrIndexer:
1132
+ """
1133
+ Creates an index for efficient region queries in a VCF Zarr dataset.
1134
+ """
1135
+
1136
+ def __init__(self, path):
1137
+ self.path = pathlib.Path(path)
1138
+
1139
+ def create_index(self):
1140
+ """Create an index to support efficient region queries."""
1141
+ root = zarr.open_group(store=self.path, mode="r+")
1142
+ if (
1143
+ "variant_contig" not in root
1144
+ or "variant_position" not in root
1145
+ or "variant_length" not in root
1146
+ ):
1312
1147
  raise ValueError(
1313
- "Cannot specify schema along with chunk sizes"
1314
- ) # NEEDS TEST
1315
- with open(schema_path) as f:
1316
- schema = VcfZarrSchema.fromjson(f.read())
1317
- zarr_path = pathlib.Path(zarr_path)
1318
- vzw = VcfZarrWriter(zarr_path)
1319
- return vzw.init(
1320
- icf_store,
1321
- target_num_partitions=target_num_partitions,
1322
- schema=schema,
1323
- dimension_separator=dimension_separator,
1324
- max_variant_chunks=max_variant_chunks,
1325
- )
1326
-
1327
-
1328
- def encode_partition(zarr_path, partition):
1329
- writer = VcfZarrWriter(zarr_path)
1330
- writer.encode_partition(partition)
1331
-
1332
-
1333
- def encode_finalise(zarr_path, show_progress=False):
1334
- writer = VcfZarrWriter(zarr_path)
1335
- writer.finalise(show_progress=show_progress)
1336
-
1337
-
1338
- def convert(
1339
- vcfs,
1340
- out_path,
1341
- *,
1342
- variants_chunk_size=None,
1343
- samples_chunk_size=None,
1344
- worker_processes=1,
1345
- local_alleles=None,
1346
- show_progress=False,
1347
- icf_path=None,
1348
- ):
1349
- if icf_path is None:
1350
- cm = temp_icf_path(prefix="vcf2zarr")
1351
- else:
1352
- cm = contextlib.nullcontext(icf_path)
1353
-
1354
- with cm as icf_path:
1355
- icf.explode(
1356
- icf_path,
1357
- vcfs,
1358
- worker_processes=worker_processes,
1359
- show_progress=show_progress,
1360
- )
1361
- encode(
1362
- icf_path,
1363
- out_path,
1364
- variants_chunk_size=variants_chunk_size,
1365
- samples_chunk_size=samples_chunk_size,
1366
- worker_processes=worker_processes,
1367
- show_progress=show_progress,
1368
- local_alleles=local_alleles,
1369
- )
1148
+ "Cannot create index: variant_contig, "
1149
+ "variant_position and variant_length arrays are required"
1150
+ )
1151
+
1152
+ contig = root["variant_contig"]
1153
+ pos = root["variant_position"]
1154
+ length = root["variant_length"]
1155
+
1156
+ assert contig.cdata_shape == pos.cdata_shape
1370
1157
 
1158
+ index = []
1159
+
1160
+ logger.info("Creating region index")
1161
+ for v_chunk in range(pos.cdata_shape[0]):
1162
+ c = contig.blocks[v_chunk]
1163
+ p = pos.blocks[v_chunk]
1164
+ e = p + length.blocks[v_chunk] - 1
1371
1165
 
1372
- @contextlib.contextmanager
1373
- def temp_icf_path(prefix=None):
1374
- with tempfile.TemporaryDirectory(prefix=prefix) as tmp:
1375
- yield pathlib.Path(tmp) / "icf"
1166
+ # create a row for each contig in the chunk
1167
+ d = np.diff(c, append=-1)
1168
+ c_start_idx = 0
1169
+ for c_end_idx in np.nonzero(d)[0]:
1170
+ assert c[c_start_idx] == c[c_end_idx]
1171
+ index.append(
1172
+ (
1173
+ v_chunk, # chunk index
1174
+ c[c_start_idx], # contig ID
1175
+ p[c_start_idx], # start
1176
+ p[c_end_idx], # end
1177
+ np.max(e[c_start_idx : c_end_idx + 1]), # max end
1178
+ c_end_idx - c_start_idx + 1, # num records
1179
+ )
1180
+ )
1181
+ c_start_idx = c_end_idx + 1
1182
+
1183
+ index = np.array(index, dtype=pos.dtype)
1184
+ kwargs = {}
1185
+ if not zarr_utils.zarr_v3():
1186
+ kwargs["dimension_separator"] = "/"
1187
+ zarr_utils.create_group_array(
1188
+ root,
1189
+ "region_index",
1190
+ data=index,
1191
+ shape=index.shape,
1192
+ chunks=index.shape,
1193
+ dtype=index.dtype,
1194
+ compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
1195
+ fill_value=None,
1196
+ dimension_names=[
1197
+ "region_index_values",
1198
+ "region_index_fields",
1199
+ ],
1200
+ **kwargs,
1201
+ )
1202
+
1203
+ logger.info("Consolidating Zarr metadata")
1204
+ zarr.consolidate_metadata(self.path)