bio2zarr 0.0.9__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

@@ -0,0 +1,1053 @@
1
+ import dataclasses
2
+ import json
3
+ import logging
4
+ import os
5
+ import os.path
6
+ import pathlib
7
+ import shutil
8
+ import tempfile
9
+
10
+ import humanfriendly
11
+ import numcodecs
12
+ import numpy as np
13
+ import zarr
14
+
15
+ from .. import constants, core, provenance
16
+ from . import icf
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def inspect(path):
22
+ path = pathlib.Path(path)
23
+ # TODO add support for the Zarr format also
24
+ if (path / "metadata.json").exists():
25
+ obj = icf.IntermediateColumnarFormat(path)
26
+ elif (path / ".zmetadata").exists():
27
+ obj = VcfZarr(path)
28
+ else:
29
+ raise ValueError("Format not recognised") # NEEDS TEST
30
+ return obj.summary_table()
31
+
32
+
33
+ DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
34
+
35
+
36
+ @dataclasses.dataclass
37
+ class ZarrArraySpec:
38
+ name: str
39
+ dtype: str
40
+ shape: tuple
41
+ chunks: tuple
42
+ dimensions: tuple
43
+ description: str
44
+ vcf_field: str
45
+ compressor: dict
46
+ filters: list
47
+
48
+ def __post_init__(self):
49
+ # Ensure these are tuples for ease of comparison and consistency
50
+ self.shape = tuple(self.shape)
51
+ self.chunks = tuple(self.chunks)
52
+ self.dimensions = tuple(self.dimensions)
53
+ self.filters = tuple(self.filters)
54
+
55
+ @staticmethod
56
+ def new(**kwargs):
57
+ spec = ZarrArraySpec(
58
+ **kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
59
+ )
60
+ spec._choose_compressor_settings()
61
+ return spec
62
+
63
+ @staticmethod
64
+ def from_field(
65
+ vcf_field,
66
+ *,
67
+ num_variants,
68
+ num_samples,
69
+ variants_chunk_size,
70
+ samples_chunk_size,
71
+ variable_name=None,
72
+ ):
73
+ shape = [num_variants]
74
+ prefix = "variant_"
75
+ dimensions = ["variants"]
76
+ chunks = [variants_chunk_size]
77
+ if vcf_field.category == "FORMAT":
78
+ prefix = "call_"
79
+ shape.append(num_samples)
80
+ chunks.append(samples_chunk_size)
81
+ dimensions.append("samples")
82
+ if variable_name is None:
83
+ variable_name = prefix + vcf_field.name
84
+ # TODO make an option to add in the empty extra dimension
85
+ if vcf_field.summary.max_number > 1:
86
+ shape.append(vcf_field.summary.max_number)
87
+ # TODO we should really be checking this to see if the named dimensions
88
+ # are actually correct.
89
+ if vcf_field.vcf_number == "R":
90
+ dimensions.append("alleles")
91
+ elif vcf_field.vcf_number == "A":
92
+ dimensions.append("alt_alleles")
93
+ elif vcf_field.vcf_number == "G":
94
+ dimensions.append("genotypes")
95
+ else:
96
+ dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
97
+ return ZarrArraySpec.new(
98
+ vcf_field=vcf_field.full_name,
99
+ name=variable_name,
100
+ dtype=vcf_field.smallest_dtype(),
101
+ shape=shape,
102
+ chunks=chunks,
103
+ dimensions=dimensions,
104
+ description=vcf_field.description,
105
+ )
106
+
107
+ def _choose_compressor_settings(self):
108
+ """
109
+ Choose compressor and filter settings based on the size and
110
+ type of the array, plus some hueristics from observed properties
111
+ of VCFs.
112
+
113
+ See https://github.com/pystatgen/bio2zarr/discussions/74
114
+ """
115
+ # Default is to not shuffle, because autoshuffle isn't recognised
116
+ # by many Zarr implementations, and shuffling can lead to worse
117
+ # performance in some cases anyway. Turning on shuffle should be a
118
+ # deliberate choice.
119
+ shuffle = numcodecs.Blosc.NOSHUFFLE
120
+ if self.name == "call_genotype" and self.dtype == "i1":
121
+ # call_genotype gets BITSHUFFLE by default as it gets
122
+ # significantly better compression (at a cost of slower
123
+ # decoding)
124
+ shuffle = numcodecs.Blosc.BITSHUFFLE
125
+ elif self.dtype == "bool":
126
+ shuffle = numcodecs.Blosc.BITSHUFFLE
127
+
128
+ self.compressor["shuffle"] = shuffle
129
+
130
+ @property
131
+ def chunk_nbytes(self):
132
+ """
133
+ Returns the nbytes for a single chunk in this array.
134
+ """
135
+ items = 1
136
+ dim = 0
137
+ for chunk_size in self.chunks:
138
+ size = min(chunk_size, self.shape[dim])
139
+ items *= size
140
+ dim += 1
141
+ # Include sizes for extra dimensions.
142
+ for size in self.shape[dim:]:
143
+ items *= size
144
+ dt = np.dtype(self.dtype)
145
+ return items * dt.itemsize
146
+
147
+ @property
148
+ def variant_chunk_nbytes(self):
149
+ """
150
+ Returns the nbytes for a single variant chunk of this array.
151
+ """
152
+ chunk_items = self.chunks[0]
153
+ for size in self.shape[1:]:
154
+ chunk_items *= size
155
+ dt = np.dtype(self.dtype)
156
+ if dt.kind == "O" and "samples" in self.dimensions:
157
+ logger.warning(
158
+ f"Field {self.name} is a string; max memory usage may "
159
+ "be a significant underestimate"
160
+ )
161
+ return chunk_items * dt.itemsize
162
+
163
+
164
+ ZARR_SCHEMA_FORMAT_VERSION = "0.4"
165
+
166
+
167
+ @dataclasses.dataclass
168
+ class VcfZarrSchema(core.JsonDataclass):
169
+ format_version: str
170
+ samples_chunk_size: int
171
+ variants_chunk_size: int
172
+ samples: list
173
+ contigs: list
174
+ filters: list
175
+ fields: list
176
+
177
+ def validate(self):
178
+ """
179
+ Checks that the schema is well-formed and within required limits.
180
+ """
181
+ for field in self.fields:
182
+ # This is the Blosc max buffer size
183
+ if field.chunk_nbytes > 2147483647:
184
+ # TODO add some links to documentation here advising how to
185
+ # deal with PL values.
186
+ raise ValueError(
187
+ f"Field {field.name} chunks are too large "
188
+ f"({field.chunk_nbytes} > 2**31 - 1 bytes). "
189
+ "Either generate a schema and drop this field (if you don't "
190
+ "need it) or reduce the variant or sample chunk sizes."
191
+ )
192
+ # TODO other checks? There must be lots of ways people could mess
193
+ # up the schema leading to cryptic errors.
194
+
195
+ def field_map(self):
196
+ return {field.name: field for field in self.fields}
197
+
198
+ @staticmethod
199
+ def fromdict(d):
200
+ if d["format_version"] != ZARR_SCHEMA_FORMAT_VERSION:
201
+ raise ValueError(
202
+ "Zarr schema format version mismatch: "
203
+ f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
204
+ )
205
+ ret = VcfZarrSchema(**d)
206
+ ret.samples = [icf.Sample(**sd) for sd in d["samples"]]
207
+ ret.contigs = [icf.Contig(**sd) for sd in d["contigs"]]
208
+ ret.filters = [icf.Filter(**sd) for sd in d["filters"]]
209
+ ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
210
+ return ret
211
+
212
+ @staticmethod
213
+ def fromjson(s):
214
+ return VcfZarrSchema.fromdict(json.loads(s))
215
+
216
+ @staticmethod
217
+ def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
218
+ m = icf.num_records
219
+ n = icf.num_samples
220
+ # FIXME
221
+ if samples_chunk_size is None:
222
+ samples_chunk_size = 1000
223
+ if variants_chunk_size is None:
224
+ variants_chunk_size = 10_000
225
+ logger.info(
226
+ f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
227
+ )
228
+
229
+ def spec_from_field(field, variable_name=None):
230
+ return ZarrArraySpec.from_field(
231
+ field,
232
+ num_samples=n,
233
+ num_variants=m,
234
+ samples_chunk_size=samples_chunk_size,
235
+ variants_chunk_size=variants_chunk_size,
236
+ variable_name=variable_name,
237
+ )
238
+
239
+ def fixed_field_spec(
240
+ name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
241
+ ):
242
+ return ZarrArraySpec.new(
243
+ vcf_field=vcf_field,
244
+ name=name,
245
+ dtype=dtype,
246
+ shape=shape,
247
+ description="",
248
+ dimensions=dimensions,
249
+ chunks=[variants_chunk_size],
250
+ )
251
+
252
+ alt_col = icf.fields["ALT"]
253
+ max_alleles = alt_col.vcf_field.summary.max_number + 1
254
+
255
+ colspecs = [
256
+ fixed_field_spec(
257
+ name="variant_contig",
258
+ dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
259
+ ),
260
+ fixed_field_spec(
261
+ name="variant_filter",
262
+ dtype="bool",
263
+ shape=(m, icf.metadata.num_filters),
264
+ dimensions=["variants", "filters"],
265
+ ),
266
+ fixed_field_spec(
267
+ name="variant_allele",
268
+ dtype="O",
269
+ shape=(m, max_alleles),
270
+ dimensions=["variants", "alleles"],
271
+ ),
272
+ fixed_field_spec(
273
+ name="variant_id",
274
+ dtype="O",
275
+ ),
276
+ fixed_field_spec(
277
+ name="variant_id_mask",
278
+ dtype="bool",
279
+ ),
280
+ ]
281
+ name_map = {field.full_name: field for field in icf.metadata.fields}
282
+
283
+ # Only two of the fixed fields have a direct one-to-one mapping.
284
+ colspecs.extend(
285
+ [
286
+ spec_from_field(name_map["QUAL"], variable_name="variant_quality"),
287
+ spec_from_field(name_map["POS"], variable_name="variant_position"),
288
+ ]
289
+ )
290
+ colspecs.extend([spec_from_field(field) for field in icf.metadata.info_fields])
291
+
292
+ gt_field = None
293
+ for field in icf.metadata.format_fields:
294
+ if field.name == "GT":
295
+ gt_field = field
296
+ continue
297
+ colspecs.append(spec_from_field(field))
298
+
299
+ if gt_field is not None:
300
+ ploidy = gt_field.summary.max_number - 1
301
+ shape = [m, n]
302
+ chunks = [variants_chunk_size, samples_chunk_size]
303
+ dimensions = ["variants", "samples"]
304
+ colspecs.append(
305
+ ZarrArraySpec.new(
306
+ vcf_field=None,
307
+ name="call_genotype_phased",
308
+ dtype="bool",
309
+ shape=list(shape),
310
+ chunks=list(chunks),
311
+ dimensions=list(dimensions),
312
+ description="",
313
+ )
314
+ )
315
+ shape += [ploidy]
316
+ dimensions += ["ploidy"]
317
+ colspecs.append(
318
+ ZarrArraySpec.new(
319
+ vcf_field=None,
320
+ name="call_genotype",
321
+ dtype=gt_field.smallest_dtype(),
322
+ shape=list(shape),
323
+ chunks=list(chunks),
324
+ dimensions=list(dimensions),
325
+ description="",
326
+ )
327
+ )
328
+ colspecs.append(
329
+ ZarrArraySpec.new(
330
+ vcf_field=None,
331
+ name="call_genotype_mask",
332
+ dtype="bool",
333
+ shape=list(shape),
334
+ chunks=list(chunks),
335
+ dimensions=list(dimensions),
336
+ description="",
337
+ )
338
+ )
339
+
340
+ return VcfZarrSchema(
341
+ format_version=ZARR_SCHEMA_FORMAT_VERSION,
342
+ samples_chunk_size=samples_chunk_size,
343
+ variants_chunk_size=variants_chunk_size,
344
+ fields=colspecs,
345
+ samples=icf.metadata.samples,
346
+ contigs=icf.metadata.contigs,
347
+ filters=icf.metadata.filters,
348
+ )
349
+
350
+
351
+ class VcfZarr:
352
+ def __init__(self, path):
353
+ if not (path / ".zmetadata").exists():
354
+ raise ValueError("Not in VcfZarr format") # NEEDS TEST
355
+ self.path = path
356
+ self.root = zarr.open(path, mode="r")
357
+
358
+ def summary_table(self):
359
+ data = []
360
+ arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()]
361
+ arrays.sort(key=lambda x: x[0])
362
+ for stored, array in reversed(arrays):
363
+ d = {
364
+ "name": array.name,
365
+ "dtype": str(array.dtype),
366
+ "stored": core.display_size(stored),
367
+ "size": core.display_size(array.nbytes),
368
+ "ratio": core.display_number(array.nbytes / stored),
369
+ "nchunks": str(array.nchunks),
370
+ "chunk_size": core.display_size(array.nbytes / array.nchunks),
371
+ "avg_chunk_stored": core.display_size(int(stored / array.nchunks)),
372
+ "shape": str(array.shape),
373
+ "chunk_shape": str(array.chunks),
374
+ "compressor": str(array.compressor),
375
+ "filters": str(array.filters),
376
+ }
377
+ data.append(d)
378
+ return data
379
+
380
+
381
+ def parse_max_memory(max_memory):
382
+ if max_memory is None:
383
+ # Effectively unbounded
384
+ return 2**63
385
+ if isinstance(max_memory, str):
386
+ max_memory = humanfriendly.parse_size(max_memory)
387
+ logger.info(f"Set memory budget to {core.display_size(max_memory)}")
388
+ return max_memory
389
+
390
+
391
+ @dataclasses.dataclass
392
+ class VcfZarrPartition:
393
+ start: int
394
+ stop: int
395
+
396
+ @staticmethod
397
+ def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
398
+ num_chunks = int(np.ceil(num_records / chunk_size))
399
+ if max_chunks is not None:
400
+ num_chunks = min(num_chunks, max_chunks)
401
+ partitions = []
402
+ splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
403
+ for chunk_slice in splits:
404
+ start_chunk = int(chunk_slice[0])
405
+ stop_chunk = int(chunk_slice[-1]) + 1
406
+ start_index = start_chunk * chunk_size
407
+ stop_index = min(stop_chunk * chunk_size, num_records)
408
+ partitions.append(VcfZarrPartition(start_index, stop_index))
409
+ return partitions
410
+
411
+
412
+ VZW_METADATA_FORMAT_VERSION = "0.1"
413
+
414
+
415
+ @dataclasses.dataclass
416
+ class VcfZarrWriterMetadata(core.JsonDataclass):
417
+ format_version: str
418
+ icf_path: str
419
+ schema: VcfZarrSchema
420
+ dimension_separator: str
421
+ partitions: list
422
+ provenance: dict
423
+
424
+ @staticmethod
425
+ def fromdict(d):
426
+ if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
427
+ raise ValueError(
428
+ "VcfZarrWriter format version mismatch: "
429
+ f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
430
+ )
431
+ ret = VcfZarrWriterMetadata(**d)
432
+ ret.schema = VcfZarrSchema.fromdict(ret.schema)
433
+ ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
434
+ return ret
435
+
436
+
437
+ @dataclasses.dataclass
438
+ class VcfZarrWriteSummary(core.JsonDataclass):
439
+ num_partitions: int
440
+ num_samples: int
441
+ num_variants: int
442
+ num_chunks: int
443
+ max_encoding_memory: str
444
+
445
+
446
+ class VcfZarrWriter:
447
+ def __init__(self, path):
448
+ self.path = pathlib.Path(path)
449
+ self.wip_path = self.path / "wip"
450
+ self.arrays_path = self.wip_path / "arrays"
451
+ self.partitions_path = self.wip_path / "partitions"
452
+ self.metadata = None
453
+ self.icf = None
454
+
455
+ @property
456
+ def schema(self):
457
+ return self.metadata.schema
458
+
459
+ @property
460
+ def num_partitions(self):
461
+ return len(self.metadata.partitions)
462
+
463
+ def has_genotypes(self):
464
+ for field in self.schema.fields:
465
+ if field.name == "call_genotype":
466
+ return True
467
+ return False
468
+
469
+ #######################
470
+ # init
471
+ #######################
472
+
473
+ def init(
474
+ self,
475
+ icf,
476
+ *,
477
+ target_num_partitions,
478
+ schema,
479
+ dimension_separator=None,
480
+ max_variant_chunks=None,
481
+ ):
482
+ self.icf = icf
483
+ if self.path.exists():
484
+ raise ValueError("Zarr path already exists") # NEEDS TEST
485
+ schema.validate()
486
+ partitions = VcfZarrPartition.generate_partitions(
487
+ self.icf.num_records,
488
+ schema.variants_chunk_size,
489
+ target_num_partitions,
490
+ max_chunks=max_variant_chunks,
491
+ )
492
+ # Default to using nested directories following the Zarr v3 default.
493
+ # This seems to require version 2.17+ to work properly
494
+ dimension_separator = (
495
+ "/" if dimension_separator is None else dimension_separator
496
+ )
497
+ self.metadata = VcfZarrWriterMetadata(
498
+ format_version=VZW_METADATA_FORMAT_VERSION,
499
+ icf_path=str(self.icf.path),
500
+ schema=schema,
501
+ dimension_separator=dimension_separator,
502
+ partitions=partitions,
503
+ # Bare minimum here for provenance - see comments above
504
+ provenance={"source": f"bio2zarr-{provenance.__version__}"},
505
+ )
506
+
507
+ self.path.mkdir()
508
+ store = zarr.DirectoryStore(self.path)
509
+ root = zarr.group(store=store)
510
+ root.attrs.update(
511
+ {
512
+ "vcf_zarr_version": "0.2",
513
+ "vcf_header": self.icf.vcf_header,
514
+ "source": f"bio2zarr-{provenance.__version__}",
515
+ }
516
+ )
517
+ # Doing this syncronously - this is fine surely
518
+ self.encode_samples(root)
519
+ self.encode_filter_id(root)
520
+ self.encode_contig_id(root)
521
+
522
+ self.wip_path.mkdir()
523
+ self.arrays_path.mkdir()
524
+ self.partitions_path.mkdir()
525
+ store = zarr.DirectoryStore(self.arrays_path)
526
+ root = zarr.group(store=store)
527
+
528
+ total_chunks = 0
529
+ for field in self.schema.fields:
530
+ a = self.init_array(root, field, partitions[-1].stop)
531
+ total_chunks += a.nchunks
532
+
533
+ logger.info("Writing WIP metadata")
534
+ with open(self.wip_path / "metadata.json", "w") as f:
535
+ json.dump(self.metadata.asdict(), f, indent=4)
536
+
537
+ return VcfZarrWriteSummary(
538
+ num_variants=self.icf.num_records,
539
+ num_samples=self.icf.num_samples,
540
+ num_partitions=self.num_partitions,
541
+ num_chunks=total_chunks,
542
+ max_encoding_memory=core.display_size(self.get_max_encoding_memory()),
543
+ )
544
+
545
+ def encode_samples(self, root):
546
+ if self.schema.samples != self.icf.metadata.samples:
547
+ raise ValueError("Subsetting or reordering samples not supported currently")
548
+ array = root.array(
549
+ "sample_id",
550
+ [sample.id for sample in self.schema.samples],
551
+ dtype="str",
552
+ compressor=DEFAULT_ZARR_COMPRESSOR,
553
+ chunks=(self.schema.samples_chunk_size,),
554
+ )
555
+ array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
556
+ logger.debug("Samples done")
557
+
558
+ def encode_contig_id(self, root):
559
+ array = root.array(
560
+ "contig_id",
561
+ [contig.id for contig in self.schema.contigs],
562
+ dtype="str",
563
+ compressor=DEFAULT_ZARR_COMPRESSOR,
564
+ )
565
+ array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
566
+ if all(contig.length is not None for contig in self.schema.contigs):
567
+ array = root.array(
568
+ "contig_length",
569
+ [contig.length for contig in self.schema.contigs],
570
+ dtype=np.int64,
571
+ compressor=DEFAULT_ZARR_COMPRESSOR,
572
+ )
573
+ array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
574
+
575
+ def encode_filter_id(self, root):
576
+ # TODO need a way to store description also
577
+ # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
578
+ array = root.array(
579
+ "filter_id",
580
+ [filt.id for filt in self.schema.filters],
581
+ dtype="str",
582
+ compressor=DEFAULT_ZARR_COMPRESSOR,
583
+ )
584
+ array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
585
+
586
+ def init_array(self, root, variable, variants_dim_size):
587
+ object_codec = None
588
+ if variable.dtype == "O":
589
+ object_codec = numcodecs.VLenUTF8()
590
+ shape = list(variable.shape)
591
+ # Truncate the variants dimension is max_variant_chunks was specified
592
+ shape[0] = variants_dim_size
593
+ a = root.empty(
594
+ variable.name,
595
+ shape=shape,
596
+ chunks=variable.chunks,
597
+ dtype=variable.dtype,
598
+ compressor=numcodecs.get_codec(variable.compressor),
599
+ filters=[numcodecs.get_codec(filt) for filt in variable.filters],
600
+ object_codec=object_codec,
601
+ dimension_separator=self.metadata.dimension_separator,
602
+ )
603
+ a.attrs.update(
604
+ {
605
+ "description": variable.description,
606
+ # Dimension names are part of the spec in Zarr v3
607
+ "_ARRAY_DIMENSIONS": variable.dimensions,
608
+ }
609
+ )
610
+ logger.debug(f"Initialised {a}")
611
+ return a
612
+
613
+ #######################
614
+ # encode_partition
615
+ #######################
616
+
617
+ def load_metadata(self):
618
+ if self.metadata is None:
619
+ with open(self.wip_path / "metadata.json") as f:
620
+ self.metadata = VcfZarrWriterMetadata.fromdict(json.load(f))
621
+ self.icf = icf.IntermediateColumnarFormat(self.metadata.icf_path)
622
+
623
+ def partition_path(self, partition_index):
624
+ return self.partitions_path / f"p{partition_index}"
625
+
626
+ def wip_partition_path(self, partition_index):
627
+ return self.partitions_path / f"wip_p{partition_index}"
628
+
629
+ def wip_partition_array_path(self, partition_index, name):
630
+ return self.wip_partition_path(partition_index) / name
631
+
632
+ def partition_array_path(self, partition_index, name):
633
+ return self.partition_path(partition_index) / name
634
+
635
+ def encode_partition(self, partition_index):
636
+ self.load_metadata()
637
+ if partition_index < 0 or partition_index >= self.num_partitions:
638
+ raise ValueError("Partition index not in the valid range")
639
+ partition_path = self.wip_partition_path(partition_index)
640
+ partition_path.mkdir(exist_ok=True)
641
+ logger.info(f"Encoding partition {partition_index} to {partition_path}")
642
+
643
+ self.encode_id_partition(partition_index)
644
+ self.encode_filters_partition(partition_index)
645
+ self.encode_contig_partition(partition_index)
646
+ self.encode_alleles_partition(partition_index)
647
+ for col in self.schema.fields:
648
+ if col.vcf_field is not None:
649
+ self.encode_array_partition(col, partition_index)
650
+ if self.has_genotypes():
651
+ self.encode_genotypes_partition(partition_index)
652
+
653
+ final_path = self.partition_path(partition_index)
654
+ logger.info(f"Finalising {partition_index} at {final_path}")
655
+ if final_path.exists():
656
+ logger.warning(f"Removing existing partition at {final_path}")
657
+ shutil.rmtree(final_path)
658
+ os.rename(partition_path, final_path)
659
+
660
+ def init_partition_array(self, partition_index, name):
661
+ # Create an empty array like the definition
662
+ src = self.arrays_path / name
663
+ # Overwrite any existing WIP files
664
+ wip_path = self.wip_partition_array_path(partition_index, name)
665
+ shutil.copytree(src, wip_path, dirs_exist_ok=True)
666
+ store = zarr.DirectoryStore(self.wip_partition_path(partition_index))
667
+ wip_root = zarr.group(store=store)
668
+ array = wip_root[name]
669
+ logger.debug(f"Opened empty array {array.name} <{array.dtype}> @ {wip_path}")
670
+ return array
671
+
672
+ def finalise_partition_array(self, partition_index, name):
673
+ logger.debug(f"Encoded {name} partition {partition_index}")
674
+
675
+ def encode_array_partition(self, column, partition_index):
676
+ array = self.init_partition_array(partition_index, column.name)
677
+
678
+ partition = self.metadata.partitions[partition_index]
679
+ ba = core.BufferedArray(array, partition.start)
680
+ source_col = self.icf.fields[column.vcf_field]
681
+ sanitiser = source_col.sanitiser_factory(ba.buff.shape)
682
+
683
+ for value in source_col.iter_values(partition.start, partition.stop):
684
+ # We write directly into the buffer in the sanitiser function
685
+ # to make it easier to reason about dimension padding
686
+ j = ba.next_buffer_row()
687
+ sanitiser(ba.buff, j, value)
688
+ ba.flush()
689
+ self.finalise_partition_array(partition_index, column.name)
690
+
691
+ def encode_genotypes_partition(self, partition_index):
692
+ gt_array = self.init_partition_array(partition_index, "call_genotype")
693
+ gt_mask_array = self.init_partition_array(partition_index, "call_genotype_mask")
694
+ gt_phased_array = self.init_partition_array(
695
+ partition_index, "call_genotype_phased"
696
+ )
697
+
698
+ partition = self.metadata.partitions[partition_index]
699
+ gt = core.BufferedArray(gt_array, partition.start)
700
+ gt_mask = core.BufferedArray(gt_mask_array, partition.start)
701
+ gt_phased = core.BufferedArray(gt_phased_array, partition.start)
702
+
703
+ source_col = self.icf.fields["FORMAT/GT"]
704
+ for value in source_col.iter_values(partition.start, partition.stop):
705
+ j = gt.next_buffer_row()
706
+ icf.sanitise_value_int_2d(gt.buff, j, value[:, :-1])
707
+ j = gt_phased.next_buffer_row()
708
+ icf.sanitise_value_int_1d(gt_phased.buff, j, value[:, -1])
709
+ # TODO check is this the correct semantics when we are padding
710
+ # with mixed ploidies?
711
+ j = gt_mask.next_buffer_row()
712
+ gt_mask.buff[j] = gt.buff[j] < 0
713
+ gt.flush()
714
+ gt_phased.flush()
715
+ gt_mask.flush()
716
+
717
+ self.finalise_partition_array(partition_index, "call_genotype")
718
+ self.finalise_partition_array(partition_index, "call_genotype_mask")
719
+ self.finalise_partition_array(partition_index, "call_genotype_phased")
720
+
721
+ def encode_alleles_partition(self, partition_index):
722
+ array_name = "variant_allele"
723
+ alleles_array = self.init_partition_array(partition_index, array_name)
724
+ partition = self.metadata.partitions[partition_index]
725
+ alleles = core.BufferedArray(alleles_array, partition.start)
726
+ ref_col = self.icf.fields["REF"]
727
+ alt_col = self.icf.fields["ALT"]
728
+
729
+ for ref, alt in zip(
730
+ ref_col.iter_values(partition.start, partition.stop),
731
+ alt_col.iter_values(partition.start, partition.stop),
732
+ ):
733
+ j = alleles.next_buffer_row()
734
+ alleles.buff[j, :] = constants.STR_FILL
735
+ alleles.buff[j, 0] = ref[0]
736
+ alleles.buff[j, 1 : 1 + len(alt)] = alt
737
+ alleles.flush()
738
+
739
+ self.finalise_partition_array(partition_index, array_name)
740
+
741
+ def encode_id_partition(self, partition_index):
742
+ vid_array = self.init_partition_array(partition_index, "variant_id")
743
+ vid_mask_array = self.init_partition_array(partition_index, "variant_id_mask")
744
+ partition = self.metadata.partitions[partition_index]
745
+ vid = core.BufferedArray(vid_array, partition.start)
746
+ vid_mask = core.BufferedArray(vid_mask_array, partition.start)
747
+ col = self.icf.fields["ID"]
748
+
749
+ for value in col.iter_values(partition.start, partition.stop):
750
+ j = vid.next_buffer_row()
751
+ k = vid_mask.next_buffer_row()
752
+ assert j == k
753
+ if value is not None:
754
+ vid.buff[j] = value[0]
755
+ vid_mask.buff[j] = False
756
+ else:
757
+ vid.buff[j] = constants.STR_MISSING
758
+ vid_mask.buff[j] = True
759
+ vid.flush()
760
+ vid_mask.flush()
761
+
762
+ self.finalise_partition_array(partition_index, "variant_id")
763
+ self.finalise_partition_array(partition_index, "variant_id_mask")
764
+
765
+ def encode_filters_partition(self, partition_index):
766
+ lookup = {filt.id: index for index, filt in enumerate(self.schema.filters)}
767
+ array_name = "variant_filter"
768
+ array = self.init_partition_array(partition_index, array_name)
769
+ partition = self.metadata.partitions[partition_index]
770
+ var_filter = core.BufferedArray(array, partition.start)
771
+
772
+ col = self.icf.fields["FILTERS"]
773
+ for value in col.iter_values(partition.start, partition.stop):
774
+ j = var_filter.next_buffer_row()
775
+ var_filter.buff[j] = False
776
+ for f in value:
777
+ try:
778
+ var_filter.buff[j, lookup[f]] = True
779
+ except KeyError:
780
+ raise ValueError(
781
+ f"Filter '{f}' was not defined in the header."
782
+ ) from None
783
+ var_filter.flush()
784
+
785
+ self.finalise_partition_array(partition_index, array_name)
786
+
787
+ def encode_contig_partition(self, partition_index):
788
+ lookup = {contig.id: index for index, contig in enumerate(self.schema.contigs)}
789
+ array_name = "variant_contig"
790
+ array = self.init_partition_array(partition_index, array_name)
791
+ partition = self.metadata.partitions[partition_index]
792
+ contig = core.BufferedArray(array, partition.start)
793
+ col = self.icf.fields["CHROM"]
794
+
795
+ for value in col.iter_values(partition.start, partition.stop):
796
+ j = contig.next_buffer_row()
797
+ # Note: because we are using the indexes to define the lookups
798
+ # and we always have an index, it seems that we the contig lookup
799
+ # will always succeed. However, if anyone ever does hit a KeyError
800
+ # here, please do open an issue with a reproducible example!
801
+ contig.buff[j] = lookup[value[0]]
802
+ contig.flush()
803
+
804
+ self.finalise_partition_array(partition_index, array_name)
805
+
806
+ #######################
807
+ # finalise
808
+ #######################
809
+
810
+ def finalise_array(self, name):
811
+ logger.info(f"Finalising {name}")
812
+ final_path = self.path / name
813
+ if final_path.exists():
814
+ # NEEDS TEST
815
+ raise ValueError(f"Array {name} already exists")
816
+ for partition in range(self.num_partitions):
817
+ # Move all the files in partition dir to dest dir
818
+ src = self.partition_array_path(partition, name)
819
+ if not src.exists():
820
+ # Needs test
821
+ raise ValueError(f"Partition {partition} of {name} does not exist")
822
+ dest = self.arrays_path / name
823
+ # This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
824
+ chunk_files = [
825
+ path for path in src.iterdir() if not path.name.startswith(".")
826
+ ]
827
+ # TODO check for a count of then number of files. If we require a
828
+ # dimension_separator of "/" then we could make stronger assertions
829
+ # here, as we'd always have num_variant_chunks
830
+ logger.debug(
831
+ f"Moving {len(chunk_files)} chunks for {name} partition {partition}"
832
+ )
833
+ for chunk_file in chunk_files:
834
+ os.rename(chunk_file, dest / chunk_file.name)
835
+ # Finally, once all the chunks have moved into the arrays dir,
836
+ # we move it out of wip
837
+ os.rename(self.arrays_path / name, self.path / name)
838
+ core.update_progress(1)
839
+
840
+ def finalise(self, show_progress=False):
841
+ self.load_metadata()
842
+
843
+ logger.info(f"Scanning {self.num_partitions} partitions")
844
+ missing = []
845
+ # TODO may need a progress bar here
846
+ for partition_id in range(self.num_partitions):
847
+ if not self.partition_path(partition_id).exists():
848
+ missing.append(partition_id)
849
+ if len(missing) > 0:
850
+ raise FileNotFoundError(f"Partitions not encoded: {missing}")
851
+
852
+ progress_config = core.ProgressConfig(
853
+ total=len(self.schema.fields),
854
+ title="Finalise",
855
+ units="array",
856
+ show=show_progress,
857
+ )
858
+ # NOTE: it's not clear that adding more workers will make this quicker,
859
+ # as it's just going to be causing contention on the file system.
860
+ # Something to check empirically in some deployments.
861
+ # FIXME we're just using worker_processes=0 here to hook into the
862
+ # SynchronousExecutor which is intended for testing purposes so
863
+ # that we get test coverage. Should fix this either by allowing
864
+ # for multiple workers, or making a standard wrapper for tqdm
865
+ # that allows us to have a consistent look and feel.
866
+ with core.ParallelWorkManager(0, progress_config) as pwm:
867
+ for field in self.schema.fields:
868
+ pwm.submit(self.finalise_array, field.name)
869
+ logger.debug(f"Removing {self.wip_path}")
870
+ shutil.rmtree(self.wip_path)
871
+ logger.info("Consolidating Zarr metadata")
872
+ zarr.consolidate_metadata(self.path)
873
+
874
+ ######################
875
+ # encode_all_partitions
876
+ ######################
877
+
878
+ def get_max_encoding_memory(self):
879
+ """
880
+ Return the approximate maximum memory used to encode a variant chunk.
881
+ """
882
+ max_encoding_mem = 0
883
+ for col in self.schema.fields:
884
+ max_encoding_mem = max(max_encoding_mem, col.variant_chunk_nbytes)
885
+ gt_mem = 0
886
+ if self.has_genotypes:
887
+ gt_mem = sum(
888
+ field.variant_chunk_nbytes
889
+ for field in self.schema.fields
890
+ if field.name.startswith("call_genotype")
891
+ )
892
+ return max(max_encoding_mem, gt_mem)
893
+
894
+ def encode_all_partitions(
895
+ self, *, worker_processes=1, show_progress=False, max_memory=None
896
+ ):
897
+ max_memory = parse_max_memory(max_memory)
898
+ self.load_metadata()
899
+ num_partitions = self.num_partitions
900
+ per_worker_memory = self.get_max_encoding_memory()
901
+ logger.info(
902
+ f"Encoding Zarr over {num_partitions} partitions with "
903
+ f"{worker_processes} workers and {core.display_size(per_worker_memory)} "
904
+ "per worker"
905
+ )
906
+ # Each partition requires per_worker_memory bytes, so to prevent more that
907
+ # max_memory being used, we clamp the number of workers
908
+ max_num_workers = max_memory // per_worker_memory
909
+ if max_num_workers < worker_processes:
910
+ logger.warning(
911
+ f"Limiting number of workers to {max_num_workers} to "
912
+ "keep within specified memory budget of "
913
+ f"{core.display_size(max_memory)}"
914
+ )
915
+ if max_num_workers <= 0:
916
+ raise ValueError(
917
+ f"Insufficient memory to encode a partition:"
918
+ f"{core.display_size(per_worker_memory)} > "
919
+ f"{core.display_size(max_memory)}"
920
+ )
921
+ num_workers = min(max_num_workers, worker_processes)
922
+
923
+ total_bytes = 0
924
+ for col in self.schema.fields:
925
+ # Open the array definition to get the total size
926
+ total_bytes += zarr.open(self.arrays_path / col.name).nbytes
927
+
928
+ progress_config = core.ProgressConfig(
929
+ total=total_bytes,
930
+ title="Encode",
931
+ units="B",
932
+ show=show_progress,
933
+ )
934
+ with core.ParallelWorkManager(num_workers, progress_config) as pwm:
935
+ for partition_index in range(num_partitions):
936
+ pwm.submit(self.encode_partition, partition_index)
937
+
938
+
939
+ def mkschema(if_path, out):
940
+ store = icf.IntermediateColumnarFormat(if_path)
941
+ spec = VcfZarrSchema.generate(store)
942
+ out.write(spec.asjson())
943
+
944
+
945
+ def encode(
946
+ if_path,
947
+ zarr_path,
948
+ schema_path=None,
949
+ variants_chunk_size=None,
950
+ samples_chunk_size=None,
951
+ max_variant_chunks=None,
952
+ dimension_separator=None,
953
+ max_memory=None,
954
+ worker_processes=1,
955
+ show_progress=False,
956
+ ):
957
+ # Rough heuristic to split work up enough to keep utilisation high
958
+ target_num_partitions = max(1, worker_processes * 4)
959
+ encode_init(
960
+ if_path,
961
+ zarr_path,
962
+ target_num_partitions,
963
+ schema_path=schema_path,
964
+ variants_chunk_size=variants_chunk_size,
965
+ samples_chunk_size=samples_chunk_size,
966
+ max_variant_chunks=max_variant_chunks,
967
+ dimension_separator=dimension_separator,
968
+ )
969
+ vzw = VcfZarrWriter(zarr_path)
970
+ vzw.encode_all_partitions(
971
+ worker_processes=worker_processes,
972
+ show_progress=show_progress,
973
+ max_memory=max_memory,
974
+ )
975
+ vzw.finalise(show_progress)
976
+
977
+
978
+ def encode_init(
979
+ icf_path,
980
+ zarr_path,
981
+ target_num_partitions,
982
+ *,
983
+ schema_path=None,
984
+ variants_chunk_size=None,
985
+ samples_chunk_size=None,
986
+ max_variant_chunks=None,
987
+ dimension_separator=None,
988
+ max_memory=None,
989
+ worker_processes=1,
990
+ show_progress=False,
991
+ ):
992
+ icf_store = icf.IntermediateColumnarFormat(icf_path)
993
+ if schema_path is None:
994
+ schema = VcfZarrSchema.generate(
995
+ icf_store,
996
+ variants_chunk_size=variants_chunk_size,
997
+ samples_chunk_size=samples_chunk_size,
998
+ )
999
+ else:
1000
+ logger.info(f"Reading schema from {schema_path}")
1001
+ if variants_chunk_size is not None or samples_chunk_size is not None:
1002
+ raise ValueError(
1003
+ "Cannot specify schema along with chunk sizes"
1004
+ ) # NEEDS TEST
1005
+ with open(schema_path) as f:
1006
+ schema = VcfZarrSchema.fromjson(f.read())
1007
+ zarr_path = pathlib.Path(zarr_path)
1008
+ vzw = VcfZarrWriter(zarr_path)
1009
+ return vzw.init(
1010
+ icf_store,
1011
+ target_num_partitions=target_num_partitions,
1012
+ schema=schema,
1013
+ dimension_separator=dimension_separator,
1014
+ max_variant_chunks=max_variant_chunks,
1015
+ )
1016
+
1017
+
1018
+ def encode_partition(zarr_path, partition):
1019
+ writer = VcfZarrWriter(zarr_path)
1020
+ writer.encode_partition(partition)
1021
+
1022
+
1023
+ def encode_finalise(zarr_path, show_progress=False):
1024
+ writer = VcfZarrWriter(zarr_path)
1025
+ writer.finalise(show_progress=show_progress)
1026
+
1027
+
1028
+ def convert(
1029
+ vcfs,
1030
+ out_path,
1031
+ *,
1032
+ variants_chunk_size=None,
1033
+ samples_chunk_size=None,
1034
+ worker_processes=1,
1035
+ show_progress=False,
1036
+ # TODO add arguments to control location of tmpdir
1037
+ ):
1038
+ with tempfile.TemporaryDirectory(prefix="vcf2zarr") as tmp:
1039
+ if_dir = pathlib.Path(tmp) / "icf"
1040
+ icf.explode(
1041
+ if_dir,
1042
+ vcfs,
1043
+ worker_processes=worker_processes,
1044
+ show_progress=show_progress,
1045
+ )
1046
+ encode(
1047
+ if_dir,
1048
+ out_path,
1049
+ variants_chunk_size=variants_chunk_size,
1050
+ samples_chunk_size=samples_chunk_size,
1051
+ worker_processes=worker_processes,
1052
+ show_progress=show_progress,
1053
+ )