bio2zarr 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

@@ -0,0 +1,1017 @@
1
+ import dataclasses
2
+ import json
3
+ import logging
4
+ import os
5
+ import os.path
6
+ import pathlib
7
+ import shutil
8
+ import tempfile
9
+
10
+ import humanfriendly
11
+ import numcodecs
12
+ import numpy as np
13
+ import zarr
14
+
15
+ from .. import constants, core, provenance
16
+ from . import icf
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def inspect(path):
22
+ path = pathlib.Path(path)
23
+ # TODO add support for the Zarr format also
24
+ if (path / "metadata.json").exists():
25
+ obj = icf.IntermediateColumnarFormat(path)
26
+ elif (path / ".zmetadata").exists():
27
+ obj = VcfZarr(path)
28
+ else:
29
+ raise ValueError("Format not recognised") # NEEDS TEST
30
+ return obj.summary_table()
31
+
32
+
33
+ DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
34
+
35
+
36
+ @dataclasses.dataclass
37
+ class ZarrColumnSpec:
38
+ name: str
39
+ dtype: str
40
+ shape: tuple
41
+ chunks: tuple
42
+ dimensions: tuple
43
+ description: str
44
+ vcf_field: str
45
+ compressor: dict
46
+ filters: list
47
+
48
+ def __post_init__(self):
49
+ # Ensure these are tuples for ease of comparison and consistency
50
+ self.shape = tuple(self.shape)
51
+ self.chunks = tuple(self.chunks)
52
+ self.dimensions = tuple(self.dimensions)
53
+ self.filters = tuple(self.filters)
54
+
55
+ @staticmethod
56
+ def new(**kwargs):
57
+ spec = ZarrColumnSpec(
58
+ **kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
59
+ )
60
+ spec._choose_compressor_settings()
61
+ return spec
62
+
63
+ @staticmethod
64
+ def from_field(
65
+ vcf_field,
66
+ *,
67
+ num_variants,
68
+ num_samples,
69
+ variants_chunk_size,
70
+ samples_chunk_size,
71
+ variable_name=None,
72
+ ):
73
+ shape = [num_variants]
74
+ prefix = "variant_"
75
+ dimensions = ["variants"]
76
+ chunks = [variants_chunk_size]
77
+ if vcf_field.category == "FORMAT":
78
+ prefix = "call_"
79
+ shape.append(num_samples)
80
+ chunks.append(samples_chunk_size)
81
+ dimensions.append("samples")
82
+ if variable_name is None:
83
+ variable_name = prefix + vcf_field.name
84
+ # TODO make an option to add in the empty extra dimension
85
+ if vcf_field.summary.max_number > 1:
86
+ shape.append(vcf_field.summary.max_number)
87
+ # TODO we should really be checking this to see if the named dimensions
88
+ # are actually correct.
89
+ if vcf_field.vcf_number == "R":
90
+ dimensions.append("alleles")
91
+ elif vcf_field.vcf_number == "A":
92
+ dimensions.append("alt_alleles")
93
+ elif vcf_field.vcf_number == "G":
94
+ dimensions.append("genotypes")
95
+ else:
96
+ dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
97
+ return ZarrColumnSpec.new(
98
+ vcf_field=vcf_field.full_name,
99
+ name=variable_name,
100
+ dtype=vcf_field.smallest_dtype(),
101
+ shape=shape,
102
+ chunks=chunks,
103
+ dimensions=dimensions,
104
+ description=vcf_field.description,
105
+ )
106
+
107
+ def _choose_compressor_settings(self):
108
+ """
109
+ Choose compressor and filter settings based on the size and
110
+ type of the array, plus some hueristics from observed properties
111
+ of VCFs.
112
+
113
+ See https://github.com/pystatgen/bio2zarr/discussions/74
114
+ """
115
+ # Default is to not shuffle, because autoshuffle isn't recognised
116
+ # by many Zarr implementations, and shuffling can lead to worse
117
+ # performance in some cases anyway. Turning on shuffle should be a
118
+ # deliberate choice.
119
+ shuffle = numcodecs.Blosc.NOSHUFFLE
120
+ if self.name == "call_genotype" and self.dtype == "i1":
121
+ # call_genotype gets BITSHUFFLE by default as it gets
122
+ # significantly better compression (at a cost of slower
123
+ # decoding)
124
+ shuffle = numcodecs.Blosc.BITSHUFFLE
125
+ elif self.dtype == "bool":
126
+ shuffle = numcodecs.Blosc.BITSHUFFLE
127
+
128
+ self.compressor["shuffle"] = shuffle
129
+
130
+ @property
131
+ def variant_chunk_nbytes(self):
132
+ """
133
+ Returns the nbytes for a single variant chunk of this array.
134
+ """
135
+ chunk_items = self.chunks[0]
136
+ for size in self.shape[1:]:
137
+ chunk_items *= size
138
+ dt = np.dtype(self.dtype)
139
+ if dt.kind == "O" and "samples" in self.dimensions:
140
+ logger.warning(
141
+ f"Field {self.name} is a string; max memory usage may "
142
+ "be a significant underestimate"
143
+ )
144
+ return chunk_items * dt.itemsize
145
+
146
+
147
+ ZARR_SCHEMA_FORMAT_VERSION = "0.4"
148
+
149
+
150
+ @dataclasses.dataclass
151
+ class VcfZarrSchema(core.JsonDataclass):
152
+ format_version: str
153
+ samples_chunk_size: int
154
+ variants_chunk_size: int
155
+ samples: list
156
+ contigs: list
157
+ filters: list
158
+ fields: list
159
+
160
+ def field_map(self):
161
+ return {field.name: field for field in self.fields}
162
+
163
+ @staticmethod
164
+ def fromdict(d):
165
+ if d["format_version"] != ZARR_SCHEMA_FORMAT_VERSION:
166
+ raise ValueError(
167
+ "Zarr schema format version mismatch: "
168
+ f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
169
+ )
170
+ ret = VcfZarrSchema(**d)
171
+ ret.samples = [icf.Sample(**sd) for sd in d["samples"]]
172
+ ret.contigs = [icf.Contig(**sd) for sd in d["contigs"]]
173
+ ret.filters = [icf.Filter(**sd) for sd in d["filters"]]
174
+ ret.fields = [ZarrColumnSpec(**sd) for sd in d["fields"]]
175
+ return ret
176
+
177
+ @staticmethod
178
+ def fromjson(s):
179
+ return VcfZarrSchema.fromdict(json.loads(s))
180
+
181
+ @staticmethod
182
+ def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
183
+ m = icf.num_records
184
+ n = icf.num_samples
185
+ # FIXME
186
+ if samples_chunk_size is None:
187
+ samples_chunk_size = 1000
188
+ if variants_chunk_size is None:
189
+ variants_chunk_size = 10_000
190
+ logger.info(
191
+ f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
192
+ )
193
+
194
+ def spec_from_field(field, variable_name=None):
195
+ return ZarrColumnSpec.from_field(
196
+ field,
197
+ num_samples=n,
198
+ num_variants=m,
199
+ samples_chunk_size=samples_chunk_size,
200
+ variants_chunk_size=variants_chunk_size,
201
+ variable_name=variable_name,
202
+ )
203
+
204
+ def fixed_field_spec(
205
+ name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
206
+ ):
207
+ return ZarrColumnSpec.new(
208
+ vcf_field=vcf_field,
209
+ name=name,
210
+ dtype=dtype,
211
+ shape=shape,
212
+ description="",
213
+ dimensions=dimensions,
214
+ chunks=[variants_chunk_size],
215
+ )
216
+
217
+ alt_col = icf.fields["ALT"]
218
+ max_alleles = alt_col.vcf_field.summary.max_number + 1
219
+
220
+ colspecs = [
221
+ fixed_field_spec(
222
+ name="variant_contig",
223
+ dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
224
+ ),
225
+ fixed_field_spec(
226
+ name="variant_filter",
227
+ dtype="bool",
228
+ shape=(m, icf.metadata.num_filters),
229
+ dimensions=["variants", "filters"],
230
+ ),
231
+ fixed_field_spec(
232
+ name="variant_allele",
233
+ dtype="str",
234
+ shape=(m, max_alleles),
235
+ dimensions=["variants", "alleles"],
236
+ ),
237
+ fixed_field_spec(
238
+ name="variant_id",
239
+ dtype="str",
240
+ ),
241
+ fixed_field_spec(
242
+ name="variant_id_mask",
243
+ dtype="bool",
244
+ ),
245
+ ]
246
+ name_map = {field.full_name: field for field in icf.metadata.fields}
247
+
248
+ # Only two of the fixed fields have a direct one-to-one mapping.
249
+ colspecs.extend(
250
+ [
251
+ spec_from_field(name_map["QUAL"], variable_name="variant_quality"),
252
+ spec_from_field(name_map["POS"], variable_name="variant_position"),
253
+ ]
254
+ )
255
+ colspecs.extend([spec_from_field(field) for field in icf.metadata.info_fields])
256
+
257
+ gt_field = None
258
+ for field in icf.metadata.format_fields:
259
+ if field.name == "GT":
260
+ gt_field = field
261
+ continue
262
+ colspecs.append(spec_from_field(field))
263
+
264
+ if gt_field is not None:
265
+ ploidy = gt_field.summary.max_number - 1
266
+ shape = [m, n]
267
+ chunks = [variants_chunk_size, samples_chunk_size]
268
+ dimensions = ["variants", "samples"]
269
+ colspecs.append(
270
+ ZarrColumnSpec.new(
271
+ vcf_field=None,
272
+ name="call_genotype_phased",
273
+ dtype="bool",
274
+ shape=list(shape),
275
+ chunks=list(chunks),
276
+ dimensions=list(dimensions),
277
+ description="",
278
+ )
279
+ )
280
+ shape += [ploidy]
281
+ dimensions += ["ploidy"]
282
+ colspecs.append(
283
+ ZarrColumnSpec.new(
284
+ vcf_field=None,
285
+ name="call_genotype",
286
+ dtype=gt_field.smallest_dtype(),
287
+ shape=list(shape),
288
+ chunks=list(chunks),
289
+ dimensions=list(dimensions),
290
+ description="",
291
+ )
292
+ )
293
+ colspecs.append(
294
+ ZarrColumnSpec.new(
295
+ vcf_field=None,
296
+ name="call_genotype_mask",
297
+ dtype="bool",
298
+ shape=list(shape),
299
+ chunks=list(chunks),
300
+ dimensions=list(dimensions),
301
+ description="",
302
+ )
303
+ )
304
+
305
+ return VcfZarrSchema(
306
+ format_version=ZARR_SCHEMA_FORMAT_VERSION,
307
+ samples_chunk_size=samples_chunk_size,
308
+ variants_chunk_size=variants_chunk_size,
309
+ fields=colspecs,
310
+ samples=icf.metadata.samples,
311
+ contigs=icf.metadata.contigs,
312
+ filters=icf.metadata.filters,
313
+ )
314
+
315
+
316
+ class VcfZarr:
317
+ def __init__(self, path):
318
+ if not (path / ".zmetadata").exists():
319
+ raise ValueError("Not in VcfZarr format") # NEEDS TEST
320
+ self.path = path
321
+ self.root = zarr.open(path, mode="r")
322
+
323
+ def summary_table(self):
324
+ data = []
325
+ arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()]
326
+ arrays.sort(key=lambda x: x[0])
327
+ for stored, array in reversed(arrays):
328
+ d = {
329
+ "name": array.name,
330
+ "dtype": str(array.dtype),
331
+ "stored": core.display_size(stored),
332
+ "size": core.display_size(array.nbytes),
333
+ "ratio": core.display_number(array.nbytes / stored),
334
+ "nchunks": str(array.nchunks),
335
+ "chunk_size": core.display_size(array.nbytes / array.nchunks),
336
+ "avg_chunk_stored": core.display_size(int(stored / array.nchunks)),
337
+ "shape": str(array.shape),
338
+ "chunk_shape": str(array.chunks),
339
+ "compressor": str(array.compressor),
340
+ "filters": str(array.filters),
341
+ }
342
+ data.append(d)
343
+ return data
344
+
345
+
346
+ def parse_max_memory(max_memory):
347
+ if max_memory is None:
348
+ # Effectively unbounded
349
+ return 2**63
350
+ if isinstance(max_memory, str):
351
+ max_memory = humanfriendly.parse_size(max_memory)
352
+ logger.info(f"Set memory budget to {core.display_size(max_memory)}")
353
+ return max_memory
354
+
355
+
356
+ @dataclasses.dataclass
357
+ class VcfZarrPartition:
358
+ start: int
359
+ stop: int
360
+
361
+ @staticmethod
362
+ def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
363
+ num_chunks = int(np.ceil(num_records / chunk_size))
364
+ if max_chunks is not None:
365
+ num_chunks = min(num_chunks, max_chunks)
366
+ partitions = []
367
+ splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
368
+ for chunk_slice in splits:
369
+ start_chunk = int(chunk_slice[0])
370
+ stop_chunk = int(chunk_slice[-1]) + 1
371
+ start_index = start_chunk * chunk_size
372
+ stop_index = min(stop_chunk * chunk_size, num_records)
373
+ partitions.append(VcfZarrPartition(start_index, stop_index))
374
+ return partitions
375
+
376
+
377
+ VZW_METADATA_FORMAT_VERSION = "0.1"
378
+
379
+
380
+ @dataclasses.dataclass
381
+ class VcfZarrWriterMetadata(core.JsonDataclass):
382
+ format_version: str
383
+ icf_path: str
384
+ schema: VcfZarrSchema
385
+ dimension_separator: str
386
+ partitions: list
387
+ provenance: dict
388
+
389
+ @staticmethod
390
+ def fromdict(d):
391
+ if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
392
+ raise ValueError(
393
+ "VcfZarrWriter format version mismatch: "
394
+ f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
395
+ )
396
+ ret = VcfZarrWriterMetadata(**d)
397
+ ret.schema = VcfZarrSchema.fromdict(ret.schema)
398
+ ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
399
+ return ret
400
+
401
+
402
+ @dataclasses.dataclass
403
+ class VcfZarrWriteSummary(core.JsonDataclass):
404
+ num_partitions: int
405
+ num_samples: int
406
+ num_variants: int
407
+ num_chunks: int
408
+ max_encoding_memory: str
409
+
410
+
411
+ class VcfZarrWriter:
412
+ def __init__(self, path):
413
+ self.path = pathlib.Path(path)
414
+ self.wip_path = self.path / "wip"
415
+ self.arrays_path = self.wip_path / "arrays"
416
+ self.partitions_path = self.wip_path / "partitions"
417
+ self.metadata = None
418
+ self.icf = None
419
+
420
+ @property
421
+ def schema(self):
422
+ return self.metadata.schema
423
+
424
+ @property
425
+ def num_partitions(self):
426
+ return len(self.metadata.partitions)
427
+
428
+ def has_genotypes(self):
429
+ for field in self.schema.fields:
430
+ if field.name == "call_genotype":
431
+ return True
432
+ return False
433
+
434
+ #######################
435
+ # init
436
+ #######################
437
+
438
+ def init(
439
+ self,
440
+ icf,
441
+ *,
442
+ target_num_partitions,
443
+ schema,
444
+ dimension_separator=None,
445
+ max_variant_chunks=None,
446
+ ):
447
+ self.icf = icf
448
+ if self.path.exists():
449
+ raise ValueError("Zarr path already exists") # NEEDS TEST
450
+ partitions = VcfZarrPartition.generate_partitions(
451
+ self.icf.num_records,
452
+ schema.variants_chunk_size,
453
+ target_num_partitions,
454
+ max_chunks=max_variant_chunks,
455
+ )
456
+ # Default to using nested directories following the Zarr v3 default.
457
+ # This seems to require version 2.17+ to work properly
458
+ dimension_separator = (
459
+ "/" if dimension_separator is None else dimension_separator
460
+ )
461
+ self.metadata = VcfZarrWriterMetadata(
462
+ format_version=VZW_METADATA_FORMAT_VERSION,
463
+ icf_path=str(self.icf.path),
464
+ schema=schema,
465
+ dimension_separator=dimension_separator,
466
+ partitions=partitions,
467
+ # Bare minimum here for provenance - see comments above
468
+ provenance={"source": f"bio2zarr-{provenance.__version__}"},
469
+ )
470
+
471
+ self.path.mkdir()
472
+ store = zarr.DirectoryStore(self.path)
473
+ root = zarr.group(store=store)
474
+ root.attrs.update(
475
+ {
476
+ "vcf_zarr_version": "0.2",
477
+ "vcf_header": self.icf.vcf_header,
478
+ "source": f"bio2zarr-{provenance.__version__}",
479
+ }
480
+ )
481
+ # Doing this syncronously - this is fine surely
482
+ self.encode_samples(root)
483
+ self.encode_filter_id(root)
484
+ self.encode_contig_id(root)
485
+
486
+ self.wip_path.mkdir()
487
+ self.arrays_path.mkdir()
488
+ self.partitions_path.mkdir()
489
+ store = zarr.DirectoryStore(self.arrays_path)
490
+ root = zarr.group(store=store)
491
+
492
+ total_chunks = 0
493
+ for field in self.schema.fields:
494
+ a = self.init_array(root, field, partitions[-1].stop)
495
+ total_chunks += a.nchunks
496
+
497
+ logger.info("Writing WIP metadata")
498
+ with open(self.wip_path / "metadata.json", "w") as f:
499
+ json.dump(self.metadata.asdict(), f, indent=4)
500
+
501
+ return VcfZarrWriteSummary(
502
+ num_variants=self.icf.num_records,
503
+ num_samples=self.icf.num_samples,
504
+ num_partitions=self.num_partitions,
505
+ num_chunks=total_chunks,
506
+ max_encoding_memory=core.display_size(self.get_max_encoding_memory()),
507
+ )
508
+
509
+ def encode_samples(self, root):
510
+ if self.schema.samples != self.icf.metadata.samples:
511
+ raise ValueError("Subsetting or reordering samples not supported currently")
512
+ array = root.array(
513
+ "sample_id",
514
+ [sample.id for sample in self.schema.samples],
515
+ dtype="str",
516
+ compressor=DEFAULT_ZARR_COMPRESSOR,
517
+ chunks=(self.schema.samples_chunk_size,),
518
+ )
519
+ array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
520
+ logger.debug("Samples done")
521
+
522
+ def encode_contig_id(self, root):
523
+ array = root.array(
524
+ "contig_id",
525
+ [contig.id for contig in self.schema.contigs],
526
+ dtype="str",
527
+ compressor=DEFAULT_ZARR_COMPRESSOR,
528
+ )
529
+ array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
530
+ if all(contig.length is not None for contig in self.schema.contigs):
531
+ array = root.array(
532
+ "contig_length",
533
+ [contig.length for contig in self.schema.contigs],
534
+ dtype=np.int64,
535
+ compressor=DEFAULT_ZARR_COMPRESSOR,
536
+ )
537
+ array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
538
+
539
+ def encode_filter_id(self, root):
540
+ # TODO need a way to store description also
541
+ # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
542
+ array = root.array(
543
+ "filter_id",
544
+ [filt.id for filt in self.schema.filters],
545
+ dtype="str",
546
+ compressor=DEFAULT_ZARR_COMPRESSOR,
547
+ )
548
+ array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
549
+
550
+ def init_array(self, root, variable, variants_dim_size):
551
+ object_codec = None
552
+ if variable.dtype == "O":
553
+ object_codec = numcodecs.VLenUTF8()
554
+ shape = list(variable.shape)
555
+ # Truncate the variants dimension is max_variant_chunks was specified
556
+ shape[0] = variants_dim_size
557
+ a = root.empty(
558
+ variable.name,
559
+ shape=shape,
560
+ chunks=variable.chunks,
561
+ dtype=variable.dtype,
562
+ compressor=numcodecs.get_codec(variable.compressor),
563
+ filters=[numcodecs.get_codec(filt) for filt in variable.filters],
564
+ object_codec=object_codec,
565
+ dimension_separator=self.metadata.dimension_separator,
566
+ )
567
+ a.attrs.update(
568
+ {
569
+ "description": variable.description,
570
+ # Dimension names are part of the spec in Zarr v3
571
+ "_ARRAY_DIMENSIONS": variable.dimensions,
572
+ }
573
+ )
574
+ logger.debug(f"Initialised {a}")
575
+ return a
576
+
577
+ #######################
578
+ # encode_partition
579
+ #######################
580
+
581
+ def load_metadata(self):
582
+ if self.metadata is None:
583
+ with open(self.wip_path / "metadata.json") as f:
584
+ self.metadata = VcfZarrWriterMetadata.fromdict(json.load(f))
585
+ self.icf = icf.IntermediateColumnarFormat(self.metadata.icf_path)
586
+
587
+ def partition_path(self, partition_index):
588
+ return self.partitions_path / f"p{partition_index}"
589
+
590
+ def wip_partition_path(self, partition_index):
591
+ return self.partitions_path / f"wip_p{partition_index}"
592
+
593
+ def wip_partition_array_path(self, partition_index, name):
594
+ return self.wip_partition_path(partition_index) / name
595
+
596
+ def partition_array_path(self, partition_index, name):
597
+ return self.partition_path(partition_index) / name
598
+
599
+ def encode_partition(self, partition_index):
600
+ self.load_metadata()
601
+ if partition_index < 0 or partition_index >= self.num_partitions:
602
+ raise ValueError("Partition index not in the valid range")
603
+ partition_path = self.wip_partition_path(partition_index)
604
+ partition_path.mkdir(exist_ok=True)
605
+ logger.info(f"Encoding partition {partition_index} to {partition_path}")
606
+
607
+ self.encode_id_partition(partition_index)
608
+ self.encode_filters_partition(partition_index)
609
+ self.encode_contig_partition(partition_index)
610
+ self.encode_alleles_partition(partition_index)
611
+ for col in self.schema.fields:
612
+ if col.vcf_field is not None:
613
+ self.encode_array_partition(col, partition_index)
614
+ if self.has_genotypes():
615
+ self.encode_genotypes_partition(partition_index)
616
+
617
+ final_path = self.partition_path(partition_index)
618
+ logger.info(f"Finalising {partition_index} at {final_path}")
619
+ if final_path.exists():
620
+ logger.warning(f"Removing existing partition at {final_path}")
621
+ shutil.rmtree(final_path)
622
+ os.rename(partition_path, final_path)
623
+
624
+ def init_partition_array(self, partition_index, name):
625
+ # Create an empty array like the definition
626
+ src = self.arrays_path / name
627
+ # Overwrite any existing WIP files
628
+ wip_path = self.wip_partition_array_path(partition_index, name)
629
+ shutil.copytree(src, wip_path, dirs_exist_ok=True)
630
+ store = zarr.DirectoryStore(self.wip_partition_path(partition_index))
631
+ wip_root = zarr.group(store=store)
632
+ array = wip_root[name]
633
+ logger.debug(f"Opened empty array {array.name} <{array.dtype}> @ {wip_path}")
634
+ return array
635
+
636
+ def finalise_partition_array(self, partition_index, name):
637
+ logger.debug(f"Encoded {name} partition {partition_index}")
638
+
639
+ def encode_array_partition(self, column, partition_index):
640
+ array = self.init_partition_array(partition_index, column.name)
641
+
642
+ partition = self.metadata.partitions[partition_index]
643
+ ba = core.BufferedArray(array, partition.start)
644
+ source_col = self.icf.fields[column.vcf_field]
645
+ sanitiser = source_col.sanitiser_factory(ba.buff.shape)
646
+
647
+ for value in source_col.iter_values(partition.start, partition.stop):
648
+ # We write directly into the buffer in the sanitiser function
649
+ # to make it easier to reason about dimension padding
650
+ j = ba.next_buffer_row()
651
+ sanitiser(ba.buff, j, value)
652
+ ba.flush()
653
+ self.finalise_partition_array(partition_index, column.name)
654
+
655
+ def encode_genotypes_partition(self, partition_index):
656
+ gt_array = self.init_partition_array(partition_index, "call_genotype")
657
+ gt_mask_array = self.init_partition_array(partition_index, "call_genotype_mask")
658
+ gt_phased_array = self.init_partition_array(
659
+ partition_index, "call_genotype_phased"
660
+ )
661
+
662
+ partition = self.metadata.partitions[partition_index]
663
+ gt = core.BufferedArray(gt_array, partition.start)
664
+ gt_mask = core.BufferedArray(gt_mask_array, partition.start)
665
+ gt_phased = core.BufferedArray(gt_phased_array, partition.start)
666
+
667
+ source_col = self.icf.fields["FORMAT/GT"]
668
+ for value in source_col.iter_values(partition.start, partition.stop):
669
+ j = gt.next_buffer_row()
670
+ icf.sanitise_value_int_2d(gt.buff, j, value[:, :-1])
671
+ j = gt_phased.next_buffer_row()
672
+ icf.sanitise_value_int_1d(gt_phased.buff, j, value[:, -1])
673
+ # TODO check is this the correct semantics when we are padding
674
+ # with mixed ploidies?
675
+ j = gt_mask.next_buffer_row()
676
+ gt_mask.buff[j] = gt.buff[j] < 0
677
+ gt.flush()
678
+ gt_phased.flush()
679
+ gt_mask.flush()
680
+
681
+ self.finalise_partition_array(partition_index, "call_genotype")
682
+ self.finalise_partition_array(partition_index, "call_genotype_mask")
683
+ self.finalise_partition_array(partition_index, "call_genotype_phased")
684
+
685
+ def encode_alleles_partition(self, partition_index):
686
+ array_name = "variant_allele"
687
+ alleles_array = self.init_partition_array(partition_index, array_name)
688
+ partition = self.metadata.partitions[partition_index]
689
+ alleles = core.BufferedArray(alleles_array, partition.start)
690
+ ref_col = self.icf.fields["REF"]
691
+ alt_col = self.icf.fields["ALT"]
692
+
693
+ for ref, alt in zip(
694
+ ref_col.iter_values(partition.start, partition.stop),
695
+ alt_col.iter_values(partition.start, partition.stop),
696
+ ):
697
+ j = alleles.next_buffer_row()
698
+ alleles.buff[j, :] = constants.STR_FILL
699
+ alleles.buff[j, 0] = ref[0]
700
+ alleles.buff[j, 1 : 1 + len(alt)] = alt
701
+ alleles.flush()
702
+
703
+ self.finalise_partition_array(partition_index, array_name)
704
+
705
+ def encode_id_partition(self, partition_index):
706
+ vid_array = self.init_partition_array(partition_index, "variant_id")
707
+ vid_mask_array = self.init_partition_array(partition_index, "variant_id_mask")
708
+ partition = self.metadata.partitions[partition_index]
709
+ vid = core.BufferedArray(vid_array, partition.start)
710
+ vid_mask = core.BufferedArray(vid_mask_array, partition.start)
711
+ col = self.icf.fields["ID"]
712
+
713
+ for value in col.iter_values(partition.start, partition.stop):
714
+ j = vid.next_buffer_row()
715
+ k = vid_mask.next_buffer_row()
716
+ assert j == k
717
+ if value is not None:
718
+ vid.buff[j] = value[0]
719
+ vid_mask.buff[j] = False
720
+ else:
721
+ vid.buff[j] = constants.STR_MISSING
722
+ vid_mask.buff[j] = True
723
+ vid.flush()
724
+ vid_mask.flush()
725
+
726
+ self.finalise_partition_array(partition_index, "variant_id")
727
+ self.finalise_partition_array(partition_index, "variant_id_mask")
728
+
729
+ def encode_filters_partition(self, partition_index):
730
+ lookup = {filt.id: index for index, filt in enumerate(self.schema.filters)}
731
+ array_name = "variant_filter"
732
+ array = self.init_partition_array(partition_index, array_name)
733
+ partition = self.metadata.partitions[partition_index]
734
+ var_filter = core.BufferedArray(array, partition.start)
735
+
736
+ col = self.icf.fields["FILTERS"]
737
+ for value in col.iter_values(partition.start, partition.stop):
738
+ j = var_filter.next_buffer_row()
739
+ var_filter.buff[j] = False
740
+ for f in value:
741
+ try:
742
+ var_filter.buff[j, lookup[f]] = True
743
+ except KeyError:
744
+ raise ValueError(
745
+ f"Filter '{f}' was not defined in the header."
746
+ ) from None
747
+ var_filter.flush()
748
+
749
+ self.finalise_partition_array(partition_index, array_name)
750
+
751
+ def encode_contig_partition(self, partition_index):
752
+ lookup = {contig.id: index for index, contig in enumerate(self.schema.contigs)}
753
+ array_name = "variant_contig"
754
+ array = self.init_partition_array(partition_index, array_name)
755
+ partition = self.metadata.partitions[partition_index]
756
+ contig = core.BufferedArray(array, partition.start)
757
+ col = self.icf.fields["CHROM"]
758
+
759
+ for value in col.iter_values(partition.start, partition.stop):
760
+ j = contig.next_buffer_row()
761
+ # Note: because we are using the indexes to define the lookups
762
+ # and we always have an index, it seems that we the contig lookup
763
+ # will always succeed. However, if anyone ever does hit a KeyError
764
+ # here, please do open an issue with a reproducible example!
765
+ contig.buff[j] = lookup[value[0]]
766
+ contig.flush()
767
+
768
+ self.finalise_partition_array(partition_index, array_name)
769
+
770
+ #######################
771
+ # finalise
772
+ #######################
773
+
774
+ def finalise_array(self, name):
775
+ logger.info(f"Finalising {name}")
776
+ final_path = self.path / name
777
+ if final_path.exists():
778
+ # NEEDS TEST
779
+ raise ValueError(f"Array {name} already exists")
780
+ for partition in range(self.num_partitions):
781
+ # Move all the files in partition dir to dest dir
782
+ src = self.partition_array_path(partition, name)
783
+ if not src.exists():
784
+ # Needs test
785
+ raise ValueError(f"Partition {partition} of {name} does not exist")
786
+ dest = self.arrays_path / name
787
+ # This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
788
+ chunk_files = [
789
+ path for path in src.iterdir() if not path.name.startswith(".")
790
+ ]
791
+ # TODO check for a count of then number of files. If we require a
792
+ # dimension_separator of "/" then we could make stronger assertions
793
+ # here, as we'd always have num_variant_chunks
794
+ logger.debug(
795
+ f"Moving {len(chunk_files)} chunks for {name} partition {partition}"
796
+ )
797
+ for chunk_file in chunk_files:
798
+ os.rename(chunk_file, dest / chunk_file.name)
799
+ # Finally, once all the chunks have moved into the arrays dir,
800
+ # we move it out of wip
801
+ os.rename(self.arrays_path / name, self.path / name)
802
+ core.update_progress(1)
803
+
804
+ def finalise(self, show_progress=False):
805
+ self.load_metadata()
806
+
807
+ logger.info(f"Scanning {self.num_partitions} partitions")
808
+ missing = []
809
+ # TODO may need a progress bar here
810
+ for partition_id in range(self.num_partitions):
811
+ if not self.partition_path(partition_id).exists():
812
+ missing.append(partition_id)
813
+ if len(missing) > 0:
814
+ raise FileNotFoundError(f"Partitions not encoded: {missing}")
815
+
816
+ progress_config = core.ProgressConfig(
817
+ total=len(self.schema.fields),
818
+ title="Finalise",
819
+ units="array",
820
+ show=show_progress,
821
+ )
822
+ # NOTE: it's not clear that adding more workers will make this quicker,
823
+ # as it's just going to be causing contention on the file system.
824
+ # Something to check empirically in some deployments.
825
+ # FIXME we're just using worker_processes=0 here to hook into the
826
+ # SynchronousExecutor which is intended for testing purposes so
827
+ # that we get test coverage. Should fix this either by allowing
828
+ # for multiple workers, or making a standard wrapper for tqdm
829
+ # that allows us to have a consistent look and feel.
830
+ with core.ParallelWorkManager(0, progress_config) as pwm:
831
+ for field in self.schema.fields:
832
+ pwm.submit(self.finalise_array, field.name)
833
+ logger.debug(f"Removing {self.wip_path}")
834
+ shutil.rmtree(self.wip_path)
835
+ logger.info("Consolidating Zarr metadata")
836
+ zarr.consolidate_metadata(self.path)
837
+
838
+ ######################
839
+ # encode_all_partitions
840
+ ######################
841
+
842
+ def get_max_encoding_memory(self):
843
+ """
844
+ Return the approximate maximum memory used to encode a variant chunk.
845
+ """
846
+ max_encoding_mem = 0
847
+ for col in self.schema.fields:
848
+ max_encoding_mem = max(max_encoding_mem, col.variant_chunk_nbytes)
849
+ gt_mem = 0
850
+ if self.has_genotypes:
851
+ gt_mem = sum(
852
+ field.variant_chunk_nbytes
853
+ for field in self.schema.fields
854
+ if field.name.startswith("call_genotype")
855
+ )
856
+ return max(max_encoding_mem, gt_mem)
857
+
858
+ def encode_all_partitions(
859
+ self, *, worker_processes=1, show_progress=False, max_memory=None
860
+ ):
861
+ max_memory = parse_max_memory(max_memory)
862
+ self.load_metadata()
863
+ num_partitions = self.num_partitions
864
+ per_worker_memory = self.get_max_encoding_memory()
865
+ logger.info(
866
+ f"Encoding Zarr over {num_partitions} partitions with "
867
+ f"{worker_processes} workers and {core.display_size(per_worker_memory)} "
868
+ "per worker"
869
+ )
870
+ # Each partition requires per_worker_memory bytes, so to prevent more that
871
+ # max_memory being used, we clamp the number of workers
872
+ max_num_workers = max_memory // per_worker_memory
873
+ if max_num_workers < worker_processes:
874
+ logger.warning(
875
+ f"Limiting number of workers to {max_num_workers} to "
876
+ "keep within specified memory budget of "
877
+ f"{core.display_size(max_memory)}"
878
+ )
879
+ if max_num_workers <= 0:
880
+ raise ValueError(
881
+ f"Insufficient memory to encode a partition:"
882
+ f"{core.display_size(per_worker_memory)} > "
883
+ f"{core.display_size(max_memory)}"
884
+ )
885
+ num_workers = min(max_num_workers, worker_processes)
886
+
887
+ total_bytes = 0
888
+ for col in self.schema.fields:
889
+ # Open the array definition to get the total size
890
+ total_bytes += zarr.open(self.arrays_path / col.name).nbytes
891
+
892
+ progress_config = core.ProgressConfig(
893
+ total=total_bytes,
894
+ title="Encode",
895
+ units="B",
896
+ show=show_progress,
897
+ )
898
+ with core.ParallelWorkManager(num_workers, progress_config) as pwm:
899
+ for partition_index in range(num_partitions):
900
+ pwm.submit(self.encode_partition, partition_index)
901
+
902
+
903
+ def mkschema(if_path, out):
904
+ store = icf.IntermediateColumnarFormat(if_path)
905
+ spec = VcfZarrSchema.generate(store)
906
+ out.write(spec.asjson())
907
+
908
+
909
+ def encode(
910
+ if_path,
911
+ zarr_path,
912
+ schema_path=None,
913
+ variants_chunk_size=None,
914
+ samples_chunk_size=None,
915
+ max_variant_chunks=None,
916
+ dimension_separator=None,
917
+ max_memory=None,
918
+ worker_processes=1,
919
+ show_progress=False,
920
+ ):
921
+ # Rough heuristic to split work up enough to keep utilisation high
922
+ target_num_partitions = max(1, worker_processes * 4)
923
+ encode_init(
924
+ if_path,
925
+ zarr_path,
926
+ target_num_partitions,
927
+ schema_path=schema_path,
928
+ variants_chunk_size=variants_chunk_size,
929
+ samples_chunk_size=samples_chunk_size,
930
+ max_variant_chunks=max_variant_chunks,
931
+ dimension_separator=dimension_separator,
932
+ )
933
+ vzw = VcfZarrWriter(zarr_path)
934
+ vzw.encode_all_partitions(
935
+ worker_processes=worker_processes,
936
+ show_progress=show_progress,
937
+ max_memory=max_memory,
938
+ )
939
+ vzw.finalise(show_progress)
940
+
941
+
942
+ def encode_init(
943
+ icf_path,
944
+ zarr_path,
945
+ target_num_partitions,
946
+ *,
947
+ schema_path=None,
948
+ variants_chunk_size=None,
949
+ samples_chunk_size=None,
950
+ max_variant_chunks=None,
951
+ dimension_separator=None,
952
+ max_memory=None,
953
+ worker_processes=1,
954
+ show_progress=False,
955
+ ):
956
+ icf_store = icf.IntermediateColumnarFormat(icf_path)
957
+ if schema_path is None:
958
+ schema = VcfZarrSchema.generate(
959
+ icf_store,
960
+ variants_chunk_size=variants_chunk_size,
961
+ samples_chunk_size=samples_chunk_size,
962
+ )
963
+ else:
964
+ logger.info(f"Reading schema from {schema_path}")
965
+ if variants_chunk_size is not None or samples_chunk_size is not None:
966
+ raise ValueError(
967
+ "Cannot specify schema along with chunk sizes"
968
+ ) # NEEDS TEST
969
+ with open(schema_path) as f:
970
+ schema = VcfZarrSchema.fromjson(f.read())
971
+ zarr_path = pathlib.Path(zarr_path)
972
+ vzw = VcfZarrWriter(zarr_path)
973
+ return vzw.init(
974
+ icf_store,
975
+ target_num_partitions=target_num_partitions,
976
+ schema=schema,
977
+ dimension_separator=dimension_separator,
978
+ max_variant_chunks=max_variant_chunks,
979
+ )
980
+
981
+
982
+ def encode_partition(zarr_path, partition):
983
+ writer = VcfZarrWriter(zarr_path)
984
+ writer.encode_partition(partition)
985
+
986
+
987
+ def encode_finalise(zarr_path, show_progress=False):
988
+ writer = VcfZarrWriter(zarr_path)
989
+ writer.finalise(show_progress=show_progress)
990
+
991
+
992
+ def convert(
993
+ vcfs,
994
+ out_path,
995
+ *,
996
+ variants_chunk_size=None,
997
+ samples_chunk_size=None,
998
+ worker_processes=1,
999
+ show_progress=False,
1000
+ # TODO add arguments to control location of tmpdir
1001
+ ):
1002
+ with tempfile.TemporaryDirectory(prefix="vcf2zarr") as tmp:
1003
+ if_dir = pathlib.Path(tmp) / "icf"
1004
+ icf.explode(
1005
+ if_dir,
1006
+ vcfs,
1007
+ worker_processes=worker_processes,
1008
+ show_progress=show_progress,
1009
+ )
1010
+ encode(
1011
+ if_dir,
1012
+ out_path,
1013
+ variants_chunk_size=variants_chunk_size,
1014
+ samples_chunk_size=samples_chunk_size,
1015
+ worker_processes=worker_processes,
1016
+ show_progress=show_progress,
1017
+ )