bio2zarr 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

@@ -6,14 +6,17 @@ import logging
6
6
  import math
7
7
  import pathlib
8
8
  import pickle
9
+ import re
9
10
  import shutil
10
11
  import sys
12
+ import tempfile
13
+ from functools import partial
11
14
  from typing import Any
12
15
 
13
16
  import numcodecs
14
17
  import numpy as np
15
18
 
16
- from .. import constants, core, provenance, vcf_utils
19
+ from . import constants, core, provenance, vcf_utils, vcz
17
20
 
18
21
  logger = logging.getLogger(__name__)
19
22
 
@@ -77,6 +80,14 @@ class VcfField:
77
80
  return self.name
78
81
  return f"{self.category}/{self.name}"
79
82
 
83
+ @property
84
+ def max_number(self):
85
+ if self.vcf_number in ("R", "A", "G", "."):
86
+ return self.summary.max_number
87
+ else:
88
+ # use declared number if larger than max found
89
+ return max(self.summary.max_number, int(self.vcf_number))
90
+
80
91
  def smallest_dtype(self):
81
92
  """
82
93
  Returns the smallest dtype suitable for this field based
@@ -116,23 +127,6 @@ ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
116
127
  )
117
128
 
118
129
 
119
- @dataclasses.dataclass
120
- class Contig:
121
- id: str
122
- length: int = None
123
-
124
-
125
- @dataclasses.dataclass
126
- class Sample:
127
- id: str
128
-
129
-
130
- @dataclasses.dataclass
131
- class Filter:
132
- id: str
133
- description: str = ""
134
-
135
-
136
130
  @dataclasses.dataclass
137
131
  class IcfMetadata(core.JsonDataclass):
138
132
  samples: list
@@ -187,9 +181,9 @@ class IcfMetadata(core.JsonDataclass):
187
181
  d = d.copy()
188
182
  d["partitions"] = partitions
189
183
  d["fields"] = [VcfField.fromdict(fd) for fd in d["fields"]]
190
- d["samples"] = [Sample(**sd) for sd in d["samples"]]
191
- d["filters"] = [Filter(**fd) for fd in d["filters"]]
192
- d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
184
+ d["samples"] = [vcz.Sample(**sd) for sd in d["samples"]]
185
+ d["filters"] = [vcz.Filter(**fd) for fd in d["filters"]]
186
+ d["contigs"] = [vcz.Contig(**cd) for cd in d["contigs"]]
193
187
  return IcfMetadata(**d)
194
188
 
195
189
  def __eq__(self, other):
@@ -240,7 +234,7 @@ def scan_vcf(path, target_num_partitions):
240
234
  description = ""
241
235
  if h["ID"] == "PASS":
242
236
  pass_index = len(filters)
243
- filters.append(Filter(h["ID"], description))
237
+ filters.append(vcz.Filter(h["ID"], description))
244
238
 
245
239
  # Ensure PASS is the first filter if present
246
240
  if pass_index > 0:
@@ -262,9 +256,9 @@ def scan_vcf(path, target_num_partitions):
262
256
  contig_lengths = [None for _ in vcf.seqnames]
263
257
 
264
258
  metadata = IcfMetadata(
265
- samples=[Sample(sample_id) for sample_id in vcf.samples],
259
+ samples=[vcz.Sample(sample_id) for sample_id in vcf.samples],
266
260
  contigs=[
267
- Contig(contig_id, length)
261
+ vcz.Contig(contig_id, length)
268
262
  for contig_id, length in zip(vcf.seqnames, contig_lengths)
269
263
  ],
270
264
  filters=filters,
@@ -291,7 +285,12 @@ def scan_vcf(path, target_num_partitions):
291
285
  return metadata, vcf.raw_header
292
286
 
293
287
 
294
- def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
288
+ def scan_vcfs(
289
+ paths,
290
+ show_progress,
291
+ target_num_partitions,
292
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
293
+ ):
295
294
  logger.info(
296
295
  f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
297
296
  f" partitions."
@@ -366,64 +365,58 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
366
365
  return icf_metadata, header
367
366
 
368
367
 
369
- def sanitise_value_bool(buff, j, value):
368
+ def sanitise_value_bool(shape, value):
370
369
  x = True
371
370
  if value is None:
372
371
  x = False
373
- buff[j] = x
372
+ return x
374
373
 
375
374
 
376
- def sanitise_value_float_scalar(buff, j, value):
375
+ def sanitise_value_float_scalar(shape, value):
377
376
  x = value
378
377
  if value is None:
379
378
  x = [constants.FLOAT32_MISSING]
380
- buff[j] = x[0]
379
+ return x[0]
381
380
 
382
381
 
383
- def sanitise_value_int_scalar(buff, j, value):
382
+ def sanitise_value_int_scalar(shape, value):
384
383
  x = value
385
384
  if value is None:
386
- # print("MISSING", INT_MISSING, INT_FILL)
387
385
  x = [constants.INT_MISSING]
388
386
  else:
389
387
  x = sanitise_int_array(value, ndmin=1, dtype=np.int32)
390
- buff[j] = x[0]
388
+ return x[0]
391
389
 
392
390
 
393
- def sanitise_value_string_scalar(buff, j, value):
391
+ def sanitise_value_string_scalar(shape, value):
394
392
  if value is None:
395
- buff[j] = "."
393
+ return "."
396
394
  else:
397
- buff[j] = value[0]
395
+ return value[0]
398
396
 
399
397
 
400
- def sanitise_value_string_1d(buff, j, value):
398
+ def sanitise_value_string_1d(shape, value):
401
399
  if value is None:
402
- buff[j] = "."
400
+ return np.full(shape, ".", dtype="O")
403
401
  else:
404
- # value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
405
- # FIXME failure isn't coming from here, it seems to be from an
406
- # incorrectly detected dimension in the zarr array
407
- # The dimesions look all wrong, and the dtype should be Object
408
- # not str
409
402
  value = drop_empty_second_dim(value)
410
- buff[j] = ""
411
- buff[j, : value.shape[0]] = value
403
+ result = np.full(shape, "", dtype=value.dtype)
404
+ result[: value.shape[0]] = value
405
+ return result
412
406
 
413
407
 
414
- def sanitise_value_string_2d(buff, j, value):
408
+ def sanitise_value_string_2d(shape, value):
415
409
  if value is None:
416
- buff[j] = "."
410
+ return np.full(shape, ".", dtype="O")
417
411
  else:
418
- # print(buff.shape, value.dtype, value)
419
- # assert value.ndim == 2
420
- buff[j] = ""
412
+ result = np.full(shape, "", dtype="O")
421
413
  if value.ndim == 2:
422
- buff[j, :, : value.shape[1]] = value
414
+ result[: value.shape[0], : value.shape[1]] = value
423
415
  else:
424
- # TODO check if this is still necessary
416
+ # Convert 1D array into 2D with appropriate shape
425
417
  for k, val in enumerate(value):
426
- buff[j, k, : len(val)] = val
418
+ result[k, : len(val)] = val
419
+ return result
427
420
 
428
421
 
429
422
  def drop_empty_second_dim(value):
@@ -433,27 +426,28 @@ def drop_empty_second_dim(value):
433
426
  return value
434
427
 
435
428
 
436
- def sanitise_value_float_1d(buff, j, value):
429
+ def sanitise_value_float_1d(shape, value):
437
430
  if value is None:
438
- buff[j] = constants.FLOAT32_MISSING
431
+ return np.full(shape, constants.FLOAT32_MISSING)
439
432
  else:
440
- value = np.array(value, ndmin=1, dtype=buff.dtype, copy=True)
433
+ value = np.array(value, ndmin=1, dtype=np.float32, copy=True)
441
434
  # numpy will map None values to Nan, but we need a
442
435
  # specific NaN
443
436
  value[np.isnan(value)] = constants.FLOAT32_MISSING
444
437
  value = drop_empty_second_dim(value)
445
- buff[j] = constants.FLOAT32_FILL
446
- buff[j, : value.shape[0]] = value
438
+ result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
439
+ result[: value.shape[0]] = value
440
+ return result
447
441
 
448
442
 
449
- def sanitise_value_float_2d(buff, j, value):
443
+ def sanitise_value_float_2d(shape, value):
450
444
  if value is None:
451
- buff[j] = constants.FLOAT32_MISSING
445
+ return np.full(shape, constants.FLOAT32_MISSING)
452
446
  else:
453
- # print("value = ", value)
454
- value = np.array(value, ndmin=2, dtype=buff.dtype, copy=True)
455
- buff[j] = constants.FLOAT32_FILL
456
- buff[j, :, : value.shape[1]] = value
447
+ value = np.array(value, ndmin=2, dtype=np.float32, copy=True)
448
+ result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
449
+ result[:, : value.shape[1]] = value
450
+ return result
457
451
 
458
452
 
459
453
  def sanitise_int_array(value, ndmin, dtype):
@@ -468,23 +462,25 @@ def sanitise_int_array(value, ndmin, dtype):
468
462
  return value.astype(dtype)
469
463
 
470
464
 
471
- def sanitise_value_int_1d(buff, j, value):
465
+ def sanitise_value_int_1d(shape, value):
472
466
  if value is None:
473
- buff[j] = -1
467
+ return np.full(shape, -1)
474
468
  else:
475
- value = sanitise_int_array(value, 1, buff.dtype)
469
+ value = sanitise_int_array(value, 1, np.int32)
476
470
  value = drop_empty_second_dim(value)
477
- buff[j] = -2
478
- buff[j, : value.shape[0]] = value
471
+ result = np.full(shape, -2, dtype=np.int32)
472
+ result[: value.shape[0]] = value
473
+ return result
479
474
 
480
475
 
481
- def sanitise_value_int_2d(buff, j, value):
476
+ def sanitise_value_int_2d(shape, value):
482
477
  if value is None:
483
- buff[j] = -1
478
+ return np.full(shape, -1)
484
479
  else:
485
- value = sanitise_int_array(value, 2, buff.dtype)
486
- buff[j] = -2
487
- buff[j, :, : value.shape[1]] = value
480
+ value = sanitise_int_array(value, 2, np.int32)
481
+ result = np.full(shape, -2, dtype=np.int32)
482
+ result[:, : value.shape[1]] = value
483
+ return result
488
484
 
489
485
 
490
486
  missing_value_map = {
@@ -648,7 +644,8 @@ class IntermediateColumnarFormatField:
648
644
  chunk_cumulative_records = self.chunk_record_index(partition_id)
649
645
  chunk_num_records = np.diff(chunk_cumulative_records)
650
646
  for count, cumulative in zip(
651
- chunk_num_records[start_chunk:], chunk_cumulative_records[start_chunk + 1 :]
647
+ chunk_num_records[start_chunk:],
648
+ chunk_cumulative_records[start_chunk + 1 :],
652
649
  ):
653
650
  path = partition_path / f"{cumulative}"
654
651
  chunk = self.read_chunk(path)
@@ -707,36 +704,32 @@ class IntermediateColumnarFormatField:
707
704
  return ret
708
705
 
709
706
  def sanitiser_factory(self, shape):
710
- """
711
- Return a function that sanitised values from this column
712
- and writes into a buffer of the specified shape.
713
- """
714
- assert len(shape) <= 3
707
+ assert len(shape) <= 2
715
708
  if self.vcf_field.vcf_type == "Flag":
716
- assert len(shape) == 1
717
- return sanitise_value_bool
709
+ assert len(shape) == 0
710
+ return partial(sanitise_value_bool, shape)
718
711
  elif self.vcf_field.vcf_type == "Float":
719
- if len(shape) == 1:
720
- return sanitise_value_float_scalar
721
- elif len(shape) == 2:
722
- return sanitise_value_float_1d
712
+ if len(shape) == 0:
713
+ return partial(sanitise_value_float_scalar, shape)
714
+ elif len(shape) == 1:
715
+ return partial(sanitise_value_float_1d, shape)
723
716
  else:
724
- return sanitise_value_float_2d
717
+ return partial(sanitise_value_float_2d, shape)
725
718
  elif self.vcf_field.vcf_type == "Integer":
726
- if len(shape) == 1:
727
- return sanitise_value_int_scalar
728
- elif len(shape) == 2:
729
- return sanitise_value_int_1d
719
+ if len(shape) == 0:
720
+ return partial(sanitise_value_int_scalar, shape)
721
+ elif len(shape) == 1:
722
+ return partial(sanitise_value_int_1d, shape)
730
723
  else:
731
- return sanitise_value_int_2d
724
+ return partial(sanitise_value_int_2d, shape)
732
725
  else:
733
726
  assert self.vcf_field.vcf_type in ("String", "Character")
734
- if len(shape) == 1:
735
- return sanitise_value_string_scalar
736
- elif len(shape) == 2:
737
- return sanitise_value_string_1d
727
+ if len(shape) == 0:
728
+ return partial(sanitise_value_string_scalar, shape)
729
+ elif len(shape) == 1:
730
+ return partial(sanitise_value_string_1d, shape)
738
731
  else:
739
- return sanitise_value_string_2d
732
+ return partial(sanitise_value_string_2d, shape)
740
733
 
741
734
 
742
735
  @dataclasses.dataclass
@@ -843,9 +836,66 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
843
836
  return False
844
837
 
845
838
 
846
- class IntermediateColumnarFormat(collections.abc.Mapping):
839
+ def convert_local_allele_field_types(fields, schema_instance):
840
+ """
841
+ Update the specified list of fields to include the LAA field, and to convert
842
+ any supported localisable fields to the L* counterpart.
843
+
844
+ Note that we currently support only two ALT alleles per sample, and so the
845
+ dimensions of these fields are fixed by that requirement. Later versions may
846
+ use summary data storted in the ICF to make different choices, if information
847
+ about subsequent alleles (not in the actual genotype calls) should also be
848
+ stored.
849
+ """
850
+ fields_by_name = {field.name: field for field in fields}
851
+ gt = fields_by_name["call_genotype"]
852
+
853
+ if schema_instance.get_shape(["ploidy"])[0] != 2:
854
+ raise ValueError("Local alleles only supported on diploid data")
855
+
856
+ dimensions = gt.dimensions[:-1]
857
+
858
+ la = vcz.ZarrArraySpec(
859
+ name="call_LA",
860
+ dtype="i1",
861
+ dimensions=(*dimensions, "local_alleles"),
862
+ description=(
863
+ "0-based indices into REF+ALT, indicating which alleles"
864
+ " are relevant (local) for the current sample"
865
+ ),
866
+ )
867
+ schema_instance.dimensions["local_alleles"] = vcz.VcfZarrDimension.unchunked(
868
+ schema_instance.dimensions["ploidy"].size
869
+ )
870
+
871
+ ad = fields_by_name.get("call_AD", None)
872
+ if ad is not None:
873
+ # TODO check if call_LAD is in the list already
874
+ ad.name = "call_LAD"
875
+ ad.source = None
876
+ ad.dimensions = (*dimensions, "local_alleles_AD")
877
+ ad.description += " (local-alleles)"
878
+ schema_instance.dimensions["local_alleles_AD"] = vcz.VcfZarrDimension.unchunked(
879
+ 2
880
+ )
881
+
882
+ pl = fields_by_name.get("call_PL", None)
883
+ if pl is not None:
884
+ # TODO check if call_LPL is in the list already
885
+ pl.name = "call_LPL"
886
+ pl.source = None
887
+ pl.description += " (local-alleles)"
888
+ pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1].split("_")[-1])
889
+ schema_instance.dimensions["local_" + pl.dimensions[-1].split("_")[-1]] = (
890
+ vcz.VcfZarrDimension.unchunked(3)
891
+ )
892
+
893
+ return [*fields, la]
894
+
895
+
896
+ class IntermediateColumnarFormat(vcz.Source):
847
897
  def __init__(self, path):
848
- self.path = pathlib.Path(path)
898
+ self._path = pathlib.Path(path)
849
899
  # TODO raise a more informative error here telling people this
850
900
  # directory is either a WIP or the wrong format.
851
901
  with open(self.path / "metadata.json") as f:
@@ -859,8 +909,12 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
859
909
  ]
860
910
  # Allow us to find which partition a given record is in
861
911
  self.partition_record_index = np.cumsum([0, *partition_num_records])
912
+ self.gt_field = None
862
913
  for field in self.metadata.fields:
863
914
  self.fields[field.full_name] = IntermediateColumnarFormatField(self, field)
915
+ if field.name == "GT":
916
+ self.gt_field = field
917
+
864
918
  logger.info(
865
919
  f"Loaded IntermediateColumnarFormat(partitions={self.num_partitions}, "
866
920
  f"records={self.num_records}, fields={self.num_fields})"
@@ -868,20 +922,11 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
868
922
 
869
923
  def __repr__(self):
870
924
  return (
871
- f"IntermediateColumnarFormat(fields={len(self)}, "
925
+ f"IntermediateColumnarFormat(fields={len(self.fields)}, "
872
926
  f"partitions={self.num_partitions}, "
873
927
  f"records={self.num_records}, path={self.path})"
874
928
  )
875
929
 
876
- def __getitem__(self, key):
877
- return self.fields[key]
878
-
879
- def __iter__(self):
880
- return iter(self.fields)
881
-
882
- def __len__(self):
883
- return len(self.fields)
884
-
885
930
  def summary_table(self):
886
931
  data = []
887
932
  for name, icf_field in self.fields.items():
@@ -900,6 +945,10 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
900
945
  data.append(d)
901
946
  return data
902
947
 
948
+ @property
949
+ def path(self):
950
+ return self._path
951
+
903
952
  @property
904
953
  def num_records(self):
905
954
  return self.metadata.num_records
@@ -908,6 +957,18 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
908
957
  def num_partitions(self):
909
958
  return len(self.metadata.partitions)
910
959
 
960
+ @property
961
+ def samples(self):
962
+ return self.metadata.samples
963
+
964
+ @property
965
+ def contigs(self):
966
+ return self.metadata.contigs
967
+
968
+ @property
969
+ def filters(self):
970
+ return self.metadata.filters
971
+
911
972
  @property
912
973
  def num_samples(self):
913
974
  return len(self.metadata.samples)
@@ -916,6 +977,261 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
916
977
  def num_fields(self):
917
978
  return len(self.fields)
918
979
 
980
+ @property
981
+ def root_attrs(self):
982
+ meta_information_pattern = re.compile("##([^=]+)=(.*)")
983
+ vcf_meta_information = []
984
+ for line in self.vcf_header.split("\n"):
985
+ match = re.fullmatch(meta_information_pattern, line)
986
+ if match:
987
+ key = match.group(1)
988
+ if key in ("contig", "FILTER", "INFO", "FORMAT"):
989
+ # these fields are stored in Zarr arrays
990
+ continue
991
+ value = match.group(2)
992
+ vcf_meta_information.append((key, value))
993
+ return {
994
+ "vcf_meta_information": vcf_meta_information,
995
+ }
996
+
997
+ def iter_id(self, start, stop):
998
+ for value in self.fields["ID"].iter_values(start, stop):
999
+ if value is not None:
1000
+ yield value[0]
1001
+ else:
1002
+ yield None
1003
+
1004
+ def iter_filters(self, start, stop):
1005
+ source_field = self.fields["FILTERS"]
1006
+ lookup = {filt.id: index for index, filt in enumerate(self.metadata.filters)}
1007
+
1008
+ for filter_values in source_field.iter_values(start, stop):
1009
+ filters = np.zeros(len(self.metadata.filters), dtype=bool)
1010
+ if filter_values is not None:
1011
+ for filter_id in filter_values:
1012
+ try:
1013
+ filters[lookup[filter_id]] = True
1014
+ except KeyError:
1015
+ raise ValueError(
1016
+ f"Filter '{filter_id}' was not defined in the header."
1017
+ ) from None
1018
+ yield filters
1019
+
1020
+ def iter_contig(self, start, stop):
1021
+ source_field = self.fields["CHROM"]
1022
+ lookup = {
1023
+ contig.id: index for index, contig in enumerate(self.metadata.contigs)
1024
+ }
1025
+
1026
+ for value in source_field.iter_values(start, stop):
1027
+ # Note: because we are using the indexes to define the lookups
1028
+ # and we always have an index, it seems that we the contig lookup
1029
+ # will always succeed. However, if anyone ever does hit a KeyError
1030
+ # here, please do open an issue with a reproducible example!
1031
+ yield lookup[value[0]]
1032
+
1033
+ def iter_field(self, field_name, shape, start, stop):
1034
+ source_field = self.fields[field_name]
1035
+ sanitiser = source_field.sanitiser_factory(shape)
1036
+ for value in source_field.iter_values(start, stop):
1037
+ yield sanitiser(value)
1038
+
1039
+ def iter_alleles(self, start, stop, num_alleles):
1040
+ ref_field = self.fields["REF"]
1041
+ alt_field = self.fields["ALT"]
1042
+
1043
+ for ref, alt in zip(
1044
+ ref_field.iter_values(start, stop),
1045
+ alt_field.iter_values(start, stop),
1046
+ ):
1047
+ alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
1048
+ alleles[0] = ref[0]
1049
+ alleles[1 : 1 + len(alt)] = alt
1050
+ yield alleles
1051
+
1052
+ def iter_genotypes(self, shape, start, stop):
1053
+ source_field = self.fields["FORMAT/GT"]
1054
+ for value in source_field.iter_values(start, stop):
1055
+ genotypes = value[:, :-1] if value is not None else None
1056
+ phased = value[:, -1] if value is not None else None
1057
+ sanitised_genotypes = sanitise_value_int_2d(shape, genotypes)
1058
+ sanitised_phased = sanitise_value_int_1d(shape[:-1], phased)
1059
+ # Force haploids to always be phased
1060
+ # https://github.com/sgkit-dev/bio2zarr/issues/399
1061
+ if sanitised_genotypes.shape[1] == 1:
1062
+ sanitised_phased[:] = True
1063
+ yield sanitised_genotypes, sanitised_phased
1064
+
1065
+ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
1066
+ variant_lengths = self.fields["rlen"].iter_values(start, stop)
1067
+ if self.gt_field is None or shape is None:
1068
+ for variant_length, alleles in zip(
1069
+ variant_lengths, self.iter_alleles(start, stop, num_alleles)
1070
+ ):
1071
+ yield vcz.VariantData(variant_length, alleles, None, None)
1072
+ else:
1073
+ for variant_length, alleles, (gt, phased) in zip(
1074
+ variant_lengths,
1075
+ self.iter_alleles(start, stop, num_alleles),
1076
+ self.iter_genotypes(shape, start, stop),
1077
+ ):
1078
+ yield vcz.VariantData(variant_length, alleles, gt, phased)
1079
+
1080
+ def generate_schema(
1081
+ self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
1082
+ ):
1083
+ if local_alleles is None:
1084
+ local_alleles = False
1085
+
1086
+ max_alleles = max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2)
1087
+
1088
+ # Add ploidy and genotypes dimensions only when needed
1089
+ max_genotypes = 0
1090
+ for field in self.metadata.format_fields:
1091
+ if field.vcf_number == "G":
1092
+ max_genotypes = max(max_genotypes, field.summary.max_number)
1093
+
1094
+ ploidy = None
1095
+ genotypes_size = None
1096
+ if self.gt_field is not None:
1097
+ ploidy = max(self.gt_field.summary.max_number - 1, 1)
1098
+ # NOTE: it's not clear why we're computing this, when we must have had
1099
+ # at least one number=G field to require it anyway?
1100
+ genotypes_size = math.comb(max_alleles + ploidy - 1, ploidy)
1101
+ # assert max_genotypes == genotypes_size
1102
+ else:
1103
+ if max_genotypes > 0:
1104
+ # there is no GT field, but there is at least one Number=G field,
1105
+ # so need to define genotypes dimension
1106
+ genotypes_size = max_genotypes
1107
+
1108
+ dimensions = vcz.standard_dimensions(
1109
+ variants_size=self.num_records,
1110
+ variants_chunk_size=variants_chunk_size,
1111
+ samples_size=self.num_samples,
1112
+ samples_chunk_size=samples_chunk_size,
1113
+ alleles_size=max_alleles,
1114
+ filters_size=self.metadata.num_filters,
1115
+ ploidy_size=ploidy,
1116
+ genotypes_size=genotypes_size,
1117
+ )
1118
+
1119
+ schema_instance = vcz.VcfZarrSchema(
1120
+ format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
1121
+ dimensions=dimensions,
1122
+ fields=[],
1123
+ )
1124
+
1125
+ logger.info(
1126
+ "Generating schema with chunks="
1127
+ f"variants={dimensions['variants'].chunk_size}, "
1128
+ f"samples={dimensions['samples'].chunk_size}"
1129
+ )
1130
+
1131
+ def spec_from_field(field, array_name=None):
1132
+ return vcz.ZarrArraySpec.from_field(
1133
+ field,
1134
+ schema_instance,
1135
+ array_name=array_name,
1136
+ )
1137
+
1138
+ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
1139
+ compressor = (
1140
+ vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config()
1141
+ if dtype == "bool"
1142
+ else None
1143
+ )
1144
+ return vcz.ZarrArraySpec(
1145
+ source=source,
1146
+ name=name,
1147
+ dtype=dtype,
1148
+ description="",
1149
+ dimensions=dimensions,
1150
+ compressor=compressor,
1151
+ )
1152
+
1153
+ name_map = {field.full_name: field for field in self.metadata.fields}
1154
+ array_specs = [
1155
+ fixed_field_spec(
1156
+ name="variant_contig",
1157
+ dtype=core.min_int_dtype(0, self.metadata.num_contigs),
1158
+ ),
1159
+ fixed_field_spec(
1160
+ name="variant_filter",
1161
+ dtype="bool",
1162
+ dimensions=["variants", "filters"],
1163
+ ),
1164
+ fixed_field_spec(
1165
+ name="variant_allele",
1166
+ dtype="O",
1167
+ dimensions=["variants", "alleles"],
1168
+ ),
1169
+ fixed_field_spec(
1170
+ name="variant_length",
1171
+ dtype=name_map["rlen"].smallest_dtype(),
1172
+ dimensions=["variants"],
1173
+ ),
1174
+ fixed_field_spec(
1175
+ name="variant_id",
1176
+ dtype="O",
1177
+ ),
1178
+ fixed_field_spec(
1179
+ name="variant_id_mask",
1180
+ dtype="bool",
1181
+ ),
1182
+ ]
1183
+
1184
+ # Only two of the fixed fields have a direct one-to-one mapping.
1185
+ array_specs.extend(
1186
+ [
1187
+ spec_from_field(name_map["QUAL"], array_name="variant_quality"),
1188
+ spec_from_field(name_map["POS"], array_name="variant_position"),
1189
+ ]
1190
+ )
1191
+ array_specs.extend(
1192
+ [spec_from_field(field) for field in self.metadata.info_fields]
1193
+ )
1194
+
1195
+ for field in self.metadata.format_fields:
1196
+ if field.name == "GT":
1197
+ continue
1198
+ array_specs.append(spec_from_field(field))
1199
+
1200
+ if self.gt_field is not None and self.num_samples > 0:
1201
+ array_specs.append(
1202
+ vcz.ZarrArraySpec(
1203
+ name="call_genotype_phased",
1204
+ dtype="bool",
1205
+ dimensions=["variants", "samples"],
1206
+ description="",
1207
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
1208
+ )
1209
+ )
1210
+ array_specs.append(
1211
+ vcz.ZarrArraySpec(
1212
+ name="call_genotype",
1213
+ dtype=self.gt_field.smallest_dtype(),
1214
+ dimensions=["variants", "samples", "ploidy"],
1215
+ description="",
1216
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
1217
+ )
1218
+ )
1219
+ array_specs.append(
1220
+ vcz.ZarrArraySpec(
1221
+ name="call_genotype_mask",
1222
+ dtype="bool",
1223
+ dimensions=["variants", "samples", "ploidy"],
1224
+ description="",
1225
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
1226
+ )
1227
+ )
1228
+
1229
+ if local_alleles:
1230
+ array_specs = convert_local_allele_field_types(array_specs, schema_instance)
1231
+
1232
+ schema_instance.fields = array_specs
1233
+ return schema_instance
1234
+
919
1235
 
920
1236
  @dataclasses.dataclass
921
1237
  class IcfPartitionMetadata(core.JsonDataclass):
@@ -987,7 +1303,7 @@ class IntermediateColumnarFormatWriter:
987
1303
  vcfs,
988
1304
  *,
989
1305
  column_chunk_size=16,
990
- worker_processes=1,
1306
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
991
1307
  target_num_partitions=None,
992
1308
  show_progress=False,
993
1309
  compressor=None,
@@ -1139,7 +1455,9 @@ class IntermediateColumnarFormatWriter:
1139
1455
  f"{num_records} records last_pos={last_position}"
1140
1456
  )
1141
1457
 
1142
- def explode(self, *, worker_processes=1, show_progress=False):
1458
+ def explode(
1459
+ self, *, worker_processes=core.DEFAULT_WORKER_PROCESSES, show_progress=False
1460
+ ):
1143
1461
  self.load_metadata()
1144
1462
  num_records = self.metadata.num_records
1145
1463
  if np.isinf(num_records):
@@ -1207,7 +1525,7 @@ def explode(
1207
1525
  vcfs,
1208
1526
  *,
1209
1527
  column_chunk_size=16,
1210
- worker_processes=1,
1528
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1211
1529
  show_progress=False,
1212
1530
  compressor=None,
1213
1531
  ):
@@ -1232,7 +1550,7 @@ def explode_init(
1232
1550
  *,
1233
1551
  column_chunk_size=16,
1234
1552
  target_num_partitions=1,
1235
- worker_processes=1,
1553
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1236
1554
  show_progress=False,
1237
1555
  compressor=None,
1238
1556
  ):
@@ -1255,3 +1573,167 @@ def explode_partition(icf_path, partition):
1255
1573
  def explode_finalise(icf_path):
1256
1574
  writer = IntermediateColumnarFormatWriter(icf_path)
1257
1575
  writer.finalise()
1576
+
1577
+
1578
+ def inspect(path):
1579
+ path = pathlib.Path(path)
1580
+ if not path.exists():
1581
+ raise ValueError(f"Path not found: {path}")
1582
+ if (path / "metadata.json").exists():
1583
+ obj = IntermediateColumnarFormat(path)
1584
+ # NOTE: this is too strict, we should support more general Zarrs, see #276
1585
+ elif (path / ".zmetadata").exists():
1586
+ obj = vcz.VcfZarr(path)
1587
+ else:
1588
+ raise ValueError(f"{path} not in ICF or VCF Zarr format")
1589
+ return obj.summary_table()
1590
+
1591
+
1592
+ def mkschema(
1593
+ if_path,
1594
+ out,
1595
+ *,
1596
+ variants_chunk_size=None,
1597
+ samples_chunk_size=None,
1598
+ local_alleles=None,
1599
+ ):
1600
+ store = IntermediateColumnarFormat(if_path)
1601
+ spec = store.generate_schema(
1602
+ variants_chunk_size=variants_chunk_size,
1603
+ samples_chunk_size=samples_chunk_size,
1604
+ local_alleles=local_alleles,
1605
+ )
1606
+ out.write(spec.asjson())
1607
+
1608
+
1609
+ def convert(
1610
+ vcfs,
1611
+ vcz_path,
1612
+ *,
1613
+ variants_chunk_size=None,
1614
+ samples_chunk_size=None,
1615
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1616
+ local_alleles=None,
1617
+ show_progress=False,
1618
+ icf_path=None,
1619
+ ):
1620
+ """
1621
+ Convert the VCF data at the specified list of paths
1622
+ to VCF Zarr format stored at the specified path.
1623
+
1624
+ .. todo:: Document parameters
1625
+ """
1626
+ if icf_path is None:
1627
+ cm = temp_icf_path(prefix="vcf2zarr")
1628
+ else:
1629
+ cm = contextlib.nullcontext(icf_path)
1630
+
1631
+ with cm as icf_path:
1632
+ explode(
1633
+ icf_path,
1634
+ vcfs,
1635
+ worker_processes=worker_processes,
1636
+ show_progress=show_progress,
1637
+ )
1638
+ encode(
1639
+ icf_path,
1640
+ vcz_path,
1641
+ variants_chunk_size=variants_chunk_size,
1642
+ samples_chunk_size=samples_chunk_size,
1643
+ worker_processes=worker_processes,
1644
+ show_progress=show_progress,
1645
+ local_alleles=local_alleles,
1646
+ )
1647
+
1648
+
1649
+ @contextlib.contextmanager
1650
+ def temp_icf_path(prefix=None):
1651
+ with tempfile.TemporaryDirectory(prefix=prefix) as tmp:
1652
+ yield pathlib.Path(tmp) / "icf"
1653
+
1654
+
1655
+ def encode(
1656
+ icf_path,
1657
+ zarr_path,
1658
+ schema_path=None,
1659
+ variants_chunk_size=None,
1660
+ samples_chunk_size=None,
1661
+ max_variant_chunks=None,
1662
+ dimension_separator=None,
1663
+ max_memory=None,
1664
+ local_alleles=None,
1665
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1666
+ show_progress=False,
1667
+ ):
1668
+ # Rough heuristic to split work up enough to keep utilisation high
1669
+ target_num_partitions = max(1, worker_processes * 4)
1670
+ encode_init(
1671
+ icf_path,
1672
+ zarr_path,
1673
+ target_num_partitions,
1674
+ schema_path=schema_path,
1675
+ variants_chunk_size=variants_chunk_size,
1676
+ samples_chunk_size=samples_chunk_size,
1677
+ local_alleles=local_alleles,
1678
+ max_variant_chunks=max_variant_chunks,
1679
+ dimension_separator=dimension_separator,
1680
+ )
1681
+ vzw = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
1682
+ vzw.encode_all_partitions(
1683
+ worker_processes=worker_processes,
1684
+ show_progress=show_progress,
1685
+ max_memory=max_memory,
1686
+ )
1687
+ vzw.finalise(show_progress)
1688
+ vzw.create_index()
1689
+
1690
+
1691
+ def encode_init(
1692
+ icf_path,
1693
+ zarr_path,
1694
+ target_num_partitions,
1695
+ *,
1696
+ schema_path=None,
1697
+ variants_chunk_size=None,
1698
+ samples_chunk_size=None,
1699
+ local_alleles=None,
1700
+ max_variant_chunks=None,
1701
+ dimension_separator=None,
1702
+ max_memory=None,
1703
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1704
+ show_progress=False,
1705
+ ):
1706
+ icf_store = IntermediateColumnarFormat(icf_path)
1707
+ if schema_path is None:
1708
+ schema_instance = icf_store.generate_schema(
1709
+ variants_chunk_size=variants_chunk_size,
1710
+ samples_chunk_size=samples_chunk_size,
1711
+ local_alleles=local_alleles,
1712
+ )
1713
+ else:
1714
+ logger.info(f"Reading schema from {schema_path}")
1715
+ if variants_chunk_size is not None or samples_chunk_size is not None:
1716
+ raise ValueError(
1717
+ "Cannot specify schema along with chunk sizes"
1718
+ ) # NEEDS TEST
1719
+ with open(schema_path) as f:
1720
+ schema_instance = vcz.VcfZarrSchema.fromjson(f.read())
1721
+ zarr_path = pathlib.Path(zarr_path)
1722
+ vzw = vcz.VcfZarrWriter("icf", zarr_path)
1723
+ return vzw.init(
1724
+ icf_store,
1725
+ target_num_partitions=target_num_partitions,
1726
+ schema=schema_instance,
1727
+ dimension_separator=dimension_separator,
1728
+ max_variant_chunks=max_variant_chunks,
1729
+ )
1730
+
1731
+
1732
+ def encode_partition(zarr_path, partition):
1733
+ writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
1734
+ writer_instance.encode_partition(partition)
1735
+
1736
+
1737
+ def encode_finalise(zarr_path, show_progress=False):
1738
+ writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
1739
+ writer_instance.finalise(show_progress=show_progress)