bio2zarr 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,14 +6,19 @@ import logging
6
6
  import math
7
7
  import pathlib
8
8
  import pickle
9
+ import re
9
10
  import shutil
10
11
  import sys
12
+ import tempfile
13
+ from functools import partial
11
14
  from typing import Any
12
15
 
13
16
  import numcodecs
14
17
  import numpy as np
15
18
 
16
- from .. import constants, core, provenance, vcf_utils
19
+ from bio2zarr.zarr_utils import STRING_DTYPE_NAME, zarr_exists
20
+
21
+ from . import constants, core, provenance, vcf_utils, vcz
17
22
 
18
23
  logger = logging.getLogger(__name__)
19
24
 
@@ -77,6 +82,14 @@ class VcfField:
77
82
  return self.name
78
83
  return f"{self.category}/{self.name}"
79
84
 
85
+ @property
86
+ def max_number(self):
87
+ if self.vcf_number in ("R", "A", "G", "."):
88
+ return self.summary.max_number
89
+ else:
90
+ # use declared number if larger than max found
91
+ return max(self.summary.max_number, int(self.vcf_number))
92
+
80
93
  def smallest_dtype(self):
81
94
  """
82
95
  Returns the smallest dtype suitable for this field based
@@ -99,7 +112,7 @@ class VcfField:
99
112
  ret = "U1"
100
113
  else:
101
114
  assert self.vcf_type == "String"
102
- ret = "O"
115
+ ret = STRING_DTYPE_NAME
103
116
  return ret
104
117
 
105
118
 
@@ -116,23 +129,6 @@ ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
116
129
  )
117
130
 
118
131
 
119
- @dataclasses.dataclass
120
- class Contig:
121
- id: str
122
- length: int = None
123
-
124
-
125
- @dataclasses.dataclass
126
- class Sample:
127
- id: str
128
-
129
-
130
- @dataclasses.dataclass
131
- class Filter:
132
- id: str
133
- description: str = ""
134
-
135
-
136
132
  @dataclasses.dataclass
137
133
  class IcfMetadata(core.JsonDataclass):
138
134
  samples: list
@@ -187,9 +183,9 @@ class IcfMetadata(core.JsonDataclass):
187
183
  d = d.copy()
188
184
  d["partitions"] = partitions
189
185
  d["fields"] = [VcfField.fromdict(fd) for fd in d["fields"]]
190
- d["samples"] = [Sample(**sd) for sd in d["samples"]]
191
- d["filters"] = [Filter(**fd) for fd in d["filters"]]
192
- d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
186
+ d["samples"] = [vcz.Sample(**sd) for sd in d["samples"]]
187
+ d["filters"] = [vcz.Filter(**fd) for fd in d["filters"]]
188
+ d["contigs"] = [vcz.Contig(**cd) for cd in d["contigs"]]
193
189
  return IcfMetadata(**d)
194
190
 
195
191
  def __eq__(self, other):
@@ -240,7 +236,7 @@ def scan_vcf(path, target_num_partitions):
240
236
  description = ""
241
237
  if h["ID"] == "PASS":
242
238
  pass_index = len(filters)
243
- filters.append(Filter(h["ID"], description))
239
+ filters.append(vcz.Filter(h["ID"], description))
244
240
 
245
241
  # Ensure PASS is the first filter if present
246
242
  if pass_index > 0:
@@ -262,9 +258,9 @@ def scan_vcf(path, target_num_partitions):
262
258
  contig_lengths = [None for _ in vcf.seqnames]
263
259
 
264
260
  metadata = IcfMetadata(
265
- samples=[Sample(sample_id) for sample_id in vcf.samples],
261
+ samples=[vcz.Sample(sample_id) for sample_id in vcf.samples],
266
262
  contigs=[
267
- Contig(contig_id, length)
263
+ vcz.Contig(contig_id, length)
268
264
  for contig_id, length in zip(vcf.seqnames, contig_lengths)
269
265
  ],
270
266
  filters=filters,
@@ -291,7 +287,12 @@ def scan_vcf(path, target_num_partitions):
291
287
  return metadata, vcf.raw_header
292
288
 
293
289
 
294
- def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
290
+ def scan_vcfs(
291
+ paths,
292
+ show_progress,
293
+ target_num_partitions,
294
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
295
+ ):
295
296
  logger.info(
296
297
  f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
297
298
  f" partitions."
@@ -366,64 +367,58 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
366
367
  return icf_metadata, header
367
368
 
368
369
 
369
- def sanitise_value_bool(buff, j, value):
370
+ def sanitise_value_bool(shape, value):
370
371
  x = True
371
372
  if value is None:
372
373
  x = False
373
- buff[j] = x
374
+ return x
374
375
 
375
376
 
376
- def sanitise_value_float_scalar(buff, j, value):
377
+ def sanitise_value_float_scalar(shape, value):
377
378
  x = value
378
379
  if value is None:
379
380
  x = [constants.FLOAT32_MISSING]
380
- buff[j] = x[0]
381
+ return x[0]
381
382
 
382
383
 
383
- def sanitise_value_int_scalar(buff, j, value):
384
+ def sanitise_value_int_scalar(shape, value):
384
385
  x = value
385
386
  if value is None:
386
- # print("MISSING", INT_MISSING, INT_FILL)
387
387
  x = [constants.INT_MISSING]
388
388
  else:
389
389
  x = sanitise_int_array(value, ndmin=1, dtype=np.int32)
390
- buff[j] = x[0]
390
+ return x[0]
391
391
 
392
392
 
393
- def sanitise_value_string_scalar(buff, j, value):
393
+ def sanitise_value_string_scalar(shape, value):
394
394
  if value is None:
395
- buff[j] = "."
395
+ return "."
396
396
  else:
397
- buff[j] = value[0]
397
+ return value[0]
398
398
 
399
399
 
400
- def sanitise_value_string_1d(buff, j, value):
400
+ def sanitise_value_string_1d(shape, value):
401
401
  if value is None:
402
- buff[j] = "."
402
+ return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
403
403
  else:
404
- # value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
405
- # FIXME failure isn't coming from here, it seems to be from an
406
- # incorrectly detected dimension in the zarr array
407
- # The dimesions look all wrong, and the dtype should be Object
408
- # not str
409
404
  value = drop_empty_second_dim(value)
410
- buff[j] = ""
411
- buff[j, : value.shape[0]] = value
405
+ result = np.full(shape, "", dtype=value.dtype)
406
+ result[: value.shape[0]] = value
407
+ return result
412
408
 
413
409
 
414
- def sanitise_value_string_2d(buff, j, value):
410
+ def sanitise_value_string_2d(shape, value):
415
411
  if value is None:
416
- buff[j] = "."
412
+ return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
417
413
  else:
418
- # print(buff.shape, value.dtype, value)
419
- # assert value.ndim == 2
420
- buff[j] = ""
414
+ result = np.full(shape, "", dtype=STRING_DTYPE_NAME)
421
415
  if value.ndim == 2:
422
- buff[j, :, : value.shape[1]] = value
416
+ result[: value.shape[0], : value.shape[1]] = value
423
417
  else:
424
- # TODO check if this is still necessary
418
+ # Convert 1D array into 2D with appropriate shape
425
419
  for k, val in enumerate(value):
426
- buff[j, k, : len(val)] = val
420
+ result[k, : len(val)] = val
421
+ return result
427
422
 
428
423
 
429
424
  def drop_empty_second_dim(value):
@@ -433,27 +428,28 @@ def drop_empty_second_dim(value):
433
428
  return value
434
429
 
435
430
 
436
- def sanitise_value_float_1d(buff, j, value):
431
+ def sanitise_value_float_1d(shape, value):
437
432
  if value is None:
438
- buff[j] = constants.FLOAT32_MISSING
433
+ return np.full(shape, constants.FLOAT32_MISSING)
439
434
  else:
440
- value = np.array(value, ndmin=1, dtype=buff.dtype, copy=True)
435
+ value = np.array(value, ndmin=1, dtype=np.float32, copy=True)
441
436
  # numpy will map None values to Nan, but we need a
442
437
  # specific NaN
443
438
  value[np.isnan(value)] = constants.FLOAT32_MISSING
444
439
  value = drop_empty_second_dim(value)
445
- buff[j] = constants.FLOAT32_FILL
446
- buff[j, : value.shape[0]] = value
440
+ result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
441
+ result[: value.shape[0]] = value
442
+ return result
447
443
 
448
444
 
449
- def sanitise_value_float_2d(buff, j, value):
445
+ def sanitise_value_float_2d(shape, value):
450
446
  if value is None:
451
- buff[j] = constants.FLOAT32_MISSING
447
+ return np.full(shape, constants.FLOAT32_MISSING)
452
448
  else:
453
- # print("value = ", value)
454
- value = np.array(value, ndmin=2, dtype=buff.dtype, copy=True)
455
- buff[j] = constants.FLOAT32_FILL
456
- buff[j, :, : value.shape[1]] = value
449
+ value = np.array(value, ndmin=2, dtype=np.float32, copy=True)
450
+ result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
451
+ result[:, : value.shape[1]] = value
452
+ return result
457
453
 
458
454
 
459
455
  def sanitise_int_array(value, ndmin, dtype):
@@ -468,23 +464,25 @@ def sanitise_int_array(value, ndmin, dtype):
468
464
  return value.astype(dtype)
469
465
 
470
466
 
471
- def sanitise_value_int_1d(buff, j, value):
467
+ def sanitise_value_int_1d(shape, value):
472
468
  if value is None:
473
- buff[j] = -1
469
+ return np.full(shape, -1)
474
470
  else:
475
- value = sanitise_int_array(value, 1, buff.dtype)
471
+ value = sanitise_int_array(value, 1, np.int32)
476
472
  value = drop_empty_second_dim(value)
477
- buff[j] = -2
478
- buff[j, : value.shape[0]] = value
473
+ result = np.full(shape, -2, dtype=np.int32)
474
+ result[: value.shape[0]] = value
475
+ return result
479
476
 
480
477
 
481
- def sanitise_value_int_2d(buff, j, value):
478
+ def sanitise_value_int_2d(shape, value):
482
479
  if value is None:
483
- buff[j] = -1
480
+ return np.full(shape, -1)
484
481
  else:
485
- value = sanitise_int_array(value, 2, buff.dtype)
486
- buff[j] = -2
487
- buff[j, :, : value.shape[1]] = value
482
+ value = sanitise_int_array(value, 2, np.int32)
483
+ result = np.full(shape, -2, dtype=np.int32)
484
+ result[:, : value.shape[1]] = value
485
+ return result
488
486
 
489
487
 
490
488
  missing_value_map = {
@@ -573,7 +571,12 @@ class StringValueTransformer(VcfValueTransformer):
573
571
  value = np.array(list(vcf_value.split(",")))
574
572
  else:
575
573
  # TODO can we make this faster??
576
- value = np.array([v.split(",") for v in vcf_value], dtype="O")
574
+ var_len_values = [v.split(",") for v in vcf_value]
575
+ number = max(len(v) for v in var_len_values)
576
+ value = np.array(
577
+ [v + [""] * (number - len(v)) for v in var_len_values],
578
+ dtype=STRING_DTYPE_NAME,
579
+ )
577
580
  # print("HERE", vcf_value, value)
578
581
  # for v in vcf_value:
579
582
  # print("\t", type(v), len(v), v.split(","))
@@ -648,7 +651,8 @@ class IntermediateColumnarFormatField:
648
651
  chunk_cumulative_records = self.chunk_record_index(partition_id)
649
652
  chunk_num_records = np.diff(chunk_cumulative_records)
650
653
  for count, cumulative in zip(
651
- chunk_num_records[start_chunk:], chunk_cumulative_records[start_chunk + 1 :]
654
+ chunk_num_records[start_chunk:],
655
+ chunk_cumulative_records[start_chunk + 1 :],
652
656
  ):
653
657
  path = partition_path / f"{cumulative}"
654
658
  chunk = self.read_chunk(path)
@@ -707,36 +711,32 @@ class IntermediateColumnarFormatField:
707
711
  return ret
708
712
 
709
713
  def sanitiser_factory(self, shape):
710
- """
711
- Return a function that sanitised values from this column
712
- and writes into a buffer of the specified shape.
713
- """
714
- assert len(shape) <= 3
714
+ assert len(shape) <= 2
715
715
  if self.vcf_field.vcf_type == "Flag":
716
- assert len(shape) == 1
717
- return sanitise_value_bool
716
+ assert len(shape) == 0
717
+ return partial(sanitise_value_bool, shape)
718
718
  elif self.vcf_field.vcf_type == "Float":
719
- if len(shape) == 1:
720
- return sanitise_value_float_scalar
721
- elif len(shape) == 2:
722
- return sanitise_value_float_1d
719
+ if len(shape) == 0:
720
+ return partial(sanitise_value_float_scalar, shape)
721
+ elif len(shape) == 1:
722
+ return partial(sanitise_value_float_1d, shape)
723
723
  else:
724
- return sanitise_value_float_2d
724
+ return partial(sanitise_value_float_2d, shape)
725
725
  elif self.vcf_field.vcf_type == "Integer":
726
- if len(shape) == 1:
727
- return sanitise_value_int_scalar
728
- elif len(shape) == 2:
729
- return sanitise_value_int_1d
726
+ if len(shape) == 0:
727
+ return partial(sanitise_value_int_scalar, shape)
728
+ elif len(shape) == 1:
729
+ return partial(sanitise_value_int_1d, shape)
730
730
  else:
731
- return sanitise_value_int_2d
731
+ return partial(sanitise_value_int_2d, shape)
732
732
  else:
733
733
  assert self.vcf_field.vcf_type in ("String", "Character")
734
- if len(shape) == 1:
735
- return sanitise_value_string_scalar
736
- elif len(shape) == 2:
737
- return sanitise_value_string_1d
734
+ if len(shape) == 0:
735
+ return partial(sanitise_value_string_scalar, shape)
736
+ elif len(shape) == 1:
737
+ return partial(sanitise_value_string_1d, shape)
738
738
  else:
739
- return sanitise_value_string_2d
739
+ return partial(sanitise_value_string_2d, shape)
740
740
 
741
741
 
742
742
  @dataclasses.dataclass
@@ -843,9 +843,66 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
843
843
  return False
844
844
 
845
845
 
846
- class IntermediateColumnarFormat(collections.abc.Mapping):
846
+ def convert_local_allele_field_types(fields, schema_instance):
847
+ """
848
+ Update the specified list of fields to include the LAA field, and to convert
849
+ any supported localisable fields to the L* counterpart.
850
+
851
+ Note that we currently support only two ALT alleles per sample, and so the
852
+ dimensions of these fields are fixed by that requirement. Later versions may
853
+ use summary data storted in the ICF to make different choices, if information
854
+ about subsequent alleles (not in the actual genotype calls) should also be
855
+ stored.
856
+ """
857
+ fields_by_name = {field.name: field for field in fields}
858
+ gt = fields_by_name["call_genotype"]
859
+
860
+ if schema_instance.get_shape(["ploidy"])[0] != 2:
861
+ raise ValueError("Local alleles only supported on diploid data")
862
+
863
+ dimensions = gt.dimensions[:-1]
864
+
865
+ la = vcz.ZarrArraySpec(
866
+ name="call_LA",
867
+ dtype="i1",
868
+ dimensions=(*dimensions, "local_alleles"),
869
+ description=(
870
+ "0-based indices into REF+ALT, indicating which alleles"
871
+ " are relevant (local) for the current sample"
872
+ ),
873
+ )
874
+ schema_instance.dimensions["local_alleles"] = vcz.VcfZarrDimension.unchunked(
875
+ schema_instance.dimensions["ploidy"].size
876
+ )
877
+
878
+ ad = fields_by_name.get("call_AD", None)
879
+ if ad is not None:
880
+ # TODO check if call_LAD is in the list already
881
+ ad.name = "call_LAD"
882
+ ad.source = None
883
+ ad.dimensions = (*dimensions, "local_alleles_AD")
884
+ ad.description += " (local-alleles)"
885
+ schema_instance.dimensions["local_alleles_AD"] = vcz.VcfZarrDimension.unchunked(
886
+ 2
887
+ )
888
+
889
+ pl = fields_by_name.get("call_PL", None)
890
+ if pl is not None:
891
+ # TODO check if call_LPL is in the list already
892
+ pl.name = "call_LPL"
893
+ pl.source = None
894
+ pl.description += " (local-alleles)"
895
+ pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1].split("_")[-1])
896
+ schema_instance.dimensions["local_" + pl.dimensions[-1].split("_")[-1]] = (
897
+ vcz.VcfZarrDimension.unchunked(3)
898
+ )
899
+
900
+ return [*fields, la]
901
+
902
+
903
+ class IntermediateColumnarFormat(vcz.Source):
847
904
  def __init__(self, path):
848
- self.path = pathlib.Path(path)
905
+ self._path = pathlib.Path(path)
849
906
  # TODO raise a more informative error here telling people this
850
907
  # directory is either a WIP or the wrong format.
851
908
  with open(self.path / "metadata.json") as f:
@@ -859,8 +916,12 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
859
916
  ]
860
917
  # Allow us to find which partition a given record is in
861
918
  self.partition_record_index = np.cumsum([0, *partition_num_records])
919
+ self.gt_field = None
862
920
  for field in self.metadata.fields:
863
921
  self.fields[field.full_name] = IntermediateColumnarFormatField(self, field)
922
+ if field.name == "GT":
923
+ self.gt_field = field
924
+
864
925
  logger.info(
865
926
  f"Loaded IntermediateColumnarFormat(partitions={self.num_partitions}, "
866
927
  f"records={self.num_records}, fields={self.num_fields})"
@@ -868,20 +929,11 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
868
929
 
869
930
  def __repr__(self):
870
931
  return (
871
- f"IntermediateColumnarFormat(fields={len(self)}, "
932
+ f"IntermediateColumnarFormat(fields={len(self.fields)}, "
872
933
  f"partitions={self.num_partitions}, "
873
934
  f"records={self.num_records}, path={self.path})"
874
935
  )
875
936
 
876
- def __getitem__(self, key):
877
- return self.fields[key]
878
-
879
- def __iter__(self):
880
- return iter(self.fields)
881
-
882
- def __len__(self):
883
- return len(self.fields)
884
-
885
937
  def summary_table(self):
886
938
  data = []
887
939
  for name, icf_field in self.fields.items():
@@ -900,6 +952,10 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
900
952
  data.append(d)
901
953
  return data
902
954
 
955
+ @property
956
+ def path(self):
957
+ return self._path
958
+
903
959
  @property
904
960
  def num_records(self):
905
961
  return self.metadata.num_records
@@ -908,6 +964,18 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
908
964
  def num_partitions(self):
909
965
  return len(self.metadata.partitions)
910
966
 
967
+ @property
968
+ def samples(self):
969
+ return self.metadata.samples
970
+
971
+ @property
972
+ def contigs(self):
973
+ return self.metadata.contigs
974
+
975
+ @property
976
+ def filters(self):
977
+ return self.metadata.filters
978
+
911
979
  @property
912
980
  def num_samples(self):
913
981
  return len(self.metadata.samples)
@@ -916,6 +984,265 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
916
984
  def num_fields(self):
917
985
  return len(self.fields)
918
986
 
987
+ @property
988
+ def root_attrs(self):
989
+ meta_information_pattern = re.compile("##([^=]+)=(.*)")
990
+ vcf_meta_information = []
991
+ for line in self.vcf_header.split("\n"):
992
+ match = re.fullmatch(meta_information_pattern, line)
993
+ if match:
994
+ key = match.group(1)
995
+ if key in ("contig", "FILTER", "INFO", "FORMAT"):
996
+ # these fields are stored in Zarr arrays
997
+ continue
998
+ value = match.group(2)
999
+ vcf_meta_information.append((key, value))
1000
+ return {
1001
+ "vcf_meta_information": vcf_meta_information,
1002
+ }
1003
+
1004
+ def iter_id(self, start, stop):
1005
+ for value in self.fields["ID"].iter_values(start, stop):
1006
+ if value is not None:
1007
+ yield value[0]
1008
+ else:
1009
+ yield None
1010
+
1011
+ def iter_filters(self, start, stop):
1012
+ source_field = self.fields["FILTERS"]
1013
+ lookup = {filt.id: index for index, filt in enumerate(self.metadata.filters)}
1014
+
1015
+ for filter_values in source_field.iter_values(start, stop):
1016
+ filters = np.zeros(len(self.metadata.filters), dtype=bool)
1017
+ if filter_values is not None:
1018
+ for filter_id in filter_values:
1019
+ try:
1020
+ filters[lookup[filter_id]] = True
1021
+ except KeyError:
1022
+ raise ValueError(
1023
+ f"Filter '{filter_id}' was not defined in the header."
1024
+ ) from None
1025
+ yield filters
1026
+
1027
+ def iter_contig(self, start, stop):
1028
+ source_field = self.fields["CHROM"]
1029
+ lookup = {
1030
+ contig.id: index for index, contig in enumerate(self.metadata.contigs)
1031
+ }
1032
+
1033
+ for value in source_field.iter_values(start, stop):
1034
+ # Note: because we are using the indexes to define the lookups
1035
+ # and we always have an index, it seems that we the contig lookup
1036
+ # will always succeed. However, if anyone ever does hit a KeyError
1037
+ # here, please do open an issue with a reproducible example!
1038
+ yield lookup[value[0]]
1039
+
1040
+ def iter_field(self, field_name, shape, start, stop):
1041
+ source_field = self.fields[field_name]
1042
+ sanitiser = source_field.sanitiser_factory(shape)
1043
+ for value in source_field.iter_values(start, stop):
1044
+ yield sanitiser(value)
1045
+
1046
+ def iter_alleles(self, start, stop, num_alleles):
1047
+ ref_field = self.fields["REF"]
1048
+ alt_field = self.fields["ALT"]
1049
+
1050
+ for ref, alt in zip(
1051
+ ref_field.iter_values(start, stop),
1052
+ alt_field.iter_values(start, stop),
1053
+ ):
1054
+ alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
1055
+ alleles[0] = ref[0]
1056
+ alleles[1 : 1 + len(alt)] = alt
1057
+ yield alleles
1058
+
1059
+ def iter_genotypes(self, shape, start, stop):
1060
+ source_field = self.fields["FORMAT/GT"]
1061
+ for value in source_field.iter_values(start, stop):
1062
+ genotypes = value[:, :-1] if value is not None else None
1063
+ phased = value[:, -1] if value is not None else None
1064
+ sanitised_genotypes = sanitise_value_int_2d(shape, genotypes)
1065
+ sanitised_phased = sanitise_value_int_1d(shape[:-1], phased)
1066
+ # Force haploids to always be phased
1067
+ # https://github.com/sgkit-dev/bio2zarr/issues/399
1068
+ if sanitised_genotypes.shape[1] == 1:
1069
+ sanitised_phased[:] = True
1070
+ yield sanitised_genotypes, sanitised_phased
1071
+
1072
+ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
1073
+ variant_lengths = self.fields["rlen"].iter_values(start, stop)
1074
+ if self.gt_field is None or shape is None:
1075
+ for variant_length, alleles in zip(
1076
+ variant_lengths, self.iter_alleles(start, stop, num_alleles)
1077
+ ):
1078
+ # Stored ICF values are always at least 1D arrays; "rlen" is Number=1
1079
+ # so we must extract the scalar to avoid NumPy scalar-conversion issues.
1080
+ yield vcz.VariantData(variant_length[0], alleles, None, None)
1081
+ else:
1082
+ for variant_length, alleles, (gt, phased) in zip(
1083
+ variant_lengths,
1084
+ self.iter_alleles(start, stop, num_alleles),
1085
+ self.iter_genotypes(shape, start, stop),
1086
+ ):
1087
+ yield vcz.VariantData(variant_length[0], alleles, gt, phased)
1088
+
1089
+ def generate_schema(
1090
+ self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
1091
+ ):
1092
+ if local_alleles is None:
1093
+ local_alleles = False
1094
+
1095
+ max_alleles = max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2)
1096
+
1097
+ # Add ploidy and genotypes dimensions only when needed
1098
+ max_genotypes = 0
1099
+ has_g_field = False
1100
+ for field in self.metadata.format_fields:
1101
+ if field.vcf_number == "G":
1102
+ has_g_field = True
1103
+ max_genotypes = max(max_genotypes, field.summary.max_number)
1104
+
1105
+ ploidy = None
1106
+ genotypes_size = None
1107
+ if self.gt_field is not None:
1108
+ ploidy = max(self.gt_field.summary.max_number - 1, 1)
1109
+ # NOTE: it's not clear why we're computing this, when we must have had
1110
+ # at least one number=G field to require it anyway?
1111
+ genotypes_size = math.comb(max_alleles + ploidy - 1, ploidy)
1112
+ # assert max_genotypes == genotypes_size
1113
+ else:
1114
+ if max_genotypes > 0 or has_g_field:
1115
+ # there is no GT field, but there is at least one Number=G field,
1116
+ # so need to define genotypes dimension
1117
+ genotypes_size = max_genotypes
1118
+
1119
+ dimensions = vcz.standard_dimensions(
1120
+ variants_size=self.num_records,
1121
+ variants_chunk_size=variants_chunk_size,
1122
+ samples_size=self.num_samples,
1123
+ samples_chunk_size=samples_chunk_size,
1124
+ alleles_size=max_alleles,
1125
+ filters_size=self.metadata.num_filters,
1126
+ ploidy_size=ploidy,
1127
+ genotypes_size=genotypes_size,
1128
+ )
1129
+
1130
+ schema_instance = vcz.VcfZarrSchema(
1131
+ format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
1132
+ dimensions=dimensions,
1133
+ fields=[],
1134
+ )
1135
+
1136
+ logger.info(
1137
+ "Generating schema with chunks="
1138
+ f"variants={dimensions['variants'].chunk_size}, "
1139
+ f"samples={dimensions['samples'].chunk_size}"
1140
+ )
1141
+
1142
+ def spec_from_field(field, array_name=None):
1143
+ return vcz.ZarrArraySpec.from_field(
1144
+ field,
1145
+ schema_instance,
1146
+ array_name=array_name,
1147
+ )
1148
+
1149
+ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
1150
+ compressor = (
1151
+ vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config()
1152
+ if dtype == "bool"
1153
+ else None
1154
+ )
1155
+ return vcz.ZarrArraySpec(
1156
+ source=source,
1157
+ name=name,
1158
+ dtype=dtype,
1159
+ description="",
1160
+ dimensions=dimensions,
1161
+ compressor=compressor,
1162
+ )
1163
+
1164
+ name_map = {field.full_name: field for field in self.metadata.fields}
1165
+ array_specs = [
1166
+ fixed_field_spec(
1167
+ name="variant_contig",
1168
+ dtype=core.min_int_dtype(0, self.metadata.num_contigs),
1169
+ ),
1170
+ fixed_field_spec(
1171
+ name="variant_filter",
1172
+ dtype="bool",
1173
+ dimensions=["variants", "filters"],
1174
+ ),
1175
+ fixed_field_spec(
1176
+ name="variant_allele",
1177
+ dtype=STRING_DTYPE_NAME,
1178
+ dimensions=["variants", "alleles"],
1179
+ ),
1180
+ fixed_field_spec(
1181
+ name="variant_length",
1182
+ dtype=name_map["rlen"].smallest_dtype(),
1183
+ dimensions=["variants"],
1184
+ ),
1185
+ fixed_field_spec(
1186
+ name="variant_id",
1187
+ dtype=STRING_DTYPE_NAME,
1188
+ ),
1189
+ fixed_field_spec(
1190
+ name="variant_id_mask",
1191
+ dtype="bool",
1192
+ ),
1193
+ ]
1194
+
1195
+ # Only two of the fixed fields have a direct one-to-one mapping.
1196
+ array_specs.extend(
1197
+ [
1198
+ spec_from_field(name_map["QUAL"], array_name="variant_quality"),
1199
+ spec_from_field(name_map["POS"], array_name="variant_position"),
1200
+ ]
1201
+ )
1202
+ array_specs.extend(
1203
+ [spec_from_field(field) for field in self.metadata.info_fields]
1204
+ )
1205
+
1206
+ for field in self.metadata.format_fields:
1207
+ if field.name == "GT":
1208
+ continue
1209
+ array_specs.append(spec_from_field(field))
1210
+
1211
+ if self.gt_field is not None and self.num_samples > 0:
1212
+ array_specs.append(
1213
+ vcz.ZarrArraySpec(
1214
+ name="call_genotype_phased",
1215
+ dtype="bool",
1216
+ dimensions=["variants", "samples"],
1217
+ description="",
1218
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
1219
+ )
1220
+ )
1221
+ array_specs.append(
1222
+ vcz.ZarrArraySpec(
1223
+ name="call_genotype",
1224
+ dtype=self.gt_field.smallest_dtype(),
1225
+ dimensions=["variants", "samples", "ploidy"],
1226
+ description="",
1227
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
1228
+ )
1229
+ )
1230
+ array_specs.append(
1231
+ vcz.ZarrArraySpec(
1232
+ name="call_genotype_mask",
1233
+ dtype="bool",
1234
+ dimensions=["variants", "samples", "ploidy"],
1235
+ description="",
1236
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
1237
+ )
1238
+ )
1239
+
1240
+ if local_alleles:
1241
+ array_specs = convert_local_allele_field_types(array_specs, schema_instance)
1242
+
1243
+ schema_instance.fields = array_specs
1244
+ return schema_instance
1245
+
919
1246
 
920
1247
  @dataclasses.dataclass
921
1248
  class IcfPartitionMetadata(core.JsonDataclass):
@@ -987,7 +1314,7 @@ class IntermediateColumnarFormatWriter:
987
1314
  vcfs,
988
1315
  *,
989
1316
  column_chunk_size=16,
990
- worker_processes=1,
1317
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
991
1318
  target_num_partitions=None,
992
1319
  show_progress=False,
993
1320
  compressor=None,
@@ -1139,7 +1466,9 @@ class IntermediateColumnarFormatWriter:
1139
1466
  f"{num_records} records last_pos={last_position}"
1140
1467
  )
1141
1468
 
1142
- def explode(self, *, worker_processes=1, show_progress=False):
1469
+ def explode(
1470
+ self, *, worker_processes=core.DEFAULT_WORKER_PROCESSES, show_progress=False
1471
+ ):
1143
1472
  self.load_metadata()
1144
1473
  num_records = self.metadata.num_records
1145
1474
  if np.isinf(num_records):
@@ -1207,7 +1536,7 @@ def explode(
1207
1536
  vcfs,
1208
1537
  *,
1209
1538
  column_chunk_size=16,
1210
- worker_processes=1,
1539
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1211
1540
  show_progress=False,
1212
1541
  compressor=None,
1213
1542
  ):
@@ -1232,7 +1561,7 @@ def explode_init(
1232
1561
  *,
1233
1562
  column_chunk_size=16,
1234
1563
  target_num_partitions=1,
1235
- worker_processes=1,
1564
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1236
1565
  show_progress=False,
1237
1566
  compressor=None,
1238
1567
  ):
@@ -1255,3 +1584,166 @@ def explode_partition(icf_path, partition):
1255
1584
  def explode_finalise(icf_path):
1256
1585
  writer = IntermediateColumnarFormatWriter(icf_path)
1257
1586
  writer.finalise()
1587
+
1588
+
1589
+ def inspect(path):
1590
+ path = pathlib.Path(path)
1591
+ if not path.exists():
1592
+ raise ValueError(f"Path not found: {path}")
1593
+ if (path / "metadata.json").exists():
1594
+ obj = IntermediateColumnarFormat(path)
1595
+ elif zarr_exists(path):
1596
+ obj = vcz.VcfZarr(path)
1597
+ else:
1598
+ raise ValueError(f"{path} not in ICF or VCF Zarr format")
1599
+ return obj.summary_table()
1600
+
1601
+
1602
+ def mkschema(
1603
+ if_path,
1604
+ out,
1605
+ *,
1606
+ variants_chunk_size=None,
1607
+ samples_chunk_size=None,
1608
+ local_alleles=None,
1609
+ ):
1610
+ store = IntermediateColumnarFormat(if_path)
1611
+ spec = store.generate_schema(
1612
+ variants_chunk_size=variants_chunk_size,
1613
+ samples_chunk_size=samples_chunk_size,
1614
+ local_alleles=local_alleles,
1615
+ )
1616
+ out.write(spec.asjson())
1617
+
1618
+
1619
+ def convert(
1620
+ vcfs,
1621
+ vcz_path,
1622
+ *,
1623
+ variants_chunk_size=None,
1624
+ samples_chunk_size=None,
1625
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1626
+ local_alleles=None,
1627
+ show_progress=False,
1628
+ icf_path=None,
1629
+ ):
1630
+ """
1631
+ Convert the VCF data at the specified list of paths
1632
+ to VCF Zarr format stored at the specified path.
1633
+
1634
+ .. todo:: Document parameters
1635
+ """
1636
+ if icf_path is None:
1637
+ cm = temp_icf_path(prefix="vcf2zarr")
1638
+ else:
1639
+ cm = contextlib.nullcontext(icf_path)
1640
+
1641
+ with cm as icf_path:
1642
+ explode(
1643
+ icf_path,
1644
+ vcfs,
1645
+ worker_processes=worker_processes,
1646
+ show_progress=show_progress,
1647
+ )
1648
+ encode(
1649
+ icf_path,
1650
+ vcz_path,
1651
+ variants_chunk_size=variants_chunk_size,
1652
+ samples_chunk_size=samples_chunk_size,
1653
+ worker_processes=worker_processes,
1654
+ show_progress=show_progress,
1655
+ local_alleles=local_alleles,
1656
+ )
1657
+
1658
+
1659
+ @contextlib.contextmanager
1660
+ def temp_icf_path(prefix=None):
1661
+ with tempfile.TemporaryDirectory(prefix=prefix) as tmp:
1662
+ yield pathlib.Path(tmp) / "icf"
1663
+
1664
+
1665
+ def encode(
1666
+ icf_path,
1667
+ zarr_path,
1668
+ schema_path=None,
1669
+ variants_chunk_size=None,
1670
+ samples_chunk_size=None,
1671
+ max_variant_chunks=None,
1672
+ dimension_separator=None,
1673
+ max_memory=None,
1674
+ local_alleles=None,
1675
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1676
+ show_progress=False,
1677
+ ):
1678
+ # Rough heuristic to split work up enough to keep utilisation high
1679
+ target_num_partitions = max(1, worker_processes * 4)
1680
+ encode_init(
1681
+ icf_path,
1682
+ zarr_path,
1683
+ target_num_partitions,
1684
+ schema_path=schema_path,
1685
+ variants_chunk_size=variants_chunk_size,
1686
+ samples_chunk_size=samples_chunk_size,
1687
+ local_alleles=local_alleles,
1688
+ max_variant_chunks=max_variant_chunks,
1689
+ dimension_separator=dimension_separator,
1690
+ )
1691
+ vzw = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
1692
+ vzw.encode_all_partitions(
1693
+ worker_processes=worker_processes,
1694
+ show_progress=show_progress,
1695
+ max_memory=max_memory,
1696
+ )
1697
+ vzw.finalise(show_progress)
1698
+ vzw.create_index()
1699
+
1700
+
1701
+ def encode_init(
1702
+ icf_path,
1703
+ zarr_path,
1704
+ target_num_partitions,
1705
+ *,
1706
+ schema_path=None,
1707
+ variants_chunk_size=None,
1708
+ samples_chunk_size=None,
1709
+ local_alleles=None,
1710
+ max_variant_chunks=None,
1711
+ dimension_separator=None,
1712
+ max_memory=None,
1713
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1714
+ show_progress=False,
1715
+ ):
1716
+ icf_store = IntermediateColumnarFormat(icf_path)
1717
+ if schema_path is None:
1718
+ schema_instance = icf_store.generate_schema(
1719
+ variants_chunk_size=variants_chunk_size,
1720
+ samples_chunk_size=samples_chunk_size,
1721
+ local_alleles=local_alleles,
1722
+ )
1723
+ else:
1724
+ logger.info(f"Reading schema from {schema_path}")
1725
+ if variants_chunk_size is not None or samples_chunk_size is not None:
1726
+ raise ValueError(
1727
+ "Cannot specify schema along with chunk sizes"
1728
+ ) # NEEDS TEST
1729
+ with open(schema_path) as f:
1730
+ schema_instance = vcz.VcfZarrSchema.fromjson(f.read())
1731
+ zarr_path = pathlib.Path(zarr_path)
1732
+ vzw = vcz.VcfZarrWriter("icf", zarr_path)
1733
+ return vzw.init(
1734
+ icf_store,
1735
+ target_num_partitions=target_num_partitions,
1736
+ schema=schema_instance,
1737
+ dimension_separator=dimension_separator,
1738
+ max_variant_chunks=max_variant_chunks,
1739
+ )
1740
+
1741
+
1742
+ def encode_partition(zarr_path, partition):
1743
+ writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
1744
+ writer_instance.encode_partition(partition)
1745
+
1746
+
1747
+ def encode_finalise(zarr_path, show_progress=False):
1748
+ writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
1749
+ writer_instance.finalise(show_progress=show_progress)