bio2zarr 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

@@ -6,14 +6,17 @@ import logging
6
6
  import math
7
7
  import pathlib
8
8
  import pickle
9
+ import re
9
10
  import shutil
10
11
  import sys
12
+ import tempfile
13
+ from functools import partial
11
14
  from typing import Any
12
15
 
13
16
  import numcodecs
14
17
  import numpy as np
15
18
 
16
- from .. import constants, core, provenance, vcf_utils
19
+ from . import constants, core, provenance, vcf_utils, vcz
17
20
 
18
21
  logger = logging.getLogger(__name__)
19
22
 
@@ -77,6 +80,14 @@ class VcfField:
77
80
  return self.name
78
81
  return f"{self.category}/{self.name}"
79
82
 
83
+ @property
84
+ def max_number(self):
85
+ if self.vcf_number in ("R", "A", "G", "."):
86
+ return self.summary.max_number
87
+ else:
88
+ # use declared number if larger than max found
89
+ return max(self.summary.max_number, int(self.vcf_number))
90
+
80
91
  def smallest_dtype(self):
81
92
  """
82
93
  Returns the smallest dtype suitable for this field based
@@ -116,23 +127,6 @@ ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
116
127
  )
117
128
 
118
129
 
119
- @dataclasses.dataclass
120
- class Contig:
121
- id: str
122
- length: int = None
123
-
124
-
125
- @dataclasses.dataclass
126
- class Sample:
127
- id: str
128
-
129
-
130
- @dataclasses.dataclass
131
- class Filter:
132
- id: str
133
- description: str = ""
134
-
135
-
136
130
  @dataclasses.dataclass
137
131
  class IcfMetadata(core.JsonDataclass):
138
132
  samples: list
@@ -187,9 +181,9 @@ class IcfMetadata(core.JsonDataclass):
187
181
  d = d.copy()
188
182
  d["partitions"] = partitions
189
183
  d["fields"] = [VcfField.fromdict(fd) for fd in d["fields"]]
190
- d["samples"] = [Sample(**sd) for sd in d["samples"]]
191
- d["filters"] = [Filter(**fd) for fd in d["filters"]]
192
- d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
184
+ d["samples"] = [vcz.Sample(**sd) for sd in d["samples"]]
185
+ d["filters"] = [vcz.Filter(**fd) for fd in d["filters"]]
186
+ d["contigs"] = [vcz.Contig(**cd) for cd in d["contigs"]]
193
187
  return IcfMetadata(**d)
194
188
 
195
189
  def __eq__(self, other):
@@ -228,8 +222,8 @@ def fixed_vcf_field_definitions():
228
222
 
229
223
 
230
224
  def scan_vcf(path, target_num_partitions):
231
- with vcf_utils.IndexedVcf(path) as indexed_vcf:
232
- vcf = indexed_vcf.vcf
225
+ with vcf_utils.VcfFile(path) as vcf_file:
226
+ vcf = vcf_file.vcf
233
227
  filters = []
234
228
  pass_index = -1
235
229
  for h in vcf.header_iter():
@@ -240,7 +234,7 @@ def scan_vcf(path, target_num_partitions):
240
234
  description = ""
241
235
  if h["ID"] == "PASS":
242
236
  pass_index = len(filters)
243
- filters.append(Filter(h["ID"], description))
237
+ filters.append(vcz.Filter(h["ID"], description))
244
238
 
245
239
  # Ensure PASS is the first filter if present
246
240
  if pass_index > 0:
@@ -262,18 +256,18 @@ def scan_vcf(path, target_num_partitions):
262
256
  contig_lengths = [None for _ in vcf.seqnames]
263
257
 
264
258
  metadata = IcfMetadata(
265
- samples=[Sample(sample_id) for sample_id in vcf.samples],
259
+ samples=[vcz.Sample(sample_id) for sample_id in vcf.samples],
266
260
  contigs=[
267
- Contig(contig_id, length)
261
+ vcz.Contig(contig_id, length)
268
262
  for contig_id, length in zip(vcf.seqnames, contig_lengths)
269
263
  ],
270
264
  filters=filters,
271
265
  fields=fields,
272
266
  partitions=[],
273
- num_records=sum(indexed_vcf.contig_record_counts().values()),
267
+ num_records=sum(vcf_file.contig_record_counts().values()),
274
268
  )
275
269
 
276
- regions = indexed_vcf.partition_into_regions(num_parts=target_num_partitions)
270
+ regions = vcf_file.partition_into_regions(num_parts=target_num_partitions)
277
271
  for region in regions:
278
272
  metadata.partitions.append(
279
273
  VcfPartition(
@@ -291,7 +285,12 @@ def scan_vcf(path, target_num_partitions):
291
285
  return metadata, vcf.raw_header
292
286
 
293
287
 
294
- def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
288
+ def scan_vcfs(
289
+ paths,
290
+ show_progress,
291
+ target_num_partitions,
292
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
293
+ ):
295
294
  logger.info(
296
295
  f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
297
296
  f" partitions."
@@ -324,14 +323,28 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
324
323
  # are compatible.
325
324
  all_partitions = []
326
325
  total_records = 0
326
+ contigs = {}
327
327
  for metadata, _ in results:
328
328
  for partition in metadata.partitions:
329
329
  logger.debug(f"Scanned partition {partition}")
330
330
  all_partitions.append(partition)
331
+ for contig in metadata.contigs:
332
+ if contig.id in contigs:
333
+ if contig != contigs[contig.id]:
334
+ raise ValueError(
335
+ "Incompatible contig definitions: "
336
+ f"{contig} != {contigs[contig.id]}"
337
+ )
338
+ else:
339
+ contigs[contig.id] = contig
331
340
  total_records += metadata.num_records
332
341
  metadata.num_records = 0
333
342
  metadata.partitions = []
334
343
 
344
+ contig_union = list(contigs.values())
345
+ for metadata, _ in results:
346
+ metadata.contigs = contig_union
347
+
335
348
  icf_metadata, header = results[0]
336
349
  for metadata, _ in results[1:]:
337
350
  if metadata != icf_metadata:
@@ -352,64 +365,58 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
352
365
  return icf_metadata, header
353
366
 
354
367
 
355
- def sanitise_value_bool(buff, j, value):
368
+ def sanitise_value_bool(shape, value):
356
369
  x = True
357
370
  if value is None:
358
371
  x = False
359
- buff[j] = x
372
+ return x
360
373
 
361
374
 
362
- def sanitise_value_float_scalar(buff, j, value):
375
+ def sanitise_value_float_scalar(shape, value):
363
376
  x = value
364
377
  if value is None:
365
378
  x = [constants.FLOAT32_MISSING]
366
- buff[j] = x[0]
379
+ return x[0]
367
380
 
368
381
 
369
- def sanitise_value_int_scalar(buff, j, value):
382
+ def sanitise_value_int_scalar(shape, value):
370
383
  x = value
371
384
  if value is None:
372
- # print("MISSING", INT_MISSING, INT_FILL)
373
385
  x = [constants.INT_MISSING]
374
386
  else:
375
387
  x = sanitise_int_array(value, ndmin=1, dtype=np.int32)
376
- buff[j] = x[0]
388
+ return x[0]
377
389
 
378
390
 
379
- def sanitise_value_string_scalar(buff, j, value):
391
+ def sanitise_value_string_scalar(shape, value):
380
392
  if value is None:
381
- buff[j] = "."
393
+ return "."
382
394
  else:
383
- buff[j] = value[0]
395
+ return value[0]
384
396
 
385
397
 
386
- def sanitise_value_string_1d(buff, j, value):
398
+ def sanitise_value_string_1d(shape, value):
387
399
  if value is None:
388
- buff[j] = "."
400
+ return np.full(shape, ".", dtype="O")
389
401
  else:
390
- # value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
391
- # FIXME failure isn't coming from here, it seems to be from an
392
- # incorrectly detected dimension in the zarr array
393
- # The dimesions look all wrong, and the dtype should be Object
394
- # not str
395
402
  value = drop_empty_second_dim(value)
396
- buff[j] = ""
397
- buff[j, : value.shape[0]] = value
403
+ result = np.full(shape, "", dtype=value.dtype)
404
+ result[: value.shape[0]] = value
405
+ return result
398
406
 
399
407
 
400
- def sanitise_value_string_2d(buff, j, value):
408
+ def sanitise_value_string_2d(shape, value):
401
409
  if value is None:
402
- buff[j] = "."
410
+ return np.full(shape, ".", dtype="O")
403
411
  else:
404
- # print(buff.shape, value.dtype, value)
405
- # assert value.ndim == 2
406
- buff[j] = ""
412
+ result = np.full(shape, "", dtype="O")
407
413
  if value.ndim == 2:
408
- buff[j, :, : value.shape[1]] = value
414
+ result[: value.shape[0], : value.shape[1]] = value
409
415
  else:
410
- # TODO check if this is still necessary
416
+ # Convert 1D array into 2D with appropriate shape
411
417
  for k, val in enumerate(value):
412
- buff[j, k, : len(val)] = val
418
+ result[k, : len(val)] = val
419
+ return result
413
420
 
414
421
 
415
422
  def drop_empty_second_dim(value):
@@ -419,27 +426,28 @@ def drop_empty_second_dim(value):
419
426
  return value
420
427
 
421
428
 
422
- def sanitise_value_float_1d(buff, j, value):
429
+ def sanitise_value_float_1d(shape, value):
423
430
  if value is None:
424
- buff[j] = constants.FLOAT32_MISSING
431
+ return np.full(shape, constants.FLOAT32_MISSING)
425
432
  else:
426
- value = np.array(value, ndmin=1, dtype=buff.dtype, copy=True)
433
+ value = np.array(value, ndmin=1, dtype=np.float32, copy=True)
427
434
  # numpy will map None values to Nan, but we need a
428
435
  # specific NaN
429
436
  value[np.isnan(value)] = constants.FLOAT32_MISSING
430
437
  value = drop_empty_second_dim(value)
431
- buff[j] = constants.FLOAT32_FILL
432
- buff[j, : value.shape[0]] = value
438
+ result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
439
+ result[: value.shape[0]] = value
440
+ return result
433
441
 
434
442
 
435
- def sanitise_value_float_2d(buff, j, value):
443
+ def sanitise_value_float_2d(shape, value):
436
444
  if value is None:
437
- buff[j] = constants.FLOAT32_MISSING
445
+ return np.full(shape, constants.FLOAT32_MISSING)
438
446
  else:
439
- # print("value = ", value)
440
- value = np.array(value, ndmin=2, dtype=buff.dtype, copy=True)
441
- buff[j] = constants.FLOAT32_FILL
442
- buff[j, :, : value.shape[1]] = value
447
+ value = np.array(value, ndmin=2, dtype=np.float32, copy=True)
448
+ result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
449
+ result[:, : value.shape[1]] = value
450
+ return result
443
451
 
444
452
 
445
453
  def sanitise_int_array(value, ndmin, dtype):
@@ -454,23 +462,25 @@ def sanitise_int_array(value, ndmin, dtype):
454
462
  return value.astype(dtype)
455
463
 
456
464
 
457
- def sanitise_value_int_1d(buff, j, value):
465
+ def sanitise_value_int_1d(shape, value):
458
466
  if value is None:
459
- buff[j] = -1
467
+ return np.full(shape, -1)
460
468
  else:
461
- value = sanitise_int_array(value, 1, buff.dtype)
469
+ value = sanitise_int_array(value, 1, np.int32)
462
470
  value = drop_empty_second_dim(value)
463
- buff[j] = -2
464
- buff[j, : value.shape[0]] = value
471
+ result = np.full(shape, -2, dtype=np.int32)
472
+ result[: value.shape[0]] = value
473
+ return result
465
474
 
466
475
 
467
- def sanitise_value_int_2d(buff, j, value):
476
+ def sanitise_value_int_2d(shape, value):
468
477
  if value is None:
469
- buff[j] = -1
478
+ return np.full(shape, -1)
470
479
  else:
471
- value = sanitise_int_array(value, 2, buff.dtype)
472
- buff[j] = -2
473
- buff[j, :, : value.shape[1]] = value
480
+ value = sanitise_int_array(value, 2, np.int32)
481
+ result = np.full(shape, -2, dtype=np.int32)
482
+ result[:, : value.shape[1]] = value
483
+ return result
474
484
 
475
485
 
476
486
  missing_value_map = {
@@ -634,7 +644,8 @@ class IntermediateColumnarFormatField:
634
644
  chunk_cumulative_records = self.chunk_record_index(partition_id)
635
645
  chunk_num_records = np.diff(chunk_cumulative_records)
636
646
  for count, cumulative in zip(
637
- chunk_num_records[start_chunk:], chunk_cumulative_records[start_chunk + 1 :]
647
+ chunk_num_records[start_chunk:],
648
+ chunk_cumulative_records[start_chunk + 1 :],
638
649
  ):
639
650
  path = partition_path / f"{cumulative}"
640
651
  chunk = self.read_chunk(path)
@@ -693,36 +704,32 @@ class IntermediateColumnarFormatField:
693
704
  return ret
694
705
 
695
706
  def sanitiser_factory(self, shape):
696
- """
697
- Return a function that sanitised values from this column
698
- and writes into a buffer of the specified shape.
699
- """
700
- assert len(shape) <= 3
707
+ assert len(shape) <= 2
701
708
  if self.vcf_field.vcf_type == "Flag":
702
- assert len(shape) == 1
703
- return sanitise_value_bool
709
+ assert len(shape) == 0
710
+ return partial(sanitise_value_bool, shape)
704
711
  elif self.vcf_field.vcf_type == "Float":
705
- if len(shape) == 1:
706
- return sanitise_value_float_scalar
707
- elif len(shape) == 2:
708
- return sanitise_value_float_1d
712
+ if len(shape) == 0:
713
+ return partial(sanitise_value_float_scalar, shape)
714
+ elif len(shape) == 1:
715
+ return partial(sanitise_value_float_1d, shape)
709
716
  else:
710
- return sanitise_value_float_2d
717
+ return partial(sanitise_value_float_2d, shape)
711
718
  elif self.vcf_field.vcf_type == "Integer":
712
- if len(shape) == 1:
713
- return sanitise_value_int_scalar
714
- elif len(shape) == 2:
715
- return sanitise_value_int_1d
719
+ if len(shape) == 0:
720
+ return partial(sanitise_value_int_scalar, shape)
721
+ elif len(shape) == 1:
722
+ return partial(sanitise_value_int_1d, shape)
716
723
  else:
717
- return sanitise_value_int_2d
724
+ return partial(sanitise_value_int_2d, shape)
718
725
  else:
719
726
  assert self.vcf_field.vcf_type in ("String", "Character")
720
- if len(shape) == 1:
721
- return sanitise_value_string_scalar
722
- elif len(shape) == 2:
723
- return sanitise_value_string_1d
727
+ if len(shape) == 0:
728
+ return partial(sanitise_value_string_scalar, shape)
729
+ elif len(shape) == 1:
730
+ return partial(sanitise_value_string_1d, shape)
724
731
  else:
725
- return sanitise_value_string_2d
732
+ return partial(sanitise_value_string_2d, shape)
726
733
 
727
734
 
728
735
  @dataclasses.dataclass
@@ -829,9 +836,66 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
829
836
  return False
830
837
 
831
838
 
832
- class IntermediateColumnarFormat(collections.abc.Mapping):
839
+ def convert_local_allele_field_types(fields, schema_instance):
840
+ """
841
+ Update the specified list of fields to include the LAA field, and to convert
842
+ any supported localisable fields to the L* counterpart.
843
+
844
+ Note that we currently support only two ALT alleles per sample, and so the
845
+ dimensions of these fields are fixed by that requirement. Later versions may
846
+ use summary data storted in the ICF to make different choices, if information
847
+ about subsequent alleles (not in the actual genotype calls) should also be
848
+ stored.
849
+ """
850
+ fields_by_name = {field.name: field for field in fields}
851
+ gt = fields_by_name["call_genotype"]
852
+
853
+ if schema_instance.get_shape(["ploidy"])[0] != 2:
854
+ raise ValueError("Local alleles only supported on diploid data")
855
+
856
+ dimensions = gt.dimensions[:-1]
857
+
858
+ la = vcz.ZarrArraySpec(
859
+ name="call_LA",
860
+ dtype="i1",
861
+ dimensions=(*dimensions, "local_alleles"),
862
+ description=(
863
+ "0-based indices into REF+ALT, indicating which alleles"
864
+ " are relevant (local) for the current sample"
865
+ ),
866
+ )
867
+ schema_instance.dimensions["local_alleles"] = vcz.VcfZarrDimension.unchunked(
868
+ schema_instance.dimensions["ploidy"].size
869
+ )
870
+
871
+ ad = fields_by_name.get("call_AD", None)
872
+ if ad is not None:
873
+ # TODO check if call_LAD is in the list already
874
+ ad.name = "call_LAD"
875
+ ad.source = None
876
+ ad.dimensions = (*dimensions, "local_alleles_AD")
877
+ ad.description += " (local-alleles)"
878
+ schema_instance.dimensions["local_alleles_AD"] = vcz.VcfZarrDimension.unchunked(
879
+ 2
880
+ )
881
+
882
+ pl = fields_by_name.get("call_PL", None)
883
+ if pl is not None:
884
+ # TODO check if call_LPL is in the list already
885
+ pl.name = "call_LPL"
886
+ pl.source = None
887
+ pl.description += " (local-alleles)"
888
+ pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1].split("_")[-1])
889
+ schema_instance.dimensions["local_" + pl.dimensions[-1].split("_")[-1]] = (
890
+ vcz.VcfZarrDimension.unchunked(3)
891
+ )
892
+
893
+ return [*fields, la]
894
+
895
+
896
+ class IntermediateColumnarFormat(vcz.Source):
833
897
  def __init__(self, path):
834
- self.path = pathlib.Path(path)
898
+ self._path = pathlib.Path(path)
835
899
  # TODO raise a more informative error here telling people this
836
900
  # directory is either a WIP or the wrong format.
837
901
  with open(self.path / "metadata.json") as f:
@@ -845,8 +909,12 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
845
909
  ]
846
910
  # Allow us to find which partition a given record is in
847
911
  self.partition_record_index = np.cumsum([0, *partition_num_records])
912
+ self.gt_field = None
848
913
  for field in self.metadata.fields:
849
914
  self.fields[field.full_name] = IntermediateColumnarFormatField(self, field)
915
+ if field.name == "GT":
916
+ self.gt_field = field
917
+
850
918
  logger.info(
851
919
  f"Loaded IntermediateColumnarFormat(partitions={self.num_partitions}, "
852
920
  f"records={self.num_records}, fields={self.num_fields})"
@@ -854,20 +922,11 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
854
922
 
855
923
  def __repr__(self):
856
924
  return (
857
- f"IntermediateColumnarFormat(fields={len(self)}, "
925
+ f"IntermediateColumnarFormat(fields={len(self.fields)}, "
858
926
  f"partitions={self.num_partitions}, "
859
927
  f"records={self.num_records}, path={self.path})"
860
928
  )
861
929
 
862
- def __getitem__(self, key):
863
- return self.fields[key]
864
-
865
- def __iter__(self):
866
- return iter(self.fields)
867
-
868
- def __len__(self):
869
- return len(self.fields)
870
-
871
930
  def summary_table(self):
872
931
  data = []
873
932
  for name, icf_field in self.fields.items():
@@ -886,6 +945,10 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
886
945
  data.append(d)
887
946
  return data
888
947
 
948
+ @property
949
+ def path(self):
950
+ return self._path
951
+
889
952
  @property
890
953
  def num_records(self):
891
954
  return self.metadata.num_records
@@ -894,6 +957,18 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
894
957
  def num_partitions(self):
895
958
  return len(self.metadata.partitions)
896
959
 
960
+ @property
961
+ def samples(self):
962
+ return self.metadata.samples
963
+
964
+ @property
965
+ def contigs(self):
966
+ return self.metadata.contigs
967
+
968
+ @property
969
+ def filters(self):
970
+ return self.metadata.filters
971
+
897
972
  @property
898
973
  def num_samples(self):
899
974
  return len(self.metadata.samples)
@@ -902,6 +977,261 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
902
977
  def num_fields(self):
903
978
  return len(self.fields)
904
979
 
980
+ @property
981
+ def root_attrs(self):
982
+ meta_information_pattern = re.compile("##([^=]+)=(.*)")
983
+ vcf_meta_information = []
984
+ for line in self.vcf_header.split("\n"):
985
+ match = re.fullmatch(meta_information_pattern, line)
986
+ if match:
987
+ key = match.group(1)
988
+ if key in ("contig", "FILTER", "INFO", "FORMAT"):
989
+ # these fields are stored in Zarr arrays
990
+ continue
991
+ value = match.group(2)
992
+ vcf_meta_information.append((key, value))
993
+ return {
994
+ "vcf_meta_information": vcf_meta_information,
995
+ }
996
+
997
+ def iter_id(self, start, stop):
998
+ for value in self.fields["ID"].iter_values(start, stop):
999
+ if value is not None:
1000
+ yield value[0]
1001
+ else:
1002
+ yield None
1003
+
1004
+ def iter_filters(self, start, stop):
1005
+ source_field = self.fields["FILTERS"]
1006
+ lookup = {filt.id: index for index, filt in enumerate(self.metadata.filters)}
1007
+
1008
+ for filter_values in source_field.iter_values(start, stop):
1009
+ filters = np.zeros(len(self.metadata.filters), dtype=bool)
1010
+ if filter_values is not None:
1011
+ for filter_id in filter_values:
1012
+ try:
1013
+ filters[lookup[filter_id]] = True
1014
+ except KeyError:
1015
+ raise ValueError(
1016
+ f"Filter '{filter_id}' was not defined in the header."
1017
+ ) from None
1018
+ yield filters
1019
+
1020
+ def iter_contig(self, start, stop):
1021
+ source_field = self.fields["CHROM"]
1022
+ lookup = {
1023
+ contig.id: index for index, contig in enumerate(self.metadata.contigs)
1024
+ }
1025
+
1026
+ for value in source_field.iter_values(start, stop):
1027
+ # Note: because we are using the indexes to define the lookups
1028
+ # and we always have an index, it seems that we the contig lookup
1029
+ # will always succeed. However, if anyone ever does hit a KeyError
1030
+ # here, please do open an issue with a reproducible example!
1031
+ yield lookup[value[0]]
1032
+
1033
+ def iter_field(self, field_name, shape, start, stop):
1034
+ source_field = self.fields[field_name]
1035
+ sanitiser = source_field.sanitiser_factory(shape)
1036
+ for value in source_field.iter_values(start, stop):
1037
+ yield sanitiser(value)
1038
+
1039
+ def iter_alleles(self, start, stop, num_alleles):
1040
+ ref_field = self.fields["REF"]
1041
+ alt_field = self.fields["ALT"]
1042
+
1043
+ for ref, alt in zip(
1044
+ ref_field.iter_values(start, stop),
1045
+ alt_field.iter_values(start, stop),
1046
+ ):
1047
+ alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
1048
+ alleles[0] = ref[0]
1049
+ alleles[1 : 1 + len(alt)] = alt
1050
+ yield alleles
1051
+
1052
+ def iter_genotypes(self, shape, start, stop):
1053
+ source_field = self.fields["FORMAT/GT"]
1054
+ for value in source_field.iter_values(start, stop):
1055
+ genotypes = value[:, :-1] if value is not None else None
1056
+ phased = value[:, -1] if value is not None else None
1057
+ sanitised_genotypes = sanitise_value_int_2d(shape, genotypes)
1058
+ sanitised_phased = sanitise_value_int_1d(shape[:-1], phased)
1059
+ # Force haploids to always be phased
1060
+ # https://github.com/sgkit-dev/bio2zarr/issues/399
1061
+ if sanitised_genotypes.shape[1] == 1:
1062
+ sanitised_phased[:] = True
1063
+ yield sanitised_genotypes, sanitised_phased
1064
+
1065
+ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
1066
+ variant_lengths = self.fields["rlen"].iter_values(start, stop)
1067
+ if self.gt_field is None or shape is None:
1068
+ for variant_length, alleles in zip(
1069
+ variant_lengths, self.iter_alleles(start, stop, num_alleles)
1070
+ ):
1071
+ yield vcz.VariantData(variant_length, alleles, None, None)
1072
+ else:
1073
+ for variant_length, alleles, (gt, phased) in zip(
1074
+ variant_lengths,
1075
+ self.iter_alleles(start, stop, num_alleles),
1076
+ self.iter_genotypes(shape, start, stop),
1077
+ ):
1078
+ yield vcz.VariantData(variant_length, alleles, gt, phased)
1079
+
1080
+ def generate_schema(
1081
+ self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
1082
+ ):
1083
+ if local_alleles is None:
1084
+ local_alleles = False
1085
+
1086
+ max_alleles = max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2)
1087
+
1088
+ # Add ploidy and genotypes dimensions only when needed
1089
+ max_genotypes = 0
1090
+ for field in self.metadata.format_fields:
1091
+ if field.vcf_number == "G":
1092
+ max_genotypes = max(max_genotypes, field.summary.max_number)
1093
+
1094
+ ploidy = None
1095
+ genotypes_size = None
1096
+ if self.gt_field is not None:
1097
+ ploidy = max(self.gt_field.summary.max_number - 1, 1)
1098
+ # NOTE: it's not clear why we're computing this, when we must have had
1099
+ # at least one number=G field to require it anyway?
1100
+ genotypes_size = math.comb(max_alleles + ploidy - 1, ploidy)
1101
+ # assert max_genotypes == genotypes_size
1102
+ else:
1103
+ if max_genotypes > 0:
1104
+ # there is no GT field, but there is at least one Number=G field,
1105
+ # so need to define genotypes dimension
1106
+ genotypes_size = max_genotypes
1107
+
1108
+ dimensions = vcz.standard_dimensions(
1109
+ variants_size=self.num_records,
1110
+ variants_chunk_size=variants_chunk_size,
1111
+ samples_size=self.num_samples,
1112
+ samples_chunk_size=samples_chunk_size,
1113
+ alleles_size=max_alleles,
1114
+ filters_size=self.metadata.num_filters,
1115
+ ploidy_size=ploidy,
1116
+ genotypes_size=genotypes_size,
1117
+ )
1118
+
1119
+ schema_instance = vcz.VcfZarrSchema(
1120
+ format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
1121
+ dimensions=dimensions,
1122
+ fields=[],
1123
+ )
1124
+
1125
+ logger.info(
1126
+ "Generating schema with chunks="
1127
+ f"variants={dimensions['variants'].chunk_size}, "
1128
+ f"samples={dimensions['samples'].chunk_size}"
1129
+ )
1130
+
1131
+ def spec_from_field(field, array_name=None):
1132
+ return vcz.ZarrArraySpec.from_field(
1133
+ field,
1134
+ schema_instance,
1135
+ array_name=array_name,
1136
+ )
1137
+
1138
+ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
1139
+ compressor = (
1140
+ vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config()
1141
+ if dtype == "bool"
1142
+ else None
1143
+ )
1144
+ return vcz.ZarrArraySpec(
1145
+ source=source,
1146
+ name=name,
1147
+ dtype=dtype,
1148
+ description="",
1149
+ dimensions=dimensions,
1150
+ compressor=compressor,
1151
+ )
1152
+
1153
+ name_map = {field.full_name: field for field in self.metadata.fields}
1154
+ array_specs = [
1155
+ fixed_field_spec(
1156
+ name="variant_contig",
1157
+ dtype=core.min_int_dtype(0, self.metadata.num_contigs),
1158
+ ),
1159
+ fixed_field_spec(
1160
+ name="variant_filter",
1161
+ dtype="bool",
1162
+ dimensions=["variants", "filters"],
1163
+ ),
1164
+ fixed_field_spec(
1165
+ name="variant_allele",
1166
+ dtype="O",
1167
+ dimensions=["variants", "alleles"],
1168
+ ),
1169
+ fixed_field_spec(
1170
+ name="variant_length",
1171
+ dtype=name_map["rlen"].smallest_dtype(),
1172
+ dimensions=["variants"],
1173
+ ),
1174
+ fixed_field_spec(
1175
+ name="variant_id",
1176
+ dtype="O",
1177
+ ),
1178
+ fixed_field_spec(
1179
+ name="variant_id_mask",
1180
+ dtype="bool",
1181
+ ),
1182
+ ]
1183
+
1184
+ # Only two of the fixed fields have a direct one-to-one mapping.
1185
+ array_specs.extend(
1186
+ [
1187
+ spec_from_field(name_map["QUAL"], array_name="variant_quality"),
1188
+ spec_from_field(name_map["POS"], array_name="variant_position"),
1189
+ ]
1190
+ )
1191
+ array_specs.extend(
1192
+ [spec_from_field(field) for field in self.metadata.info_fields]
1193
+ )
1194
+
1195
+ for field in self.metadata.format_fields:
1196
+ if field.name == "GT":
1197
+ continue
1198
+ array_specs.append(spec_from_field(field))
1199
+
1200
+ if self.gt_field is not None and self.num_samples > 0:
1201
+ array_specs.append(
1202
+ vcz.ZarrArraySpec(
1203
+ name="call_genotype_phased",
1204
+ dtype="bool",
1205
+ dimensions=["variants", "samples"],
1206
+ description="",
1207
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
1208
+ )
1209
+ )
1210
+ array_specs.append(
1211
+ vcz.ZarrArraySpec(
1212
+ name="call_genotype",
1213
+ dtype=self.gt_field.smallest_dtype(),
1214
+ dimensions=["variants", "samples", "ploidy"],
1215
+ description="",
1216
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
1217
+ )
1218
+ )
1219
+ array_specs.append(
1220
+ vcz.ZarrArraySpec(
1221
+ name="call_genotype_mask",
1222
+ dtype="bool",
1223
+ dimensions=["variants", "samples", "ploidy"],
1224
+ description="",
1225
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
1226
+ )
1227
+ )
1228
+
1229
+ if local_alleles:
1230
+ array_specs = convert_local_allele_field_types(array_specs, schema_instance)
1231
+
1232
+ schema_instance.fields = array_specs
1233
+ return schema_instance
1234
+
905
1235
 
906
1236
  @dataclasses.dataclass
907
1237
  class IcfPartitionMetadata(core.JsonDataclass):
@@ -973,7 +1303,7 @@ class IntermediateColumnarFormatWriter:
973
1303
  vcfs,
974
1304
  *,
975
1305
  column_chunk_size=16,
976
- worker_processes=1,
1306
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
977
1307
  target_num_partitions=None,
978
1308
  show_progress=False,
979
1309
  compressor=None,
@@ -1079,9 +1409,9 @@ class IntermediateColumnarFormatWriter:
1079
1409
  self.path,
1080
1410
  partition_index,
1081
1411
  ) as tcw:
1082
- with vcf_utils.IndexedVcf(partition.vcf_path) as ivcf:
1412
+ with vcf_utils.VcfFile(partition.vcf_path) as vcf:
1083
1413
  num_records = 0
1084
- for variant in ivcf.variants(partition.region):
1414
+ for variant in vcf.variants(partition.region):
1085
1415
  num_records += 1
1086
1416
  last_position = variant.POS
1087
1417
  tcw.append("CHROM", variant.CHROM)
@@ -1125,7 +1455,9 @@ class IntermediateColumnarFormatWriter:
1125
1455
  f"{num_records} records last_pos={last_position}"
1126
1456
  )
1127
1457
 
1128
- def explode(self, *, worker_processes=1, show_progress=False):
1458
+ def explode(
1459
+ self, *, worker_processes=core.DEFAULT_WORKER_PROCESSES, show_progress=False
1460
+ ):
1129
1461
  self.load_metadata()
1130
1462
  num_records = self.metadata.num_records
1131
1463
  if np.isinf(num_records):
@@ -1193,7 +1525,7 @@ def explode(
1193
1525
  vcfs,
1194
1526
  *,
1195
1527
  column_chunk_size=16,
1196
- worker_processes=1,
1528
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1197
1529
  show_progress=False,
1198
1530
  compressor=None,
1199
1531
  ):
@@ -1218,7 +1550,7 @@ def explode_init(
1218
1550
  *,
1219
1551
  column_chunk_size=16,
1220
1552
  target_num_partitions=1,
1221
- worker_processes=1,
1553
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1222
1554
  show_progress=False,
1223
1555
  compressor=None,
1224
1556
  ):
@@ -1241,3 +1573,167 @@ def explode_partition(icf_path, partition):
1241
1573
  def explode_finalise(icf_path):
1242
1574
  writer = IntermediateColumnarFormatWriter(icf_path)
1243
1575
  writer.finalise()
1576
+
1577
+
1578
+ def inspect(path):
1579
+ path = pathlib.Path(path)
1580
+ if not path.exists():
1581
+ raise ValueError(f"Path not found: {path}")
1582
+ if (path / "metadata.json").exists():
1583
+ obj = IntermediateColumnarFormat(path)
1584
+ # NOTE: this is too strict, we should support more general Zarrs, see #276
1585
+ elif (path / ".zmetadata").exists():
1586
+ obj = vcz.VcfZarr(path)
1587
+ else:
1588
+ raise ValueError(f"{path} not in ICF or VCF Zarr format")
1589
+ return obj.summary_table()
1590
+
1591
+
1592
+ def mkschema(
1593
+ if_path,
1594
+ out,
1595
+ *,
1596
+ variants_chunk_size=None,
1597
+ samples_chunk_size=None,
1598
+ local_alleles=None,
1599
+ ):
1600
+ store = IntermediateColumnarFormat(if_path)
1601
+ spec = store.generate_schema(
1602
+ variants_chunk_size=variants_chunk_size,
1603
+ samples_chunk_size=samples_chunk_size,
1604
+ local_alleles=local_alleles,
1605
+ )
1606
+ out.write(spec.asjson())
1607
+
1608
+
1609
+ def convert(
1610
+ vcfs,
1611
+ vcz_path,
1612
+ *,
1613
+ variants_chunk_size=None,
1614
+ samples_chunk_size=None,
1615
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1616
+ local_alleles=None,
1617
+ show_progress=False,
1618
+ icf_path=None,
1619
+ ):
1620
+ """
1621
+ Convert the VCF data at the specified list of paths
1622
+ to VCF Zarr format stored at the specified path.
1623
+
1624
+ .. todo:: Document parameters
1625
+ """
1626
+ if icf_path is None:
1627
+ cm = temp_icf_path(prefix="vcf2zarr")
1628
+ else:
1629
+ cm = contextlib.nullcontext(icf_path)
1630
+
1631
+ with cm as icf_path:
1632
+ explode(
1633
+ icf_path,
1634
+ vcfs,
1635
+ worker_processes=worker_processes,
1636
+ show_progress=show_progress,
1637
+ )
1638
+ encode(
1639
+ icf_path,
1640
+ vcz_path,
1641
+ variants_chunk_size=variants_chunk_size,
1642
+ samples_chunk_size=samples_chunk_size,
1643
+ worker_processes=worker_processes,
1644
+ show_progress=show_progress,
1645
+ local_alleles=local_alleles,
1646
+ )
1647
+
1648
+
1649
+ @contextlib.contextmanager
1650
+ def temp_icf_path(prefix=None):
1651
+ with tempfile.TemporaryDirectory(prefix=prefix) as tmp:
1652
+ yield pathlib.Path(tmp) / "icf"
1653
+
1654
+
1655
+ def encode(
1656
+ icf_path,
1657
+ zarr_path,
1658
+ schema_path=None,
1659
+ variants_chunk_size=None,
1660
+ samples_chunk_size=None,
1661
+ max_variant_chunks=None,
1662
+ dimension_separator=None,
1663
+ max_memory=None,
1664
+ local_alleles=None,
1665
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1666
+ show_progress=False,
1667
+ ):
1668
+ # Rough heuristic to split work up enough to keep utilisation high
1669
+ target_num_partitions = max(1, worker_processes * 4)
1670
+ encode_init(
1671
+ icf_path,
1672
+ zarr_path,
1673
+ target_num_partitions,
1674
+ schema_path=schema_path,
1675
+ variants_chunk_size=variants_chunk_size,
1676
+ samples_chunk_size=samples_chunk_size,
1677
+ local_alleles=local_alleles,
1678
+ max_variant_chunks=max_variant_chunks,
1679
+ dimension_separator=dimension_separator,
1680
+ )
1681
+ vzw = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
1682
+ vzw.encode_all_partitions(
1683
+ worker_processes=worker_processes,
1684
+ show_progress=show_progress,
1685
+ max_memory=max_memory,
1686
+ )
1687
+ vzw.finalise(show_progress)
1688
+ vzw.create_index()
1689
+
1690
+
1691
+ def encode_init(
1692
+ icf_path,
1693
+ zarr_path,
1694
+ target_num_partitions,
1695
+ *,
1696
+ schema_path=None,
1697
+ variants_chunk_size=None,
1698
+ samples_chunk_size=None,
1699
+ local_alleles=None,
1700
+ max_variant_chunks=None,
1701
+ dimension_separator=None,
1702
+ max_memory=None,
1703
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
1704
+ show_progress=False,
1705
+ ):
1706
+ icf_store = IntermediateColumnarFormat(icf_path)
1707
+ if schema_path is None:
1708
+ schema_instance = icf_store.generate_schema(
1709
+ variants_chunk_size=variants_chunk_size,
1710
+ samples_chunk_size=samples_chunk_size,
1711
+ local_alleles=local_alleles,
1712
+ )
1713
+ else:
1714
+ logger.info(f"Reading schema from {schema_path}")
1715
+ if variants_chunk_size is not None or samples_chunk_size is not None:
1716
+ raise ValueError(
1717
+ "Cannot specify schema along with chunk sizes"
1718
+ ) # NEEDS TEST
1719
+ with open(schema_path) as f:
1720
+ schema_instance = vcz.VcfZarrSchema.fromjson(f.read())
1721
+ zarr_path = pathlib.Path(zarr_path)
1722
+ vzw = vcz.VcfZarrWriter("icf", zarr_path)
1723
+ return vzw.init(
1724
+ icf_store,
1725
+ target_num_partitions=target_num_partitions,
1726
+ schema=schema_instance,
1727
+ dimension_separator=dimension_separator,
1728
+ max_variant_chunks=max_variant_chunks,
1729
+ )
1730
+
1731
+
1732
+ def encode_partition(zarr_path, partition):
1733
+ writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
1734
+ writer_instance.encode_partition(partition)
1735
+
1736
+
1737
+ def encode_finalise(zarr_path, show_progress=False):
1738
+ writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
1739
+ writer_instance.finalise(show_progress=show_progress)