bio2zarr 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/vcf.py CHANGED
@@ -40,6 +40,17 @@ FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array(
40
40
  )
41
41
 
42
42
 
43
+ def display_number(x):
44
+ ret = "n/a"
45
+ if math.isfinite(x):
46
+ ret = f"{x: 0.2g}"
47
+ return ret
48
+
49
+
50
+ def display_size(n):
51
+ return humanfriendly.format_size(n, binary=True)
52
+
53
+
43
54
  @dataclasses.dataclass
44
55
  class VcfFieldSummary:
45
56
  num_chunks: int = 0
@@ -53,11 +64,18 @@ class VcfFieldSummary:
53
64
  def update(self, other):
54
65
  self.num_chunks += other.num_chunks
55
66
  self.compressed_size += other.compressed_size
56
- self.uncompressed_size = other.uncompressed_size
67
+ self.uncompressed_size += other.uncompressed_size
57
68
  self.max_number = max(self.max_number, other.max_number)
58
69
  self.min_value = min(self.min_value, other.min_value)
59
70
  self.max_value = max(self.max_value, other.max_value)
60
71
 
72
+ def asdict(self):
73
+ return dataclasses.asdict(self)
74
+
75
+ @staticmethod
76
+ def fromdict(d):
77
+ return VcfFieldSummary(**d)
78
+
61
79
 
62
80
  @dataclasses.dataclass
63
81
  class VcfField:
@@ -131,9 +149,14 @@ class VcfPartition:
131
149
  num_records: int = -1
132
150
 
133
151
 
152
+ ICF_METADATA_FORMAT_VERSION = "0.2"
153
+ ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
154
+ cname="lz4", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
155
+ ).get_config()
156
+
157
+
134
158
  @dataclasses.dataclass
135
- class VcfMetadata:
136
- format_version: str
159
+ class IcfMetadata:
137
160
  samples: list
138
161
  contig_names: list
139
162
  contig_record_counts: dict
@@ -141,6 +164,10 @@ class VcfMetadata:
141
164
  fields: list
142
165
  partitions: list = None
143
166
  contig_lengths: list = None
167
+ format_version: str = None
168
+ compressor: dict = None
169
+ column_chunk_size: int = None
170
+ provenance: dict = None
144
171
 
145
172
  @property
146
173
  def info_fields(self):
@@ -164,12 +191,19 @@ class VcfMetadata:
164
191
 
165
192
  @staticmethod
166
193
  def fromdict(d):
194
+ if d["format_version"] != ICF_METADATA_FORMAT_VERSION:
195
+ raise ValueError(
196
+ "Intermediate columnar metadata format version mismatch: "
197
+ f"{d['format_version']} != {ICF_METADATA_FORMAT_VERSION}"
198
+ )
167
199
  fields = [VcfField.fromdict(fd) for fd in d["fields"]]
168
200
  partitions = [VcfPartition(**pd) for pd in d["partitions"]]
201
+ for p in partitions:
202
+ p.region = vcf_utils.Region(**p.region)
169
203
  d = d.copy()
170
204
  d["fields"] = fields
171
205
  d["partitions"] = partitions
172
- return VcfMetadata(**d)
206
+ return IcfMetadata(**d)
173
207
 
174
208
  def asdict(self):
175
209
  return dataclasses.asdict(self)
@@ -220,16 +254,13 @@ def scan_vcf(path, target_num_partitions):
220
254
  field.vcf_number = "."
221
255
  fields.append(field)
222
256
 
223
- metadata = VcfMetadata(
257
+ metadata = IcfMetadata(
224
258
  samples=vcf.samples,
225
259
  contig_names=vcf.seqnames,
226
260
  contig_record_counts=indexed_vcf.contig_record_counts(),
227
261
  filters=filters,
228
- # TODO use the mapping dictionary
229
262
  fields=fields,
230
263
  partitions=[],
231
- # FIXME do something systematic with this
232
- format_version="0.1",
233
264
  )
234
265
  try:
235
266
  metadata.contig_lengths = vcf.seqlens
@@ -243,6 +274,8 @@ def scan_vcf(path, target_num_partitions):
243
274
  for region in regions:
244
275
  metadata.partitions.append(
245
276
  VcfPartition(
277
+ # TODO should this be fully resolving the path? Otherwise it's all
278
+ # relative to the original WD
246
279
  vcf_path=str(path),
247
280
  region=region,
248
281
  )
@@ -251,8 +284,19 @@ def scan_vcf(path, target_num_partitions):
251
284
  return metadata, vcf.raw_header
252
285
 
253
286
 
254
- def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
255
- logger.info(f"Scanning {len(paths)} VCFs")
287
+ def scan_vcfs(
288
+ paths, show_progress, target_num_partitions, column_chunk_size, worker_processes=1
289
+ ):
290
+ logger.info(
291
+ f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions."
292
+ )
293
+ # An easy mistake to make is to pass the same file twice. Check this early on.
294
+ for path, count in collections.Counter(paths).items():
295
+ if not path.exists(): # NEEDS TEST
296
+ raise FileNotFoundError(path)
297
+ if count > 1:
298
+ raise ValueError(f"Duplicate path provided: {path}")
299
+
256
300
  progress_config = core.ProgressConfig(
257
301
  total=len(paths),
258
302
  units="files",
@@ -261,7 +305,7 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
261
305
  )
262
306
  with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
263
307
  for path in paths:
264
- pwm.submit(scan_vcf, path, target_num_partitions)
308
+ pwm.submit(scan_vcf, path, max(1, target_num_partitions // len(paths)))
265
309
  results = list(pwm.results_as_completed())
266
310
 
267
311
  # Sort to make the ordering deterministic
@@ -276,12 +320,12 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
276
320
  contig_record_counts += metadata.contig_record_counts
277
321
  metadata.contig_record_counts.clear()
278
322
 
279
- vcf_metadata, header = results[0]
323
+ icf_metadata, header = results[0]
280
324
  for metadata, _ in results[1:]:
281
- if metadata != vcf_metadata:
325
+ if metadata != icf_metadata:
282
326
  raise ValueError("Incompatible VCF chunks")
283
327
 
284
- vcf_metadata.contig_record_counts = dict(contig_record_counts)
328
+ icf_metadata.contig_record_counts = dict(contig_record_counts)
285
329
 
286
330
  # Sort by contig (in the order they appear in the header) first,
287
331
  # then by start coordinate
@@ -289,8 +333,15 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
289
333
  all_partitions.sort(
290
334
  key=lambda x: (contig_index_map[x.region.contig], x.region.start)
291
335
  )
292
- vcf_metadata.partitions = all_partitions
293
- return vcf_metadata, header
336
+ icf_metadata.partitions = all_partitions
337
+ icf_metadata.format_version = ICF_METADATA_FORMAT_VERSION
338
+ icf_metadata.compressor = ICF_DEFAULT_COMPRESSOR
339
+ icf_metadata.column_chunk_size = column_chunk_size
340
+ # Bare minimum here for provenance - would be nice to include versions of key
341
+ # dependencies as well.
342
+ icf_metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
343
+ logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
344
+ return icf_metadata, header
294
345
 
295
346
 
296
347
  def sanitise_value_bool(buff, j, value):
@@ -385,7 +436,7 @@ def sanitise_value_float_2d(buff, j, value):
385
436
 
386
437
  def sanitise_int_array(value, ndmin, dtype):
387
438
  if isinstance(value, tuple):
388
- value = [VCF_INT_MISSING if x is None else x for x in value]
439
+ value = [VCF_INT_MISSING if x is None else x for x in value] # NEEDS TEST
389
440
  value = np.array(value, ndmin=ndmin, copy=False)
390
441
  value[value == VCF_INT_MISSING] = -1
391
442
  value[value == VCF_INT_FILL] = -2
@@ -428,7 +479,7 @@ missing_value_map = {
428
479
  class VcfValueTransformer:
429
480
  """
430
481
  Transform VCF values into the stored intermediate format used
431
- in the PickleChunkedVcf, and update field summaries.
482
+ in the IntermediateColumnarFormat, and update field summaries.
432
483
  """
433
484
 
434
485
  def __init__(self, field, num_samples):
@@ -516,29 +567,29 @@ class StringValueTransformer(VcfValueTransformer):
516
567
  class SplitStringValueTransformer(StringValueTransformer):
517
568
  def transform(self, vcf_value):
518
569
  if vcf_value is None:
519
- return self.missing_value
570
+ return self.missing_value # NEEDS TEST
520
571
  assert self.dimension == 1
521
572
  return np.array(vcf_value, ndmin=1, dtype="str")
522
573
 
523
574
 
524
- class PickleChunkedVcfField:
525
- def __init__(self, pcvcf, vcf_field):
575
+ def get_vcf_field_path(base_path, vcf_field):
576
+ if vcf_field.category == "fixed":
577
+ return base_path / vcf_field.name
578
+ return base_path / vcf_field.category / vcf_field.name
579
+
580
+
581
+ class IntermediateColumnarFormatField:
582
+ def __init__(self, icf, vcf_field):
526
583
  self.vcf_field = vcf_field
527
- self.path = self.get_path(pcvcf.path, vcf_field)
528
- self.compressor = pcvcf.compressor
529
- self.num_partitions = pcvcf.num_partitions
530
- self.num_records = pcvcf.num_records
531
- self.partition_record_index = pcvcf.partition_record_index
584
+ self.path = get_vcf_field_path(icf.path, vcf_field)
585
+ self.compressor = icf.compressor
586
+ self.num_partitions = icf.num_partitions
587
+ self.num_records = icf.num_records
588
+ self.partition_record_index = icf.partition_record_index
532
589
  # A map of partition id to the cumulative number of records
533
590
  # in chunks within that partition
534
591
  self._chunk_record_index = {}
535
592
 
536
- @staticmethod
537
- def get_path(base_path, vcf_field):
538
- if vcf_field.category == "fixed":
539
- return base_path / vcf_field.name
540
- return base_path / vcf_field.category / vcf_field.name
541
-
542
593
  @property
543
594
  def name(self):
544
595
  return self.vcf_field.full_name
@@ -549,17 +600,17 @@ class PickleChunkedVcfField:
549
600
  def __repr__(self):
550
601
  partition_chunks = [self.num_chunks(j) for j in range(self.num_partitions)]
551
602
  return (
552
- f"PickleChunkedVcfField(name={self.name}, "
603
+ f"IntermediateColumnarFormatField(name={self.name}, "
553
604
  f"partition_chunks={partition_chunks}, "
554
605
  f"path={self.path})"
555
606
  )
556
607
 
557
608
  def num_chunks(self, partition_id):
558
- return len(self.chunk_cumulative_records(partition_id))
609
+ return len(self.chunk_record_index(partition_id)) - 1
559
610
 
560
611
  def chunk_record_index(self, partition_id):
561
612
  if partition_id not in self._chunk_record_index:
562
- index_path = self.partition_path(partition_id) / "chunk_index.pkl"
613
+ index_path = self.partition_path(partition_id) / "chunk_index"
563
614
  with open(index_path, "rb") as f:
564
615
  a = pickle.load(f)
565
616
  assert len(a) > 1
@@ -567,22 +618,27 @@ class PickleChunkedVcfField:
567
618
  self._chunk_record_index[partition_id] = a
568
619
  return self._chunk_record_index[partition_id]
569
620
 
570
- def chunk_cumulative_records(self, partition_id):
571
- return self.chunk_record_index(partition_id)[1:]
572
-
573
- def chunk_num_records(self, partition_id):
574
- return np.diff(self.chunk_cumulative_records(partition_id))
575
-
576
- def chunk_files(self, partition_id, start=0):
577
- partition_path = self.partition_path(partition_id)
578
- for n in self.chunk_cumulative_records(partition_id)[start:]:
579
- yield partition_path / f"{n}.pkl"
580
-
581
621
  def read_chunk(self, path):
582
622
  with open(path, "rb") as f:
583
623
  pkl = self.compressor.decode(f.read())
584
624
  return pickle.loads(pkl)
585
625
 
626
+ def chunk_num_records(self, partition_id):
627
+ return np.diff(self.chunk_record_index(partition_id))
628
+
629
+ def chunks(self, partition_id, start_chunk=0):
630
+ partition_path = self.partition_path(partition_id)
631
+ chunk_cumulative_records = self.chunk_record_index(partition_id)
632
+ chunk_num_records = np.diff(chunk_cumulative_records)
633
+ for count, cumulative in zip(
634
+ chunk_num_records[start_chunk:], chunk_cumulative_records[start_chunk + 1 :]
635
+ ):
636
+ path = partition_path / f"{cumulative}"
637
+ chunk = self.read_chunk(path)
638
+ if len(chunk) != count:
639
+ raise ValueError(f"Corruption detected in chunk: {path}")
640
+ yield chunk
641
+
586
642
  def iter_values(self, start=None, stop=None):
587
643
  start = 0 if start is None else start
588
644
  stop = self.num_records if stop is None else stop
@@ -603,9 +659,7 @@ class PickleChunkedVcfField:
603
659
  f"Read {self.vcf_field.full_name} slice [{start}:{stop}]:"
604
660
  f"p_start={start_partition}, c_start={start_chunk}, r_start={record_id}"
605
661
  )
606
-
607
- for chunk_path in self.chunk_files(start_partition, start_chunk):
608
- chunk = self.read_chunk(chunk_path)
662
+ for chunk in self.chunks(start_partition, start_chunk):
609
663
  for record in chunk:
610
664
  if record_id == stop:
611
665
  return
@@ -614,8 +668,7 @@ class PickleChunkedVcfField:
614
668
  record_id += 1
615
669
  assert record_id > start
616
670
  for partition_id in range(start_partition + 1, self.num_partitions):
617
- for chunk_path in self.chunk_files(partition_id):
618
- chunk = self.read_chunk(chunk_path)
671
+ for chunk in self.chunks(partition_id):
619
672
  for record in chunk:
620
673
  if record_id == stop:
621
674
  return
@@ -629,15 +682,11 @@ class PickleChunkedVcfField:
629
682
  ret = [None] * self.num_records
630
683
  j = 0
631
684
  for partition_id in range(self.num_partitions):
632
- for chunk_path in self.chunk_files(partition_id):
633
- chunk = self.read_chunk(chunk_path)
685
+ for chunk in self.chunks(partition_id):
634
686
  for record in chunk:
635
687
  ret[j] = record
636
688
  j += 1
637
- if j != self.num_records:
638
- raise ValueError(
639
- f"Corruption detected: incorrect number of records in {str(self.path)}."
640
- )
689
+ assert j == self.num_records
641
690
  return ret
642
691
 
643
692
  def sanitiser_factory(self, shape):
@@ -674,7 +723,7 @@ class PickleChunkedVcfField:
674
723
 
675
724
 
676
725
  @dataclasses.dataclass
677
- class PcvcfFieldWriter:
726
+ class IcfFieldWriter:
678
727
  vcf_field: VcfField
679
728
  path: pathlib.Path
680
729
  transformer: VcfValueTransformer
@@ -704,7 +753,7 @@ class PcvcfFieldWriter:
704
753
  def write_chunk(self):
705
754
  # Update index
706
755
  self.chunk_index.append(self.num_records)
707
- path = self.path / f"{self.num_records}.pkl"
756
+ path = self.path / f"{self.num_records}"
708
757
  logger.debug(f"Start write: {path}")
709
758
  pkl = pickle.dumps(self.buff)
710
759
  compressed = self.compressor.encode(pkl)
@@ -723,37 +772,35 @@ class PcvcfFieldWriter:
723
772
  )
724
773
  if len(self.buff) > 0:
725
774
  self.write_chunk()
726
- with open(self.path / "chunk_index.pkl", "wb") as f:
775
+ with open(self.path / "chunk_index", "wb") as f:
727
776
  a = np.array(self.chunk_index, dtype=int)
728
777
  pickle.dump(a, f)
729
778
 
730
779
 
731
- class PcvcfPartitionWriter(contextlib.AbstractContextManager):
780
+ class IcfPartitionWriter(contextlib.AbstractContextManager):
732
781
  """
733
- Writes the data for a PickleChunkedVcf partition.
782
+ Writes the data for a IntermediateColumnarFormat partition.
734
783
  """
735
784
 
736
785
  def __init__(
737
786
  self,
738
- vcf_metadata,
787
+ icf_metadata,
739
788
  out_path,
740
789
  partition_index,
741
- compressor,
742
- *,
743
- chunk_size=1,
744
790
  ):
745
791
  self.partition_index = partition_index
746
792
  # chunk_size is in megabytes
747
- max_buffered_bytes = chunk_size * 2**20
793
+ max_buffered_bytes = icf_metadata.column_chunk_size * 2**20
748
794
  assert max_buffered_bytes > 0
795
+ compressor = numcodecs.get_codec(icf_metadata.compressor)
749
796
 
750
797
  self.field_writers = {}
751
- num_samples = len(vcf_metadata.samples)
752
- for vcf_field in vcf_metadata.fields:
753
- field_path = PickleChunkedVcfField.get_path(out_path, vcf_field)
798
+ num_samples = len(icf_metadata.samples)
799
+ for vcf_field in icf_metadata.fields:
800
+ field_path = get_vcf_field_path(out_path, vcf_field)
754
801
  field_partition_path = field_path / f"p{partition_index}"
755
802
  transformer = VcfValueTransformer.factory(vcf_field, num_samples)
756
- self.field_writers[vcf_field.full_name] = PcvcfFieldWriter(
803
+ self.field_writers[vcf_field.full_name] = IcfFieldWriter(
757
804
  vcf_field,
758
805
  field_partition_path,
759
806
  transformer,
@@ -777,16 +824,23 @@ class PcvcfPartitionWriter(contextlib.AbstractContextManager):
777
824
  return False
778
825
 
779
826
 
780
- class PickleChunkedVcf(collections.abc.Mapping):
827
+ # TODO rename to IntermediateColumnarFormat and move to icf.py
828
+
829
+
830
+ class IntermediateColumnarFormat(collections.abc.Mapping):
781
831
  # TODO Check if other compressors would give reasonable compression
782
832
  # with significantly faster times
783
- DEFAULT_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
784
833
 
785
- def __init__(self, path, metadata, vcf_header):
786
- self.path = path
787
- self.metadata = metadata
788
- self.vcf_header = vcf_header
789
- self.compressor = self.DEFAULT_COMPRESSOR
834
+ def __init__(self, path):
835
+ self.path = pathlib.Path(path)
836
+ # TODO raise a more informative error here telling people this
837
+ # directory is either a WIP or the wrong format.
838
+ with open(self.path / "metadata.json") as f:
839
+ self.metadata = IcfMetadata.fromdict(json.load(f))
840
+ with open(self.path / "header.txt") as f:
841
+ self.vcf_header = f.read()
842
+
843
+ self.compressor = numcodecs.get_codec(self.metadata.compressor)
790
844
  self.columns = {}
791
845
  partition_num_records = [
792
846
  partition.num_records for partition in self.metadata.partitions
@@ -794,11 +848,15 @@ class PickleChunkedVcf(collections.abc.Mapping):
794
848
  # Allow us to find which partition a given record is in
795
849
  self.partition_record_index = np.cumsum([0] + partition_num_records)
796
850
  for field in self.metadata.fields:
797
- self.columns[field.full_name] = PickleChunkedVcfField(self, field)
851
+ self.columns[field.full_name] = IntermediateColumnarFormatField(self, field)
852
+ logger.info(
853
+ f"Loaded IntermediateColumnarFormat(partitions={self.num_partitions}, "
854
+ f"records={self.num_records}, columns={self.num_columns})"
855
+ )
798
856
 
799
857
  def __repr__(self):
800
858
  return (
801
- f"PickleChunkedVcf(fields={len(self)}, partitions={self.num_partitions}, "
859
+ f"IntermediateColumnarFormat(fields={len(self)}, partitions={self.num_partitions}, "
802
860
  f"records={self.num_records}, path={self.path})"
803
861
  )
804
862
 
@@ -812,15 +870,6 @@ class PickleChunkedVcf(collections.abc.Mapping):
812
870
  return len(self.columns)
813
871
 
814
872
  def summary_table(self):
815
- def display_number(x):
816
- ret = "n/a"
817
- if math.isfinite(x):
818
- ret = f"{x: 0.2g}"
819
- return ret
820
-
821
- def display_size(n):
822
- return humanfriendly.format_size(n)
823
-
824
873
  data = []
825
874
  for name, col in self.columns.items():
826
875
  summary = col.vcf_field.summary
@@ -838,14 +887,6 @@ class PickleChunkedVcf(collections.abc.Mapping):
838
887
  data.append(d)
839
888
  return data
840
889
 
841
- @functools.cached_property
842
- def total_uncompressed_bytes(self):
843
- total = 0
844
- for col in self.columns.values():
845
- summary = col.vcf_field.summary
846
- total += summary.uncompressed_size
847
- return total
848
-
849
890
  @functools.cached_property
850
891
  def num_records(self):
851
892
  return sum(self.metadata.contig_record_counts.values())
@@ -862,57 +903,121 @@ class PickleChunkedVcf(collections.abc.Mapping):
862
903
  def num_columns(self):
863
904
  return len(self.columns)
864
905
 
865
- def mkdirs(self):
866
- self.path.mkdir()
867
- for col in self.columns.values():
868
- col.path.mkdir(parents=True)
869
- for j in range(self.num_partitions):
870
- part_path = col.path / f"p{j}"
871
- part_path.mkdir()
872
906
 
873
- @staticmethod
874
- def load(path):
875
- path = pathlib.Path(path)
876
- with open(path / "metadata.json") as f:
877
- metadata = VcfMetadata.fromdict(json.load(f))
878
- with open(path / "header.txt") as f:
879
- header = f.read()
880
- pcvcf = PickleChunkedVcf(path, metadata, header)
881
- logger.info(
882
- f"Loaded PickleChunkedVcf(partitions={pcvcf.num_partitions}, "
883
- f"records={pcvcf.num_records}, columns={pcvcf.num_columns})"
884
- )
885
- return pcvcf
907
+ class IntermediateColumnarFormatWriter:
908
+ def __init__(self, path):
909
+ self.path = pathlib.Path(path)
910
+ self.wip_path = self.path / "wip"
911
+ self.metadata = None
886
912
 
887
- @staticmethod
888
- def convert_partition(
889
- vcf_metadata,
890
- partition_index,
891
- out_path,
913
+ @property
914
+ def num_partitions(self):
915
+ return len(self.metadata.partitions)
916
+
917
+ def init(
918
+ self,
919
+ vcfs,
892
920
  *,
893
921
  column_chunk_size=16,
922
+ worker_processes=1,
923
+ target_num_partitions=None,
924
+ show_progress=False,
894
925
  ):
895
- partition = vcf_metadata.partitions[partition_index]
926
+ if self.path.exists():
927
+ shutil.rmtree(self.path)
928
+ vcfs = [pathlib.Path(vcf) for vcf in vcfs]
929
+ target_num_partitions = max(target_num_partitions, len(vcfs))
930
+
931
+ # TODO move scan_vcfs into this class
932
+ icf_metadata, header = scan_vcfs(
933
+ vcfs,
934
+ worker_processes=worker_processes,
935
+ show_progress=show_progress,
936
+ target_num_partitions=target_num_partitions,
937
+ column_chunk_size=column_chunk_size,
938
+ )
939
+ self.metadata = icf_metadata
940
+
941
+ self.mkdirs()
942
+
943
+ # Note: this is needed for the current version of the vcfzarr spec, but it's
944
+ # probably goint to be dropped.
945
+ # https://github.com/pystatgen/vcf-zarr-spec/issues/15
946
+ # May be useful to keep lying around still though?
947
+ logger.info(f"Writing VCF header")
948
+ with open(self.path / "header.txt", "w") as f:
949
+ f.write(header)
950
+
951
+ logger.info(f"Writing WIP metadata")
952
+ with open(self.wip_path / "metadata.json", "w") as f:
953
+ json.dump(self.metadata.asdict(), f, indent=4)
954
+ return self.num_partitions
955
+
956
+ def mkdirs(self):
957
+ # TODO add worker_processes here and do this with the ParallelWorkManager
958
+ logger.info(
959
+ f"Creating {len(self.metadata.fields) * self.num_partitions} directories"
960
+ )
961
+ self.path.mkdir()
962
+ self.wip_path.mkdir()
963
+ for field in self.metadata.fields:
964
+ col_path = get_vcf_field_path(self.path, field)
965
+ logger.debug(f"Make directories for {field.full_name} at {col_path}")
966
+ col_path.mkdir(parents=True)
967
+ for j in range(self.num_partitions):
968
+ part_path = col_path / f"p{j}"
969
+ part_path.mkdir()
970
+
971
+ def load_partition_summaries(self):
972
+ summaries = []
973
+ not_found = []
974
+ for j in range(self.num_partitions):
975
+ try:
976
+ with open(self.wip_path / f"p{j}_summary.json") as f:
977
+ summary = json.load(f)
978
+ for k, v in summary["field_summaries"].items():
979
+ summary["field_summaries"][k] = VcfFieldSummary.fromdict(v)
980
+ summaries.append(summary)
981
+ except FileNotFoundError:
982
+ not_found.append(j)
983
+ if len(not_found) > 0:
984
+ raise FileNotFoundError(
985
+ f"Partition metadata not found for {len(not_found)} partitions: {not_found}"
986
+ )
987
+ return summaries
988
+
989
+ def load_metadata(self):
990
+ if self.metadata is None:
991
+ with open(self.wip_path / f"metadata.json") as f:
992
+ self.metadata = IcfMetadata.fromdict(json.load(f))
993
+
994
+ def process_partition(self, partition_index):
995
+ self.load_metadata()
996
+ summary_path = self.wip_path / f"p{partition_index}_summary.json"
997
+ # If someone is rewriting a summary path (for whatever reason), make sure it
998
+ # doesn't look like it's already been completed.
999
+ # NOTE to do this properly we probably need to take a lock on this file - but
1000
+ # this simple approach will catch the vast majority of problems.
1001
+ if summary_path.exists():
1002
+ summary_path.unlink()
1003
+
1004
+ partition = self.metadata.partitions[partition_index]
896
1005
  logger.info(
897
1006
  f"Start p{partition_index} {partition.vcf_path}__{partition.region}"
898
1007
  )
899
- info_fields = vcf_metadata.info_fields
1008
+ info_fields = self.metadata.info_fields
900
1009
  format_fields = []
901
1010
  has_gt = False
902
- for field in vcf_metadata.format_fields:
1011
+ for field in self.metadata.format_fields:
903
1012
  if field.name == "GT":
904
1013
  has_gt = True
905
1014
  else:
906
1015
  format_fields.append(field)
907
1016
 
908
- compressor = PickleChunkedVcf.DEFAULT_COMPRESSOR
909
-
910
- with PcvcfPartitionWriter(
911
- vcf_metadata,
912
- out_path,
1017
+ with IcfPartitionWriter(
1018
+ self.metadata,
1019
+ self.path,
913
1020
  partition_index,
914
- compressor,
915
- chunk_size=column_chunk_size,
916
1021
  ) as tcw:
917
1022
  with vcf_utils.IndexedVcf(partition.vcf_path) as ivcf:
918
1023
  num_records = 0
@@ -930,108 +1035,172 @@ class PickleChunkedVcf(collections.abc.Mapping):
930
1035
  if has_gt:
931
1036
  tcw.append("FORMAT/GT", variant.genotype.array())
932
1037
  for field in format_fields:
933
- val = None
934
- try:
935
- val = variant.format(field.name)
936
- except KeyError:
937
- pass
1038
+ val = variant.format(field.name)
938
1039
  tcw.append(field.full_name, val)
939
1040
  # Note: an issue with updating the progress per variant here like this
940
1041
  # is that we get a significant pause at the end of the counter while
941
1042
  # all the "small" fields get flushed. Possibly not much to be done about it.
942
1043
  core.update_progress(1)
1044
+ logger.info(
1045
+ f"Finished reading VCF for partition {partition_index}, flushing buffers"
1046
+ )
943
1047
 
1048
+ partition_metadata = {
1049
+ "num_records": num_records,
1050
+ "field_summaries": {k: v.asdict() for k, v in tcw.field_summaries.items()},
1051
+ }
1052
+ with open(summary_path, "w") as f:
1053
+ json.dump(partition_metadata, f, indent=4)
944
1054
  logger.info(
945
1055
  f"Finish p{partition_index} {partition.vcf_path}__{partition.region}="
946
1056
  f"{num_records} records"
947
1057
  )
948
- return partition_index, tcw.field_summaries, num_records
949
1058
 
950
- @staticmethod
951
- def convert(
952
- vcfs, out_path, *, column_chunk_size=16, worker_processes=1, show_progress=False
1059
+ def process_partition_slice(
1060
+ self,
1061
+ start,
1062
+ stop,
1063
+ *,
1064
+ worker_processes=1,
1065
+ show_progress=False,
953
1066
  ):
954
- out_path = pathlib.Path(out_path)
955
- # TODO make scan work in parallel using general progress code too
956
- target_num_partitions = max(1, worker_processes * 4)
957
- vcf_metadata, header = scan_vcfs(
958
- vcfs,
959
- worker_processes=worker_processes,
960
- show_progress=show_progress,
961
- target_num_partitions=target_num_partitions,
962
- )
963
- pcvcf = PickleChunkedVcf(out_path, vcf_metadata, header)
964
- pcvcf.mkdirs()
965
-
1067
+ self.load_metadata()
1068
+ if start == 0 and stop == self.num_partitions:
1069
+ num_records = self.metadata.num_records
1070
+ else:
1071
+ # We only know the number of records if all partitions are done at once,
1072
+ # and we signal this to tqdm by passing None as the total.
1073
+ num_records = None
1074
+ num_columns = len(self.metadata.fields)
1075
+ num_samples = len(self.metadata.samples)
966
1076
  logger.info(
967
- f"Exploding {pcvcf.num_columns} columns {vcf_metadata.num_records} variants "
968
- f"{pcvcf.num_samples} samples"
1077
+ f"Exploding columns={num_columns} samples={num_samples}; "
1078
+ f"partitions={stop - start} "
1079
+ f"variants={'unknown' if num_records is None else num_records}"
969
1080
  )
970
1081
  progress_config = core.ProgressConfig(
971
- total=vcf_metadata.num_records,
1082
+ total=num_records,
972
1083
  units="vars",
973
1084
  title="Explode",
974
1085
  show=show_progress,
975
1086
  )
976
1087
  with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
977
- for j, partition in enumerate(vcf_metadata.partitions):
978
- pwm.submit(
979
- PickleChunkedVcf.convert_partition,
980
- vcf_metadata,
981
- j,
982
- out_path,
983
- column_chunk_size=column_chunk_size,
984
- )
985
- num_records = 0
986
- partition_summaries = []
987
- for index, summary, num_records in pwm.results_as_completed():
988
- partition_summaries.append(summary)
989
- vcf_metadata.partitions[index].num_records = num_records
990
-
991
- total_records = sum(
992
- partition.num_records for partition in vcf_metadata.partitions
1088
+ for j in range(start, stop):
1089
+ pwm.submit(self.process_partition, j)
1090
+
1091
+ def explode(self, *, worker_processes=1, show_progress=False):
1092
+ self.load_metadata()
1093
+ return self.process_partition_slice(
1094
+ 0,
1095
+ self.num_partitions,
1096
+ worker_processes=worker_processes,
1097
+ show_progress=show_progress,
993
1098
  )
994
- assert total_records == pcvcf.num_records
995
1099
 
996
- for field in vcf_metadata.fields:
997
- # Clear the summary to avoid problems when running in debug
998
- # syncronous mode
999
- field.summary = VcfFieldSummary()
1100
+ def explode_partition(self, partition, *, show_progress=False, worker_processes=1):
1101
+ self.load_metadata()
1102
+ if partition < 0 or partition >= self.num_partitions:
1103
+ raise ValueError(
1104
+ "Partition index must be in the range 0 <= index < num_partitions"
1105
+ )
1106
+ return self.process_partition_slice(
1107
+ partition,
1108
+ partition + 1,
1109
+ worker_processes=worker_processes,
1110
+ show_progress=show_progress,
1111
+ )
1112
+
1113
+ def finalise(self):
1114
+ self.load_metadata()
1115
+ partition_summaries = self.load_partition_summaries()
1116
+ total_records = 0
1117
+ for index, summary in enumerate(partition_summaries):
1118
+ partition_records = summary["num_records"]
1119
+ self.metadata.partitions[index].num_records = partition_records
1120
+ total_records += partition_records
1121
+ assert total_records == self.metadata.num_records
1122
+
1123
+ for field in self.metadata.fields:
1000
1124
  for summary in partition_summaries:
1001
- field.summary.update(summary[field.full_name])
1125
+ field.summary.update(summary["field_summaries"][field.full_name])
1002
1126
 
1003
- with open(out_path / "metadata.json", "w") as f:
1004
- json.dump(vcf_metadata.asdict(), f, indent=4)
1005
- with open(out_path / "header.txt", "w") as f:
1006
- f.write(header)
1127
+ logger.info(f"Finalising metadata")
1128
+ with open(self.path / "metadata.json", "w") as f:
1129
+ json.dump(self.metadata.asdict(), f, indent=4)
1130
+
1131
+ logger.debug(f"Removing WIP directory")
1132
+ shutil.rmtree(self.wip_path)
1007
1133
 
1008
1134
 
1009
1135
  def explode(
1010
1136
  vcfs,
1011
- out_path,
1137
+ icf_path,
1012
1138
  *,
1013
1139
  column_chunk_size=16,
1014
1140
  worker_processes=1,
1015
1141
  show_progress=False,
1016
1142
  ):
1017
- out_path = pathlib.Path(out_path)
1018
- if out_path.exists():
1019
- shutil.rmtree(out_path)
1020
-
1021
- PickleChunkedVcf.convert(
1143
+ writer = IntermediateColumnarFormatWriter(icf_path)
1144
+ num_partitions = writer.init(
1022
1145
  vcfs,
1023
- out_path,
1146
+ # Heuristic to get reasonable worker utilisation with lumpy partition sizing
1147
+ target_num_partitions=max(1, worker_processes * 4),
1148
+ worker_processes=worker_processes,
1149
+ show_progress=show_progress,
1024
1150
  column_chunk_size=column_chunk_size,
1151
+ )
1152
+ writer.explode(worker_processes=worker_processes, show_progress=show_progress)
1153
+ writer.finalise()
1154
+ return IntermediateColumnarFormat(icf_path)
1155
+
1156
+
1157
+ def explode_init(
1158
+ icf_path,
1159
+ vcfs,
1160
+ *,
1161
+ column_chunk_size=16,
1162
+ target_num_partitions=1,
1163
+ worker_processes=1,
1164
+ show_progress=False,
1165
+ ):
1166
+ writer = IntermediateColumnarFormatWriter(icf_path)
1167
+ return writer.init(
1168
+ vcfs,
1169
+ target_num_partitions=target_num_partitions,
1025
1170
  worker_processes=worker_processes,
1026
1171
  show_progress=show_progress,
1172
+ column_chunk_size=column_chunk_size,
1027
1173
  )
1028
- return PickleChunkedVcf.load(out_path)
1029
1174
 
1030
1175
 
1031
- def inspect(if_path):
1176
+ # NOTE only including worker_processes here so we can use the 0 option to get the
1177
+ # work done syncronously and so we can get test coverage on it. Should find a
1178
+ # better way to do this.
1179
+ def explode_partition(icf_path, partition, *, show_progress=False, worker_processes=1):
1180
+ writer = IntermediateColumnarFormatWriter(icf_path)
1181
+ writer.explode_partition(
1182
+ partition, show_progress=show_progress, worker_processes=worker_processes
1183
+ )
1184
+
1185
+
1186
+ def explode_finalise(icf_path):
1187
+ writer = IntermediateColumnarFormatWriter(icf_path)
1188
+ writer.finalise()
1189
+
1190
+
1191
+ def inspect(path):
1192
+ path = pathlib.Path(path)
1032
1193
  # TODO add support for the Zarr format also
1033
- pcvcf = PickleChunkedVcf.load(if_path)
1034
- return pcvcf.summary_table()
1194
+ if (path / "metadata.json").exists():
1195
+ obj = IntermediateColumnarFormat(path)
1196
+ elif (path / ".zmetadata").exists():
1197
+ obj = VcfZarr(path)
1198
+ else:
1199
+ raise ValueError("Format not recognised") # NEEDS TEST
1200
+ return obj.summary_table()
1201
+
1202
+
1203
+ DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
1035
1204
 
1036
1205
 
1037
1206
  @dataclasses.dataclass
@@ -1043,20 +1212,46 @@ class ZarrColumnSpec:
1043
1212
  dimensions: list
1044
1213
  description: str
1045
1214
  vcf_field: str
1046
- compressor: dict
1215
+ compressor: dict = None
1216
+ filters: list = None
1047
1217
  # TODO add filters
1048
1218
 
1049
1219
  def __post_init__(self):
1050
1220
  self.shape = tuple(self.shape)
1051
1221
  self.chunks = tuple(self.chunks)
1052
1222
  self.dimensions = tuple(self.dimensions)
1223
+ self.compressor = DEFAULT_ZARR_COMPRESSOR.get_config()
1224
+ self.filters = []
1225
+ self._choose_compressor_settings()
1226
+
1227
+ def _choose_compressor_settings(self):
1228
+ """
1229
+ Choose compressor and filter settings based on the size and
1230
+ type of the array, plus some hueristics from observed properties
1231
+ of VCFs.
1232
+
1233
+ See https://github.com/pystatgen/bio2zarr/discussions/74
1234
+ """
1235
+ dt = np.dtype(self.dtype)
1236
+ # Default is to not shuffle, because autoshuffle isn't recognised
1237
+ # by many Zarr implementations, and shuffling can lead to worse
1238
+ # performance in some cases anyway. Turning on shuffle should be a
1239
+ # deliberate choice.
1240
+ shuffle = numcodecs.Blosc.NOSHUFFLE
1241
+ if dt.itemsize == 1:
1242
+ # Any 1 byte field gets BITSHUFFLE by default
1243
+ shuffle = numcodecs.Blosc.BITSHUFFLE
1244
+ self.compressor["shuffle"] = shuffle
1245
+
1246
+
1247
+ ZARR_SCHEMA_FORMAT_VERSION = "0.2"
1053
1248
 
1054
1249
 
1055
1250
  @dataclasses.dataclass
1056
- class ZarrConversionSpec:
1251
+ class VcfZarrSchema:
1057
1252
  format_version: str
1058
- chunk_width: int
1059
- chunk_length: int
1253
+ samples_chunk_size: int
1254
+ variants_chunk_size: int
1060
1255
  dimensions: list
1061
1256
  sample_id: list
1062
1257
  contig_id: list
@@ -1072,7 +1267,12 @@ class ZarrConversionSpec:
1072
1267
 
1073
1268
  @staticmethod
1074
1269
  def fromdict(d):
1075
- ret = ZarrConversionSpec(**d)
1270
+ if d["format_version"] != ZARR_SCHEMA_FORMAT_VERSION:
1271
+ raise ValueError(
1272
+ "Zarr schema format version mismatch: "
1273
+ f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
1274
+ )
1275
+ ret = VcfZarrSchema(**d)
1076
1276
  ret.columns = {
1077
1277
  key: ZarrColumnSpec(**value) for key, value in d["columns"].items()
1078
1278
  }
@@ -1080,19 +1280,20 @@ class ZarrConversionSpec:
1080
1280
 
1081
1281
  @staticmethod
1082
1282
  def fromjson(s):
1083
- return ZarrConversionSpec.fromdict(json.loads(s))
1283
+ return VcfZarrSchema.fromdict(json.loads(s))
1084
1284
 
1085
1285
  @staticmethod
1086
- def generate(pcvcf, chunk_length=None, chunk_width=None):
1087
- m = pcvcf.num_records
1088
- n = pcvcf.num_samples
1286
+ def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
1287
+ m = icf.num_records
1288
+ n = icf.num_samples
1089
1289
  # FIXME
1090
- if chunk_width is None:
1091
- chunk_width = 1000
1092
- if chunk_length is None:
1093
- chunk_length = 10_000
1094
- logger.info(f"Generating schema with chunks={chunk_length, chunk_width}")
1095
- compressor = core.default_compressor.get_config()
1290
+ if samples_chunk_size is None:
1291
+ samples_chunk_size = 1000
1292
+ if variants_chunk_size is None:
1293
+ variants_chunk_size = 10_000
1294
+ logger.info(
1295
+ f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
1296
+ )
1096
1297
 
1097
1298
  def fixed_field_spec(
1098
1299
  name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
@@ -1104,13 +1305,12 @@ class ZarrConversionSpec:
1104
1305
  shape=shape,
1105
1306
  description="",
1106
1307
  dimensions=dimensions,
1107
- chunks=[chunk_length],
1108
- compressor=compressor,
1308
+ chunks=[variants_chunk_size],
1109
1309
  )
1110
1310
 
1111
- alt_col = pcvcf.columns["ALT"]
1311
+ alt_col = icf.columns["ALT"]
1112
1312
  max_alleles = alt_col.vcf_field.summary.max_number + 1
1113
- num_filters = len(pcvcf.metadata.filters)
1313
+ num_filters = len(icf.metadata.filters)
1114
1314
 
1115
1315
  # # FIXME get dtype from lookup table
1116
1316
  colspecs = [
@@ -1153,7 +1353,7 @@ class ZarrConversionSpec:
1153
1353
  ]
1154
1354
 
1155
1355
  gt_field = None
1156
- for field in pcvcf.metadata.fields:
1356
+ for field in icf.metadata.fields:
1157
1357
  if field.category == "fixed":
1158
1358
  continue
1159
1359
  if field.name == "GT":
@@ -1162,11 +1362,11 @@ class ZarrConversionSpec:
1162
1362
  shape = [m]
1163
1363
  prefix = "variant_"
1164
1364
  dimensions = ["variants"]
1165
- chunks = [chunk_length]
1365
+ chunks = [variants_chunk_size]
1166
1366
  if field.category == "FORMAT":
1167
1367
  prefix = "call_"
1168
1368
  shape.append(n)
1169
- chunks.append(chunk_width),
1369
+ chunks.append(samples_chunk_size),
1170
1370
  dimensions.append("samples")
1171
1371
  # TODO make an option to add in the empty extra dimension
1172
1372
  if field.summary.max_number > 1:
@@ -1181,14 +1381,13 @@ class ZarrConversionSpec:
1181
1381
  chunks=chunks,
1182
1382
  dimensions=dimensions,
1183
1383
  description=field.description,
1184
- compressor=compressor,
1185
1384
  )
1186
1385
  colspecs.append(colspec)
1187
1386
 
1188
1387
  if gt_field is not None:
1189
1388
  ploidy = gt_field.summary.max_number - 1
1190
1389
  shape = [m, n]
1191
- chunks = [chunk_length, chunk_width]
1390
+ chunks = [variants_chunk_size, samples_chunk_size]
1192
1391
  dimensions = ["variants", "samples"]
1193
1392
 
1194
1393
  colspecs.append(
@@ -1200,7 +1399,6 @@ class ZarrConversionSpec:
1200
1399
  chunks=list(chunks),
1201
1400
  dimensions=list(dimensions),
1202
1401
  description="",
1203
- compressor=compressor,
1204
1402
  )
1205
1403
  )
1206
1404
  shape += [ploidy]
@@ -1214,7 +1412,6 @@ class ZarrConversionSpec:
1214
1412
  chunks=list(chunks),
1215
1413
  dimensions=list(dimensions),
1216
1414
  description="",
1217
- compressor=compressor,
1218
1415
  )
1219
1416
  )
1220
1417
  colspecs.append(
@@ -1226,47 +1423,100 @@ class ZarrConversionSpec:
1226
1423
  chunks=list(chunks),
1227
1424
  dimensions=list(dimensions),
1228
1425
  description="",
1229
- compressor=compressor,
1230
1426
  )
1231
1427
  )
1232
1428
 
1233
- return ZarrConversionSpec(
1234
- # TODO do something systematic
1235
- format_version="0.1",
1236
- chunk_width=chunk_width,
1237
- chunk_length=chunk_length,
1429
+ return VcfZarrSchema(
1430
+ format_version=ZARR_SCHEMA_FORMAT_VERSION,
1431
+ samples_chunk_size=samples_chunk_size,
1432
+ variants_chunk_size=variants_chunk_size,
1238
1433
  columns={col.name: col for col in colspecs},
1239
1434
  dimensions=["variants", "samples", "ploidy", "alleles", "filters"],
1240
- sample_id=pcvcf.metadata.samples,
1241
- contig_id=pcvcf.metadata.contig_names,
1242
- contig_length=pcvcf.metadata.contig_lengths,
1243
- filter_id=pcvcf.metadata.filters,
1435
+ sample_id=icf.metadata.samples,
1436
+ contig_id=icf.metadata.contig_names,
1437
+ contig_length=icf.metadata.contig_lengths,
1438
+ filter_id=icf.metadata.filters,
1244
1439
  )
1245
1440
 
1246
1441
 
1247
- class SgvcfZarr:
1442
+ class VcfZarr:
1248
1443
  def __init__(self, path):
1444
+ if not (path / ".zmetadata").exists():
1445
+ raise ValueError("Not in VcfZarr format") # NEEDS TEST
1446
+ self.root = zarr.open(path, mode="r")
1447
+
1448
+ def __repr__(self):
1449
+ return repr(self.root) # NEEDS TEST
1450
+
1451
+ def summary_table(self):
1452
+ data = []
1453
+ arrays = [(a.nbytes_stored, a) for _, a in self.root.arrays()]
1454
+ arrays.sort(key=lambda x: x[0])
1455
+ for stored, array in reversed(arrays):
1456
+ d = {
1457
+ "name": array.name,
1458
+ "dtype": str(array.dtype),
1459
+ "stored": display_size(stored),
1460
+ "size": display_size(array.nbytes),
1461
+ "ratio": display_number(array.nbytes / stored),
1462
+ "nchunks": str(array.nchunks),
1463
+ "chunk_size": display_size(array.nbytes / array.nchunks),
1464
+ "avg_chunk_stored": display_size(int(stored / array.nchunks)),
1465
+ "shape": str(array.shape),
1466
+ "chunk_shape": str(array.chunks),
1467
+ "compressor": str(array.compressor),
1468
+ "filters": str(array.filters),
1469
+ }
1470
+ data.append(d)
1471
+ return data
1472
+
1473
+
1474
+ @dataclasses.dataclass
1475
+ class EncodingWork:
1476
+ func: callable = dataclasses.field(repr=False)
1477
+ start: int
1478
+ stop: int
1479
+ columns: list[str]
1480
+ memory: int = 0
1481
+
1482
+
1483
+ class VcfZarrWriter:
1484
+ def __init__(self, path, icf, schema):
1249
1485
  self.path = pathlib.Path(path)
1250
- self.root = None
1486
+ self.icf = icf
1487
+ self.schema = schema
1488
+ store = zarr.DirectoryStore(self.path)
1489
+ self.root = zarr.group(store=store)
1251
1490
 
1252
- def create_array(self, variable):
1491
+ def init_array(self, variable):
1253
1492
  # print("CREATE", variable)
1254
1493
  object_codec = None
1255
1494
  if variable.dtype == "O":
1256
1495
  object_codec = numcodecs.VLenUTF8()
1257
1496
  a = self.root.empty(
1258
- variable.name,
1497
+ "wip_" + variable.name,
1259
1498
  shape=variable.shape,
1260
1499
  chunks=variable.chunks,
1261
1500
  dtype=variable.dtype,
1262
1501
  compressor=numcodecs.get_codec(variable.compressor),
1502
+ filters=[numcodecs.get_codec(filt) for filt in variable.filters],
1263
1503
  object_codec=object_codec,
1264
1504
  )
1265
1505
  a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
1266
1506
 
1267
- def encode_column_slice(self, pcvcf, column, start, stop):
1268
- source_col = pcvcf.columns[column.vcf_field]
1269
- array = self.root[column.name]
1507
+ def get_array(self, name):
1508
+ return self.root["wip_" + name]
1509
+
1510
+ def finalise_array(self, variable_name):
1511
+ source = self.path / ("wip_" + variable_name)
1512
+ dest = self.path / variable_name
1513
+ # Atomic swap
1514
+ os.rename(source, dest)
1515
+ logger.info(f"Finalised {variable_name}")
1516
+
1517
+ def encode_array_slice(self, column, start, stop):
1518
+ source_col = self.icf.columns[column.vcf_field]
1519
+ array = self.get_array(column.name)
1270
1520
  ba = core.BufferedArray(array, start)
1271
1521
  sanitiser = source_col.sanitiser_factory(ba.buff.shape)
1272
1522
 
@@ -1278,11 +1528,11 @@ class SgvcfZarr:
1278
1528
  ba.flush()
1279
1529
  logger.debug(f"Encoded {column.name} slice {start}:{stop}")
1280
1530
 
1281
- def encode_genotypes_slice(self, pcvcf, start, stop):
1282
- source_col = pcvcf.columns["FORMAT/GT"]
1283
- gt = core.BufferedArray(self.root["call_genotype"], start)
1284
- gt_mask = core.BufferedArray(self.root["call_genotype_mask"], start)
1285
- gt_phased = core.BufferedArray(self.root["call_genotype_phased"], start)
1531
+ def encode_genotypes_slice(self, start, stop):
1532
+ source_col = self.icf.columns["FORMAT/GT"]
1533
+ gt = core.BufferedArray(self.get_array("call_genotype"), start)
1534
+ gt_mask = core.BufferedArray(self.get_array("call_genotype_mask"), start)
1535
+ gt_phased = core.BufferedArray(self.get_array("call_genotype_phased"), start)
1286
1536
 
1287
1537
  for value in source_col.iter_values(start, stop):
1288
1538
  j = gt.next_buffer_row()
@@ -1298,10 +1548,10 @@ class SgvcfZarr:
1298
1548
  gt_mask.flush()
1299
1549
  logger.debug(f"Encoded GT slice {start}:{stop}")
1300
1550
 
1301
- def encode_alleles_slice(self, pcvcf, start, stop):
1302
- ref_col = pcvcf.columns["REF"]
1303
- alt_col = pcvcf.columns["ALT"]
1304
- alleles = core.BufferedArray(self.root["variant_allele"], start)
1551
+ def encode_alleles_slice(self, start, stop):
1552
+ ref_col = self.icf.columns["REF"]
1553
+ alt_col = self.icf.columns["ALT"]
1554
+ alleles = core.BufferedArray(self.get_array("variant_allele"), start)
1305
1555
 
1306
1556
  for ref, alt in zip(
1307
1557
  ref_col.iter_values(start, stop), alt_col.iter_values(start, stop)
@@ -1313,10 +1563,10 @@ class SgvcfZarr:
1313
1563
  alleles.flush()
1314
1564
  logger.debug(f"Encoded alleles slice {start}:{stop}")
1315
1565
 
1316
- def encode_id_slice(self, pcvcf, start, stop):
1317
- col = pcvcf.columns["ID"]
1318
- vid = core.BufferedArray(self.root["variant_id"], start)
1319
- vid_mask = core.BufferedArray(self.root["variant_id_mask"], start)
1566
+ def encode_id_slice(self, start, stop):
1567
+ col = self.icf.columns["ID"]
1568
+ vid = core.BufferedArray(self.get_array("variant_id"), start)
1569
+ vid_mask = core.BufferedArray(self.get_array("variant_id_mask"), start)
1320
1570
 
1321
1571
  for value in col.iter_values(start, stop):
1322
1572
  j = vid.next_buffer_row()
@@ -1332,182 +1582,246 @@ class SgvcfZarr:
1332
1582
  vid_mask.flush()
1333
1583
  logger.debug(f"Encoded ID slice {start}:{stop}")
1334
1584
 
1335
- def encode_filters_slice(self, pcvcf, lookup, start, stop):
1336
- col = pcvcf.columns["FILTERS"]
1337
- var_filter = core.BufferedArray(self.root["variant_filter"], start)
1585
+ def encode_filters_slice(self, lookup, start, stop):
1586
+ col = self.icf.columns["FILTERS"]
1587
+ var_filter = core.BufferedArray(self.get_array("variant_filter"), start)
1338
1588
 
1339
1589
  for value in col.iter_values(start, stop):
1340
1590
  j = var_filter.next_buffer_row()
1341
1591
  var_filter.buff[j] = False
1342
- try:
1343
- for f in value:
1592
+ for f in value:
1593
+ try:
1344
1594
  var_filter.buff[j, lookup[f]] = True
1345
- except IndexError:
1346
- raise ValueError(f"Filter '{f}' was not defined in the header.")
1595
+ except KeyError:
1596
+ raise ValueError(f"Filter '{f}' was not defined in the header.")
1347
1597
  var_filter.flush()
1348
1598
  logger.debug(f"Encoded FILTERS slice {start}:{stop}")
1349
1599
 
1350
- def encode_contig_slice(self, pcvcf, lookup, start, stop):
1351
- col = pcvcf.columns["CHROM"]
1352
- contig = core.BufferedArray(self.root["variant_contig"], start)
1600
+ def encode_contig_slice(self, lookup, start, stop):
1601
+ col = self.icf.columns["CHROM"]
1602
+ contig = core.BufferedArray(self.get_array("variant_contig"), start)
1353
1603
 
1354
1604
  for value in col.iter_values(start, stop):
1355
1605
  j = contig.next_buffer_row()
1356
- try:
1357
- contig.buff[j] = lookup[value[0]]
1358
- except KeyError:
1359
- # TODO add advice about adding it to the spec
1360
- raise ValueError(f"Contig '{contig}' was not defined in the header.")
1606
+ # Note: because we are using the indexes to define the lookups
1607
+ # and we always have an index, it seems that we the contig lookup
1608
+ # will always succeed. However, if anyone ever does hit a KeyError
1609
+ # here, please do open an issue with a reproducible example!
1610
+ contig.buff[j] = lookup[value[0]]
1361
1611
  contig.flush()
1362
1612
  logger.debug(f"Encoded CHROM slice {start}:{stop}")
1363
1613
 
1364
- def encode_samples(self, pcvcf, sample_id, chunk_width):
1365
- if not np.array_equal(sample_id, pcvcf.metadata.samples):
1366
- raise ValueError("Subsetting or reordering samples not supported currently")
1614
+ def encode_samples(self):
1615
+ if not np.array_equal(self.schema.sample_id, self.icf.metadata.samples):
1616
+ raise ValueError(
1617
+ "Subsetting or reordering samples not supported currently"
1618
+ ) # NEEDS TEST
1367
1619
  array = self.root.array(
1368
1620
  "sample_id",
1369
- sample_id,
1621
+ self.schema.sample_id,
1370
1622
  dtype="str",
1371
- compressor=core.default_compressor,
1372
- chunks=(chunk_width,),
1623
+ compressor=DEFAULT_ZARR_COMPRESSOR,
1624
+ chunks=(self.schema.samples_chunk_size,),
1373
1625
  )
1374
1626
  array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
1375
1627
  logger.debug("Samples done")
1376
1628
 
1377
- def encode_contig_id(self, pcvcf, contig_names, contig_lengths):
1629
+ def encode_contig_id(self):
1378
1630
  array = self.root.array(
1379
1631
  "contig_id",
1380
- contig_names,
1632
+ self.schema.contig_id,
1381
1633
  dtype="str",
1382
- compressor=core.default_compressor,
1634
+ compressor=DEFAULT_ZARR_COMPRESSOR,
1383
1635
  )
1384
1636
  array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1385
- if contig_lengths is not None:
1637
+ if self.schema.contig_length is not None:
1386
1638
  array = self.root.array(
1387
1639
  "contig_length",
1388
- contig_lengths,
1640
+ self.schema.contig_length,
1389
1641
  dtype=np.int64,
1390
1642
  )
1391
1643
  array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1392
- return {v: j for j, v in enumerate(contig_names)}
1644
+ return {v: j for j, v in enumerate(self.schema.contig_id)}
1393
1645
 
1394
- def encode_filter_id(self, pcvcf, filter_names):
1646
+ def encode_filter_id(self):
1395
1647
  array = self.root.array(
1396
1648
  "filter_id",
1397
- filter_names,
1649
+ self.schema.filter_id,
1398
1650
  dtype="str",
1399
- compressor=core.default_compressor,
1651
+ compressor=DEFAULT_ZARR_COMPRESSOR,
1400
1652
  )
1401
1653
  array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
1402
- return {v: j for j, v in enumerate(filter_names)}
1654
+ return {v: j for j, v in enumerate(self.schema.filter_id)}
1655
+
1656
+ def init(self):
1657
+ self.root.attrs["vcf_zarr_version"] = "0.2"
1658
+ self.root.attrs["vcf_header"] = self.icf.vcf_header
1659
+ self.root.attrs["source"] = f"bio2zarr-{provenance.__version__}"
1660
+ for column in self.schema.columns.values():
1661
+ self.init_array(column)
1662
+
1663
+ def finalise(self):
1664
+ # for column in self.schema.columns.values():
1665
+ # self.finalise_array(column)
1666
+ zarr.consolidate_metadata(self.path)
1403
1667
 
1404
- @staticmethod
1405
1668
  def encode(
1406
- pcvcf,
1407
- path,
1408
- conversion_spec,
1409
- *,
1669
+ self,
1410
1670
  worker_processes=1,
1411
1671
  max_v_chunks=None,
1412
1672
  show_progress=False,
1673
+ max_memory=None,
1413
1674
  ):
1414
- path = pathlib.Path(path)
1415
- # TODO: we should do this as a future to avoid blocking
1416
- if path.exists():
1417
- logger.warning(f"Deleting existing {path}")
1418
- shutil.rmtree(path)
1419
- write_path = path.with_suffix(path.suffix + f".{os.getpid()}.build")
1420
- store = zarr.DirectoryStore(write_path)
1421
- # FIXME, duplicating logic about the store
1422
- logger.info(f"Create zarr at {write_path}")
1423
- sgvcf = SgvcfZarr(write_path)
1424
- sgvcf.root = zarr.group(store=store, overwrite=True)
1425
- for column in conversion_spec.columns.values():
1426
- sgvcf.create_array(column)
1427
-
1428
- sgvcf.root.attrs["vcf_zarr_version"] = "0.2"
1429
- sgvcf.root.attrs["vcf_header"] = pcvcf.vcf_header
1430
- sgvcf.root.attrs["source"] = f"bio2zarr-{provenance.__version__}"
1675
+ if max_memory is None:
1676
+ # Unbounded
1677
+ max_memory = 2**63
1678
+ else:
1679
+ # Value is specified in Mibibytes
1680
+ max_memory *= 2**20 # NEEDS TEST
1431
1681
 
1682
+ # TODO this will move into the setup logic later when we're making it possible
1683
+ # to split the work by slice
1432
1684
  num_slices = max(1, worker_processes * 4)
1433
1685
  # Using POS arbitrarily to get the array slices
1434
1686
  slices = core.chunk_aligned_slices(
1435
- sgvcf.root["variant_position"], num_slices, max_chunks=max_v_chunks
1687
+ self.get_array("variant_position"), num_slices, max_chunks=max_v_chunks
1436
1688
  )
1437
1689
  truncated = slices[-1][-1]
1438
- for array in sgvcf.root.values():
1690
+ for array in self.root.values():
1439
1691
  if array.attrs["_ARRAY_DIMENSIONS"][0] == "variants":
1440
1692
  shape = list(array.shape)
1441
1693
  shape[0] = truncated
1442
1694
  array.resize(shape)
1443
1695
 
1444
- chunked_1d = [
1445
- col for col in conversion_spec.columns.values() if len(col.chunks) <= 1
1446
- ]
1696
+ total_bytes = 0
1697
+ encoding_memory_requirements = {}
1698
+ for col in self.schema.columns.values():
1699
+ array = self.get_array(col.name)
1700
+ # NOTE!! this is bad, we're potentially creating quite a large
1701
+ # numpy array for basically nothing. We can compute this.
1702
+ variant_chunk_size = array.blocks[0].nbytes
1703
+ encoding_memory_requirements[col.name] = variant_chunk_size
1704
+ logger.debug(
1705
+ f"{col.name} requires at least {display_size(variant_chunk_size)} per worker"
1706
+ )
1707
+ total_bytes += array.nbytes
1708
+
1709
+ filter_id_map = self.encode_filter_id()
1710
+ contig_id_map = self.encode_contig_id()
1711
+
1712
+ work = []
1713
+ for start, stop in slices:
1714
+ for col in self.schema.columns.values():
1715
+ if col.vcf_field is not None:
1716
+ f = functools.partial(self.encode_array_slice, col)
1717
+ work.append(
1718
+ EncodingWork(
1719
+ f,
1720
+ start,
1721
+ stop,
1722
+ [col.name],
1723
+ encoding_memory_requirements[col.name],
1724
+ )
1725
+ )
1726
+ work.append(
1727
+ EncodingWork(self.encode_alleles_slice, start, stop, ["variant_allele"])
1728
+ )
1729
+ work.append(
1730
+ EncodingWork(
1731
+ self.encode_id_slice, start, stop, ["variant_id", "variant_id_mask"]
1732
+ )
1733
+ )
1734
+ work.append(
1735
+ EncodingWork(
1736
+ functools.partial(self.encode_filters_slice, filter_id_map),
1737
+ start,
1738
+ stop,
1739
+ ["variant_filter"],
1740
+ )
1741
+ )
1742
+ work.append(
1743
+ EncodingWork(
1744
+ functools.partial(self.encode_contig_slice, contig_id_map),
1745
+ start,
1746
+ stop,
1747
+ ["variant_contig"],
1748
+ )
1749
+ )
1750
+ if "call_genotype" in self.schema.columns:
1751
+ variables = [
1752
+ "call_genotype",
1753
+ "call_genotype_phased",
1754
+ "call_genotype_mask",
1755
+ ]
1756
+ gt_memory = sum(
1757
+ encoding_memory_requirements[name] for name in variables
1758
+ )
1759
+ work.append(
1760
+ EncodingWork(
1761
+ self.encode_genotypes_slice, start, stop, variables, gt_memory
1762
+ )
1763
+ )
1764
+
1765
+ # Fail early if we can't fit a particular column into memory
1766
+ for wp in work:
1767
+ if wp.memory >= max_memory:
1768
+ raise ValueError( # NEEDS TEST
1769
+ f"Insufficient memory for {wp.columns}: "
1770
+ f"{display_size(wp.memory)} > {display_size(max_memory)}"
1771
+ )
1772
+
1447
1773
  progress_config = core.ProgressConfig(
1448
- total=sum(sgvcf.root[col.name].nchunks for col in chunked_1d),
1449
- title="Encode 1D",
1450
- units="chunks",
1774
+ total=total_bytes,
1775
+ title="Encode",
1776
+ units="B",
1451
1777
  show=show_progress,
1452
1778
  )
1453
1779
 
1454
- # Do these syncronously for simplicity so we have the mapping
1455
- filter_id_map = sgvcf.encode_filter_id(pcvcf, conversion_spec.filter_id)
1456
- contig_id_map = sgvcf.encode_contig_id(
1457
- pcvcf, conversion_spec.contig_id, conversion_spec.contig_length
1458
- )
1780
+ used_memory = 0
1781
+ max_queued = 4 * max(1, worker_processes)
1782
+ encoded_slices = collections.Counter()
1459
1783
 
1460
1784
  with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
1461
- pwm.submit(
1462
- sgvcf.encode_samples,
1463
- pcvcf,
1464
- conversion_spec.sample_id,
1465
- conversion_spec.chunk_width,
1466
- )
1467
- for start, stop in slices:
1468
- pwm.submit(sgvcf.encode_alleles_slice, pcvcf, start, stop)
1469
- pwm.submit(sgvcf.encode_id_slice, pcvcf, start, stop)
1470
- pwm.submit(
1471
- sgvcf.encode_filters_slice, pcvcf, filter_id_map, start, stop
1472
- )
1473
- pwm.submit(sgvcf.encode_contig_slice, pcvcf, contig_id_map, start, stop)
1474
- for col in chunked_1d:
1475
- if col.vcf_field is not None:
1476
- pwm.submit(sgvcf.encode_column_slice, pcvcf, col, start, stop)
1477
-
1478
- chunked_2d = [
1479
- col for col in conversion_spec.columns.values() if len(col.chunks) >= 2
1480
- ]
1481
- if len(chunked_2d) > 0:
1482
- progress_config = core.ProgressConfig(
1483
- total=sum(sgvcf.root[col.name].nchunks for col in chunked_2d),
1484
- title="Encode 2D",
1485
- units="chunks",
1486
- show=show_progress,
1487
- )
1488
- with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
1489
- if "call_genotype" in conversion_spec.columns:
1490
- logger.info(f"Submit encode call_genotypes in {len(slices)} slices")
1491
- for start, stop in slices:
1492
- pwm.submit(sgvcf.encode_genotypes_slice, pcvcf, start, stop)
1493
-
1494
- for col in chunked_2d:
1495
- if col.vcf_field is not None:
1496
- logger.info(f"Submit encode {col.name} in {len(slices)} slices")
1497
- for start, stop in slices:
1498
- pwm.submit(
1499
- sgvcf.encode_column_slice, pcvcf, col, start, stop
1500
- )
1501
-
1502
- zarr.consolidate_metadata(write_path)
1503
- # Atomic swap, now we've completely finished.
1504
- logger.info(f"Moving to final path {path}")
1505
- os.rename(write_path, path)
1785
+ future = pwm.submit(self.encode_samples)
1786
+ future_to_work = {future: EncodingWork(None, 0, 0, [])}
1787
+
1788
+ def service_completed_futures():
1789
+ nonlocal used_memory
1790
+
1791
+ completed = pwm.wait_for_completed()
1792
+ for future in completed:
1793
+ wp_done = future_to_work.pop(future)
1794
+ used_memory -= wp_done.memory
1795
+ logger.debug(
1796
+ f"Complete {wp_done}: used mem={display_size(used_memory)}"
1797
+ )
1798
+ for column in wp_done.columns:
1799
+ encoded_slices[column] += 1
1800
+ if encoded_slices[column] == len(slices):
1801
+ # Do this syncronously for simplicity. Should be
1802
+ # fine as the workers will probably be busy with
1803
+ # large encode tasks most of the time.
1804
+ self.finalise_array(column)
1805
+
1806
+ for wp in work:
1807
+ if (
1808
+ used_memory + wp.memory > max_memory
1809
+ or len(future_to_work) > max_queued
1810
+ ):
1811
+ service_completed_futures()
1812
+ future = pwm.submit(wp.func, wp.start, wp.stop)
1813
+ used_memory += wp.memory
1814
+ logger.debug(f"Submit {wp}: used mem={display_size(used_memory)}")
1815
+ future_to_work[future] = wp
1816
+
1817
+ logger.debug("All work submitted")
1818
+ while len(future_to_work) > 0:
1819
+ service_completed_futures()
1506
1820
 
1507
1821
 
1508
1822
  def mkschema(if_path, out):
1509
- pcvcf = PickleChunkedVcf.load(if_path)
1510
- spec = ZarrConversionSpec.generate(pcvcf)
1823
+ icf = IntermediateColumnarFormat(if_path)
1824
+ spec = VcfZarrSchema.generate(icf)
1511
1825
  out.write(spec.asjson())
1512
1826
 
1513
1827
 
@@ -1515,42 +1829,49 @@ def encode(
1515
1829
  if_path,
1516
1830
  zarr_path,
1517
1831
  schema_path=None,
1518
- chunk_length=None,
1519
- chunk_width=None,
1832
+ variants_chunk_size=None,
1833
+ samples_chunk_size=None,
1520
1834
  max_v_chunks=None,
1835
+ max_memory=None,
1521
1836
  worker_processes=1,
1522
1837
  show_progress=False,
1523
1838
  ):
1524
- pcvcf = PickleChunkedVcf.load(if_path)
1839
+ icf = IntermediateColumnarFormat(if_path)
1525
1840
  if schema_path is None:
1526
- schema = ZarrConversionSpec.generate(
1527
- pcvcf,
1528
- chunk_length=chunk_length,
1529
- chunk_width=chunk_width,
1841
+ schema = VcfZarrSchema.generate(
1842
+ icf,
1843
+ variants_chunk_size=variants_chunk_size,
1844
+ samples_chunk_size=samples_chunk_size,
1530
1845
  )
1531
1846
  else:
1532
1847
  logger.info(f"Reading schema from {schema_path}")
1533
- if chunk_length is not None or chunk_width is not None:
1534
- raise ValueError("Cannot specify schema along with chunk sizes")
1848
+ if variants_chunk_size is not None or samples_chunk_size is not None:
1849
+ raise ValueError(
1850
+ "Cannot specify schema along with chunk sizes"
1851
+ ) # NEEDS TEST
1535
1852
  with open(schema_path, "r") as f:
1536
- schema = ZarrConversionSpec.fromjson(f.read())
1537
-
1538
- SgvcfZarr.encode(
1539
- pcvcf,
1540
- zarr_path,
1541
- conversion_spec=schema,
1853
+ schema = VcfZarrSchema.fromjson(f.read())
1854
+ zarr_path = pathlib.Path(zarr_path)
1855
+ if zarr_path.exists():
1856
+ logger.warning(f"Deleting existing {zarr_path}")
1857
+ shutil.rmtree(zarr_path)
1858
+ vzw = VcfZarrWriter(zarr_path, icf, schema)
1859
+ vzw.init()
1860
+ vzw.encode(
1542
1861
  max_v_chunks=max_v_chunks,
1543
1862
  worker_processes=worker_processes,
1863
+ max_memory=max_memory,
1544
1864
  show_progress=show_progress,
1545
1865
  )
1866
+ vzw.finalise()
1546
1867
 
1547
1868
 
1548
1869
  def convert(
1549
1870
  vcfs,
1550
1871
  out_path,
1551
1872
  *,
1552
- chunk_length=None,
1553
- chunk_width=None,
1873
+ variants_chunk_size=None,
1874
+ samples_chunk_size=None,
1554
1875
  worker_processes=1,
1555
1876
  show_progress=False,
1556
1877
  # TODO add arguments to control location of tmpdir
@@ -1565,8 +1886,8 @@ def convert(
1565
1886
  encode(
1566
1887
  if_dir,
1567
1888
  out_path,
1568
- chunk_length=chunk_length,
1569
- chunk_width=chunk_width,
1889
+ variants_chunk_size=variants_chunk_size,
1890
+ samples_chunk_size=samples_chunk_size,
1570
1891
  worker_processes=worker_processes,
1571
1892
  show_progress=show_progress,
1572
1893
  )
@@ -1744,16 +2065,14 @@ def validate(vcf_path, zarr_path, show_progress=False):
1744
2065
  name = colname.split("_", 1)[1]
1745
2066
  if name.isupper():
1746
2067
  vcf_type = info_headers[name]["Type"]
1747
- # print(root[colname])
1748
2068
  info_fields[name] = vcf_type, iter(root[colname])
1749
- # print(info_fields)
1750
2069
 
1751
2070
  first_pos = next(vcf).POS
1752
2071
  start_index = np.searchsorted(pos, first_pos)
1753
2072
  assert pos[start_index] == first_pos
1754
2073
  vcf = cyvcf2.VCF(vcf_path)
1755
2074
  if show_progress:
1756
- iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records)
2075
+ iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
1757
2076
  else:
1758
2077
  iterator = vcf
1759
2078
  for j, row in enumerate(iterator, start_index):
@@ -1790,11 +2109,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
1790
2109
  assert_info_val_equal(vcf_val, zarr_val, vcf_type)
1791
2110
 
1792
2111
  for name, (vcf_type, zarr_iter) in format_fields.items():
1793
- vcf_val = None
1794
- try:
1795
- vcf_val = row.format(name)
1796
- except KeyError:
1797
- pass
2112
+ vcf_val = row.format(name)
1798
2113
  zarr_val = next(zarr_iter)
1799
2114
  if vcf_val is None:
1800
2115
  assert_format_val_missing(zarr_val, vcf_type)