bio2zarr 0.0.5__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bio2zarr/vcf.py CHANGED
@@ -1,17 +1,17 @@
1
1
  import collections
2
2
  import contextlib
3
3
  import dataclasses
4
- import functools
5
4
  import json
6
5
  import logging
7
6
  import math
8
7
  import os
8
+ import os.path
9
9
  import pathlib
10
10
  import pickle
11
11
  import shutil
12
12
  import sys
13
13
  import tempfile
14
- from typing import Any, List
14
+ from typing import Any
15
15
 
16
16
  import cyvcf2
17
17
  import humanfriendly
@@ -111,9 +111,6 @@ class VcfField:
111
111
  return self.name
112
112
  return f"{self.category}/{self.name}"
113
113
 
114
- # TODO add method here to choose a good set compressor and
115
- # filters default here for this field.
116
-
117
114
  def smallest_dtype(self):
118
115
  """
119
116
  Returns the smallest dtype suitable for this field based
@@ -123,13 +120,13 @@ class VcfField:
123
120
  if self.vcf_type == "Float":
124
121
  ret = "f4"
125
122
  elif self.vcf_type == "Integer":
126
- dtype = "i4"
127
- for a_dtype in ["i1", "i2"]:
128
- info = np.iinfo(a_dtype)
129
- if info.min <= s.min_value and s.max_value <= info.max:
130
- dtype = a_dtype
131
- break
132
- ret = dtype
123
+ if not math.isfinite(s.max_value):
124
+ # All missing values; use i1. Note we should have some API to
125
+ # check more explicitly for missingness:
126
+ # https://github.com/sgkit-dev/bio2zarr/issues/131
127
+ ret = "i1"
128
+ else:
129
+ ret = core.min_int_dtype(s.min_value, s.max_value)
133
130
  elif self.vcf_type == "Flag":
134
131
  ret = "bool"
135
132
  elif self.vcf_type == "Character":
@@ -147,25 +144,41 @@ class VcfPartition:
147
144
  num_records: int = -1
148
145
 
149
146
 
150
- ICF_METADATA_FORMAT_VERSION = "0.2"
147
+ ICF_METADATA_FORMAT_VERSION = "0.3"
151
148
  ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
152
149
  cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
153
150
  )
154
151
 
155
152
 
153
+ @dataclasses.dataclass
154
+ class Contig:
155
+ id: str
156
+ length: int = None
157
+
158
+
159
+ @dataclasses.dataclass
160
+ class Sample:
161
+ id: str
162
+
163
+
164
+ @dataclasses.dataclass
165
+ class Filter:
166
+ id: str
167
+ description: str = ""
168
+
169
+
156
170
  @dataclasses.dataclass
157
171
  class IcfMetadata:
158
172
  samples: list
159
- contig_names: list
160
- contig_record_counts: dict
173
+ contigs: list
161
174
  filters: list
162
175
  fields: list
163
176
  partitions: list = None
164
- contig_lengths: list = None
165
177
  format_version: str = None
166
178
  compressor: dict = None
167
179
  column_chunk_size: int = None
168
180
  provenance: dict = None
181
+ num_records: int = -1
169
182
 
170
183
  @property
171
184
  def info_fields(self):
@@ -184,8 +197,12 @@ class IcfMetadata:
184
197
  return fields
185
198
 
186
199
  @property
187
- def num_records(self):
188
- return sum(self.contig_record_counts.values())
200
+ def num_contigs(self):
201
+ return len(self.contigs)
202
+
203
+ @property
204
+ def num_filters(self):
205
+ return len(self.filters)
189
206
 
190
207
  @staticmethod
191
208
  def fromdict(d):
@@ -194,18 +211,23 @@ class IcfMetadata:
194
211
  "Intermediate columnar metadata format version mismatch: "
195
212
  f"{d['format_version']} != {ICF_METADATA_FORMAT_VERSION}"
196
213
  )
197
- fields = [VcfField.fromdict(fd) for fd in d["fields"]]
198
214
  partitions = [VcfPartition(**pd) for pd in d["partitions"]]
199
215
  for p in partitions:
200
216
  p.region = vcf_utils.Region(**p.region)
201
217
  d = d.copy()
202
- d["fields"] = fields
203
218
  d["partitions"] = partitions
219
+ d["fields"] = [VcfField.fromdict(fd) for fd in d["fields"]]
220
+ d["samples"] = [Sample(**sd) for sd in d["samples"]]
221
+ d["filters"] = [Filter(**fd) for fd in d["filters"]]
222
+ d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
204
223
  return IcfMetadata(**d)
205
224
 
206
225
  def asdict(self):
207
226
  return dataclasses.asdict(self)
208
227
 
228
+ def asjson(self):
229
+ return json.dumps(self.asdict(), indent=4)
230
+
209
231
 
210
232
  def fixed_vcf_field_definitions():
211
233
  def make_field_def(name, vcf_type, vcf_number):
@@ -233,15 +255,22 @@ def fixed_vcf_field_definitions():
233
255
  def scan_vcf(path, target_num_partitions):
234
256
  with vcf_utils.IndexedVcf(path) as indexed_vcf:
235
257
  vcf = indexed_vcf.vcf
236
- filters = [
237
- h["ID"]
238
- for h in vcf.header_iter()
239
- if h["HeaderType"] == "FILTER" and isinstance(h["ID"], str)
240
- ]
258
+ filters = []
259
+ pass_index = -1
260
+ for h in vcf.header_iter():
261
+ if h["HeaderType"] == "FILTER" and isinstance(h["ID"], str):
262
+ try:
263
+ description = h["Description"].strip('"')
264
+ except KeyError:
265
+ description = ""
266
+ if h["ID"] == "PASS":
267
+ pass_index = len(filters)
268
+ filters.append(Filter(h["ID"], description))
269
+
241
270
  # Ensure PASS is the first filter if present
242
- if "PASS" in filters:
243
- filters.remove("PASS")
244
- filters.insert(0, "PASS")
271
+ if pass_index > 0:
272
+ pass_filter = filters.pop(pass_index)
273
+ filters.insert(0, pass_filter)
245
274
 
246
275
  fields = fixed_vcf_field_definitions()
247
276
  for h in vcf.header_iter():
@@ -252,18 +281,22 @@ def scan_vcf(path, target_num_partitions):
252
281
  field.vcf_number = "."
253
282
  fields.append(field)
254
283
 
284
+ try:
285
+ contig_lengths = vcf.seqlens
286
+ except AttributeError:
287
+ contig_lengths = [None for _ in vcf.seqnames]
288
+
255
289
  metadata = IcfMetadata(
256
- samples=vcf.samples,
257
- contig_names=vcf.seqnames,
258
- contig_record_counts=indexed_vcf.contig_record_counts(),
290
+ samples=[Sample(sample_id) for sample_id in vcf.samples],
291
+ contigs=[
292
+ Contig(contig_id, length)
293
+ for contig_id, length in zip(vcf.seqnames, contig_lengths)
294
+ ],
259
295
  filters=filters,
260
296
  fields=fields,
261
297
  partitions=[],
298
+ num_records=sum(indexed_vcf.contig_record_counts().values()),
262
299
  )
263
- try:
264
- metadata.contig_lengths = vcf.seqlens
265
- except AttributeError:
266
- pass
267
300
 
268
301
  regions = indexed_vcf.partition_into_regions(num_parts=target_num_partitions)
269
302
  logger.info(
@@ -282,21 +315,6 @@ def scan_vcf(path, target_num_partitions):
282
315
  return metadata, vcf.raw_header
283
316
 
284
317
 
285
- def check_overlap(partitions):
286
- for i in range(1, len(partitions)):
287
- prev_partition = partitions[i - 1]
288
- current_partition = partitions[i]
289
- if (
290
- prev_partition.region.contig == current_partition.region.contig
291
- and prev_partition.region.end > current_partition.region.start
292
- ):
293
- raise ValueError(
294
- f"Multiple VCFs have the region "
295
- f"{prev_partition.region.contig}:{prev_partition.region.start}-"
296
- f"{current_partition.region.end}"
297
- )
298
-
299
-
300
318
  def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
301
319
  logger.info(
302
320
  f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
@@ -325,27 +343,30 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
325
343
  # We just take the first header, assuming the others
326
344
  # are compatible.
327
345
  all_partitions = []
328
- contig_record_counts = collections.Counter()
346
+ total_records = 0
329
347
  for metadata, _ in results:
330
- all_partitions.extend(metadata.partitions)
331
- metadata.partitions.clear()
332
- contig_record_counts += metadata.contig_record_counts
333
- metadata.contig_record_counts.clear()
348
+ for partition in metadata.partitions:
349
+ logger.debug(f"Scanned partition {partition}")
350
+ all_partitions.append(partition)
351
+ total_records += metadata.num_records
352
+ metadata.num_records = 0
353
+ metadata.partitions = []
334
354
 
335
355
  icf_metadata, header = results[0]
336
356
  for metadata, _ in results[1:]:
337
357
  if metadata != icf_metadata:
338
358
  raise ValueError("Incompatible VCF chunks")
339
359
 
340
- icf_metadata.contig_record_counts = dict(contig_record_counts)
360
+ # Note: this will be infinity here if any of the chunks has an index
361
+ # that doesn't keep track of the number of records per-contig
362
+ icf_metadata.num_records = total_records
341
363
 
342
364
  # Sort by contig (in the order they appear in the header) first,
343
365
  # then by start coordinate
344
- contig_index_map = {contig: j for j, contig in enumerate(metadata.contig_names)}
366
+ contig_index_map = {contig.id: j for j, contig in enumerate(metadata.contigs)}
345
367
  all_partitions.sort(
346
368
  key=lambda x: (contig_index_map[x.region.contig], x.region.start)
347
369
  )
348
- check_overlap(all_partitions)
349
370
  icf_metadata.partitions = all_partitions
350
371
  logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
351
372
  return icf_metadata, header
@@ -443,7 +464,7 @@ def sanitise_value_float_2d(buff, j, value):
443
464
 
444
465
  def sanitise_int_array(value, ndmin, dtype):
445
466
  if isinstance(value, tuple):
446
- value = [VCF_INT_MISSING if x is None else x for x in value] # NEEDS TEST
467
+ value = [VCF_INT_MISSING if x is None else x for x in value] # NEEDS TEST
447
468
  value = np.array(value, ndmin=ndmin, copy=False)
448
469
  value[value == VCF_INT_MISSING] = -1
449
470
  value[value == VCF_INT_FILL] = -2
@@ -736,9 +757,9 @@ class IcfFieldWriter:
736
757
  transformer: VcfValueTransformer
737
758
  compressor: Any
738
759
  max_buffered_bytes: int
739
- buff: List[Any] = dataclasses.field(default_factory=list)
760
+ buff: list[Any] = dataclasses.field(default_factory=list)
740
761
  buffered_bytes: int = 0
741
- chunk_index: List[int] = dataclasses.field(default_factory=lambda: [0])
762
+ chunk_index: list[int] = dataclasses.field(default_factory=lambda: [0])
742
763
  num_records: int = 0
743
764
 
744
765
  def append(self, val):
@@ -842,19 +863,18 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
842
863
  self.metadata = IcfMetadata.fromdict(json.load(f))
843
864
  with open(self.path / "header.txt") as f:
844
865
  self.vcf_header = f.read()
845
-
846
866
  self.compressor = numcodecs.get_codec(self.metadata.compressor)
847
- self.columns = {}
867
+ self.fields = {}
848
868
  partition_num_records = [
849
869
  partition.num_records for partition in self.metadata.partitions
850
870
  ]
851
871
  # Allow us to find which partition a given record is in
852
872
  self.partition_record_index = np.cumsum([0, *partition_num_records])
853
873
  for field in self.metadata.fields:
854
- self.columns[field.full_name] = IntermediateColumnarFormatField(self, field)
874
+ self.fields[field.full_name] = IntermediateColumnarFormatField(self, field)
855
875
  logger.info(
856
876
  f"Loaded IntermediateColumnarFormat(partitions={self.num_partitions}, "
857
- f"records={self.num_records}, columns={self.num_columns})"
877
+ f"records={self.num_records}, fields={self.num_fields})"
858
878
  )
859
879
 
860
880
  def __repr__(self):
@@ -865,17 +885,17 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
865
885
  )
866
886
 
867
887
  def __getitem__(self, key):
868
- return self.columns[key]
888
+ return self.fields[key]
869
889
 
870
890
  def __iter__(self):
871
- return iter(self.columns)
891
+ return iter(self.fields)
872
892
 
873
893
  def __len__(self):
874
- return len(self.columns)
894
+ return len(self.fields)
875
895
 
876
896
  def summary_table(self):
877
897
  data = []
878
- for name, col in self.columns.items():
898
+ for name, col in self.fields.items():
879
899
  summary = col.vcf_field.summary
880
900
  d = {
881
901
  "name": name,
@@ -891,9 +911,9 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
891
911
  data.append(d)
892
912
  return data
893
913
 
894
- @functools.cached_property
914
+ @property
895
915
  def num_records(self):
896
- return sum(self.metadata.contig_record_counts.values())
916
+ return self.metadata.num_records
897
917
 
898
918
  @property
899
919
  def num_partitions(self):
@@ -904,8 +924,42 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
904
924
  return len(self.metadata.samples)
905
925
 
906
926
  @property
907
- def num_columns(self):
908
- return len(self.columns)
927
+ def num_fields(self):
928
+ return len(self.fields)
929
+
930
+
931
+ @dataclasses.dataclass
932
+ class IcfPartitionMetadata:
933
+ num_records: int
934
+ last_position: int
935
+ field_summaries: dict
936
+
937
+ def asdict(self):
938
+ return dataclasses.asdict(self)
939
+
940
+ def asjson(self):
941
+ return json.dumps(self.asdict(), indent=4)
942
+
943
+ @staticmethod
944
+ def fromdict(d):
945
+ md = IcfPartitionMetadata(**d)
946
+ for k, v in md.field_summaries.items():
947
+ md.field_summaries[k] = VcfFieldSummary.fromdict(v)
948
+ return md
949
+
950
+
951
+ def check_overlapping_partitions(partitions):
952
+ for i in range(1, len(partitions)):
953
+ prev_region = partitions[i - 1].region
954
+ current_region = partitions[i].region
955
+ if prev_region.contig == current_region.contig:
956
+ assert prev_region.end is not None
957
+ # Regions are *inclusive*
958
+ if prev_region.end >= current_region.start:
959
+ raise ValueError(
960
+ f"Overlapping VCF regions in partitions {i - 1} and {i}: "
961
+ f"{prev_region} and {current_region}"
962
+ )
909
963
 
910
964
 
911
965
  class IntermediateColumnarFormatWriter:
@@ -979,11 +1033,8 @@ class IntermediateColumnarFormatWriter:
979
1033
  not_found = []
980
1034
  for j in range(self.num_partitions):
981
1035
  try:
982
- with open(self.wip_path / f"p{j}_summary.json") as f:
983
- summary = json.load(f)
984
- for k, v in summary["field_summaries"].items():
985
- summary["field_summaries"][k] = VcfFieldSummary.fromdict(v)
986
- summaries.append(summary)
1036
+ with open(self.wip_path / f"p{j}.json") as f:
1037
+ summaries.append(IcfPartitionMetadata.fromdict(json.load(f)))
987
1038
  except FileNotFoundError:
988
1039
  not_found.append(j)
989
1040
  if len(not_found) > 0:
@@ -1000,7 +1051,7 @@ class IntermediateColumnarFormatWriter:
1000
1051
 
1001
1052
  def process_partition(self, partition_index):
1002
1053
  self.load_metadata()
1003
- summary_path = self.wip_path / f"p{partition_index}_summary.json"
1054
+ summary_path = self.wip_path / f"p{partition_index}.json"
1004
1055
  # If someone is rewriting a summary path (for whatever reason), make sure it
1005
1056
  # doesn't look like it's already been completed.
1006
1057
  # NOTE to do this properly we probably need to take a lock on this file - but
@@ -1021,6 +1072,7 @@ class IntermediateColumnarFormatWriter:
1021
1072
  else:
1022
1073
  format_fields.append(field)
1023
1074
 
1075
+ last_position = None
1024
1076
  with IcfPartitionWriter(
1025
1077
  self.metadata,
1026
1078
  self.path,
@@ -1030,6 +1082,7 @@ class IntermediateColumnarFormatWriter:
1030
1082
  num_records = 0
1031
1083
  for variant in ivcf.variants(partition.region):
1032
1084
  num_records += 1
1085
+ last_position = variant.POS
1033
1086
  tcw.append("CHROM", variant.CHROM)
1034
1087
  tcw.append("POS", variant.POS)
1035
1088
  tcw.append("QUAL", variant.QUAL)
@@ -1054,37 +1107,32 @@ class IntermediateColumnarFormatWriter:
1054
1107
  f"flushing buffers"
1055
1108
  )
1056
1109
 
1057
- partition_metadata = {
1058
- "num_records": num_records,
1059
- "field_summaries": {k: v.asdict() for k, v in tcw.field_summaries.items()},
1060
- }
1110
+ partition_metadata = IcfPartitionMetadata(
1111
+ num_records=num_records,
1112
+ last_position=last_position,
1113
+ field_summaries=tcw.field_summaries,
1114
+ )
1061
1115
  with open(summary_path, "w") as f:
1062
- json.dump(partition_metadata, f, indent=4)
1116
+ f.write(partition_metadata.asjson())
1063
1117
  logger.info(
1064
- f"Finish p{partition_index} {partition.vcf_path}__{partition.region}="
1065
- f"{num_records} records"
1118
+ f"Finish p{partition_index} {partition.vcf_path}__{partition.region} "
1119
+ f"{num_records} records last_pos={last_position}"
1066
1120
  )
1067
1121
 
1068
- def process_partition_slice(
1069
- self,
1070
- start,
1071
- stop,
1072
- *,
1073
- worker_processes=1,
1074
- show_progress=False,
1075
- ):
1122
+ def explode(self, *, worker_processes=1, show_progress=False):
1076
1123
  self.load_metadata()
1077
- if start == 0 and stop == self.num_partitions:
1078
- num_records = self.metadata.num_records
1079
- else:
1080
- # We only know the number of records if all partitions are done at once,
1081
- # and we signal this to tqdm by passing None as the total.
1124
+ num_records = self.metadata.num_records
1125
+ if np.isinf(num_records):
1126
+ logger.warning(
1127
+ "Total records unknown, cannot show progress; "
1128
+ "reindex VCFs with bcftools index to fix"
1129
+ )
1082
1130
  num_records = None
1083
- num_columns = len(self.metadata.fields)
1131
+ num_fields = len(self.metadata.fields)
1084
1132
  num_samples = len(self.metadata.samples)
1085
1133
  logger.info(
1086
- f"Exploding columns={num_columns} samples={num_samples}; "
1087
- f"partitions={stop - start} "
1134
+ f"Exploding fields={num_fields} samples={num_samples}; "
1135
+ f"partitions={self.num_partitions} "
1088
1136
  f"variants={'unknown' if num_records is None else num_records}"
1089
1137
  )
1090
1138
  progress_config = core.ProgressConfig(
@@ -1094,48 +1142,43 @@ class IntermediateColumnarFormatWriter:
1094
1142
  show=show_progress,
1095
1143
  )
1096
1144
  with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
1097
- for j in range(start, stop):
1145
+ for j in range(self.num_partitions):
1098
1146
  pwm.submit(self.process_partition, j)
1099
1147
 
1100
- def explode(self, *, worker_processes=1, show_progress=False):
1101
- self.load_metadata()
1102
- return self.process_partition_slice(
1103
- 0,
1104
- self.num_partitions,
1105
- worker_processes=worker_processes,
1106
- show_progress=show_progress,
1107
- )
1108
-
1109
- def explode_partition(self, partition, *, show_progress=False, worker_processes=1):
1148
+ def explode_partition(self, partition):
1110
1149
  self.load_metadata()
1111
1150
  if partition < 0 or partition >= self.num_partitions:
1112
1151
  raise ValueError(
1113
1152
  "Partition index must be in the range 0 <= index < num_partitions"
1114
1153
  )
1115
- return self.process_partition_slice(
1116
- partition,
1117
- partition + 1,
1118
- worker_processes=worker_processes,
1119
- show_progress=show_progress,
1120
- )
1154
+ self.process_partition(partition)
1121
1155
 
1122
1156
  def finalise(self):
1123
1157
  self.load_metadata()
1124
1158
  partition_summaries = self.load_partition_summaries()
1125
1159
  total_records = 0
1126
1160
  for index, summary in enumerate(partition_summaries):
1127
- partition_records = summary["num_records"]
1161
+ partition_records = summary.num_records
1128
1162
  self.metadata.partitions[index].num_records = partition_records
1163
+ self.metadata.partitions[index].region.end = summary.last_position
1129
1164
  total_records += partition_records
1130
- assert total_records == self.metadata.num_records
1165
+ if not np.isinf(self.metadata.num_records):
1166
+ # Note: this is just telling us that there's a bug in the
1167
+ # index based record counting code, but it doesn't actually
1168
+ # matter much. We may want to just make this a warning if
1169
+ # we hit regular problems.
1170
+ assert total_records == self.metadata.num_records
1171
+ self.metadata.num_records = total_records
1172
+
1173
+ check_overlapping_partitions(self.metadata.partitions)
1131
1174
 
1132
1175
  for field in self.metadata.fields:
1133
1176
  for summary in partition_summaries:
1134
- field.summary.update(summary["field_summaries"][field.full_name])
1177
+ field.summary.update(summary.field_summaries[field.full_name])
1135
1178
 
1136
1179
  logger.info("Finalising metadata")
1137
1180
  with open(self.path / "metadata.json", "w") as f:
1138
- json.dump(self.metadata.asdict(), f, indent=4)
1181
+ f.write(self.metadata.asjson())
1139
1182
 
1140
1183
  logger.debug("Removing WIP directory")
1141
1184
  shutil.rmtree(self.wip_path)
@@ -1186,14 +1229,9 @@ def explode_init(
1186
1229
  )
1187
1230
 
1188
1231
 
1189
- # NOTE only including worker_processes here so we can use the 0 option to get the
1190
- # work done syncronously and so we can get test coverage on it. Should find a
1191
- # better way to do this.
1192
- def explode_partition(icf_path, partition, *, show_progress=False, worker_processes=1):
1232
+ def explode_partition(icf_path, partition):
1193
1233
  writer = IntermediateColumnarFormatWriter(icf_path)
1194
- writer.explode_partition(
1195
- partition, show_progress=show_progress, worker_processes=worker_processes
1196
- )
1234
+ writer.explode_partition(partition)
1197
1235
 
1198
1236
 
1199
1237
  def explode_finalise(icf_path):
@@ -1242,6 +1280,50 @@ class ZarrColumnSpec:
1242
1280
  spec._choose_compressor_settings()
1243
1281
  return spec
1244
1282
 
1283
+ @staticmethod
1284
+ def from_field(
1285
+ vcf_field,
1286
+ *,
1287
+ num_variants,
1288
+ num_samples,
1289
+ variants_chunk_size,
1290
+ samples_chunk_size,
1291
+ variable_name=None,
1292
+ ):
1293
+ shape = [num_variants]
1294
+ prefix = "variant_"
1295
+ dimensions = ["variants"]
1296
+ chunks = [variants_chunk_size]
1297
+ if vcf_field.category == "FORMAT":
1298
+ prefix = "call_"
1299
+ shape.append(num_samples)
1300
+ chunks.append(samples_chunk_size)
1301
+ dimensions.append("samples")
1302
+ if variable_name is None:
1303
+ variable_name = prefix + vcf_field.name
1304
+ # TODO make an option to add in the empty extra dimension
1305
+ if vcf_field.summary.max_number > 1:
1306
+ shape.append(vcf_field.summary.max_number)
1307
+ # TODO we should really be checking this to see if the named dimensions
1308
+ # are actually correct.
1309
+ if vcf_field.vcf_number == "R":
1310
+ dimensions.append("alleles")
1311
+ elif vcf_field.vcf_number == "A":
1312
+ dimensions.append("alt_alleles")
1313
+ elif vcf_field.vcf_number == "G":
1314
+ dimensions.append("genotypes")
1315
+ else:
1316
+ dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
1317
+ return ZarrColumnSpec.new(
1318
+ vcf_field=vcf_field.full_name,
1319
+ name=variable_name,
1320
+ dtype=vcf_field.smallest_dtype(),
1321
+ shape=shape,
1322
+ chunks=chunks,
1323
+ dimensions=dimensions,
1324
+ description=vcf_field.description,
1325
+ )
1326
+
1245
1327
  def _choose_compressor_settings(self):
1246
1328
  """
1247
1329
  Choose compressor and filter settings based on the size and
@@ -1250,19 +1332,34 @@ class ZarrColumnSpec:
1250
1332
 
1251
1333
  See https://github.com/pystatgen/bio2zarr/discussions/74
1252
1334
  """
1253
- dt = np.dtype(self.dtype)
1254
1335
  # Default is to not shuffle, because autoshuffle isn't recognised
1255
1336
  # by many Zarr implementations, and shuffling can lead to worse
1256
1337
  # performance in some cases anyway. Turning on shuffle should be a
1257
1338
  # deliberate choice.
1258
1339
  shuffle = numcodecs.Blosc.NOSHUFFLE
1259
- if dt.itemsize == 1:
1260
- # Any 1 byte field gets BITSHUFFLE by default
1340
+ if self.name == "call_genotype" and self.dtype == "i1":
1341
+ # call_genotype gets BITSHUFFLE by default as it gets
1342
+ # significantly better compression (at a cost of slower
1343
+ # decoding)
1344
+ shuffle = numcodecs.Blosc.BITSHUFFLE
1345
+ elif self.dtype == "bool":
1261
1346
  shuffle = numcodecs.Blosc.BITSHUFFLE
1347
+
1262
1348
  self.compressor["shuffle"] = shuffle
1263
1349
 
1350
+ @property
1351
+ def variant_chunk_nbytes(self):
1352
+ """
1353
+ Returns the nbytes for a single variant chunk of this array.
1354
+ """
1355
+ chunk_items = self.chunks[0]
1356
+ for size in self.shape[1:]:
1357
+ chunk_items *= size
1358
+ dt = np.dtype(self.dtype)
1359
+ return chunk_items * dt.itemsize
1360
+
1264
1361
 
1265
- ZARR_SCHEMA_FORMAT_VERSION = "0.2"
1362
+ ZARR_SCHEMA_FORMAT_VERSION = "0.3"
1266
1363
 
1267
1364
 
1268
1365
  @dataclasses.dataclass
@@ -1271,11 +1368,10 @@ class VcfZarrSchema:
1271
1368
  samples_chunk_size: int
1272
1369
  variants_chunk_size: int
1273
1370
  dimensions: list
1274
- sample_id: list
1275
- contig_id: list
1276
- contig_length: list
1277
- filter_id: list
1278
- columns: dict
1371
+ samples: list
1372
+ contigs: list
1373
+ filters: list
1374
+ fields: dict
1279
1375
 
1280
1376
  def asdict(self):
1281
1377
  return dataclasses.asdict(self)
@@ -1291,8 +1387,11 @@ class VcfZarrSchema:
1291
1387
  f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
1292
1388
  )
1293
1389
  ret = VcfZarrSchema(**d)
1294
- ret.columns = {
1295
- key: ZarrColumnSpec(**value) for key, value in d["columns"].items()
1390
+ ret.samples = [Sample(**sd) for sd in d["samples"]]
1391
+ ret.contigs = [Contig(**sd) for sd in d["contigs"]]
1392
+ ret.filters = [Filter(**sd) for sd in d["filters"]]
1393
+ ret.fields = {
1394
+ key: ZarrColumnSpec(**value) for key, value in d["fields"].items()
1296
1395
  }
1297
1396
  return ret
1298
1397
 
@@ -1313,6 +1412,16 @@ class VcfZarrSchema:
1313
1412
  f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
1314
1413
  )
1315
1414
 
1415
+ def spec_from_field(field, variable_name=None):
1416
+ return ZarrColumnSpec.from_field(
1417
+ field,
1418
+ num_samples=n,
1419
+ num_variants=m,
1420
+ samples_chunk_size=samples_chunk_size,
1421
+ variants_chunk_size=variants_chunk_size,
1422
+ variable_name=variable_name,
1423
+ )
1424
+
1316
1425
  def fixed_field_spec(
1317
1426
  name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
1318
1427
  ):
@@ -1326,97 +1435,58 @@ class VcfZarrSchema:
1326
1435
  chunks=[variants_chunk_size],
1327
1436
  )
1328
1437
 
1329
- alt_col = icf.columns["ALT"]
1438
+ alt_col = icf.fields["ALT"]
1330
1439
  max_alleles = alt_col.vcf_field.summary.max_number + 1
1331
- num_filters = len(icf.metadata.filters)
1332
1440
 
1333
- # # FIXME get dtype from lookup table
1334
1441
  colspecs = [
1335
1442
  fixed_field_spec(
1336
1443
  name="variant_contig",
1337
- dtype="i2", # FIXME
1444
+ dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
1338
1445
  ),
1339
1446
  fixed_field_spec(
1340
1447
  name="variant_filter",
1341
1448
  dtype="bool",
1342
- shape=(m, num_filters),
1449
+ shape=(m, icf.metadata.num_filters),
1343
1450
  dimensions=["variants", "filters"],
1344
1451
  ),
1345
1452
  fixed_field_spec(
1346
1453
  name="variant_allele",
1347
1454
  dtype="str",
1348
- shape=[m, max_alleles],
1455
+ shape=(m, max_alleles),
1349
1456
  dimensions=["variants", "alleles"],
1350
1457
  ),
1351
1458
  fixed_field_spec(
1352
- vcf_field="POS",
1353
- name="variant_position",
1354
- dtype="i4",
1355
- ),
1356
- fixed_field_spec(
1357
- vcf_field=None,
1358
1459
  name="variant_id",
1359
1460
  dtype="str",
1360
1461
  ),
1361
1462
  fixed_field_spec(
1362
- vcf_field=None,
1363
1463
  name="variant_id_mask",
1364
1464
  dtype="bool",
1365
1465
  ),
1366
- fixed_field_spec(
1367
- vcf_field="QUAL",
1368
- name="variant_quality",
1369
- dtype="f4",
1370
- ),
1371
1466
  ]
1467
+ name_map = {field.full_name: field for field in icf.metadata.fields}
1468
+
1469
+ # Only two of the fixed fields have a direct one-to-one mapping.
1470
+ colspecs.extend(
1471
+ [
1472
+ spec_from_field(name_map["QUAL"], variable_name="variant_quality"),
1473
+ spec_from_field(name_map["POS"], variable_name="variant_position"),
1474
+ ]
1475
+ )
1476
+ colspecs.extend([spec_from_field(field) for field in icf.metadata.info_fields])
1372
1477
 
1373
1478
  gt_field = None
1374
- for field in icf.metadata.fields:
1375
- if field.category == "fixed":
1376
- continue
1479
+ for field in icf.metadata.format_fields:
1377
1480
  if field.name == "GT":
1378
1481
  gt_field = field
1379
1482
  continue
1380
- shape = [m]
1381
- prefix = "variant_"
1382
- dimensions = ["variants"]
1383
- chunks = [variants_chunk_size]
1384
- if field.category == "FORMAT":
1385
- prefix = "call_"
1386
- shape.append(n)
1387
- chunks.append(samples_chunk_size)
1388
- dimensions.append("samples")
1389
- # TODO make an option to add in the empty extra dimension
1390
- if field.summary.max_number > 1:
1391
- shape.append(field.summary.max_number)
1392
- # TODO we should really be checking this to see if the named dimensions
1393
- # are actually correct.
1394
- if field.vcf_number == "R":
1395
- dimensions.append("alleles")
1396
- elif field.vcf_number == "A":
1397
- dimensions.append("alt_alleles")
1398
- elif field.vcf_number == "G":
1399
- dimensions.append("genotypes")
1400
- else:
1401
- dimensions.append(f"{field.category}_{field.name}_dim")
1402
- variable_name = prefix + field.name
1403
- colspec = ZarrColumnSpec.new(
1404
- vcf_field=field.full_name,
1405
- name=variable_name,
1406
- dtype=field.smallest_dtype(),
1407
- shape=shape,
1408
- chunks=chunks,
1409
- dimensions=dimensions,
1410
- description=field.description,
1411
- )
1412
- colspecs.append(colspec)
1483
+ colspecs.append(spec_from_field(field))
1413
1484
 
1414
1485
  if gt_field is not None:
1415
1486
  ploidy = gt_field.summary.max_number - 1
1416
1487
  shape = [m, n]
1417
1488
  chunks = [variants_chunk_size, samples_chunk_size]
1418
1489
  dimensions = ["variants", "samples"]
1419
-
1420
1490
  colspecs.append(
1421
1491
  ZarrColumnSpec.new(
1422
1492
  vcf_field=None,
@@ -1457,12 +1527,11 @@ class VcfZarrSchema:
1457
1527
  format_version=ZARR_SCHEMA_FORMAT_VERSION,
1458
1528
  samples_chunk_size=samples_chunk_size,
1459
1529
  variants_chunk_size=variants_chunk_size,
1460
- columns={col.name: col for col in colspecs},
1530
+ fields={col.name: col for col in colspecs},
1461
1531
  dimensions=["variants", "samples", "ploidy", "alleles", "filters"],
1462
- sample_id=icf.metadata.samples,
1463
- contig_id=icf.metadata.contig_names,
1464
- contig_length=icf.metadata.contig_lengths,
1465
- filter_id=icf.metadata.filters,
1532
+ samples=icf.metadata.samples,
1533
+ contigs=icf.metadata.contigs,
1534
+ filters=icf.metadata.filters,
1466
1535
  )
1467
1536
 
1468
1537
 
@@ -1470,14 +1539,12 @@ class VcfZarr:
1470
1539
  def __init__(self, path):
1471
1540
  if not (path / ".zmetadata").exists():
1472
1541
  raise ValueError("Not in VcfZarr format") # NEEDS TEST
1542
+ self.path = path
1473
1543
  self.root = zarr.open(path, mode="r")
1474
1544
 
1475
- def __repr__(self):
1476
- return repr(self.root) # NEEDS TEST
1477
-
1478
1545
  def summary_table(self):
1479
1546
  data = []
1480
- arrays = [(a.nbytes_stored, a) for _, a in self.root.arrays()]
1547
+ arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()]
1481
1548
  arrays.sort(key=lambda x: x[0])
1482
1549
  for stored, array in reversed(arrays):
1483
1550
  d = {
@@ -1498,15 +1565,6 @@ class VcfZarr:
1498
1565
  return data
1499
1566
 
1500
1567
 
1501
- @dataclasses.dataclass
1502
- class EncodingWork:
1503
- func: callable = dataclasses.field(repr=False)
1504
- start: int
1505
- stop: int
1506
- columns: list[str]
1507
- memory: int = 0
1508
-
1509
-
1510
1568
  def parse_max_memory(max_memory):
1511
1569
  if max_memory is None:
1512
1570
  # Effectively unbounded
@@ -1517,67 +1575,299 @@ def parse_max_memory(max_memory):
1517
1575
  return max_memory
1518
1576
 
1519
1577
 
1578
+ @dataclasses.dataclass
1579
+ class VcfZarrPartition:
1580
+ start: int
1581
+ stop: int
1582
+
1583
+ @staticmethod
1584
+ def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
1585
+ num_chunks = int(np.ceil(num_records / chunk_size))
1586
+ if max_chunks is not None:
1587
+ num_chunks = min(num_chunks, max_chunks)
1588
+ partitions = []
1589
+ splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
1590
+ for chunk_slice in splits:
1591
+ start_chunk = int(chunk_slice[0])
1592
+ stop_chunk = int(chunk_slice[-1]) + 1
1593
+ start_index = start_chunk * chunk_size
1594
+ stop_index = min(stop_chunk * chunk_size, num_records)
1595
+ partitions.append(VcfZarrPartition(start_index, stop_index))
1596
+ return partitions
1597
+
1598
+
1599
+ VZW_METADATA_FORMAT_VERSION = "0.1"
1600
+
1601
+
1602
+ @dataclasses.dataclass
1603
+ class VcfZarrWriterMetadata:
1604
+ format_version: str
1605
+ icf_path: str
1606
+ schema: VcfZarrSchema
1607
+ dimension_separator: str
1608
+ partitions: list
1609
+ provenance: dict
1610
+
1611
+ def asdict(self):
1612
+ return dataclasses.asdict(self)
1613
+
1614
+ @staticmethod
1615
+ def fromdict(d):
1616
+ if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
1617
+ raise ValueError(
1618
+ "VcfZarrWriter format version mismatch: "
1619
+ f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
1620
+ )
1621
+ ret = VcfZarrWriterMetadata(**d)
1622
+ ret.schema = VcfZarrSchema.fromdict(ret.schema)
1623
+ ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
1624
+ return ret
1625
+
1626
+
1520
1627
  class VcfZarrWriter:
1521
- def __init__(self, path, icf, schema, dimension_separator=None):
1628
+ def __init__(self, path):
1522
1629
  self.path = pathlib.Path(path)
1630
+ self.wip_path = self.path / "wip"
1631
+ self.arrays_path = self.wip_path / "arrays"
1632
+ self.partitions_path = self.wip_path / "partitions"
1633
+ self.metadata = None
1634
+ self.icf = None
1635
+
1636
+ @property
1637
+ def schema(self):
1638
+ return self.metadata.schema
1639
+
1640
+ @property
1641
+ def num_partitions(self):
1642
+ return len(self.metadata.partitions)
1643
+
1644
+ #######################
1645
+ # init
1646
+ #######################
1647
+
1648
+ def init(
1649
+ self,
1650
+ icf,
1651
+ *,
1652
+ target_num_partitions,
1653
+ schema,
1654
+ dimension_separator=None,
1655
+ max_variant_chunks=None,
1656
+ ):
1523
1657
  self.icf = icf
1524
- self.schema = schema
1658
+ if self.path.exists():
1659
+ raise ValueError("Zarr path already exists") # NEEDS TEST
1660
+ partitions = VcfZarrPartition.generate_partitions(
1661
+ self.icf.num_records,
1662
+ schema.variants_chunk_size,
1663
+ target_num_partitions,
1664
+ max_chunks=max_variant_chunks,
1665
+ )
1525
1666
  # Default to using nested directories following the Zarr v3 default.
1526
1667
  # This seems to require version 2.17+ to work properly
1527
- self.dimension_separator = (
1668
+ dimension_separator = (
1528
1669
  "/" if dimension_separator is None else dimension_separator
1529
1670
  )
1671
+ self.metadata = VcfZarrWriterMetadata(
1672
+ format_version=VZW_METADATA_FORMAT_VERSION,
1673
+ icf_path=str(self.icf.path),
1674
+ schema=schema,
1675
+ dimension_separator=dimension_separator,
1676
+ partitions=partitions,
1677
+ # Bare minimum here for provenance - see comments above
1678
+ provenance={"source": f"bio2zarr-{provenance.__version__}"},
1679
+ )
1680
+
1681
+ self.path.mkdir()
1530
1682
  store = zarr.DirectoryStore(self.path)
1531
- self.root = zarr.group(store=store)
1683
+ root = zarr.group(store=store)
1684
+ root.attrs.update(
1685
+ {
1686
+ "vcf_zarr_version": "0.2",
1687
+ "vcf_header": self.icf.vcf_header,
1688
+ "source": f"bio2zarr-{provenance.__version__}",
1689
+ }
1690
+ )
1691
+ # Doing this syncronously - this is fine surely
1692
+ self.encode_samples(root)
1693
+ self.encode_filter_id(root)
1694
+ self.encode_contig_id(root)
1532
1695
 
1533
- def init_array(self, variable):
1696
+ self.wip_path.mkdir()
1697
+ self.arrays_path.mkdir()
1698
+ self.partitions_path.mkdir()
1699
+ store = zarr.DirectoryStore(self.arrays_path)
1700
+ root = zarr.group(store=store)
1701
+
1702
+ for column in self.schema.fields.values():
1703
+ self.init_array(root, column, partitions[-1].stop)
1704
+
1705
+ logger.info("Writing WIP metadata")
1706
+ with open(self.wip_path / "metadata.json", "w") as f:
1707
+ json.dump(self.metadata.asdict(), f, indent=4)
1708
+ return len(partitions)
1709
+
1710
+ def encode_samples(self, root):
1711
+ if self.schema.samples != self.icf.metadata.samples:
1712
+ raise ValueError(
1713
+ "Subsetting or reordering samples not supported currently"
1714
+ ) # NEEDS TEST
1715
+ array = root.array(
1716
+ "sample_id",
1717
+ [sample.id for sample in self.schema.samples],
1718
+ dtype="str",
1719
+ compressor=DEFAULT_ZARR_COMPRESSOR,
1720
+ chunks=(self.schema.samples_chunk_size,),
1721
+ )
1722
+ array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
1723
+ logger.debug("Samples done")
1724
+
1725
+ def encode_contig_id(self, root):
1726
+ array = root.array(
1727
+ "contig_id",
1728
+ [contig.id for contig in self.schema.contigs],
1729
+ dtype="str",
1730
+ compressor=DEFAULT_ZARR_COMPRESSOR,
1731
+ )
1732
+ array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1733
+ if all(contig.length is not None for contig in self.schema.contigs):
1734
+ array = root.array(
1735
+ "contig_length",
1736
+ [contig.length for contig in self.schema.contigs],
1737
+ dtype=np.int64,
1738
+ compressor=DEFAULT_ZARR_COMPRESSOR,
1739
+ )
1740
+ array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1741
+
1742
+ def encode_filter_id(self, root):
1743
+ # TODO need a way to store description also
1744
+ # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
1745
+ array = root.array(
1746
+ "filter_id",
1747
+ [filt.id for filt in self.schema.filters],
1748
+ dtype="str",
1749
+ compressor=DEFAULT_ZARR_COMPRESSOR,
1750
+ )
1751
+ array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
1752
+
1753
+ def init_array(self, root, variable, variants_dim_size):
1534
1754
  object_codec = None
1535
1755
  if variable.dtype == "O":
1536
1756
  object_codec = numcodecs.VLenUTF8()
1537
- a = self.root.empty(
1538
- "wip_" + variable.name,
1539
- shape=variable.shape,
1757
+ shape = list(variable.shape)
1758
+ # Truncate the variants dimension is max_variant_chunks was specified
1759
+ shape[0] = variants_dim_size
1760
+ a = root.empty(
1761
+ variable.name,
1762
+ shape=shape,
1540
1763
  chunks=variable.chunks,
1541
1764
  dtype=variable.dtype,
1542
1765
  compressor=numcodecs.get_codec(variable.compressor),
1543
1766
  filters=[numcodecs.get_codec(filt) for filt in variable.filters],
1544
1767
  object_codec=object_codec,
1545
- dimension_separator=self.dimension_separator,
1768
+ dimension_separator=self.metadata.dimension_separator,
1546
1769
  )
1547
- # Dimension names are part of the spec in Zarr v3
1548
- a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
1549
-
1550
- def get_array(self, name):
1551
- return self.root["wip_" + name]
1552
-
1553
- def finalise_array(self, variable_name):
1554
- source = self.path / ("wip_" + variable_name)
1555
- dest = self.path / variable_name
1556
- # Atomic swap
1557
- os.rename(source, dest)
1558
- logger.info(f"Finalised {variable_name}")
1559
-
1560
- def encode_array_slice(self, column, start, stop):
1561
- source_col = self.icf.columns[column.vcf_field]
1562
- array = self.get_array(column.name)
1563
- ba = core.BufferedArray(array, start)
1770
+ a.attrs.update(
1771
+ {
1772
+ "description": variable.description,
1773
+ # Dimension names are part of the spec in Zarr v3
1774
+ "_ARRAY_DIMENSIONS": variable.dimensions,
1775
+ }
1776
+ )
1777
+ logger.debug(f"Initialised {a}")
1778
+
1779
+ #######################
1780
+ # encode_partition
1781
+ #######################
1782
+
1783
+ def load_metadata(self):
1784
+ if self.metadata is None:
1785
+ with open(self.wip_path / "metadata.json") as f:
1786
+ self.metadata = VcfZarrWriterMetadata.fromdict(json.load(f))
1787
+ self.icf = IntermediateColumnarFormat(self.metadata.icf_path)
1788
+
1789
+ def partition_path(self, partition_index):
1790
+ return self.partitions_path / f"p{partition_index}"
1791
+
1792
+ def wip_partition_path(self, partition_index):
1793
+ return self.partitions_path / f"wip_p{partition_index}"
1794
+
1795
+ def wip_partition_array_path(self, partition_index, name):
1796
+ return self.wip_partition_path(partition_index) / name
1797
+
1798
+ def partition_array_path(self, partition_index, name):
1799
+ return self.partition_path(partition_index) / name
1800
+
1801
+ def encode_partition(self, partition_index):
1802
+ self.load_metadata()
1803
+ if partition_index < 0 or partition_index >= self.num_partitions:
1804
+ raise ValueError(
1805
+ "Partition index must be in the range 0 <= index < num_partitions"
1806
+ )
1807
+ partition_path = self.wip_partition_path(partition_index)
1808
+ partition_path.mkdir(exist_ok=True)
1809
+ logger.info(f"Encoding partition {partition_index} to {partition_path}")
1810
+
1811
+ self.encode_id_partition(partition_index)
1812
+ self.encode_filters_partition(partition_index)
1813
+ self.encode_contig_partition(partition_index)
1814
+ self.encode_alleles_partition(partition_index)
1815
+ for col in self.schema.fields.values():
1816
+ if col.vcf_field is not None:
1817
+ self.encode_array_partition(col, partition_index)
1818
+ if "call_genotype" in self.schema.fields:
1819
+ self.encode_genotypes_partition(partition_index)
1820
+
1821
+ final_path = self.partition_path(partition_index)
1822
+ logger.info(f"Finalising {partition_index} at {final_path}")
1823
+ if final_path.exists():
1824
+ logger.warning(f"Removing existing partition at {final_path}")
1825
+ shutil.rmtree(final_path)
1826
+ os.rename(partition_path, final_path)
1827
+
1828
+ def init_partition_array(self, partition_index, name):
1829
+ wip_path = self.wip_partition_array_path(partition_index, name)
1830
+ # Create an empty array like the definition
1831
+ src = self.arrays_path / name
1832
+ # Overwrite any existing WIP files
1833
+ shutil.copytree(src, wip_path, dirs_exist_ok=True)
1834
+ array = zarr.open(wip_path)
1835
+ logger.debug(f"Opened empty array {array} @ {wip_path}")
1836
+ return array
1837
+
1838
+ def finalise_partition_array(self, partition_index, name):
1839
+ logger.debug(f"Encoded {name} partition {partition_index}")
1840
+
1841
+ def encode_array_partition(self, column, partition_index):
1842
+ array = self.init_partition_array(partition_index, column.name)
1843
+
1844
+ partition = self.metadata.partitions[partition_index]
1845
+ ba = core.BufferedArray(array, partition.start)
1846
+ source_col = self.icf.fields[column.vcf_field]
1564
1847
  sanitiser = source_col.sanitiser_factory(ba.buff.shape)
1565
1848
 
1566
- for value in source_col.iter_values(start, stop):
1849
+ for value in source_col.iter_values(partition.start, partition.stop):
1567
1850
  # We write directly into the buffer in the sanitiser function
1568
1851
  # to make it easier to reason about dimension padding
1569
1852
  j = ba.next_buffer_row()
1570
1853
  sanitiser(ba.buff, j, value)
1571
1854
  ba.flush()
1572
- logger.debug(f"Encoded {column.name} slice {start}:{stop}")
1855
+ self.finalise_partition_array(partition_index, column.name)
1856
+
1857
+ def encode_genotypes_partition(self, partition_index):
1858
+ gt_array = self.init_partition_array(partition_index, "call_genotype")
1859
+ gt_mask_array = self.init_partition_array(partition_index, "call_genotype_mask")
1860
+ gt_phased_array = self.init_partition_array(
1861
+ partition_index, "call_genotype_phased"
1862
+ )
1573
1863
 
1574
- def encode_genotypes_slice(self, start, stop):
1575
- source_col = self.icf.columns["FORMAT/GT"]
1576
- gt = core.BufferedArray(self.get_array("call_genotype"), start)
1577
- gt_mask = core.BufferedArray(self.get_array("call_genotype_mask"), start)
1578
- gt_phased = core.BufferedArray(self.get_array("call_genotype_phased"), start)
1864
+ partition = self.metadata.partitions[partition_index]
1865
+ gt = core.BufferedArray(gt_array, partition.start)
1866
+ gt_mask = core.BufferedArray(gt_mask_array, partition.start)
1867
+ gt_phased = core.BufferedArray(gt_phased_array, partition.start)
1579
1868
 
1580
- for value in source_col.iter_values(start, stop):
1869
+ source_col = self.icf.fields["FORMAT/GT"]
1870
+ for value in source_col.iter_values(partition.start, partition.stop):
1581
1871
  j = gt.next_buffer_row()
1582
1872
  sanitise_value_int_2d(gt.buff, j, value[:, :-1])
1583
1873
  j = gt_phased.next_buffer_row()
@@ -1589,29 +1879,40 @@ class VcfZarrWriter:
1589
1879
  gt.flush()
1590
1880
  gt_phased.flush()
1591
1881
  gt_mask.flush()
1592
- logger.debug(f"Encoded GT slice {start}:{stop}")
1593
1882
 
1594
- def encode_alleles_slice(self, start, stop):
1595
- ref_col = self.icf.columns["REF"]
1596
- alt_col = self.icf.columns["ALT"]
1597
- alleles = core.BufferedArray(self.get_array("variant_allele"), start)
1883
+ self.finalise_partition_array(partition_index, "call_genotype")
1884
+ self.finalise_partition_array(partition_index, "call_genotype_mask")
1885
+ self.finalise_partition_array(partition_index, "call_genotype_phased")
1886
+
1887
+ def encode_alleles_partition(self, partition_index):
1888
+ array_name = "variant_allele"
1889
+ alleles_array = self.init_partition_array(partition_index, array_name)
1890
+ partition = self.metadata.partitions[partition_index]
1891
+ alleles = core.BufferedArray(alleles_array, partition.start)
1892
+ ref_col = self.icf.fields["REF"]
1893
+ alt_col = self.icf.fields["ALT"]
1598
1894
 
1599
1895
  for ref, alt in zip(
1600
- ref_col.iter_values(start, stop), alt_col.iter_values(start, stop)
1896
+ ref_col.iter_values(partition.start, partition.stop),
1897
+ alt_col.iter_values(partition.start, partition.stop),
1601
1898
  ):
1602
1899
  j = alleles.next_buffer_row()
1603
1900
  alleles.buff[j, :] = STR_FILL
1604
1901
  alleles.buff[j, 0] = ref[0]
1605
1902
  alleles.buff[j, 1 : 1 + len(alt)] = alt
1606
1903
  alleles.flush()
1607
- logger.debug(f"Encoded alleles slice {start}:{stop}")
1608
1904
 
1609
- def encode_id_slice(self, start, stop):
1610
- col = self.icf.columns["ID"]
1611
- vid = core.BufferedArray(self.get_array("variant_id"), start)
1612
- vid_mask = core.BufferedArray(self.get_array("variant_id_mask"), start)
1905
+ self.finalise_partition_array(partition_index, array_name)
1613
1906
 
1614
- for value in col.iter_values(start, stop):
1907
+ def encode_id_partition(self, partition_index):
1908
+ vid_array = self.init_partition_array(partition_index, "variant_id")
1909
+ vid_mask_array = self.init_partition_array(partition_index, "variant_id_mask")
1910
+ partition = self.metadata.partitions[partition_index]
1911
+ vid = core.BufferedArray(vid_array, partition.start)
1912
+ vid_mask = core.BufferedArray(vid_mask_array, partition.start)
1913
+ col = self.icf.fields["ID"]
1914
+
1915
+ for value in col.iter_values(partition.start, partition.stop):
1615
1916
  j = vid.next_buffer_row()
1616
1917
  k = vid_mask.next_buffer_row()
1617
1918
  assert j == k
@@ -1623,13 +1924,19 @@ class VcfZarrWriter:
1623
1924
  vid_mask.buff[j] = True
1624
1925
  vid.flush()
1625
1926
  vid_mask.flush()
1626
- logger.debug(f"Encoded ID slice {start}:{stop}")
1627
1927
 
1628
- def encode_filters_slice(self, lookup, start, stop):
1629
- col = self.icf.columns["FILTERS"]
1630
- var_filter = core.BufferedArray(self.get_array("variant_filter"), start)
1928
+ self.finalise_partition_array(partition_index, "variant_id")
1929
+ self.finalise_partition_array(partition_index, "variant_id_mask")
1930
+
1931
+ def encode_filters_partition(self, partition_index):
1932
+ lookup = {filt.id: index for index, filt in enumerate(self.schema.filters)}
1933
+ array_name = "variant_filter"
1934
+ array = self.init_partition_array(partition_index, array_name)
1935
+ partition = self.metadata.partitions[partition_index]
1936
+ var_filter = core.BufferedArray(array, partition.start)
1631
1937
 
1632
- for value in col.iter_values(start, stop):
1938
+ col = self.icf.fields["FILTERS"]
1939
+ for value in col.iter_values(partition.start, partition.stop):
1633
1940
  j = var_filter.next_buffer_row()
1634
1941
  var_filter.buff[j] = False
1635
1942
  for f in value:
@@ -1637,16 +1944,21 @@ class VcfZarrWriter:
1637
1944
  var_filter.buff[j, lookup[f]] = True
1638
1945
  except KeyError:
1639
1946
  raise ValueError(
1640
- f"Filter '{f}' was not defined " f"in the header."
1947
+ f"Filter '{f}' was not defined in the header."
1641
1948
  ) from None
1642
1949
  var_filter.flush()
1643
- logger.debug(f"Encoded FILTERS slice {start}:{stop}")
1644
1950
 
1645
- def encode_contig_slice(self, lookup, start, stop):
1646
- col = self.icf.columns["CHROM"]
1647
- contig = core.BufferedArray(self.get_array("variant_contig"), start)
1951
+ self.finalise_partition_array(partition_index, array_name)
1952
+
1953
+ def encode_contig_partition(self, partition_index):
1954
+ lookup = {contig.id: index for index, contig in enumerate(self.schema.contigs)}
1955
+ array_name = "variant_contig"
1956
+ array = self.init_partition_array(partition_index, array_name)
1957
+ partition = self.metadata.partitions[partition_index]
1958
+ contig = core.BufferedArray(array, partition.start)
1959
+ col = self.icf.fields["CHROM"]
1648
1960
 
1649
- for value in col.iter_values(start, stop):
1961
+ for value in col.iter_values(partition.start, partition.stop):
1650
1962
  j = contig.next_buffer_row()
1651
1963
  # Note: because we are using the indexes to define the lookups
1652
1964
  # and we always have an index, it seems that we the contig lookup
@@ -1654,161 +1966,131 @@ class VcfZarrWriter:
1654
1966
  # here, please do open an issue with a reproducible example!
1655
1967
  contig.buff[j] = lookup[value[0]]
1656
1968
  contig.flush()
1657
- logger.debug(f"Encoded CHROM slice {start}:{stop}")
1658
1969
 
1659
- def encode_samples(self):
1660
- if not np.array_equal(self.schema.sample_id, self.icf.metadata.samples):
1661
- raise ValueError(
1662
- "Subsetting or reordering samples not supported currently"
1663
- ) # NEEDS TEST
1664
- array = self.root.array(
1665
- "sample_id",
1666
- self.schema.sample_id,
1667
- dtype="str",
1668
- compressor=DEFAULT_ZARR_COMPRESSOR,
1669
- chunks=(self.schema.samples_chunk_size,),
1670
- )
1671
- array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
1672
- logger.debug("Samples done")
1673
-
1674
- def encode_contig_id(self):
1675
- array = self.root.array(
1676
- "contig_id",
1677
- self.schema.contig_id,
1678
- dtype="str",
1679
- compressor=DEFAULT_ZARR_COMPRESSOR,
1680
- )
1681
- array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1682
- if self.schema.contig_length is not None:
1683
- array = self.root.array(
1684
- "contig_length",
1685
- self.schema.contig_length,
1686
- dtype=np.int64,
1687
- compressor=DEFAULT_ZARR_COMPRESSOR,
1970
+ self.finalise_partition_array(partition_index, array_name)
1971
+
1972
+ #######################
1973
+ # finalise
1974
+ #######################
1975
+
1976
+ def finalise_array(self, name):
1977
+ logger.info(f"Finalising {name}")
1978
+ final_path = self.path / name
1979
+ if final_path.exists():
1980
+ # NEEDS TEST
1981
+ raise ValueError(f"Array {name} already exists")
1982
+ for partition in range(self.num_partitions):
1983
+ # Move all the files in partition dir to dest dir
1984
+ src = self.partition_array_path(partition, name)
1985
+ if not src.exists():
1986
+ # Needs test
1987
+ raise ValueError(f"Partition {partition} of {name} does not exist")
1988
+ dest = self.arrays_path / name
1989
+ # This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
1990
+ chunk_files = [
1991
+ path for path in src.iterdir() if not path.name.startswith(".")
1992
+ ]
1993
+ # TODO check for a count of then number of files. If we require a
1994
+ # dimension_separator of "/" then we could make stronger assertions
1995
+ # here, as we'd always have num_variant_chunks
1996
+ logger.debug(
1997
+ f"Moving {len(chunk_files)} chunks for {name} partition {partition}"
1688
1998
  )
1689
- array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1690
- return {v: j for j, v in enumerate(self.schema.contig_id)}
1999
+ for chunk_file in chunk_files:
2000
+ os.rename(chunk_file, dest / chunk_file.name)
2001
+ # Finally, once all the chunks have moved into the arrays dir,
2002
+ # we move it out of wip
2003
+ os.rename(self.arrays_path / name, self.path / name)
2004
+ core.update_progress(1)
1691
2005
 
1692
- def encode_filter_id(self):
1693
- array = self.root.array(
1694
- "filter_id",
1695
- self.schema.filter_id,
1696
- dtype="str",
1697
- compressor=DEFAULT_ZARR_COMPRESSOR,
1698
- )
1699
- array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
1700
- return {v: j for j, v in enumerate(self.schema.filter_id)}
2006
+ def finalise(self, show_progress=False):
2007
+ self.load_metadata()
1701
2008
 
1702
- def init(self):
1703
- self.root.attrs["vcf_zarr_version"] = "0.2"
1704
- self.root.attrs["vcf_header"] = self.icf.vcf_header
1705
- self.root.attrs["source"] = f"bio2zarr-{provenance.__version__}"
1706
- for column in self.schema.columns.values():
1707
- self.init_array(column)
2009
+ logger.info("Scanning {self.num_partitions} partitions")
2010
+ missing = []
2011
+ # TODO may need a progress bar here
2012
+ for partition_id in range(self.num_partitions):
2013
+ if not self.partition_path(partition_id).exists():
2014
+ missing.append(partition_id)
2015
+ if len(missing) > 0:
2016
+ raise FileNotFoundError(f"Partitions not encoded: {missing}")
1708
2017
 
1709
- def finalise(self):
2018
+ progress_config = core.ProgressConfig(
2019
+ total=len(self.schema.fields),
2020
+ title="Finalise",
2021
+ units="array",
2022
+ show=show_progress,
2023
+ )
2024
+ # NOTE: it's not clear that adding more workers will make this quicker,
2025
+ # as it's just going to be causing contention on the file system.
2026
+ # Something to check empirically in some deployments.
2027
+ # FIXME we're just using worker_processes=0 here to hook into the
2028
+ # SynchronousExecutor which is intended for testing purposes so
2029
+ # that we get test coverage. Should fix this either by allowing
2030
+ # for multiple workers, or making a standard wrapper for tqdm
2031
+ # that allows us to have a consistent look and feel.
2032
+ with core.ParallelWorkManager(0, progress_config) as pwm:
2033
+ for name in self.schema.fields:
2034
+ pwm.submit(self.finalise_array, name)
2035
+ logger.debug(f"Removing {self.wip_path}")
2036
+ shutil.rmtree(self.wip_path)
2037
+ logger.info("Consolidating Zarr metadata")
1710
2038
  zarr.consolidate_metadata(self.path)
1711
2039
 
1712
- def encode(
1713
- self,
1714
- worker_processes=1,
1715
- max_v_chunks=None,
1716
- show_progress=False,
1717
- max_memory=None,
1718
- ):
1719
- max_memory = parse_max_memory(max_memory)
2040
+ ######################
2041
+ # encode_all_partitions
2042
+ ######################
1720
2043
 
1721
- # TODO this will move into the setup logic later when we're making it possible
1722
- # to split the work by slice
1723
- num_slices = max(1, worker_processes * 4)
1724
- # Using POS arbitrarily to get the array slices
1725
- slices = core.chunk_aligned_slices(
1726
- self.get_array("variant_position"), num_slices, max_chunks=max_v_chunks
2044
+ def get_max_encoding_memory(self):
2045
+ """
2046
+ Return the approximate maximum memory used to encode a variant chunk.
2047
+ """
2048
+ max_encoding_mem = max(
2049
+ col.variant_chunk_nbytes for col in self.schema.fields.values()
1727
2050
  )
1728
- truncated = slices[-1][-1]
1729
- for array in self.root.values():
1730
- if array.attrs["_ARRAY_DIMENSIONS"][0] == "variants":
1731
- shape = list(array.shape)
1732
- shape[0] = truncated
1733
- array.resize(shape)
1734
-
1735
- total_bytes = 0
1736
- encoding_memory_requirements = {}
1737
- for col in self.schema.columns.values():
1738
- array = self.get_array(col.name)
1739
- # NOTE!! this is bad, we're potentially creating quite a large
1740
- # numpy array for basically nothing. We can compute this.
1741
- variant_chunk_size = array.blocks[0].nbytes
1742
- encoding_memory_requirements[col.name] = variant_chunk_size
1743
- logger.debug(
1744
- f"{col.name} requires at least {display_size(variant_chunk_size)} "
1745
- f"per worker"
2051
+ gt_mem = 0
2052
+ if "call_genotype" in self.schema.fields:
2053
+ encoded_together = [
2054
+ "call_genotype",
2055
+ "call_genotype_phased",
2056
+ "call_genotype_mask",
2057
+ ]
2058
+ gt_mem = sum(
2059
+ self.schema.fields[col].variant_chunk_nbytes for col in encoded_together
1746
2060
  )
1747
- total_bytes += array.nbytes
1748
-
1749
- filter_id_map = self.encode_filter_id()
1750
- contig_id_map = self.encode_contig_id()
1751
-
1752
- work = []
1753
- for start, stop in slices:
1754
- for col in self.schema.columns.values():
1755
- if col.vcf_field is not None:
1756
- f = functools.partial(self.encode_array_slice, col)
1757
- work.append(
1758
- EncodingWork(
1759
- f,
1760
- start,
1761
- stop,
1762
- [col.name],
1763
- encoding_memory_requirements[col.name],
1764
- )
1765
- )
1766
- work.append(
1767
- EncodingWork(self.encode_alleles_slice, start, stop, ["variant_allele"])
1768
- )
1769
- work.append(
1770
- EncodingWork(
1771
- self.encode_id_slice, start, stop, ["variant_id", "variant_id_mask"]
1772
- )
1773
- )
1774
- work.append(
1775
- EncodingWork(
1776
- functools.partial(self.encode_filters_slice, filter_id_map),
1777
- start,
1778
- stop,
1779
- ["variant_filter"],
1780
- )
2061
+ return max(max_encoding_mem, gt_mem)
2062
+
2063
+ def encode_all_partitions(
2064
+ self, *, worker_processes=1, show_progress=False, max_memory=None
2065
+ ):
2066
+ max_memory = parse_max_memory(max_memory)
2067
+ self.load_metadata()
2068
+ num_partitions = self.num_partitions
2069
+ per_worker_memory = self.get_max_encoding_memory()
2070
+ logger.info(
2071
+ f"Encoding Zarr over {num_partitions} partitions with "
2072
+ f"{worker_processes} workers and {display_size(per_worker_memory)} "
2073
+ "per worker"
2074
+ )
2075
+ # Each partition requires per_worker_memory bytes, so to prevent more that
2076
+ # max_memory being used, we clamp the number of workers
2077
+ max_num_workers = max_memory // per_worker_memory
2078
+ if max_num_workers < worker_processes:
2079
+ logger.warning(
2080
+ f"Limiting number of workers to {max_num_workers} to "
2081
+ f"keep within specified memory budget of {display_size(max_memory)}"
1781
2082
  )
1782
- work.append(
1783
- EncodingWork(
1784
- functools.partial(self.encode_contig_slice, contig_id_map),
1785
- start,
1786
- stop,
1787
- ["variant_contig"],
1788
- )
2083
+ if max_num_workers <= 0:
2084
+ raise ValueError(
2085
+ f"Insufficient memory to encode a partition:"
2086
+ f"{display_size(per_worker_memory)} > {display_size(max_memory)}"
1789
2087
  )
1790
- if "call_genotype" in self.schema.columns:
1791
- variables = [
1792
- "call_genotype",
1793
- "call_genotype_phased",
1794
- "call_genotype_mask",
1795
- ]
1796
- gt_memory = sum(
1797
- encoding_memory_requirements[name] for name in variables
1798
- )
1799
- work.append(
1800
- EncodingWork(
1801
- self.encode_genotypes_slice, start, stop, variables, gt_memory
1802
- )
1803
- )
2088
+ num_workers = min(max_num_workers, worker_processes)
1804
2089
 
1805
- # Fail early if we can't fit a particular column into memory
1806
- for wp in work:
1807
- if wp.memory > max_memory:
1808
- raise ValueError(
1809
- f"Insufficient memory for {wp.columns}: "
1810
- f"{display_size(wp.memory)} > {display_size(max_memory)}"
1811
- )
2090
+ total_bytes = 0
2091
+ for col in self.schema.fields.values():
2092
+ # Open the array definition to get the total size
2093
+ total_bytes += zarr.open(self.arrays_path / col.name).nbytes
1812
2094
 
1813
2095
  progress_config = core.ProgressConfig(
1814
2096
  total=total_bytes,
@@ -1816,54 +2098,9 @@ class VcfZarrWriter:
1816
2098
  units="B",
1817
2099
  show=show_progress,
1818
2100
  )
1819
-
1820
- used_memory = 0
1821
- # We need to keep some bounds on the queue size or the memory bounds algorithm
1822
- # below doesn't really work.
1823
- max_queued = 4 * max(1, worker_processes)
1824
- encoded_slices = collections.Counter()
1825
-
1826
- with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
1827
- future = pwm.submit(self.encode_samples)
1828
- future_to_work = {future: EncodingWork(None, 0, 0, [])}
1829
-
1830
- def service_completed_futures():
1831
- nonlocal used_memory
1832
-
1833
- completed = pwm.wait_for_completed()
1834
- for future in completed:
1835
- wp_done = future_to_work.pop(future)
1836
- used_memory -= wp_done.memory
1837
- logger.debug(
1838
- f"Complete {wp_done}: used mem={display_size(used_memory)}"
1839
- )
1840
- for column in wp_done.columns:
1841
- encoded_slices[column] += 1
1842
- if encoded_slices[column] == len(slices):
1843
- # Do this syncronously for simplicity. Should be
1844
- # fine as the workers will probably be busy with
1845
- # large encode tasks most of the time.
1846
- self.finalise_array(column)
1847
-
1848
- for wp in work:
1849
- while (
1850
- used_memory + wp.memory > max_memory
1851
- or len(future_to_work) > max_queued
1852
- ):
1853
- logger.debug(
1854
- f"Wait: mem_required={used_memory + wp.memory} "
1855
- f"max_mem={max_memory} queued={len(future_to_work)} "
1856
- f"max_queued={max_queued}"
1857
- )
1858
- service_completed_futures()
1859
- future = pwm.submit(wp.func, wp.start, wp.stop)
1860
- used_memory += wp.memory
1861
- logger.debug(f"Submit {wp}: used mem={display_size(used_memory)}")
1862
- future_to_work[future] = wp
1863
-
1864
- logger.debug("All work submitted")
1865
- while len(future_to_work) > 0:
1866
- service_completed_futures()
2101
+ with core.ParallelWorkManager(num_workers, progress_config) as pwm:
2102
+ for partition_index in range(num_partitions):
2103
+ pwm.submit(self.encode_partition, partition_index)
1867
2104
 
1868
2105
 
1869
2106
  def mkschema(if_path, out):
@@ -1878,13 +2115,48 @@ def encode(
1878
2115
  schema_path=None,
1879
2116
  variants_chunk_size=None,
1880
2117
  samples_chunk_size=None,
1881
- max_v_chunks=None,
2118
+ max_variant_chunks=None,
1882
2119
  dimension_separator=None,
1883
2120
  max_memory=None,
1884
2121
  worker_processes=1,
1885
2122
  show_progress=False,
1886
2123
  ):
1887
- icf = IntermediateColumnarFormat(if_path)
2124
+ # Rough heuristic to split work up enough to keep utilisation high
2125
+ target_num_partitions = max(1, worker_processes * 4)
2126
+ encode_init(
2127
+ if_path,
2128
+ zarr_path,
2129
+ target_num_partitions,
2130
+ schema_path=schema_path,
2131
+ variants_chunk_size=variants_chunk_size,
2132
+ samples_chunk_size=samples_chunk_size,
2133
+ max_variant_chunks=max_variant_chunks,
2134
+ dimension_separator=dimension_separator,
2135
+ )
2136
+ vzw = VcfZarrWriter(zarr_path)
2137
+ vzw.encode_all_partitions(
2138
+ worker_processes=worker_processes,
2139
+ show_progress=show_progress,
2140
+ max_memory=max_memory,
2141
+ )
2142
+ vzw.finalise(show_progress)
2143
+
2144
+
2145
+ def encode_init(
2146
+ icf_path,
2147
+ zarr_path,
2148
+ target_num_partitions,
2149
+ *,
2150
+ schema_path=None,
2151
+ variants_chunk_size=None,
2152
+ samples_chunk_size=None,
2153
+ max_variant_chunks=None,
2154
+ dimension_separator=None,
2155
+ max_memory=None,
2156
+ worker_processes=1,
2157
+ show_progress=False,
2158
+ ):
2159
+ icf = IntermediateColumnarFormat(icf_path)
1888
2160
  if schema_path is None:
1889
2161
  schema = VcfZarrSchema.generate(
1890
2162
  icf,
@@ -1900,18 +2172,25 @@ def encode(
1900
2172
  with open(schema_path) as f:
1901
2173
  schema = VcfZarrSchema.fromjson(f.read())
1902
2174
  zarr_path = pathlib.Path(zarr_path)
1903
- if zarr_path.exists():
1904
- logger.warning(f"Deleting existing {zarr_path}")
1905
- shutil.rmtree(zarr_path)
1906
- vzw = VcfZarrWriter(zarr_path, icf, schema, dimension_separator=dimension_separator)
1907
- vzw.init()
1908
- vzw.encode(
1909
- max_v_chunks=max_v_chunks,
1910
- worker_processes=worker_processes,
1911
- max_memory=max_memory,
1912
- show_progress=show_progress,
2175
+ vzw = VcfZarrWriter(zarr_path)
2176
+ vzw.init(
2177
+ icf,
2178
+ target_num_partitions=target_num_partitions,
2179
+ schema=schema,
2180
+ dimension_separator=dimension_separator,
2181
+ max_variant_chunks=max_variant_chunks,
1913
2182
  )
1914
- vzw.finalise()
2183
+ return vzw.num_partitions, vzw.get_max_encoding_memory()
2184
+
2185
+
2186
+ def encode_partition(zarr_path, partition):
2187
+ writer = VcfZarrWriter(zarr_path)
2188
+ writer.encode_partition(partition)
2189
+
2190
+
2191
+ def encode_finalise(zarr_path, show_progress=False):
2192
+ writer = VcfZarrWriter(zarr_path)
2193
+ writer.finalise(show_progress=show_progress)
1915
2194
 
1916
2195
 
1917
2196
  def convert(
@@ -2121,7 +2400,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
2121
2400
  assert pos[start_index] == first_pos
2122
2401
  vcf = cyvcf2.VCF(vcf_path)
2123
2402
  if show_progress:
2124
- iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
2403
+ iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
2125
2404
  else:
2126
2405
  iterator = vcf
2127
2406
  for j, row in enumerate(iterator, start_index):