bio2zarr 0.0.5__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +126 -25
- bio2zarr/core.py +31 -3
- bio2zarr/vcf.py +754 -475
- bio2zarr/vcf_utils.py +25 -16
- bio2zarr-0.0.9.dist-info/METADATA +363 -0
- bio2zarr-0.0.9.dist-info/RECORD +16 -0
- bio2zarr-0.0.5.dist-info/METADATA +0 -33
- bio2zarr-0.0.5.dist-info/RECORD +0 -16
- {bio2zarr-0.0.5.dist-info → bio2zarr-0.0.9.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.5.dist-info → bio2zarr-0.0.9.dist-info}/WHEEL +0 -0
- {bio2zarr-0.0.5.dist-info → bio2zarr-0.0.9.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.0.5.dist-info → bio2zarr-0.0.9.dist-info}/top_level.txt +0 -0
bio2zarr/vcf.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
import collections
|
|
2
2
|
import contextlib
|
|
3
3
|
import dataclasses
|
|
4
|
-
import functools
|
|
5
4
|
import json
|
|
6
5
|
import logging
|
|
7
6
|
import math
|
|
8
7
|
import os
|
|
8
|
+
import os.path
|
|
9
9
|
import pathlib
|
|
10
10
|
import pickle
|
|
11
11
|
import shutil
|
|
12
12
|
import sys
|
|
13
13
|
import tempfile
|
|
14
|
-
from typing import Any
|
|
14
|
+
from typing import Any
|
|
15
15
|
|
|
16
16
|
import cyvcf2
|
|
17
17
|
import humanfriendly
|
|
@@ -111,9 +111,6 @@ class VcfField:
|
|
|
111
111
|
return self.name
|
|
112
112
|
return f"{self.category}/{self.name}"
|
|
113
113
|
|
|
114
|
-
# TODO add method here to choose a good set compressor and
|
|
115
|
-
# filters default here for this field.
|
|
116
|
-
|
|
117
114
|
def smallest_dtype(self):
|
|
118
115
|
"""
|
|
119
116
|
Returns the smallest dtype suitable for this field based
|
|
@@ -123,13 +120,13 @@ class VcfField:
|
|
|
123
120
|
if self.vcf_type == "Float":
|
|
124
121
|
ret = "f4"
|
|
125
122
|
elif self.vcf_type == "Integer":
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
123
|
+
if not math.isfinite(s.max_value):
|
|
124
|
+
# All missing values; use i1. Note we should have some API to
|
|
125
|
+
# check more explicitly for missingness:
|
|
126
|
+
# https://github.com/sgkit-dev/bio2zarr/issues/131
|
|
127
|
+
ret = "i1"
|
|
128
|
+
else:
|
|
129
|
+
ret = core.min_int_dtype(s.min_value, s.max_value)
|
|
133
130
|
elif self.vcf_type == "Flag":
|
|
134
131
|
ret = "bool"
|
|
135
132
|
elif self.vcf_type == "Character":
|
|
@@ -147,25 +144,41 @@ class VcfPartition:
|
|
|
147
144
|
num_records: int = -1
|
|
148
145
|
|
|
149
146
|
|
|
150
|
-
ICF_METADATA_FORMAT_VERSION = "0.
|
|
147
|
+
ICF_METADATA_FORMAT_VERSION = "0.3"
|
|
151
148
|
ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
|
|
152
149
|
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
|
|
153
150
|
)
|
|
154
151
|
|
|
155
152
|
|
|
153
|
+
@dataclasses.dataclass
|
|
154
|
+
class Contig:
|
|
155
|
+
id: str
|
|
156
|
+
length: int = None
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@dataclasses.dataclass
|
|
160
|
+
class Sample:
|
|
161
|
+
id: str
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@dataclasses.dataclass
|
|
165
|
+
class Filter:
|
|
166
|
+
id: str
|
|
167
|
+
description: str = ""
|
|
168
|
+
|
|
169
|
+
|
|
156
170
|
@dataclasses.dataclass
|
|
157
171
|
class IcfMetadata:
|
|
158
172
|
samples: list
|
|
159
|
-
|
|
160
|
-
contig_record_counts: dict
|
|
173
|
+
contigs: list
|
|
161
174
|
filters: list
|
|
162
175
|
fields: list
|
|
163
176
|
partitions: list = None
|
|
164
|
-
contig_lengths: list = None
|
|
165
177
|
format_version: str = None
|
|
166
178
|
compressor: dict = None
|
|
167
179
|
column_chunk_size: int = None
|
|
168
180
|
provenance: dict = None
|
|
181
|
+
num_records: int = -1
|
|
169
182
|
|
|
170
183
|
@property
|
|
171
184
|
def info_fields(self):
|
|
@@ -184,8 +197,12 @@ class IcfMetadata:
|
|
|
184
197
|
return fields
|
|
185
198
|
|
|
186
199
|
@property
|
|
187
|
-
def
|
|
188
|
-
return
|
|
200
|
+
def num_contigs(self):
|
|
201
|
+
return len(self.contigs)
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def num_filters(self):
|
|
205
|
+
return len(self.filters)
|
|
189
206
|
|
|
190
207
|
@staticmethod
|
|
191
208
|
def fromdict(d):
|
|
@@ -194,18 +211,23 @@ class IcfMetadata:
|
|
|
194
211
|
"Intermediate columnar metadata format version mismatch: "
|
|
195
212
|
f"{d['format_version']} != {ICF_METADATA_FORMAT_VERSION}"
|
|
196
213
|
)
|
|
197
|
-
fields = [VcfField.fromdict(fd) for fd in d["fields"]]
|
|
198
214
|
partitions = [VcfPartition(**pd) for pd in d["partitions"]]
|
|
199
215
|
for p in partitions:
|
|
200
216
|
p.region = vcf_utils.Region(**p.region)
|
|
201
217
|
d = d.copy()
|
|
202
|
-
d["fields"] = fields
|
|
203
218
|
d["partitions"] = partitions
|
|
219
|
+
d["fields"] = [VcfField.fromdict(fd) for fd in d["fields"]]
|
|
220
|
+
d["samples"] = [Sample(**sd) for sd in d["samples"]]
|
|
221
|
+
d["filters"] = [Filter(**fd) for fd in d["filters"]]
|
|
222
|
+
d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
|
|
204
223
|
return IcfMetadata(**d)
|
|
205
224
|
|
|
206
225
|
def asdict(self):
|
|
207
226
|
return dataclasses.asdict(self)
|
|
208
227
|
|
|
228
|
+
def asjson(self):
|
|
229
|
+
return json.dumps(self.asdict(), indent=4)
|
|
230
|
+
|
|
209
231
|
|
|
210
232
|
def fixed_vcf_field_definitions():
|
|
211
233
|
def make_field_def(name, vcf_type, vcf_number):
|
|
@@ -233,15 +255,22 @@ def fixed_vcf_field_definitions():
|
|
|
233
255
|
def scan_vcf(path, target_num_partitions):
|
|
234
256
|
with vcf_utils.IndexedVcf(path) as indexed_vcf:
|
|
235
257
|
vcf = indexed_vcf.vcf
|
|
236
|
-
filters = [
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
if h["HeaderType"] == "FILTER" and isinstance(h["ID"], str)
|
|
240
|
-
|
|
258
|
+
filters = []
|
|
259
|
+
pass_index = -1
|
|
260
|
+
for h in vcf.header_iter():
|
|
261
|
+
if h["HeaderType"] == "FILTER" and isinstance(h["ID"], str):
|
|
262
|
+
try:
|
|
263
|
+
description = h["Description"].strip('"')
|
|
264
|
+
except KeyError:
|
|
265
|
+
description = ""
|
|
266
|
+
if h["ID"] == "PASS":
|
|
267
|
+
pass_index = len(filters)
|
|
268
|
+
filters.append(Filter(h["ID"], description))
|
|
269
|
+
|
|
241
270
|
# Ensure PASS is the first filter if present
|
|
242
|
-
if
|
|
243
|
-
filters.
|
|
244
|
-
filters.insert(0,
|
|
271
|
+
if pass_index > 0:
|
|
272
|
+
pass_filter = filters.pop(pass_index)
|
|
273
|
+
filters.insert(0, pass_filter)
|
|
245
274
|
|
|
246
275
|
fields = fixed_vcf_field_definitions()
|
|
247
276
|
for h in vcf.header_iter():
|
|
@@ -252,18 +281,22 @@ def scan_vcf(path, target_num_partitions):
|
|
|
252
281
|
field.vcf_number = "."
|
|
253
282
|
fields.append(field)
|
|
254
283
|
|
|
284
|
+
try:
|
|
285
|
+
contig_lengths = vcf.seqlens
|
|
286
|
+
except AttributeError:
|
|
287
|
+
contig_lengths = [None for _ in vcf.seqnames]
|
|
288
|
+
|
|
255
289
|
metadata = IcfMetadata(
|
|
256
|
-
samples=vcf.samples,
|
|
257
|
-
|
|
258
|
-
|
|
290
|
+
samples=[Sample(sample_id) for sample_id in vcf.samples],
|
|
291
|
+
contigs=[
|
|
292
|
+
Contig(contig_id, length)
|
|
293
|
+
for contig_id, length in zip(vcf.seqnames, contig_lengths)
|
|
294
|
+
],
|
|
259
295
|
filters=filters,
|
|
260
296
|
fields=fields,
|
|
261
297
|
partitions=[],
|
|
298
|
+
num_records=sum(indexed_vcf.contig_record_counts().values()),
|
|
262
299
|
)
|
|
263
|
-
try:
|
|
264
|
-
metadata.contig_lengths = vcf.seqlens
|
|
265
|
-
except AttributeError:
|
|
266
|
-
pass
|
|
267
300
|
|
|
268
301
|
regions = indexed_vcf.partition_into_regions(num_parts=target_num_partitions)
|
|
269
302
|
logger.info(
|
|
@@ -282,21 +315,6 @@ def scan_vcf(path, target_num_partitions):
|
|
|
282
315
|
return metadata, vcf.raw_header
|
|
283
316
|
|
|
284
317
|
|
|
285
|
-
def check_overlap(partitions):
|
|
286
|
-
for i in range(1, len(partitions)):
|
|
287
|
-
prev_partition = partitions[i - 1]
|
|
288
|
-
current_partition = partitions[i]
|
|
289
|
-
if (
|
|
290
|
-
prev_partition.region.contig == current_partition.region.contig
|
|
291
|
-
and prev_partition.region.end > current_partition.region.start
|
|
292
|
-
):
|
|
293
|
-
raise ValueError(
|
|
294
|
-
f"Multiple VCFs have the region "
|
|
295
|
-
f"{prev_partition.region.contig}:{prev_partition.region.start}-"
|
|
296
|
-
f"{current_partition.region.end}"
|
|
297
|
-
)
|
|
298
|
-
|
|
299
|
-
|
|
300
318
|
def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
301
319
|
logger.info(
|
|
302
320
|
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
|
|
@@ -325,27 +343,30 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
|
325
343
|
# We just take the first header, assuming the others
|
|
326
344
|
# are compatible.
|
|
327
345
|
all_partitions = []
|
|
328
|
-
|
|
346
|
+
total_records = 0
|
|
329
347
|
for metadata, _ in results:
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
metadata.
|
|
348
|
+
for partition in metadata.partitions:
|
|
349
|
+
logger.debug(f"Scanned partition {partition}")
|
|
350
|
+
all_partitions.append(partition)
|
|
351
|
+
total_records += metadata.num_records
|
|
352
|
+
metadata.num_records = 0
|
|
353
|
+
metadata.partitions = []
|
|
334
354
|
|
|
335
355
|
icf_metadata, header = results[0]
|
|
336
356
|
for metadata, _ in results[1:]:
|
|
337
357
|
if metadata != icf_metadata:
|
|
338
358
|
raise ValueError("Incompatible VCF chunks")
|
|
339
359
|
|
|
340
|
-
|
|
360
|
+
# Note: this will be infinity here if any of the chunks has an index
|
|
361
|
+
# that doesn't keep track of the number of records per-contig
|
|
362
|
+
icf_metadata.num_records = total_records
|
|
341
363
|
|
|
342
364
|
# Sort by contig (in the order they appear in the header) first,
|
|
343
365
|
# then by start coordinate
|
|
344
|
-
contig_index_map = {contig: j for j, contig in enumerate(metadata.
|
|
366
|
+
contig_index_map = {contig.id: j for j, contig in enumerate(metadata.contigs)}
|
|
345
367
|
all_partitions.sort(
|
|
346
368
|
key=lambda x: (contig_index_map[x.region.contig], x.region.start)
|
|
347
369
|
)
|
|
348
|
-
check_overlap(all_partitions)
|
|
349
370
|
icf_metadata.partitions = all_partitions
|
|
350
371
|
logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
|
|
351
372
|
return icf_metadata, header
|
|
@@ -443,7 +464,7 @@ def sanitise_value_float_2d(buff, j, value):
|
|
|
443
464
|
|
|
444
465
|
def sanitise_int_array(value, ndmin, dtype):
|
|
445
466
|
if isinstance(value, tuple):
|
|
446
|
-
value = [VCF_INT_MISSING if x is None else x for x in value] #
|
|
467
|
+
value = [VCF_INT_MISSING if x is None else x for x in value] # NEEDS TEST
|
|
447
468
|
value = np.array(value, ndmin=ndmin, copy=False)
|
|
448
469
|
value[value == VCF_INT_MISSING] = -1
|
|
449
470
|
value[value == VCF_INT_FILL] = -2
|
|
@@ -736,9 +757,9 @@ class IcfFieldWriter:
|
|
|
736
757
|
transformer: VcfValueTransformer
|
|
737
758
|
compressor: Any
|
|
738
759
|
max_buffered_bytes: int
|
|
739
|
-
buff:
|
|
760
|
+
buff: list[Any] = dataclasses.field(default_factory=list)
|
|
740
761
|
buffered_bytes: int = 0
|
|
741
|
-
chunk_index:
|
|
762
|
+
chunk_index: list[int] = dataclasses.field(default_factory=lambda: [0])
|
|
742
763
|
num_records: int = 0
|
|
743
764
|
|
|
744
765
|
def append(self, val):
|
|
@@ -842,19 +863,18 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
842
863
|
self.metadata = IcfMetadata.fromdict(json.load(f))
|
|
843
864
|
with open(self.path / "header.txt") as f:
|
|
844
865
|
self.vcf_header = f.read()
|
|
845
|
-
|
|
846
866
|
self.compressor = numcodecs.get_codec(self.metadata.compressor)
|
|
847
|
-
self.
|
|
867
|
+
self.fields = {}
|
|
848
868
|
partition_num_records = [
|
|
849
869
|
partition.num_records for partition in self.metadata.partitions
|
|
850
870
|
]
|
|
851
871
|
# Allow us to find which partition a given record is in
|
|
852
872
|
self.partition_record_index = np.cumsum([0, *partition_num_records])
|
|
853
873
|
for field in self.metadata.fields:
|
|
854
|
-
self.
|
|
874
|
+
self.fields[field.full_name] = IntermediateColumnarFormatField(self, field)
|
|
855
875
|
logger.info(
|
|
856
876
|
f"Loaded IntermediateColumnarFormat(partitions={self.num_partitions}, "
|
|
857
|
-
f"records={self.num_records},
|
|
877
|
+
f"records={self.num_records}, fields={self.num_fields})"
|
|
858
878
|
)
|
|
859
879
|
|
|
860
880
|
def __repr__(self):
|
|
@@ -865,17 +885,17 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
865
885
|
)
|
|
866
886
|
|
|
867
887
|
def __getitem__(self, key):
|
|
868
|
-
return self.
|
|
888
|
+
return self.fields[key]
|
|
869
889
|
|
|
870
890
|
def __iter__(self):
|
|
871
|
-
return iter(self.
|
|
891
|
+
return iter(self.fields)
|
|
872
892
|
|
|
873
893
|
def __len__(self):
|
|
874
|
-
return len(self.
|
|
894
|
+
return len(self.fields)
|
|
875
895
|
|
|
876
896
|
def summary_table(self):
|
|
877
897
|
data = []
|
|
878
|
-
for name, col in self.
|
|
898
|
+
for name, col in self.fields.items():
|
|
879
899
|
summary = col.vcf_field.summary
|
|
880
900
|
d = {
|
|
881
901
|
"name": name,
|
|
@@ -891,9 +911,9 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
891
911
|
data.append(d)
|
|
892
912
|
return data
|
|
893
913
|
|
|
894
|
-
@
|
|
914
|
+
@property
|
|
895
915
|
def num_records(self):
|
|
896
|
-
return
|
|
916
|
+
return self.metadata.num_records
|
|
897
917
|
|
|
898
918
|
@property
|
|
899
919
|
def num_partitions(self):
|
|
@@ -904,8 +924,42 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
904
924
|
return len(self.metadata.samples)
|
|
905
925
|
|
|
906
926
|
@property
|
|
907
|
-
def
|
|
908
|
-
return len(self.
|
|
927
|
+
def num_fields(self):
|
|
928
|
+
return len(self.fields)
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
@dataclasses.dataclass
|
|
932
|
+
class IcfPartitionMetadata:
|
|
933
|
+
num_records: int
|
|
934
|
+
last_position: int
|
|
935
|
+
field_summaries: dict
|
|
936
|
+
|
|
937
|
+
def asdict(self):
|
|
938
|
+
return dataclasses.asdict(self)
|
|
939
|
+
|
|
940
|
+
def asjson(self):
|
|
941
|
+
return json.dumps(self.asdict(), indent=4)
|
|
942
|
+
|
|
943
|
+
@staticmethod
|
|
944
|
+
def fromdict(d):
|
|
945
|
+
md = IcfPartitionMetadata(**d)
|
|
946
|
+
for k, v in md.field_summaries.items():
|
|
947
|
+
md.field_summaries[k] = VcfFieldSummary.fromdict(v)
|
|
948
|
+
return md
|
|
949
|
+
|
|
950
|
+
|
|
951
|
+
def check_overlapping_partitions(partitions):
|
|
952
|
+
for i in range(1, len(partitions)):
|
|
953
|
+
prev_region = partitions[i - 1].region
|
|
954
|
+
current_region = partitions[i].region
|
|
955
|
+
if prev_region.contig == current_region.contig:
|
|
956
|
+
assert prev_region.end is not None
|
|
957
|
+
# Regions are *inclusive*
|
|
958
|
+
if prev_region.end >= current_region.start:
|
|
959
|
+
raise ValueError(
|
|
960
|
+
f"Overlapping VCF regions in partitions {i - 1} and {i}: "
|
|
961
|
+
f"{prev_region} and {current_region}"
|
|
962
|
+
)
|
|
909
963
|
|
|
910
964
|
|
|
911
965
|
class IntermediateColumnarFormatWriter:
|
|
@@ -979,11 +1033,8 @@ class IntermediateColumnarFormatWriter:
|
|
|
979
1033
|
not_found = []
|
|
980
1034
|
for j in range(self.num_partitions):
|
|
981
1035
|
try:
|
|
982
|
-
with open(self.wip_path / f"p{j}
|
|
983
|
-
|
|
984
|
-
for k, v in summary["field_summaries"].items():
|
|
985
|
-
summary["field_summaries"][k] = VcfFieldSummary.fromdict(v)
|
|
986
|
-
summaries.append(summary)
|
|
1036
|
+
with open(self.wip_path / f"p{j}.json") as f:
|
|
1037
|
+
summaries.append(IcfPartitionMetadata.fromdict(json.load(f)))
|
|
987
1038
|
except FileNotFoundError:
|
|
988
1039
|
not_found.append(j)
|
|
989
1040
|
if len(not_found) > 0:
|
|
@@ -1000,7 +1051,7 @@ class IntermediateColumnarFormatWriter:
|
|
|
1000
1051
|
|
|
1001
1052
|
def process_partition(self, partition_index):
|
|
1002
1053
|
self.load_metadata()
|
|
1003
|
-
summary_path = self.wip_path / f"p{partition_index}
|
|
1054
|
+
summary_path = self.wip_path / f"p{partition_index}.json"
|
|
1004
1055
|
# If someone is rewriting a summary path (for whatever reason), make sure it
|
|
1005
1056
|
# doesn't look like it's already been completed.
|
|
1006
1057
|
# NOTE to do this properly we probably need to take a lock on this file - but
|
|
@@ -1021,6 +1072,7 @@ class IntermediateColumnarFormatWriter:
|
|
|
1021
1072
|
else:
|
|
1022
1073
|
format_fields.append(field)
|
|
1023
1074
|
|
|
1075
|
+
last_position = None
|
|
1024
1076
|
with IcfPartitionWriter(
|
|
1025
1077
|
self.metadata,
|
|
1026
1078
|
self.path,
|
|
@@ -1030,6 +1082,7 @@ class IntermediateColumnarFormatWriter:
|
|
|
1030
1082
|
num_records = 0
|
|
1031
1083
|
for variant in ivcf.variants(partition.region):
|
|
1032
1084
|
num_records += 1
|
|
1085
|
+
last_position = variant.POS
|
|
1033
1086
|
tcw.append("CHROM", variant.CHROM)
|
|
1034
1087
|
tcw.append("POS", variant.POS)
|
|
1035
1088
|
tcw.append("QUAL", variant.QUAL)
|
|
@@ -1054,37 +1107,32 @@ class IntermediateColumnarFormatWriter:
|
|
|
1054
1107
|
f"flushing buffers"
|
|
1055
1108
|
)
|
|
1056
1109
|
|
|
1057
|
-
partition_metadata =
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1110
|
+
partition_metadata = IcfPartitionMetadata(
|
|
1111
|
+
num_records=num_records,
|
|
1112
|
+
last_position=last_position,
|
|
1113
|
+
field_summaries=tcw.field_summaries,
|
|
1114
|
+
)
|
|
1061
1115
|
with open(summary_path, "w") as f:
|
|
1062
|
-
|
|
1116
|
+
f.write(partition_metadata.asjson())
|
|
1063
1117
|
logger.info(
|
|
1064
|
-
f"Finish p{partition_index} {partition.vcf_path}__{partition.region}
|
|
1065
|
-
f"{num_records} records"
|
|
1118
|
+
f"Finish p{partition_index} {partition.vcf_path}__{partition.region} "
|
|
1119
|
+
f"{num_records} records last_pos={last_position}"
|
|
1066
1120
|
)
|
|
1067
1121
|
|
|
1068
|
-
def
|
|
1069
|
-
self,
|
|
1070
|
-
start,
|
|
1071
|
-
stop,
|
|
1072
|
-
*,
|
|
1073
|
-
worker_processes=1,
|
|
1074
|
-
show_progress=False,
|
|
1075
|
-
):
|
|
1122
|
+
def explode(self, *, worker_processes=1, show_progress=False):
|
|
1076
1123
|
self.load_metadata()
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1124
|
+
num_records = self.metadata.num_records
|
|
1125
|
+
if np.isinf(num_records):
|
|
1126
|
+
logger.warning(
|
|
1127
|
+
"Total records unknown, cannot show progress; "
|
|
1128
|
+
"reindex VCFs with bcftools index to fix"
|
|
1129
|
+
)
|
|
1082
1130
|
num_records = None
|
|
1083
|
-
|
|
1131
|
+
num_fields = len(self.metadata.fields)
|
|
1084
1132
|
num_samples = len(self.metadata.samples)
|
|
1085
1133
|
logger.info(
|
|
1086
|
-
f"Exploding
|
|
1087
|
-
f"partitions={
|
|
1134
|
+
f"Exploding fields={num_fields} samples={num_samples}; "
|
|
1135
|
+
f"partitions={self.num_partitions} "
|
|
1088
1136
|
f"variants={'unknown' if num_records is None else num_records}"
|
|
1089
1137
|
)
|
|
1090
1138
|
progress_config = core.ProgressConfig(
|
|
@@ -1094,48 +1142,43 @@ class IntermediateColumnarFormatWriter:
|
|
|
1094
1142
|
show=show_progress,
|
|
1095
1143
|
)
|
|
1096
1144
|
with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
|
|
1097
|
-
for j in range(
|
|
1145
|
+
for j in range(self.num_partitions):
|
|
1098
1146
|
pwm.submit(self.process_partition, j)
|
|
1099
1147
|
|
|
1100
|
-
def
|
|
1101
|
-
self.load_metadata()
|
|
1102
|
-
return self.process_partition_slice(
|
|
1103
|
-
0,
|
|
1104
|
-
self.num_partitions,
|
|
1105
|
-
worker_processes=worker_processes,
|
|
1106
|
-
show_progress=show_progress,
|
|
1107
|
-
)
|
|
1108
|
-
|
|
1109
|
-
def explode_partition(self, partition, *, show_progress=False, worker_processes=1):
|
|
1148
|
+
def explode_partition(self, partition):
|
|
1110
1149
|
self.load_metadata()
|
|
1111
1150
|
if partition < 0 or partition >= self.num_partitions:
|
|
1112
1151
|
raise ValueError(
|
|
1113
1152
|
"Partition index must be in the range 0 <= index < num_partitions"
|
|
1114
1153
|
)
|
|
1115
|
-
|
|
1116
|
-
partition,
|
|
1117
|
-
partition + 1,
|
|
1118
|
-
worker_processes=worker_processes,
|
|
1119
|
-
show_progress=show_progress,
|
|
1120
|
-
)
|
|
1154
|
+
self.process_partition(partition)
|
|
1121
1155
|
|
|
1122
1156
|
def finalise(self):
|
|
1123
1157
|
self.load_metadata()
|
|
1124
1158
|
partition_summaries = self.load_partition_summaries()
|
|
1125
1159
|
total_records = 0
|
|
1126
1160
|
for index, summary in enumerate(partition_summaries):
|
|
1127
|
-
partition_records = summary
|
|
1161
|
+
partition_records = summary.num_records
|
|
1128
1162
|
self.metadata.partitions[index].num_records = partition_records
|
|
1163
|
+
self.metadata.partitions[index].region.end = summary.last_position
|
|
1129
1164
|
total_records += partition_records
|
|
1130
|
-
|
|
1165
|
+
if not np.isinf(self.metadata.num_records):
|
|
1166
|
+
# Note: this is just telling us that there's a bug in the
|
|
1167
|
+
# index based record counting code, but it doesn't actually
|
|
1168
|
+
# matter much. We may want to just make this a warning if
|
|
1169
|
+
# we hit regular problems.
|
|
1170
|
+
assert total_records == self.metadata.num_records
|
|
1171
|
+
self.metadata.num_records = total_records
|
|
1172
|
+
|
|
1173
|
+
check_overlapping_partitions(self.metadata.partitions)
|
|
1131
1174
|
|
|
1132
1175
|
for field in self.metadata.fields:
|
|
1133
1176
|
for summary in partition_summaries:
|
|
1134
|
-
field.summary.update(summary
|
|
1177
|
+
field.summary.update(summary.field_summaries[field.full_name])
|
|
1135
1178
|
|
|
1136
1179
|
logger.info("Finalising metadata")
|
|
1137
1180
|
with open(self.path / "metadata.json", "w") as f:
|
|
1138
|
-
|
|
1181
|
+
f.write(self.metadata.asjson())
|
|
1139
1182
|
|
|
1140
1183
|
logger.debug("Removing WIP directory")
|
|
1141
1184
|
shutil.rmtree(self.wip_path)
|
|
@@ -1186,14 +1229,9 @@ def explode_init(
|
|
|
1186
1229
|
)
|
|
1187
1230
|
|
|
1188
1231
|
|
|
1189
|
-
|
|
1190
|
-
# work done syncronously and so we can get test coverage on it. Should find a
|
|
1191
|
-
# better way to do this.
|
|
1192
|
-
def explode_partition(icf_path, partition, *, show_progress=False, worker_processes=1):
|
|
1232
|
+
def explode_partition(icf_path, partition):
|
|
1193
1233
|
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1194
|
-
writer.explode_partition(
|
|
1195
|
-
partition, show_progress=show_progress, worker_processes=worker_processes
|
|
1196
|
-
)
|
|
1234
|
+
writer.explode_partition(partition)
|
|
1197
1235
|
|
|
1198
1236
|
|
|
1199
1237
|
def explode_finalise(icf_path):
|
|
@@ -1242,6 +1280,50 @@ class ZarrColumnSpec:
|
|
|
1242
1280
|
spec._choose_compressor_settings()
|
|
1243
1281
|
return spec
|
|
1244
1282
|
|
|
1283
|
+
@staticmethod
|
|
1284
|
+
def from_field(
|
|
1285
|
+
vcf_field,
|
|
1286
|
+
*,
|
|
1287
|
+
num_variants,
|
|
1288
|
+
num_samples,
|
|
1289
|
+
variants_chunk_size,
|
|
1290
|
+
samples_chunk_size,
|
|
1291
|
+
variable_name=None,
|
|
1292
|
+
):
|
|
1293
|
+
shape = [num_variants]
|
|
1294
|
+
prefix = "variant_"
|
|
1295
|
+
dimensions = ["variants"]
|
|
1296
|
+
chunks = [variants_chunk_size]
|
|
1297
|
+
if vcf_field.category == "FORMAT":
|
|
1298
|
+
prefix = "call_"
|
|
1299
|
+
shape.append(num_samples)
|
|
1300
|
+
chunks.append(samples_chunk_size)
|
|
1301
|
+
dimensions.append("samples")
|
|
1302
|
+
if variable_name is None:
|
|
1303
|
+
variable_name = prefix + vcf_field.name
|
|
1304
|
+
# TODO make an option to add in the empty extra dimension
|
|
1305
|
+
if vcf_field.summary.max_number > 1:
|
|
1306
|
+
shape.append(vcf_field.summary.max_number)
|
|
1307
|
+
# TODO we should really be checking this to see if the named dimensions
|
|
1308
|
+
# are actually correct.
|
|
1309
|
+
if vcf_field.vcf_number == "R":
|
|
1310
|
+
dimensions.append("alleles")
|
|
1311
|
+
elif vcf_field.vcf_number == "A":
|
|
1312
|
+
dimensions.append("alt_alleles")
|
|
1313
|
+
elif vcf_field.vcf_number == "G":
|
|
1314
|
+
dimensions.append("genotypes")
|
|
1315
|
+
else:
|
|
1316
|
+
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
|
|
1317
|
+
return ZarrColumnSpec.new(
|
|
1318
|
+
vcf_field=vcf_field.full_name,
|
|
1319
|
+
name=variable_name,
|
|
1320
|
+
dtype=vcf_field.smallest_dtype(),
|
|
1321
|
+
shape=shape,
|
|
1322
|
+
chunks=chunks,
|
|
1323
|
+
dimensions=dimensions,
|
|
1324
|
+
description=vcf_field.description,
|
|
1325
|
+
)
|
|
1326
|
+
|
|
1245
1327
|
def _choose_compressor_settings(self):
|
|
1246
1328
|
"""
|
|
1247
1329
|
Choose compressor and filter settings based on the size and
|
|
@@ -1250,19 +1332,34 @@ class ZarrColumnSpec:
|
|
|
1250
1332
|
|
|
1251
1333
|
See https://github.com/pystatgen/bio2zarr/discussions/74
|
|
1252
1334
|
"""
|
|
1253
|
-
dt = np.dtype(self.dtype)
|
|
1254
1335
|
# Default is to not shuffle, because autoshuffle isn't recognised
|
|
1255
1336
|
# by many Zarr implementations, and shuffling can lead to worse
|
|
1256
1337
|
# performance in some cases anyway. Turning on shuffle should be a
|
|
1257
1338
|
# deliberate choice.
|
|
1258
1339
|
shuffle = numcodecs.Blosc.NOSHUFFLE
|
|
1259
|
-
if
|
|
1260
|
-
#
|
|
1340
|
+
if self.name == "call_genotype" and self.dtype == "i1":
|
|
1341
|
+
# call_genotype gets BITSHUFFLE by default as it gets
|
|
1342
|
+
# significantly better compression (at a cost of slower
|
|
1343
|
+
# decoding)
|
|
1344
|
+
shuffle = numcodecs.Blosc.BITSHUFFLE
|
|
1345
|
+
elif self.dtype == "bool":
|
|
1261
1346
|
shuffle = numcodecs.Blosc.BITSHUFFLE
|
|
1347
|
+
|
|
1262
1348
|
self.compressor["shuffle"] = shuffle
|
|
1263
1349
|
|
|
1350
|
+
@property
|
|
1351
|
+
def variant_chunk_nbytes(self):
|
|
1352
|
+
"""
|
|
1353
|
+
Returns the nbytes for a single variant chunk of this array.
|
|
1354
|
+
"""
|
|
1355
|
+
chunk_items = self.chunks[0]
|
|
1356
|
+
for size in self.shape[1:]:
|
|
1357
|
+
chunk_items *= size
|
|
1358
|
+
dt = np.dtype(self.dtype)
|
|
1359
|
+
return chunk_items * dt.itemsize
|
|
1360
|
+
|
|
1264
1361
|
|
|
1265
|
-
ZARR_SCHEMA_FORMAT_VERSION = "0.
|
|
1362
|
+
ZARR_SCHEMA_FORMAT_VERSION = "0.3"
|
|
1266
1363
|
|
|
1267
1364
|
|
|
1268
1365
|
@dataclasses.dataclass
|
|
@@ -1271,11 +1368,10 @@ class VcfZarrSchema:
|
|
|
1271
1368
|
samples_chunk_size: int
|
|
1272
1369
|
variants_chunk_size: int
|
|
1273
1370
|
dimensions: list
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
columns: dict
|
|
1371
|
+
samples: list
|
|
1372
|
+
contigs: list
|
|
1373
|
+
filters: list
|
|
1374
|
+
fields: dict
|
|
1279
1375
|
|
|
1280
1376
|
def asdict(self):
|
|
1281
1377
|
return dataclasses.asdict(self)
|
|
@@ -1291,8 +1387,11 @@ class VcfZarrSchema:
|
|
|
1291
1387
|
f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
|
|
1292
1388
|
)
|
|
1293
1389
|
ret = VcfZarrSchema(**d)
|
|
1294
|
-
ret.
|
|
1295
|
-
|
|
1390
|
+
ret.samples = [Sample(**sd) for sd in d["samples"]]
|
|
1391
|
+
ret.contigs = [Contig(**sd) for sd in d["contigs"]]
|
|
1392
|
+
ret.filters = [Filter(**sd) for sd in d["filters"]]
|
|
1393
|
+
ret.fields = {
|
|
1394
|
+
key: ZarrColumnSpec(**value) for key, value in d["fields"].items()
|
|
1296
1395
|
}
|
|
1297
1396
|
return ret
|
|
1298
1397
|
|
|
@@ -1313,6 +1412,16 @@ class VcfZarrSchema:
|
|
|
1313
1412
|
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
|
|
1314
1413
|
)
|
|
1315
1414
|
|
|
1415
|
+
def spec_from_field(field, variable_name=None):
|
|
1416
|
+
return ZarrColumnSpec.from_field(
|
|
1417
|
+
field,
|
|
1418
|
+
num_samples=n,
|
|
1419
|
+
num_variants=m,
|
|
1420
|
+
samples_chunk_size=samples_chunk_size,
|
|
1421
|
+
variants_chunk_size=variants_chunk_size,
|
|
1422
|
+
variable_name=variable_name,
|
|
1423
|
+
)
|
|
1424
|
+
|
|
1316
1425
|
def fixed_field_spec(
|
|
1317
1426
|
name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
|
|
1318
1427
|
):
|
|
@@ -1326,97 +1435,58 @@ class VcfZarrSchema:
|
|
|
1326
1435
|
chunks=[variants_chunk_size],
|
|
1327
1436
|
)
|
|
1328
1437
|
|
|
1329
|
-
alt_col = icf.
|
|
1438
|
+
alt_col = icf.fields["ALT"]
|
|
1330
1439
|
max_alleles = alt_col.vcf_field.summary.max_number + 1
|
|
1331
|
-
num_filters = len(icf.metadata.filters)
|
|
1332
1440
|
|
|
1333
|
-
# # FIXME get dtype from lookup table
|
|
1334
1441
|
colspecs = [
|
|
1335
1442
|
fixed_field_spec(
|
|
1336
1443
|
name="variant_contig",
|
|
1337
|
-
dtype=
|
|
1444
|
+
dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
|
|
1338
1445
|
),
|
|
1339
1446
|
fixed_field_spec(
|
|
1340
1447
|
name="variant_filter",
|
|
1341
1448
|
dtype="bool",
|
|
1342
|
-
shape=(m, num_filters),
|
|
1449
|
+
shape=(m, icf.metadata.num_filters),
|
|
1343
1450
|
dimensions=["variants", "filters"],
|
|
1344
1451
|
),
|
|
1345
1452
|
fixed_field_spec(
|
|
1346
1453
|
name="variant_allele",
|
|
1347
1454
|
dtype="str",
|
|
1348
|
-
shape=
|
|
1455
|
+
shape=(m, max_alleles),
|
|
1349
1456
|
dimensions=["variants", "alleles"],
|
|
1350
1457
|
),
|
|
1351
1458
|
fixed_field_spec(
|
|
1352
|
-
vcf_field="POS",
|
|
1353
|
-
name="variant_position",
|
|
1354
|
-
dtype="i4",
|
|
1355
|
-
),
|
|
1356
|
-
fixed_field_spec(
|
|
1357
|
-
vcf_field=None,
|
|
1358
1459
|
name="variant_id",
|
|
1359
1460
|
dtype="str",
|
|
1360
1461
|
),
|
|
1361
1462
|
fixed_field_spec(
|
|
1362
|
-
vcf_field=None,
|
|
1363
1463
|
name="variant_id_mask",
|
|
1364
1464
|
dtype="bool",
|
|
1365
1465
|
),
|
|
1366
|
-
fixed_field_spec(
|
|
1367
|
-
vcf_field="QUAL",
|
|
1368
|
-
name="variant_quality",
|
|
1369
|
-
dtype="f4",
|
|
1370
|
-
),
|
|
1371
1466
|
]
|
|
1467
|
+
name_map = {field.full_name: field for field in icf.metadata.fields}
|
|
1468
|
+
|
|
1469
|
+
# Only two of the fixed fields have a direct one-to-one mapping.
|
|
1470
|
+
colspecs.extend(
|
|
1471
|
+
[
|
|
1472
|
+
spec_from_field(name_map["QUAL"], variable_name="variant_quality"),
|
|
1473
|
+
spec_from_field(name_map["POS"], variable_name="variant_position"),
|
|
1474
|
+
]
|
|
1475
|
+
)
|
|
1476
|
+
colspecs.extend([spec_from_field(field) for field in icf.metadata.info_fields])
|
|
1372
1477
|
|
|
1373
1478
|
gt_field = None
|
|
1374
|
-
for field in icf.metadata.
|
|
1375
|
-
if field.category == "fixed":
|
|
1376
|
-
continue
|
|
1479
|
+
for field in icf.metadata.format_fields:
|
|
1377
1480
|
if field.name == "GT":
|
|
1378
1481
|
gt_field = field
|
|
1379
1482
|
continue
|
|
1380
|
-
|
|
1381
|
-
prefix = "variant_"
|
|
1382
|
-
dimensions = ["variants"]
|
|
1383
|
-
chunks = [variants_chunk_size]
|
|
1384
|
-
if field.category == "FORMAT":
|
|
1385
|
-
prefix = "call_"
|
|
1386
|
-
shape.append(n)
|
|
1387
|
-
chunks.append(samples_chunk_size)
|
|
1388
|
-
dimensions.append("samples")
|
|
1389
|
-
# TODO make an option to add in the empty extra dimension
|
|
1390
|
-
if field.summary.max_number > 1:
|
|
1391
|
-
shape.append(field.summary.max_number)
|
|
1392
|
-
# TODO we should really be checking this to see if the named dimensions
|
|
1393
|
-
# are actually correct.
|
|
1394
|
-
if field.vcf_number == "R":
|
|
1395
|
-
dimensions.append("alleles")
|
|
1396
|
-
elif field.vcf_number == "A":
|
|
1397
|
-
dimensions.append("alt_alleles")
|
|
1398
|
-
elif field.vcf_number == "G":
|
|
1399
|
-
dimensions.append("genotypes")
|
|
1400
|
-
else:
|
|
1401
|
-
dimensions.append(f"{field.category}_{field.name}_dim")
|
|
1402
|
-
variable_name = prefix + field.name
|
|
1403
|
-
colspec = ZarrColumnSpec.new(
|
|
1404
|
-
vcf_field=field.full_name,
|
|
1405
|
-
name=variable_name,
|
|
1406
|
-
dtype=field.smallest_dtype(),
|
|
1407
|
-
shape=shape,
|
|
1408
|
-
chunks=chunks,
|
|
1409
|
-
dimensions=dimensions,
|
|
1410
|
-
description=field.description,
|
|
1411
|
-
)
|
|
1412
|
-
colspecs.append(colspec)
|
|
1483
|
+
colspecs.append(spec_from_field(field))
|
|
1413
1484
|
|
|
1414
1485
|
if gt_field is not None:
|
|
1415
1486
|
ploidy = gt_field.summary.max_number - 1
|
|
1416
1487
|
shape = [m, n]
|
|
1417
1488
|
chunks = [variants_chunk_size, samples_chunk_size]
|
|
1418
1489
|
dimensions = ["variants", "samples"]
|
|
1419
|
-
|
|
1420
1490
|
colspecs.append(
|
|
1421
1491
|
ZarrColumnSpec.new(
|
|
1422
1492
|
vcf_field=None,
|
|
@@ -1457,12 +1527,11 @@ class VcfZarrSchema:
|
|
|
1457
1527
|
format_version=ZARR_SCHEMA_FORMAT_VERSION,
|
|
1458
1528
|
samples_chunk_size=samples_chunk_size,
|
|
1459
1529
|
variants_chunk_size=variants_chunk_size,
|
|
1460
|
-
|
|
1530
|
+
fields={col.name: col for col in colspecs},
|
|
1461
1531
|
dimensions=["variants", "samples", "ploidy", "alleles", "filters"],
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
filter_id=icf.metadata.filters,
|
|
1532
|
+
samples=icf.metadata.samples,
|
|
1533
|
+
contigs=icf.metadata.contigs,
|
|
1534
|
+
filters=icf.metadata.filters,
|
|
1466
1535
|
)
|
|
1467
1536
|
|
|
1468
1537
|
|
|
@@ -1470,14 +1539,12 @@ class VcfZarr:
|
|
|
1470
1539
|
def __init__(self, path):
|
|
1471
1540
|
if not (path / ".zmetadata").exists():
|
|
1472
1541
|
raise ValueError("Not in VcfZarr format") # NEEDS TEST
|
|
1542
|
+
self.path = path
|
|
1473
1543
|
self.root = zarr.open(path, mode="r")
|
|
1474
1544
|
|
|
1475
|
-
def __repr__(self):
|
|
1476
|
-
return repr(self.root) # NEEDS TEST
|
|
1477
|
-
|
|
1478
1545
|
def summary_table(self):
|
|
1479
1546
|
data = []
|
|
1480
|
-
arrays = [(a.
|
|
1547
|
+
arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()]
|
|
1481
1548
|
arrays.sort(key=lambda x: x[0])
|
|
1482
1549
|
for stored, array in reversed(arrays):
|
|
1483
1550
|
d = {
|
|
@@ -1498,15 +1565,6 @@ class VcfZarr:
|
|
|
1498
1565
|
return data
|
|
1499
1566
|
|
|
1500
1567
|
|
|
1501
|
-
@dataclasses.dataclass
|
|
1502
|
-
class EncodingWork:
|
|
1503
|
-
func: callable = dataclasses.field(repr=False)
|
|
1504
|
-
start: int
|
|
1505
|
-
stop: int
|
|
1506
|
-
columns: list[str]
|
|
1507
|
-
memory: int = 0
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
1568
|
def parse_max_memory(max_memory):
|
|
1511
1569
|
if max_memory is None:
|
|
1512
1570
|
# Effectively unbounded
|
|
@@ -1517,67 +1575,299 @@ def parse_max_memory(max_memory):
|
|
|
1517
1575
|
return max_memory
|
|
1518
1576
|
|
|
1519
1577
|
|
|
1578
|
+
@dataclasses.dataclass
|
|
1579
|
+
class VcfZarrPartition:
|
|
1580
|
+
start: int
|
|
1581
|
+
stop: int
|
|
1582
|
+
|
|
1583
|
+
@staticmethod
|
|
1584
|
+
def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
|
|
1585
|
+
num_chunks = int(np.ceil(num_records / chunk_size))
|
|
1586
|
+
if max_chunks is not None:
|
|
1587
|
+
num_chunks = min(num_chunks, max_chunks)
|
|
1588
|
+
partitions = []
|
|
1589
|
+
splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
|
|
1590
|
+
for chunk_slice in splits:
|
|
1591
|
+
start_chunk = int(chunk_slice[0])
|
|
1592
|
+
stop_chunk = int(chunk_slice[-1]) + 1
|
|
1593
|
+
start_index = start_chunk * chunk_size
|
|
1594
|
+
stop_index = min(stop_chunk * chunk_size, num_records)
|
|
1595
|
+
partitions.append(VcfZarrPartition(start_index, stop_index))
|
|
1596
|
+
return partitions
|
|
1597
|
+
|
|
1598
|
+
|
|
1599
|
+
VZW_METADATA_FORMAT_VERSION = "0.1"
|
|
1600
|
+
|
|
1601
|
+
|
|
1602
|
+
@dataclasses.dataclass
|
|
1603
|
+
class VcfZarrWriterMetadata:
|
|
1604
|
+
format_version: str
|
|
1605
|
+
icf_path: str
|
|
1606
|
+
schema: VcfZarrSchema
|
|
1607
|
+
dimension_separator: str
|
|
1608
|
+
partitions: list
|
|
1609
|
+
provenance: dict
|
|
1610
|
+
|
|
1611
|
+
def asdict(self):
|
|
1612
|
+
return dataclasses.asdict(self)
|
|
1613
|
+
|
|
1614
|
+
@staticmethod
|
|
1615
|
+
def fromdict(d):
|
|
1616
|
+
if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
|
|
1617
|
+
raise ValueError(
|
|
1618
|
+
"VcfZarrWriter format version mismatch: "
|
|
1619
|
+
f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
|
|
1620
|
+
)
|
|
1621
|
+
ret = VcfZarrWriterMetadata(**d)
|
|
1622
|
+
ret.schema = VcfZarrSchema.fromdict(ret.schema)
|
|
1623
|
+
ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
|
|
1624
|
+
return ret
|
|
1625
|
+
|
|
1626
|
+
|
|
1520
1627
|
class VcfZarrWriter:
|
|
1521
|
-
def __init__(self, path
|
|
1628
|
+
def __init__(self, path):
|
|
1522
1629
|
self.path = pathlib.Path(path)
|
|
1630
|
+
self.wip_path = self.path / "wip"
|
|
1631
|
+
self.arrays_path = self.wip_path / "arrays"
|
|
1632
|
+
self.partitions_path = self.wip_path / "partitions"
|
|
1633
|
+
self.metadata = None
|
|
1634
|
+
self.icf = None
|
|
1635
|
+
|
|
1636
|
+
@property
|
|
1637
|
+
def schema(self):
|
|
1638
|
+
return self.metadata.schema
|
|
1639
|
+
|
|
1640
|
+
@property
|
|
1641
|
+
def num_partitions(self):
|
|
1642
|
+
return len(self.metadata.partitions)
|
|
1643
|
+
|
|
1644
|
+
#######################
|
|
1645
|
+
# init
|
|
1646
|
+
#######################
|
|
1647
|
+
|
|
1648
|
+
def init(
|
|
1649
|
+
self,
|
|
1650
|
+
icf,
|
|
1651
|
+
*,
|
|
1652
|
+
target_num_partitions,
|
|
1653
|
+
schema,
|
|
1654
|
+
dimension_separator=None,
|
|
1655
|
+
max_variant_chunks=None,
|
|
1656
|
+
):
|
|
1523
1657
|
self.icf = icf
|
|
1524
|
-
self.
|
|
1658
|
+
if self.path.exists():
|
|
1659
|
+
raise ValueError("Zarr path already exists") # NEEDS TEST
|
|
1660
|
+
partitions = VcfZarrPartition.generate_partitions(
|
|
1661
|
+
self.icf.num_records,
|
|
1662
|
+
schema.variants_chunk_size,
|
|
1663
|
+
target_num_partitions,
|
|
1664
|
+
max_chunks=max_variant_chunks,
|
|
1665
|
+
)
|
|
1525
1666
|
# Default to using nested directories following the Zarr v3 default.
|
|
1526
1667
|
# This seems to require version 2.17+ to work properly
|
|
1527
|
-
|
|
1668
|
+
dimension_separator = (
|
|
1528
1669
|
"/" if dimension_separator is None else dimension_separator
|
|
1529
1670
|
)
|
|
1671
|
+
self.metadata = VcfZarrWriterMetadata(
|
|
1672
|
+
format_version=VZW_METADATA_FORMAT_VERSION,
|
|
1673
|
+
icf_path=str(self.icf.path),
|
|
1674
|
+
schema=schema,
|
|
1675
|
+
dimension_separator=dimension_separator,
|
|
1676
|
+
partitions=partitions,
|
|
1677
|
+
# Bare minimum here for provenance - see comments above
|
|
1678
|
+
provenance={"source": f"bio2zarr-{provenance.__version__}"},
|
|
1679
|
+
)
|
|
1680
|
+
|
|
1681
|
+
self.path.mkdir()
|
|
1530
1682
|
store = zarr.DirectoryStore(self.path)
|
|
1531
|
-
|
|
1683
|
+
root = zarr.group(store=store)
|
|
1684
|
+
root.attrs.update(
|
|
1685
|
+
{
|
|
1686
|
+
"vcf_zarr_version": "0.2",
|
|
1687
|
+
"vcf_header": self.icf.vcf_header,
|
|
1688
|
+
"source": f"bio2zarr-{provenance.__version__}",
|
|
1689
|
+
}
|
|
1690
|
+
)
|
|
1691
|
+
# Doing this syncronously - this is fine surely
|
|
1692
|
+
self.encode_samples(root)
|
|
1693
|
+
self.encode_filter_id(root)
|
|
1694
|
+
self.encode_contig_id(root)
|
|
1532
1695
|
|
|
1533
|
-
|
|
1696
|
+
self.wip_path.mkdir()
|
|
1697
|
+
self.arrays_path.mkdir()
|
|
1698
|
+
self.partitions_path.mkdir()
|
|
1699
|
+
store = zarr.DirectoryStore(self.arrays_path)
|
|
1700
|
+
root = zarr.group(store=store)
|
|
1701
|
+
|
|
1702
|
+
for column in self.schema.fields.values():
|
|
1703
|
+
self.init_array(root, column, partitions[-1].stop)
|
|
1704
|
+
|
|
1705
|
+
logger.info("Writing WIP metadata")
|
|
1706
|
+
with open(self.wip_path / "metadata.json", "w") as f:
|
|
1707
|
+
json.dump(self.metadata.asdict(), f, indent=4)
|
|
1708
|
+
return len(partitions)
|
|
1709
|
+
|
|
1710
|
+
def encode_samples(self, root):
|
|
1711
|
+
if self.schema.samples != self.icf.metadata.samples:
|
|
1712
|
+
raise ValueError(
|
|
1713
|
+
"Subsetting or reordering samples not supported currently"
|
|
1714
|
+
) # NEEDS TEST
|
|
1715
|
+
array = root.array(
|
|
1716
|
+
"sample_id",
|
|
1717
|
+
[sample.id for sample in self.schema.samples],
|
|
1718
|
+
dtype="str",
|
|
1719
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1720
|
+
chunks=(self.schema.samples_chunk_size,),
|
|
1721
|
+
)
|
|
1722
|
+
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
1723
|
+
logger.debug("Samples done")
|
|
1724
|
+
|
|
1725
|
+
def encode_contig_id(self, root):
|
|
1726
|
+
array = root.array(
|
|
1727
|
+
"contig_id",
|
|
1728
|
+
[contig.id for contig in self.schema.contigs],
|
|
1729
|
+
dtype="str",
|
|
1730
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1731
|
+
)
|
|
1732
|
+
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
1733
|
+
if all(contig.length is not None for contig in self.schema.contigs):
|
|
1734
|
+
array = root.array(
|
|
1735
|
+
"contig_length",
|
|
1736
|
+
[contig.length for contig in self.schema.contigs],
|
|
1737
|
+
dtype=np.int64,
|
|
1738
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1739
|
+
)
|
|
1740
|
+
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
1741
|
+
|
|
1742
|
+
def encode_filter_id(self, root):
|
|
1743
|
+
# TODO need a way to store description also
|
|
1744
|
+
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
|
|
1745
|
+
array = root.array(
|
|
1746
|
+
"filter_id",
|
|
1747
|
+
[filt.id for filt in self.schema.filters],
|
|
1748
|
+
dtype="str",
|
|
1749
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1750
|
+
)
|
|
1751
|
+
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
|
|
1752
|
+
|
|
1753
|
+
def init_array(self, root, variable, variants_dim_size):
|
|
1534
1754
|
object_codec = None
|
|
1535
1755
|
if variable.dtype == "O":
|
|
1536
1756
|
object_codec = numcodecs.VLenUTF8()
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1757
|
+
shape = list(variable.shape)
|
|
1758
|
+
# Truncate the variants dimension is max_variant_chunks was specified
|
|
1759
|
+
shape[0] = variants_dim_size
|
|
1760
|
+
a = root.empty(
|
|
1761
|
+
variable.name,
|
|
1762
|
+
shape=shape,
|
|
1540
1763
|
chunks=variable.chunks,
|
|
1541
1764
|
dtype=variable.dtype,
|
|
1542
1765
|
compressor=numcodecs.get_codec(variable.compressor),
|
|
1543
1766
|
filters=[numcodecs.get_codec(filt) for filt in variable.filters],
|
|
1544
1767
|
object_codec=object_codec,
|
|
1545
|
-
dimension_separator=self.dimension_separator,
|
|
1768
|
+
dimension_separator=self.metadata.dimension_separator,
|
|
1546
1769
|
)
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
def
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1770
|
+
a.attrs.update(
|
|
1771
|
+
{
|
|
1772
|
+
"description": variable.description,
|
|
1773
|
+
# Dimension names are part of the spec in Zarr v3
|
|
1774
|
+
"_ARRAY_DIMENSIONS": variable.dimensions,
|
|
1775
|
+
}
|
|
1776
|
+
)
|
|
1777
|
+
logger.debug(f"Initialised {a}")
|
|
1778
|
+
|
|
1779
|
+
#######################
|
|
1780
|
+
# encode_partition
|
|
1781
|
+
#######################
|
|
1782
|
+
|
|
1783
|
+
def load_metadata(self):
|
|
1784
|
+
if self.metadata is None:
|
|
1785
|
+
with open(self.wip_path / "metadata.json") as f:
|
|
1786
|
+
self.metadata = VcfZarrWriterMetadata.fromdict(json.load(f))
|
|
1787
|
+
self.icf = IntermediateColumnarFormat(self.metadata.icf_path)
|
|
1788
|
+
|
|
1789
|
+
def partition_path(self, partition_index):
|
|
1790
|
+
return self.partitions_path / f"p{partition_index}"
|
|
1791
|
+
|
|
1792
|
+
def wip_partition_path(self, partition_index):
|
|
1793
|
+
return self.partitions_path / f"wip_p{partition_index}"
|
|
1794
|
+
|
|
1795
|
+
def wip_partition_array_path(self, partition_index, name):
|
|
1796
|
+
return self.wip_partition_path(partition_index) / name
|
|
1797
|
+
|
|
1798
|
+
def partition_array_path(self, partition_index, name):
|
|
1799
|
+
return self.partition_path(partition_index) / name
|
|
1800
|
+
|
|
1801
|
+
def encode_partition(self, partition_index):
|
|
1802
|
+
self.load_metadata()
|
|
1803
|
+
if partition_index < 0 or partition_index >= self.num_partitions:
|
|
1804
|
+
raise ValueError(
|
|
1805
|
+
"Partition index must be in the range 0 <= index < num_partitions"
|
|
1806
|
+
)
|
|
1807
|
+
partition_path = self.wip_partition_path(partition_index)
|
|
1808
|
+
partition_path.mkdir(exist_ok=True)
|
|
1809
|
+
logger.info(f"Encoding partition {partition_index} to {partition_path}")
|
|
1810
|
+
|
|
1811
|
+
self.encode_id_partition(partition_index)
|
|
1812
|
+
self.encode_filters_partition(partition_index)
|
|
1813
|
+
self.encode_contig_partition(partition_index)
|
|
1814
|
+
self.encode_alleles_partition(partition_index)
|
|
1815
|
+
for col in self.schema.fields.values():
|
|
1816
|
+
if col.vcf_field is not None:
|
|
1817
|
+
self.encode_array_partition(col, partition_index)
|
|
1818
|
+
if "call_genotype" in self.schema.fields:
|
|
1819
|
+
self.encode_genotypes_partition(partition_index)
|
|
1820
|
+
|
|
1821
|
+
final_path = self.partition_path(partition_index)
|
|
1822
|
+
logger.info(f"Finalising {partition_index} at {final_path}")
|
|
1823
|
+
if final_path.exists():
|
|
1824
|
+
logger.warning(f"Removing existing partition at {final_path}")
|
|
1825
|
+
shutil.rmtree(final_path)
|
|
1826
|
+
os.rename(partition_path, final_path)
|
|
1827
|
+
|
|
1828
|
+
def init_partition_array(self, partition_index, name):
|
|
1829
|
+
wip_path = self.wip_partition_array_path(partition_index, name)
|
|
1830
|
+
# Create an empty array like the definition
|
|
1831
|
+
src = self.arrays_path / name
|
|
1832
|
+
# Overwrite any existing WIP files
|
|
1833
|
+
shutil.copytree(src, wip_path, dirs_exist_ok=True)
|
|
1834
|
+
array = zarr.open(wip_path)
|
|
1835
|
+
logger.debug(f"Opened empty array {array} @ {wip_path}")
|
|
1836
|
+
return array
|
|
1837
|
+
|
|
1838
|
+
def finalise_partition_array(self, partition_index, name):
|
|
1839
|
+
logger.debug(f"Encoded {name} partition {partition_index}")
|
|
1840
|
+
|
|
1841
|
+
def encode_array_partition(self, column, partition_index):
|
|
1842
|
+
array = self.init_partition_array(partition_index, column.name)
|
|
1843
|
+
|
|
1844
|
+
partition = self.metadata.partitions[partition_index]
|
|
1845
|
+
ba = core.BufferedArray(array, partition.start)
|
|
1846
|
+
source_col = self.icf.fields[column.vcf_field]
|
|
1564
1847
|
sanitiser = source_col.sanitiser_factory(ba.buff.shape)
|
|
1565
1848
|
|
|
1566
|
-
for value in source_col.iter_values(start, stop):
|
|
1849
|
+
for value in source_col.iter_values(partition.start, partition.stop):
|
|
1567
1850
|
# We write directly into the buffer in the sanitiser function
|
|
1568
1851
|
# to make it easier to reason about dimension padding
|
|
1569
1852
|
j = ba.next_buffer_row()
|
|
1570
1853
|
sanitiser(ba.buff, j, value)
|
|
1571
1854
|
ba.flush()
|
|
1572
|
-
|
|
1855
|
+
self.finalise_partition_array(partition_index, column.name)
|
|
1856
|
+
|
|
1857
|
+
def encode_genotypes_partition(self, partition_index):
|
|
1858
|
+
gt_array = self.init_partition_array(partition_index, "call_genotype")
|
|
1859
|
+
gt_mask_array = self.init_partition_array(partition_index, "call_genotype_mask")
|
|
1860
|
+
gt_phased_array = self.init_partition_array(
|
|
1861
|
+
partition_index, "call_genotype_phased"
|
|
1862
|
+
)
|
|
1573
1863
|
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
gt_phased = core.BufferedArray(self.get_array("call_genotype_phased"), start)
|
|
1864
|
+
partition = self.metadata.partitions[partition_index]
|
|
1865
|
+
gt = core.BufferedArray(gt_array, partition.start)
|
|
1866
|
+
gt_mask = core.BufferedArray(gt_mask_array, partition.start)
|
|
1867
|
+
gt_phased = core.BufferedArray(gt_phased_array, partition.start)
|
|
1579
1868
|
|
|
1580
|
-
|
|
1869
|
+
source_col = self.icf.fields["FORMAT/GT"]
|
|
1870
|
+
for value in source_col.iter_values(partition.start, partition.stop):
|
|
1581
1871
|
j = gt.next_buffer_row()
|
|
1582
1872
|
sanitise_value_int_2d(gt.buff, j, value[:, :-1])
|
|
1583
1873
|
j = gt_phased.next_buffer_row()
|
|
@@ -1589,29 +1879,40 @@ class VcfZarrWriter:
|
|
|
1589
1879
|
gt.flush()
|
|
1590
1880
|
gt_phased.flush()
|
|
1591
1881
|
gt_mask.flush()
|
|
1592
|
-
logger.debug(f"Encoded GT slice {start}:{stop}")
|
|
1593
1882
|
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1883
|
+
self.finalise_partition_array(partition_index, "call_genotype")
|
|
1884
|
+
self.finalise_partition_array(partition_index, "call_genotype_mask")
|
|
1885
|
+
self.finalise_partition_array(partition_index, "call_genotype_phased")
|
|
1886
|
+
|
|
1887
|
+
def encode_alleles_partition(self, partition_index):
|
|
1888
|
+
array_name = "variant_allele"
|
|
1889
|
+
alleles_array = self.init_partition_array(partition_index, array_name)
|
|
1890
|
+
partition = self.metadata.partitions[partition_index]
|
|
1891
|
+
alleles = core.BufferedArray(alleles_array, partition.start)
|
|
1892
|
+
ref_col = self.icf.fields["REF"]
|
|
1893
|
+
alt_col = self.icf.fields["ALT"]
|
|
1598
1894
|
|
|
1599
1895
|
for ref, alt in zip(
|
|
1600
|
-
ref_col.iter_values(start, stop),
|
|
1896
|
+
ref_col.iter_values(partition.start, partition.stop),
|
|
1897
|
+
alt_col.iter_values(partition.start, partition.stop),
|
|
1601
1898
|
):
|
|
1602
1899
|
j = alleles.next_buffer_row()
|
|
1603
1900
|
alleles.buff[j, :] = STR_FILL
|
|
1604
1901
|
alleles.buff[j, 0] = ref[0]
|
|
1605
1902
|
alleles.buff[j, 1 : 1 + len(alt)] = alt
|
|
1606
1903
|
alleles.flush()
|
|
1607
|
-
logger.debug(f"Encoded alleles slice {start}:{stop}")
|
|
1608
1904
|
|
|
1609
|
-
|
|
1610
|
-
col = self.icf.columns["ID"]
|
|
1611
|
-
vid = core.BufferedArray(self.get_array("variant_id"), start)
|
|
1612
|
-
vid_mask = core.BufferedArray(self.get_array("variant_id_mask"), start)
|
|
1905
|
+
self.finalise_partition_array(partition_index, array_name)
|
|
1613
1906
|
|
|
1614
|
-
|
|
1907
|
+
def encode_id_partition(self, partition_index):
|
|
1908
|
+
vid_array = self.init_partition_array(partition_index, "variant_id")
|
|
1909
|
+
vid_mask_array = self.init_partition_array(partition_index, "variant_id_mask")
|
|
1910
|
+
partition = self.metadata.partitions[partition_index]
|
|
1911
|
+
vid = core.BufferedArray(vid_array, partition.start)
|
|
1912
|
+
vid_mask = core.BufferedArray(vid_mask_array, partition.start)
|
|
1913
|
+
col = self.icf.fields["ID"]
|
|
1914
|
+
|
|
1915
|
+
for value in col.iter_values(partition.start, partition.stop):
|
|
1615
1916
|
j = vid.next_buffer_row()
|
|
1616
1917
|
k = vid_mask.next_buffer_row()
|
|
1617
1918
|
assert j == k
|
|
@@ -1623,13 +1924,19 @@ class VcfZarrWriter:
|
|
|
1623
1924
|
vid_mask.buff[j] = True
|
|
1624
1925
|
vid.flush()
|
|
1625
1926
|
vid_mask.flush()
|
|
1626
|
-
logger.debug(f"Encoded ID slice {start}:{stop}")
|
|
1627
1927
|
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1928
|
+
self.finalise_partition_array(partition_index, "variant_id")
|
|
1929
|
+
self.finalise_partition_array(partition_index, "variant_id_mask")
|
|
1930
|
+
|
|
1931
|
+
def encode_filters_partition(self, partition_index):
|
|
1932
|
+
lookup = {filt.id: index for index, filt in enumerate(self.schema.filters)}
|
|
1933
|
+
array_name = "variant_filter"
|
|
1934
|
+
array = self.init_partition_array(partition_index, array_name)
|
|
1935
|
+
partition = self.metadata.partitions[partition_index]
|
|
1936
|
+
var_filter = core.BufferedArray(array, partition.start)
|
|
1631
1937
|
|
|
1632
|
-
|
|
1938
|
+
col = self.icf.fields["FILTERS"]
|
|
1939
|
+
for value in col.iter_values(partition.start, partition.stop):
|
|
1633
1940
|
j = var_filter.next_buffer_row()
|
|
1634
1941
|
var_filter.buff[j] = False
|
|
1635
1942
|
for f in value:
|
|
@@ -1637,16 +1944,21 @@ class VcfZarrWriter:
|
|
|
1637
1944
|
var_filter.buff[j, lookup[f]] = True
|
|
1638
1945
|
except KeyError:
|
|
1639
1946
|
raise ValueError(
|
|
1640
|
-
f"Filter '{f}' was not defined
|
|
1947
|
+
f"Filter '{f}' was not defined in the header."
|
|
1641
1948
|
) from None
|
|
1642
1949
|
var_filter.flush()
|
|
1643
|
-
logger.debug(f"Encoded FILTERS slice {start}:{stop}")
|
|
1644
1950
|
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1951
|
+
self.finalise_partition_array(partition_index, array_name)
|
|
1952
|
+
|
|
1953
|
+
def encode_contig_partition(self, partition_index):
|
|
1954
|
+
lookup = {contig.id: index for index, contig in enumerate(self.schema.contigs)}
|
|
1955
|
+
array_name = "variant_contig"
|
|
1956
|
+
array = self.init_partition_array(partition_index, array_name)
|
|
1957
|
+
partition = self.metadata.partitions[partition_index]
|
|
1958
|
+
contig = core.BufferedArray(array, partition.start)
|
|
1959
|
+
col = self.icf.fields["CHROM"]
|
|
1648
1960
|
|
|
1649
|
-
for value in col.iter_values(start, stop):
|
|
1961
|
+
for value in col.iter_values(partition.start, partition.stop):
|
|
1650
1962
|
j = contig.next_buffer_row()
|
|
1651
1963
|
# Note: because we are using the indexes to define the lookups
|
|
1652
1964
|
# and we always have an index, it seems that we the contig lookup
|
|
@@ -1654,161 +1966,131 @@ class VcfZarrWriter:
|
|
|
1654
1966
|
# here, please do open an issue with a reproducible example!
|
|
1655
1967
|
contig.buff[j] = lookup[value[0]]
|
|
1656
1968
|
contig.flush()
|
|
1657
|
-
logger.debug(f"Encoded CHROM slice {start}:{stop}")
|
|
1658
1969
|
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
self.
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1970
|
+
self.finalise_partition_array(partition_index, array_name)
|
|
1971
|
+
|
|
1972
|
+
#######################
|
|
1973
|
+
# finalise
|
|
1974
|
+
#######################
|
|
1975
|
+
|
|
1976
|
+
def finalise_array(self, name):
|
|
1977
|
+
logger.info(f"Finalising {name}")
|
|
1978
|
+
final_path = self.path / name
|
|
1979
|
+
if final_path.exists():
|
|
1980
|
+
# NEEDS TEST
|
|
1981
|
+
raise ValueError(f"Array {name} already exists")
|
|
1982
|
+
for partition in range(self.num_partitions):
|
|
1983
|
+
# Move all the files in partition dir to dest dir
|
|
1984
|
+
src = self.partition_array_path(partition, name)
|
|
1985
|
+
if not src.exists():
|
|
1986
|
+
# Needs test
|
|
1987
|
+
raise ValueError(f"Partition {partition} of {name} does not exist")
|
|
1988
|
+
dest = self.arrays_path / name
|
|
1989
|
+
# This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
|
|
1990
|
+
chunk_files = [
|
|
1991
|
+
path for path in src.iterdir() if not path.name.startswith(".")
|
|
1992
|
+
]
|
|
1993
|
+
# TODO check for a count of then number of files. If we require a
|
|
1994
|
+
# dimension_separator of "/" then we could make stronger assertions
|
|
1995
|
+
# here, as we'd always have num_variant_chunks
|
|
1996
|
+
logger.debug(
|
|
1997
|
+
f"Moving {len(chunk_files)} chunks for {name} partition {partition}"
|
|
1688
1998
|
)
|
|
1689
|
-
|
|
1690
|
-
|
|
1999
|
+
for chunk_file in chunk_files:
|
|
2000
|
+
os.rename(chunk_file, dest / chunk_file.name)
|
|
2001
|
+
# Finally, once all the chunks have moved into the arrays dir,
|
|
2002
|
+
# we move it out of wip
|
|
2003
|
+
os.rename(self.arrays_path / name, self.path / name)
|
|
2004
|
+
core.update_progress(1)
|
|
1691
2005
|
|
|
1692
|
-
def
|
|
1693
|
-
|
|
1694
|
-
"filter_id",
|
|
1695
|
-
self.schema.filter_id,
|
|
1696
|
-
dtype="str",
|
|
1697
|
-
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1698
|
-
)
|
|
1699
|
-
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
|
|
1700
|
-
return {v: j for j, v in enumerate(self.schema.filter_id)}
|
|
2006
|
+
def finalise(self, show_progress=False):
|
|
2007
|
+
self.load_metadata()
|
|
1701
2008
|
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
2009
|
+
logger.info("Scanning {self.num_partitions} partitions")
|
|
2010
|
+
missing = []
|
|
2011
|
+
# TODO may need a progress bar here
|
|
2012
|
+
for partition_id in range(self.num_partitions):
|
|
2013
|
+
if not self.partition_path(partition_id).exists():
|
|
2014
|
+
missing.append(partition_id)
|
|
2015
|
+
if len(missing) > 0:
|
|
2016
|
+
raise FileNotFoundError(f"Partitions not encoded: {missing}")
|
|
1708
2017
|
|
|
1709
|
-
|
|
2018
|
+
progress_config = core.ProgressConfig(
|
|
2019
|
+
total=len(self.schema.fields),
|
|
2020
|
+
title="Finalise",
|
|
2021
|
+
units="array",
|
|
2022
|
+
show=show_progress,
|
|
2023
|
+
)
|
|
2024
|
+
# NOTE: it's not clear that adding more workers will make this quicker,
|
|
2025
|
+
# as it's just going to be causing contention on the file system.
|
|
2026
|
+
# Something to check empirically in some deployments.
|
|
2027
|
+
# FIXME we're just using worker_processes=0 here to hook into the
|
|
2028
|
+
# SynchronousExecutor which is intended for testing purposes so
|
|
2029
|
+
# that we get test coverage. Should fix this either by allowing
|
|
2030
|
+
# for multiple workers, or making a standard wrapper for tqdm
|
|
2031
|
+
# that allows us to have a consistent look and feel.
|
|
2032
|
+
with core.ParallelWorkManager(0, progress_config) as pwm:
|
|
2033
|
+
for name in self.schema.fields:
|
|
2034
|
+
pwm.submit(self.finalise_array, name)
|
|
2035
|
+
logger.debug(f"Removing {self.wip_path}")
|
|
2036
|
+
shutil.rmtree(self.wip_path)
|
|
2037
|
+
logger.info("Consolidating Zarr metadata")
|
|
1710
2038
|
zarr.consolidate_metadata(self.path)
|
|
1711
2039
|
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
max_v_chunks=None,
|
|
1716
|
-
show_progress=False,
|
|
1717
|
-
max_memory=None,
|
|
1718
|
-
):
|
|
1719
|
-
max_memory = parse_max_memory(max_memory)
|
|
2040
|
+
######################
|
|
2041
|
+
# encode_all_partitions
|
|
2042
|
+
######################
|
|
1720
2043
|
|
|
1721
|
-
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
self.
|
|
2044
|
+
def get_max_encoding_memory(self):
|
|
2045
|
+
"""
|
|
2046
|
+
Return the approximate maximum memory used to encode a variant chunk.
|
|
2047
|
+
"""
|
|
2048
|
+
max_encoding_mem = max(
|
|
2049
|
+
col.variant_chunk_nbytes for col in self.schema.fields.values()
|
|
1727
2050
|
)
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
for col in self.schema.columns.values():
|
|
1738
|
-
array = self.get_array(col.name)
|
|
1739
|
-
# NOTE!! this is bad, we're potentially creating quite a large
|
|
1740
|
-
# numpy array for basically nothing. We can compute this.
|
|
1741
|
-
variant_chunk_size = array.blocks[0].nbytes
|
|
1742
|
-
encoding_memory_requirements[col.name] = variant_chunk_size
|
|
1743
|
-
logger.debug(
|
|
1744
|
-
f"{col.name} requires at least {display_size(variant_chunk_size)} "
|
|
1745
|
-
f"per worker"
|
|
2051
|
+
gt_mem = 0
|
|
2052
|
+
if "call_genotype" in self.schema.fields:
|
|
2053
|
+
encoded_together = [
|
|
2054
|
+
"call_genotype",
|
|
2055
|
+
"call_genotype_phased",
|
|
2056
|
+
"call_genotype_mask",
|
|
2057
|
+
]
|
|
2058
|
+
gt_mem = sum(
|
|
2059
|
+
self.schema.fields[col].variant_chunk_nbytes for col in encoded_together
|
|
1746
2060
|
)
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
)
|
|
1769
|
-
work.append(
|
|
1770
|
-
EncodingWork(
|
|
1771
|
-
self.encode_id_slice, start, stop, ["variant_id", "variant_id_mask"]
|
|
1772
|
-
)
|
|
1773
|
-
)
|
|
1774
|
-
work.append(
|
|
1775
|
-
EncodingWork(
|
|
1776
|
-
functools.partial(self.encode_filters_slice, filter_id_map),
|
|
1777
|
-
start,
|
|
1778
|
-
stop,
|
|
1779
|
-
["variant_filter"],
|
|
1780
|
-
)
|
|
2061
|
+
return max(max_encoding_mem, gt_mem)
|
|
2062
|
+
|
|
2063
|
+
def encode_all_partitions(
|
|
2064
|
+
self, *, worker_processes=1, show_progress=False, max_memory=None
|
|
2065
|
+
):
|
|
2066
|
+
max_memory = parse_max_memory(max_memory)
|
|
2067
|
+
self.load_metadata()
|
|
2068
|
+
num_partitions = self.num_partitions
|
|
2069
|
+
per_worker_memory = self.get_max_encoding_memory()
|
|
2070
|
+
logger.info(
|
|
2071
|
+
f"Encoding Zarr over {num_partitions} partitions with "
|
|
2072
|
+
f"{worker_processes} workers and {display_size(per_worker_memory)} "
|
|
2073
|
+
"per worker"
|
|
2074
|
+
)
|
|
2075
|
+
# Each partition requires per_worker_memory bytes, so to prevent more that
|
|
2076
|
+
# max_memory being used, we clamp the number of workers
|
|
2077
|
+
max_num_workers = max_memory // per_worker_memory
|
|
2078
|
+
if max_num_workers < worker_processes:
|
|
2079
|
+
logger.warning(
|
|
2080
|
+
f"Limiting number of workers to {max_num_workers} to "
|
|
2081
|
+
f"keep within specified memory budget of {display_size(max_memory)}"
|
|
1781
2082
|
)
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
stop,
|
|
1787
|
-
["variant_contig"],
|
|
1788
|
-
)
|
|
2083
|
+
if max_num_workers <= 0:
|
|
2084
|
+
raise ValueError(
|
|
2085
|
+
f"Insufficient memory to encode a partition:"
|
|
2086
|
+
f"{display_size(per_worker_memory)} > {display_size(max_memory)}"
|
|
1789
2087
|
)
|
|
1790
|
-
|
|
1791
|
-
variables = [
|
|
1792
|
-
"call_genotype",
|
|
1793
|
-
"call_genotype_phased",
|
|
1794
|
-
"call_genotype_mask",
|
|
1795
|
-
]
|
|
1796
|
-
gt_memory = sum(
|
|
1797
|
-
encoding_memory_requirements[name] for name in variables
|
|
1798
|
-
)
|
|
1799
|
-
work.append(
|
|
1800
|
-
EncodingWork(
|
|
1801
|
-
self.encode_genotypes_slice, start, stop, variables, gt_memory
|
|
1802
|
-
)
|
|
1803
|
-
)
|
|
2088
|
+
num_workers = min(max_num_workers, worker_processes)
|
|
1804
2089
|
|
|
1805
|
-
|
|
1806
|
-
for
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
f"Insufficient memory for {wp.columns}: "
|
|
1810
|
-
f"{display_size(wp.memory)} > {display_size(max_memory)}"
|
|
1811
|
-
)
|
|
2090
|
+
total_bytes = 0
|
|
2091
|
+
for col in self.schema.fields.values():
|
|
2092
|
+
# Open the array definition to get the total size
|
|
2093
|
+
total_bytes += zarr.open(self.arrays_path / col.name).nbytes
|
|
1812
2094
|
|
|
1813
2095
|
progress_config = core.ProgressConfig(
|
|
1814
2096
|
total=total_bytes,
|
|
@@ -1816,54 +2098,9 @@ class VcfZarrWriter:
|
|
|
1816
2098
|
units="B",
|
|
1817
2099
|
show=show_progress,
|
|
1818
2100
|
)
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
# below doesn't really work.
|
|
1823
|
-
max_queued = 4 * max(1, worker_processes)
|
|
1824
|
-
encoded_slices = collections.Counter()
|
|
1825
|
-
|
|
1826
|
-
with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
|
|
1827
|
-
future = pwm.submit(self.encode_samples)
|
|
1828
|
-
future_to_work = {future: EncodingWork(None, 0, 0, [])}
|
|
1829
|
-
|
|
1830
|
-
def service_completed_futures():
|
|
1831
|
-
nonlocal used_memory
|
|
1832
|
-
|
|
1833
|
-
completed = pwm.wait_for_completed()
|
|
1834
|
-
for future in completed:
|
|
1835
|
-
wp_done = future_to_work.pop(future)
|
|
1836
|
-
used_memory -= wp_done.memory
|
|
1837
|
-
logger.debug(
|
|
1838
|
-
f"Complete {wp_done}: used mem={display_size(used_memory)}"
|
|
1839
|
-
)
|
|
1840
|
-
for column in wp_done.columns:
|
|
1841
|
-
encoded_slices[column] += 1
|
|
1842
|
-
if encoded_slices[column] == len(slices):
|
|
1843
|
-
# Do this syncronously for simplicity. Should be
|
|
1844
|
-
# fine as the workers will probably be busy with
|
|
1845
|
-
# large encode tasks most of the time.
|
|
1846
|
-
self.finalise_array(column)
|
|
1847
|
-
|
|
1848
|
-
for wp in work:
|
|
1849
|
-
while (
|
|
1850
|
-
used_memory + wp.memory > max_memory
|
|
1851
|
-
or len(future_to_work) > max_queued
|
|
1852
|
-
):
|
|
1853
|
-
logger.debug(
|
|
1854
|
-
f"Wait: mem_required={used_memory + wp.memory} "
|
|
1855
|
-
f"max_mem={max_memory} queued={len(future_to_work)} "
|
|
1856
|
-
f"max_queued={max_queued}"
|
|
1857
|
-
)
|
|
1858
|
-
service_completed_futures()
|
|
1859
|
-
future = pwm.submit(wp.func, wp.start, wp.stop)
|
|
1860
|
-
used_memory += wp.memory
|
|
1861
|
-
logger.debug(f"Submit {wp}: used mem={display_size(used_memory)}")
|
|
1862
|
-
future_to_work[future] = wp
|
|
1863
|
-
|
|
1864
|
-
logger.debug("All work submitted")
|
|
1865
|
-
while len(future_to_work) > 0:
|
|
1866
|
-
service_completed_futures()
|
|
2101
|
+
with core.ParallelWorkManager(num_workers, progress_config) as pwm:
|
|
2102
|
+
for partition_index in range(num_partitions):
|
|
2103
|
+
pwm.submit(self.encode_partition, partition_index)
|
|
1867
2104
|
|
|
1868
2105
|
|
|
1869
2106
|
def mkschema(if_path, out):
|
|
@@ -1878,13 +2115,48 @@ def encode(
|
|
|
1878
2115
|
schema_path=None,
|
|
1879
2116
|
variants_chunk_size=None,
|
|
1880
2117
|
samples_chunk_size=None,
|
|
1881
|
-
|
|
2118
|
+
max_variant_chunks=None,
|
|
1882
2119
|
dimension_separator=None,
|
|
1883
2120
|
max_memory=None,
|
|
1884
2121
|
worker_processes=1,
|
|
1885
2122
|
show_progress=False,
|
|
1886
2123
|
):
|
|
1887
|
-
|
|
2124
|
+
# Rough heuristic to split work up enough to keep utilisation high
|
|
2125
|
+
target_num_partitions = max(1, worker_processes * 4)
|
|
2126
|
+
encode_init(
|
|
2127
|
+
if_path,
|
|
2128
|
+
zarr_path,
|
|
2129
|
+
target_num_partitions,
|
|
2130
|
+
schema_path=schema_path,
|
|
2131
|
+
variants_chunk_size=variants_chunk_size,
|
|
2132
|
+
samples_chunk_size=samples_chunk_size,
|
|
2133
|
+
max_variant_chunks=max_variant_chunks,
|
|
2134
|
+
dimension_separator=dimension_separator,
|
|
2135
|
+
)
|
|
2136
|
+
vzw = VcfZarrWriter(zarr_path)
|
|
2137
|
+
vzw.encode_all_partitions(
|
|
2138
|
+
worker_processes=worker_processes,
|
|
2139
|
+
show_progress=show_progress,
|
|
2140
|
+
max_memory=max_memory,
|
|
2141
|
+
)
|
|
2142
|
+
vzw.finalise(show_progress)
|
|
2143
|
+
|
|
2144
|
+
|
|
2145
|
+
def encode_init(
|
|
2146
|
+
icf_path,
|
|
2147
|
+
zarr_path,
|
|
2148
|
+
target_num_partitions,
|
|
2149
|
+
*,
|
|
2150
|
+
schema_path=None,
|
|
2151
|
+
variants_chunk_size=None,
|
|
2152
|
+
samples_chunk_size=None,
|
|
2153
|
+
max_variant_chunks=None,
|
|
2154
|
+
dimension_separator=None,
|
|
2155
|
+
max_memory=None,
|
|
2156
|
+
worker_processes=1,
|
|
2157
|
+
show_progress=False,
|
|
2158
|
+
):
|
|
2159
|
+
icf = IntermediateColumnarFormat(icf_path)
|
|
1888
2160
|
if schema_path is None:
|
|
1889
2161
|
schema = VcfZarrSchema.generate(
|
|
1890
2162
|
icf,
|
|
@@ -1900,18 +2172,25 @@ def encode(
|
|
|
1900
2172
|
with open(schema_path) as f:
|
|
1901
2173
|
schema = VcfZarrSchema.fromjson(f.read())
|
|
1902
2174
|
zarr_path = pathlib.Path(zarr_path)
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
worker_processes=worker_processes,
|
|
1911
|
-
max_memory=max_memory,
|
|
1912
|
-
show_progress=show_progress,
|
|
2175
|
+
vzw = VcfZarrWriter(zarr_path)
|
|
2176
|
+
vzw.init(
|
|
2177
|
+
icf,
|
|
2178
|
+
target_num_partitions=target_num_partitions,
|
|
2179
|
+
schema=schema,
|
|
2180
|
+
dimension_separator=dimension_separator,
|
|
2181
|
+
max_variant_chunks=max_variant_chunks,
|
|
1913
2182
|
)
|
|
1914
|
-
vzw.
|
|
2183
|
+
return vzw.num_partitions, vzw.get_max_encoding_memory()
|
|
2184
|
+
|
|
2185
|
+
|
|
2186
|
+
def encode_partition(zarr_path, partition):
|
|
2187
|
+
writer = VcfZarrWriter(zarr_path)
|
|
2188
|
+
writer.encode_partition(partition)
|
|
2189
|
+
|
|
2190
|
+
|
|
2191
|
+
def encode_finalise(zarr_path, show_progress=False):
|
|
2192
|
+
writer = VcfZarrWriter(zarr_path)
|
|
2193
|
+
writer.finalise(show_progress=show_progress)
|
|
1915
2194
|
|
|
1916
2195
|
|
|
1917
2196
|
def convert(
|
|
@@ -2121,7 +2400,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
|
|
|
2121
2400
|
assert pos[start_index] == first_pos
|
|
2122
2401
|
vcf = cyvcf2.VCF(vcf_path)
|
|
2123
2402
|
if show_progress:
|
|
2124
|
-
iterator = tqdm.tqdm(vcf, desc="
|
|
2403
|
+
iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
|
|
2125
2404
|
else:
|
|
2126
2405
|
iterator = vcf
|
|
2127
2406
|
for j, row in enumerate(iterator, start_index):
|