bio2zarr 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +245 -68
- bio2zarr/core.py +36 -19
- bio2zarr/plink.py +25 -19
- bio2zarr/vcf.py +704 -389
- bio2zarr/vcf_utils.py +0 -1
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/METADATA +1 -1
- bio2zarr-0.0.3.dist-info/RECORD +16 -0
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/WHEEL +1 -1
- bio2zarr-0.0.1.dist-info/RECORD +0 -16
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/top_level.txt +0 -0
bio2zarr/vcf.py
CHANGED
|
@@ -40,6 +40,17 @@ FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array(
|
|
|
40
40
|
)
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
def display_number(x):
|
|
44
|
+
ret = "n/a"
|
|
45
|
+
if math.isfinite(x):
|
|
46
|
+
ret = f"{x: 0.2g}"
|
|
47
|
+
return ret
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def display_size(n):
|
|
51
|
+
return humanfriendly.format_size(n, binary=True)
|
|
52
|
+
|
|
53
|
+
|
|
43
54
|
@dataclasses.dataclass
|
|
44
55
|
class VcfFieldSummary:
|
|
45
56
|
num_chunks: int = 0
|
|
@@ -53,11 +64,18 @@ class VcfFieldSummary:
|
|
|
53
64
|
def update(self, other):
|
|
54
65
|
self.num_chunks += other.num_chunks
|
|
55
66
|
self.compressed_size += other.compressed_size
|
|
56
|
-
self.uncompressed_size
|
|
67
|
+
self.uncompressed_size += other.uncompressed_size
|
|
57
68
|
self.max_number = max(self.max_number, other.max_number)
|
|
58
69
|
self.min_value = min(self.min_value, other.min_value)
|
|
59
70
|
self.max_value = max(self.max_value, other.max_value)
|
|
60
71
|
|
|
72
|
+
def asdict(self):
|
|
73
|
+
return dataclasses.asdict(self)
|
|
74
|
+
|
|
75
|
+
@staticmethod
|
|
76
|
+
def fromdict(d):
|
|
77
|
+
return VcfFieldSummary(**d)
|
|
78
|
+
|
|
61
79
|
|
|
62
80
|
@dataclasses.dataclass
|
|
63
81
|
class VcfField:
|
|
@@ -131,9 +149,14 @@ class VcfPartition:
|
|
|
131
149
|
num_records: int = -1
|
|
132
150
|
|
|
133
151
|
|
|
152
|
+
ICF_METADATA_FORMAT_VERSION = "0.2"
|
|
153
|
+
ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
|
|
154
|
+
cname="lz4", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
|
|
155
|
+
).get_config()
|
|
156
|
+
|
|
157
|
+
|
|
134
158
|
@dataclasses.dataclass
|
|
135
|
-
class
|
|
136
|
-
format_version: str
|
|
159
|
+
class IcfMetadata:
|
|
137
160
|
samples: list
|
|
138
161
|
contig_names: list
|
|
139
162
|
contig_record_counts: dict
|
|
@@ -141,6 +164,10 @@ class VcfMetadata:
|
|
|
141
164
|
fields: list
|
|
142
165
|
partitions: list = None
|
|
143
166
|
contig_lengths: list = None
|
|
167
|
+
format_version: str = None
|
|
168
|
+
compressor: dict = None
|
|
169
|
+
column_chunk_size: int = None
|
|
170
|
+
provenance: dict = None
|
|
144
171
|
|
|
145
172
|
@property
|
|
146
173
|
def info_fields(self):
|
|
@@ -164,12 +191,19 @@ class VcfMetadata:
|
|
|
164
191
|
|
|
165
192
|
@staticmethod
|
|
166
193
|
def fromdict(d):
|
|
194
|
+
if d["format_version"] != ICF_METADATA_FORMAT_VERSION:
|
|
195
|
+
raise ValueError(
|
|
196
|
+
"Intermediate columnar metadata format version mismatch: "
|
|
197
|
+
f"{d['format_version']} != {ICF_METADATA_FORMAT_VERSION}"
|
|
198
|
+
)
|
|
167
199
|
fields = [VcfField.fromdict(fd) for fd in d["fields"]]
|
|
168
200
|
partitions = [VcfPartition(**pd) for pd in d["partitions"]]
|
|
201
|
+
for p in partitions:
|
|
202
|
+
p.region = vcf_utils.Region(**p.region)
|
|
169
203
|
d = d.copy()
|
|
170
204
|
d["fields"] = fields
|
|
171
205
|
d["partitions"] = partitions
|
|
172
|
-
return
|
|
206
|
+
return IcfMetadata(**d)
|
|
173
207
|
|
|
174
208
|
def asdict(self):
|
|
175
209
|
return dataclasses.asdict(self)
|
|
@@ -220,16 +254,13 @@ def scan_vcf(path, target_num_partitions):
|
|
|
220
254
|
field.vcf_number = "."
|
|
221
255
|
fields.append(field)
|
|
222
256
|
|
|
223
|
-
metadata =
|
|
257
|
+
metadata = IcfMetadata(
|
|
224
258
|
samples=vcf.samples,
|
|
225
259
|
contig_names=vcf.seqnames,
|
|
226
260
|
contig_record_counts=indexed_vcf.contig_record_counts(),
|
|
227
261
|
filters=filters,
|
|
228
|
-
# TODO use the mapping dictionary
|
|
229
262
|
fields=fields,
|
|
230
263
|
partitions=[],
|
|
231
|
-
# FIXME do something systematic with this
|
|
232
|
-
format_version="0.1",
|
|
233
264
|
)
|
|
234
265
|
try:
|
|
235
266
|
metadata.contig_lengths = vcf.seqlens
|
|
@@ -243,6 +274,8 @@ def scan_vcf(path, target_num_partitions):
|
|
|
243
274
|
for region in regions:
|
|
244
275
|
metadata.partitions.append(
|
|
245
276
|
VcfPartition(
|
|
277
|
+
# TODO should this be fully resolving the path? Otherwise it's all
|
|
278
|
+
# relative to the original WD
|
|
246
279
|
vcf_path=str(path),
|
|
247
280
|
region=region,
|
|
248
281
|
)
|
|
@@ -251,8 +284,19 @@ def scan_vcf(path, target_num_partitions):
|
|
|
251
284
|
return metadata, vcf.raw_header
|
|
252
285
|
|
|
253
286
|
|
|
254
|
-
def scan_vcfs(
|
|
255
|
-
|
|
287
|
+
def scan_vcfs(
|
|
288
|
+
paths, show_progress, target_num_partitions, column_chunk_size, worker_processes=1
|
|
289
|
+
):
|
|
290
|
+
logger.info(
|
|
291
|
+
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions."
|
|
292
|
+
)
|
|
293
|
+
# An easy mistake to make is to pass the same file twice. Check this early on.
|
|
294
|
+
for path, count in collections.Counter(paths).items():
|
|
295
|
+
if not path.exists(): # NEEDS TEST
|
|
296
|
+
raise FileNotFoundError(path)
|
|
297
|
+
if count > 1:
|
|
298
|
+
raise ValueError(f"Duplicate path provided: {path}")
|
|
299
|
+
|
|
256
300
|
progress_config = core.ProgressConfig(
|
|
257
301
|
total=len(paths),
|
|
258
302
|
units="files",
|
|
@@ -261,7 +305,7 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
|
261
305
|
)
|
|
262
306
|
with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
|
|
263
307
|
for path in paths:
|
|
264
|
-
pwm.submit(scan_vcf, path, target_num_partitions)
|
|
308
|
+
pwm.submit(scan_vcf, path, max(1, target_num_partitions // len(paths)))
|
|
265
309
|
results = list(pwm.results_as_completed())
|
|
266
310
|
|
|
267
311
|
# Sort to make the ordering deterministic
|
|
@@ -276,12 +320,12 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
|
276
320
|
contig_record_counts += metadata.contig_record_counts
|
|
277
321
|
metadata.contig_record_counts.clear()
|
|
278
322
|
|
|
279
|
-
|
|
323
|
+
icf_metadata, header = results[0]
|
|
280
324
|
for metadata, _ in results[1:]:
|
|
281
|
-
if metadata !=
|
|
325
|
+
if metadata != icf_metadata:
|
|
282
326
|
raise ValueError("Incompatible VCF chunks")
|
|
283
327
|
|
|
284
|
-
|
|
328
|
+
icf_metadata.contig_record_counts = dict(contig_record_counts)
|
|
285
329
|
|
|
286
330
|
# Sort by contig (in the order they appear in the header) first,
|
|
287
331
|
# then by start coordinate
|
|
@@ -289,8 +333,15 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
|
289
333
|
all_partitions.sort(
|
|
290
334
|
key=lambda x: (contig_index_map[x.region.contig], x.region.start)
|
|
291
335
|
)
|
|
292
|
-
|
|
293
|
-
|
|
336
|
+
icf_metadata.partitions = all_partitions
|
|
337
|
+
icf_metadata.format_version = ICF_METADATA_FORMAT_VERSION
|
|
338
|
+
icf_metadata.compressor = ICF_DEFAULT_COMPRESSOR
|
|
339
|
+
icf_metadata.column_chunk_size = column_chunk_size
|
|
340
|
+
# Bare minimum here for provenance - would be nice to include versions of key
|
|
341
|
+
# dependencies as well.
|
|
342
|
+
icf_metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
|
|
343
|
+
logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
|
|
344
|
+
return icf_metadata, header
|
|
294
345
|
|
|
295
346
|
|
|
296
347
|
def sanitise_value_bool(buff, j, value):
|
|
@@ -385,7 +436,7 @@ def sanitise_value_float_2d(buff, j, value):
|
|
|
385
436
|
|
|
386
437
|
def sanitise_int_array(value, ndmin, dtype):
|
|
387
438
|
if isinstance(value, tuple):
|
|
388
|
-
value = [VCF_INT_MISSING if x is None else x for x in value]
|
|
439
|
+
value = [VCF_INT_MISSING if x is None else x for x in value] # NEEDS TEST
|
|
389
440
|
value = np.array(value, ndmin=ndmin, copy=False)
|
|
390
441
|
value[value == VCF_INT_MISSING] = -1
|
|
391
442
|
value[value == VCF_INT_FILL] = -2
|
|
@@ -428,7 +479,7 @@ missing_value_map = {
|
|
|
428
479
|
class VcfValueTransformer:
|
|
429
480
|
"""
|
|
430
481
|
Transform VCF values into the stored intermediate format used
|
|
431
|
-
in the
|
|
482
|
+
in the IntermediateColumnarFormat, and update field summaries.
|
|
432
483
|
"""
|
|
433
484
|
|
|
434
485
|
def __init__(self, field, num_samples):
|
|
@@ -516,29 +567,29 @@ class StringValueTransformer(VcfValueTransformer):
|
|
|
516
567
|
class SplitStringValueTransformer(StringValueTransformer):
|
|
517
568
|
def transform(self, vcf_value):
|
|
518
569
|
if vcf_value is None:
|
|
519
|
-
return self.missing_value
|
|
570
|
+
return self.missing_value # NEEDS TEST
|
|
520
571
|
assert self.dimension == 1
|
|
521
572
|
return np.array(vcf_value, ndmin=1, dtype="str")
|
|
522
573
|
|
|
523
574
|
|
|
524
|
-
|
|
525
|
-
|
|
575
|
+
def get_vcf_field_path(base_path, vcf_field):
|
|
576
|
+
if vcf_field.category == "fixed":
|
|
577
|
+
return base_path / vcf_field.name
|
|
578
|
+
return base_path / vcf_field.category / vcf_field.name
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
class IntermediateColumnarFormatField:
|
|
582
|
+
def __init__(self, icf, vcf_field):
|
|
526
583
|
self.vcf_field = vcf_field
|
|
527
|
-
self.path =
|
|
528
|
-
self.compressor =
|
|
529
|
-
self.num_partitions =
|
|
530
|
-
self.num_records =
|
|
531
|
-
self.partition_record_index =
|
|
584
|
+
self.path = get_vcf_field_path(icf.path, vcf_field)
|
|
585
|
+
self.compressor = icf.compressor
|
|
586
|
+
self.num_partitions = icf.num_partitions
|
|
587
|
+
self.num_records = icf.num_records
|
|
588
|
+
self.partition_record_index = icf.partition_record_index
|
|
532
589
|
# A map of partition id to the cumulative number of records
|
|
533
590
|
# in chunks within that partition
|
|
534
591
|
self._chunk_record_index = {}
|
|
535
592
|
|
|
536
|
-
@staticmethod
|
|
537
|
-
def get_path(base_path, vcf_field):
|
|
538
|
-
if vcf_field.category == "fixed":
|
|
539
|
-
return base_path / vcf_field.name
|
|
540
|
-
return base_path / vcf_field.category / vcf_field.name
|
|
541
|
-
|
|
542
593
|
@property
|
|
543
594
|
def name(self):
|
|
544
595
|
return self.vcf_field.full_name
|
|
@@ -549,17 +600,17 @@ class PickleChunkedVcfField:
|
|
|
549
600
|
def __repr__(self):
|
|
550
601
|
partition_chunks = [self.num_chunks(j) for j in range(self.num_partitions)]
|
|
551
602
|
return (
|
|
552
|
-
f"
|
|
603
|
+
f"IntermediateColumnarFormatField(name={self.name}, "
|
|
553
604
|
f"partition_chunks={partition_chunks}, "
|
|
554
605
|
f"path={self.path})"
|
|
555
606
|
)
|
|
556
607
|
|
|
557
608
|
def num_chunks(self, partition_id):
|
|
558
|
-
return len(self.
|
|
609
|
+
return len(self.chunk_record_index(partition_id)) - 1
|
|
559
610
|
|
|
560
611
|
def chunk_record_index(self, partition_id):
|
|
561
612
|
if partition_id not in self._chunk_record_index:
|
|
562
|
-
index_path = self.partition_path(partition_id) / "chunk_index
|
|
613
|
+
index_path = self.partition_path(partition_id) / "chunk_index"
|
|
563
614
|
with open(index_path, "rb") as f:
|
|
564
615
|
a = pickle.load(f)
|
|
565
616
|
assert len(a) > 1
|
|
@@ -567,22 +618,27 @@ class PickleChunkedVcfField:
|
|
|
567
618
|
self._chunk_record_index[partition_id] = a
|
|
568
619
|
return self._chunk_record_index[partition_id]
|
|
569
620
|
|
|
570
|
-
def chunk_cumulative_records(self, partition_id):
|
|
571
|
-
return self.chunk_record_index(partition_id)[1:]
|
|
572
|
-
|
|
573
|
-
def chunk_num_records(self, partition_id):
|
|
574
|
-
return np.diff(self.chunk_cumulative_records(partition_id))
|
|
575
|
-
|
|
576
|
-
def chunk_files(self, partition_id, start=0):
|
|
577
|
-
partition_path = self.partition_path(partition_id)
|
|
578
|
-
for n in self.chunk_cumulative_records(partition_id)[start:]:
|
|
579
|
-
yield partition_path / f"{n}.pkl"
|
|
580
|
-
|
|
581
621
|
def read_chunk(self, path):
|
|
582
622
|
with open(path, "rb") as f:
|
|
583
623
|
pkl = self.compressor.decode(f.read())
|
|
584
624
|
return pickle.loads(pkl)
|
|
585
625
|
|
|
626
|
+
def chunk_num_records(self, partition_id):
|
|
627
|
+
return np.diff(self.chunk_record_index(partition_id))
|
|
628
|
+
|
|
629
|
+
def chunks(self, partition_id, start_chunk=0):
|
|
630
|
+
partition_path = self.partition_path(partition_id)
|
|
631
|
+
chunk_cumulative_records = self.chunk_record_index(partition_id)
|
|
632
|
+
chunk_num_records = np.diff(chunk_cumulative_records)
|
|
633
|
+
for count, cumulative in zip(
|
|
634
|
+
chunk_num_records[start_chunk:], chunk_cumulative_records[start_chunk + 1 :]
|
|
635
|
+
):
|
|
636
|
+
path = partition_path / f"{cumulative}"
|
|
637
|
+
chunk = self.read_chunk(path)
|
|
638
|
+
if len(chunk) != count:
|
|
639
|
+
raise ValueError(f"Corruption detected in chunk: {path}")
|
|
640
|
+
yield chunk
|
|
641
|
+
|
|
586
642
|
def iter_values(self, start=None, stop=None):
|
|
587
643
|
start = 0 if start is None else start
|
|
588
644
|
stop = self.num_records if stop is None else stop
|
|
@@ -603,9 +659,7 @@ class PickleChunkedVcfField:
|
|
|
603
659
|
f"Read {self.vcf_field.full_name} slice [{start}:{stop}]:"
|
|
604
660
|
f"p_start={start_partition}, c_start={start_chunk}, r_start={record_id}"
|
|
605
661
|
)
|
|
606
|
-
|
|
607
|
-
for chunk_path in self.chunk_files(start_partition, start_chunk):
|
|
608
|
-
chunk = self.read_chunk(chunk_path)
|
|
662
|
+
for chunk in self.chunks(start_partition, start_chunk):
|
|
609
663
|
for record in chunk:
|
|
610
664
|
if record_id == stop:
|
|
611
665
|
return
|
|
@@ -614,8 +668,7 @@ class PickleChunkedVcfField:
|
|
|
614
668
|
record_id += 1
|
|
615
669
|
assert record_id > start
|
|
616
670
|
for partition_id in range(start_partition + 1, self.num_partitions):
|
|
617
|
-
for
|
|
618
|
-
chunk = self.read_chunk(chunk_path)
|
|
671
|
+
for chunk in self.chunks(partition_id):
|
|
619
672
|
for record in chunk:
|
|
620
673
|
if record_id == stop:
|
|
621
674
|
return
|
|
@@ -629,15 +682,11 @@ class PickleChunkedVcfField:
|
|
|
629
682
|
ret = [None] * self.num_records
|
|
630
683
|
j = 0
|
|
631
684
|
for partition_id in range(self.num_partitions):
|
|
632
|
-
for
|
|
633
|
-
chunk = self.read_chunk(chunk_path)
|
|
685
|
+
for chunk in self.chunks(partition_id):
|
|
634
686
|
for record in chunk:
|
|
635
687
|
ret[j] = record
|
|
636
688
|
j += 1
|
|
637
|
-
|
|
638
|
-
raise ValueError(
|
|
639
|
-
f"Corruption detected: incorrect number of records in {str(self.path)}."
|
|
640
|
-
)
|
|
689
|
+
assert j == self.num_records
|
|
641
690
|
return ret
|
|
642
691
|
|
|
643
692
|
def sanitiser_factory(self, shape):
|
|
@@ -674,7 +723,7 @@ class PickleChunkedVcfField:
|
|
|
674
723
|
|
|
675
724
|
|
|
676
725
|
@dataclasses.dataclass
|
|
677
|
-
class
|
|
726
|
+
class IcfFieldWriter:
|
|
678
727
|
vcf_field: VcfField
|
|
679
728
|
path: pathlib.Path
|
|
680
729
|
transformer: VcfValueTransformer
|
|
@@ -704,7 +753,7 @@ class PcvcfFieldWriter:
|
|
|
704
753
|
def write_chunk(self):
|
|
705
754
|
# Update index
|
|
706
755
|
self.chunk_index.append(self.num_records)
|
|
707
|
-
path = self.path / f"{self.num_records}
|
|
756
|
+
path = self.path / f"{self.num_records}"
|
|
708
757
|
logger.debug(f"Start write: {path}")
|
|
709
758
|
pkl = pickle.dumps(self.buff)
|
|
710
759
|
compressed = self.compressor.encode(pkl)
|
|
@@ -723,37 +772,35 @@ class PcvcfFieldWriter:
|
|
|
723
772
|
)
|
|
724
773
|
if len(self.buff) > 0:
|
|
725
774
|
self.write_chunk()
|
|
726
|
-
with open(self.path / "chunk_index
|
|
775
|
+
with open(self.path / "chunk_index", "wb") as f:
|
|
727
776
|
a = np.array(self.chunk_index, dtype=int)
|
|
728
777
|
pickle.dump(a, f)
|
|
729
778
|
|
|
730
779
|
|
|
731
|
-
class
|
|
780
|
+
class IcfPartitionWriter(contextlib.AbstractContextManager):
|
|
732
781
|
"""
|
|
733
|
-
Writes the data for a
|
|
782
|
+
Writes the data for a IntermediateColumnarFormat partition.
|
|
734
783
|
"""
|
|
735
784
|
|
|
736
785
|
def __init__(
|
|
737
786
|
self,
|
|
738
|
-
|
|
787
|
+
icf_metadata,
|
|
739
788
|
out_path,
|
|
740
789
|
partition_index,
|
|
741
|
-
compressor,
|
|
742
|
-
*,
|
|
743
|
-
chunk_size=1,
|
|
744
790
|
):
|
|
745
791
|
self.partition_index = partition_index
|
|
746
792
|
# chunk_size is in megabytes
|
|
747
|
-
max_buffered_bytes =
|
|
793
|
+
max_buffered_bytes = icf_metadata.column_chunk_size * 2**20
|
|
748
794
|
assert max_buffered_bytes > 0
|
|
795
|
+
compressor = numcodecs.get_codec(icf_metadata.compressor)
|
|
749
796
|
|
|
750
797
|
self.field_writers = {}
|
|
751
|
-
num_samples = len(
|
|
752
|
-
for vcf_field in
|
|
753
|
-
field_path =
|
|
798
|
+
num_samples = len(icf_metadata.samples)
|
|
799
|
+
for vcf_field in icf_metadata.fields:
|
|
800
|
+
field_path = get_vcf_field_path(out_path, vcf_field)
|
|
754
801
|
field_partition_path = field_path / f"p{partition_index}"
|
|
755
802
|
transformer = VcfValueTransformer.factory(vcf_field, num_samples)
|
|
756
|
-
self.field_writers[vcf_field.full_name] =
|
|
803
|
+
self.field_writers[vcf_field.full_name] = IcfFieldWriter(
|
|
757
804
|
vcf_field,
|
|
758
805
|
field_partition_path,
|
|
759
806
|
transformer,
|
|
@@ -777,16 +824,23 @@ class PcvcfPartitionWriter(contextlib.AbstractContextManager):
|
|
|
777
824
|
return False
|
|
778
825
|
|
|
779
826
|
|
|
780
|
-
|
|
827
|
+
# TODO rename to IntermediateColumnarFormat and move to icf.py
|
|
828
|
+
|
|
829
|
+
|
|
830
|
+
class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
781
831
|
# TODO Check if other compressors would give reasonable compression
|
|
782
832
|
# with significantly faster times
|
|
783
|
-
DEFAULT_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
|
|
784
833
|
|
|
785
|
-
def __init__(self, path
|
|
786
|
-
self.path = path
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
self.
|
|
834
|
+
def __init__(self, path):
|
|
835
|
+
self.path = pathlib.Path(path)
|
|
836
|
+
# TODO raise a more informative error here telling people this
|
|
837
|
+
# directory is either a WIP or the wrong format.
|
|
838
|
+
with open(self.path / "metadata.json") as f:
|
|
839
|
+
self.metadata = IcfMetadata.fromdict(json.load(f))
|
|
840
|
+
with open(self.path / "header.txt") as f:
|
|
841
|
+
self.vcf_header = f.read()
|
|
842
|
+
|
|
843
|
+
self.compressor = numcodecs.get_codec(self.metadata.compressor)
|
|
790
844
|
self.columns = {}
|
|
791
845
|
partition_num_records = [
|
|
792
846
|
partition.num_records for partition in self.metadata.partitions
|
|
@@ -794,11 +848,15 @@ class PickleChunkedVcf(collections.abc.Mapping):
|
|
|
794
848
|
# Allow us to find which partition a given record is in
|
|
795
849
|
self.partition_record_index = np.cumsum([0] + partition_num_records)
|
|
796
850
|
for field in self.metadata.fields:
|
|
797
|
-
self.columns[field.full_name] =
|
|
851
|
+
self.columns[field.full_name] = IntermediateColumnarFormatField(self, field)
|
|
852
|
+
logger.info(
|
|
853
|
+
f"Loaded IntermediateColumnarFormat(partitions={self.num_partitions}, "
|
|
854
|
+
f"records={self.num_records}, columns={self.num_columns})"
|
|
855
|
+
)
|
|
798
856
|
|
|
799
857
|
def __repr__(self):
|
|
800
858
|
return (
|
|
801
|
-
f"
|
|
859
|
+
f"IntermediateColumnarFormat(fields={len(self)}, partitions={self.num_partitions}, "
|
|
802
860
|
f"records={self.num_records}, path={self.path})"
|
|
803
861
|
)
|
|
804
862
|
|
|
@@ -812,15 +870,6 @@ class PickleChunkedVcf(collections.abc.Mapping):
|
|
|
812
870
|
return len(self.columns)
|
|
813
871
|
|
|
814
872
|
def summary_table(self):
|
|
815
|
-
def display_number(x):
|
|
816
|
-
ret = "n/a"
|
|
817
|
-
if math.isfinite(x):
|
|
818
|
-
ret = f"{x: 0.2g}"
|
|
819
|
-
return ret
|
|
820
|
-
|
|
821
|
-
def display_size(n):
|
|
822
|
-
return humanfriendly.format_size(n)
|
|
823
|
-
|
|
824
873
|
data = []
|
|
825
874
|
for name, col in self.columns.items():
|
|
826
875
|
summary = col.vcf_field.summary
|
|
@@ -838,14 +887,6 @@ class PickleChunkedVcf(collections.abc.Mapping):
|
|
|
838
887
|
data.append(d)
|
|
839
888
|
return data
|
|
840
889
|
|
|
841
|
-
@functools.cached_property
|
|
842
|
-
def total_uncompressed_bytes(self):
|
|
843
|
-
total = 0
|
|
844
|
-
for col in self.columns.values():
|
|
845
|
-
summary = col.vcf_field.summary
|
|
846
|
-
total += summary.uncompressed_size
|
|
847
|
-
return total
|
|
848
|
-
|
|
849
890
|
@functools.cached_property
|
|
850
891
|
def num_records(self):
|
|
851
892
|
return sum(self.metadata.contig_record_counts.values())
|
|
@@ -862,57 +903,121 @@ class PickleChunkedVcf(collections.abc.Mapping):
|
|
|
862
903
|
def num_columns(self):
|
|
863
904
|
return len(self.columns)
|
|
864
905
|
|
|
865
|
-
def mkdirs(self):
|
|
866
|
-
self.path.mkdir()
|
|
867
|
-
for col in self.columns.values():
|
|
868
|
-
col.path.mkdir(parents=True)
|
|
869
|
-
for j in range(self.num_partitions):
|
|
870
|
-
part_path = col.path / f"p{j}"
|
|
871
|
-
part_path.mkdir()
|
|
872
906
|
|
|
873
|
-
|
|
874
|
-
def
|
|
875
|
-
path = pathlib.Path(path)
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
with open(path / "header.txt") as f:
|
|
879
|
-
header = f.read()
|
|
880
|
-
pcvcf = PickleChunkedVcf(path, metadata, header)
|
|
881
|
-
logger.info(
|
|
882
|
-
f"Loaded PickleChunkedVcf(partitions={pcvcf.num_partitions}, "
|
|
883
|
-
f"records={pcvcf.num_records}, columns={pcvcf.num_columns})"
|
|
884
|
-
)
|
|
885
|
-
return pcvcf
|
|
907
|
+
class IntermediateColumnarFormatWriter:
|
|
908
|
+
def __init__(self, path):
|
|
909
|
+
self.path = pathlib.Path(path)
|
|
910
|
+
self.wip_path = self.path / "wip"
|
|
911
|
+
self.metadata = None
|
|
886
912
|
|
|
887
|
-
@
|
|
888
|
-
def
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
913
|
+
@property
|
|
914
|
+
def num_partitions(self):
|
|
915
|
+
return len(self.metadata.partitions)
|
|
916
|
+
|
|
917
|
+
def init(
|
|
918
|
+
self,
|
|
919
|
+
vcfs,
|
|
892
920
|
*,
|
|
893
921
|
column_chunk_size=16,
|
|
922
|
+
worker_processes=1,
|
|
923
|
+
target_num_partitions=None,
|
|
924
|
+
show_progress=False,
|
|
894
925
|
):
|
|
895
|
-
|
|
926
|
+
if self.path.exists():
|
|
927
|
+
shutil.rmtree(self.path)
|
|
928
|
+
vcfs = [pathlib.Path(vcf) for vcf in vcfs]
|
|
929
|
+
target_num_partitions = max(target_num_partitions, len(vcfs))
|
|
930
|
+
|
|
931
|
+
# TODO move scan_vcfs into this class
|
|
932
|
+
icf_metadata, header = scan_vcfs(
|
|
933
|
+
vcfs,
|
|
934
|
+
worker_processes=worker_processes,
|
|
935
|
+
show_progress=show_progress,
|
|
936
|
+
target_num_partitions=target_num_partitions,
|
|
937
|
+
column_chunk_size=column_chunk_size,
|
|
938
|
+
)
|
|
939
|
+
self.metadata = icf_metadata
|
|
940
|
+
|
|
941
|
+
self.mkdirs()
|
|
942
|
+
|
|
943
|
+
# Note: this is needed for the current version of the vcfzarr spec, but it's
|
|
944
|
+
# probably goint to be dropped.
|
|
945
|
+
# https://github.com/pystatgen/vcf-zarr-spec/issues/15
|
|
946
|
+
# May be useful to keep lying around still though?
|
|
947
|
+
logger.info(f"Writing VCF header")
|
|
948
|
+
with open(self.path / "header.txt", "w") as f:
|
|
949
|
+
f.write(header)
|
|
950
|
+
|
|
951
|
+
logger.info(f"Writing WIP metadata")
|
|
952
|
+
with open(self.wip_path / "metadata.json", "w") as f:
|
|
953
|
+
json.dump(self.metadata.asdict(), f, indent=4)
|
|
954
|
+
return self.num_partitions
|
|
955
|
+
|
|
956
|
+
def mkdirs(self):
|
|
957
|
+
# TODO add worker_processes here and do this with the ParallelWorkManager
|
|
958
|
+
logger.info(
|
|
959
|
+
f"Creating {len(self.metadata.fields) * self.num_partitions} directories"
|
|
960
|
+
)
|
|
961
|
+
self.path.mkdir()
|
|
962
|
+
self.wip_path.mkdir()
|
|
963
|
+
for field in self.metadata.fields:
|
|
964
|
+
col_path = get_vcf_field_path(self.path, field)
|
|
965
|
+
logger.debug(f"Make directories for {field.full_name} at {col_path}")
|
|
966
|
+
col_path.mkdir(parents=True)
|
|
967
|
+
for j in range(self.num_partitions):
|
|
968
|
+
part_path = col_path / f"p{j}"
|
|
969
|
+
part_path.mkdir()
|
|
970
|
+
|
|
971
|
+
def load_partition_summaries(self):
|
|
972
|
+
summaries = []
|
|
973
|
+
not_found = []
|
|
974
|
+
for j in range(self.num_partitions):
|
|
975
|
+
try:
|
|
976
|
+
with open(self.wip_path / f"p{j}_summary.json") as f:
|
|
977
|
+
summary = json.load(f)
|
|
978
|
+
for k, v in summary["field_summaries"].items():
|
|
979
|
+
summary["field_summaries"][k] = VcfFieldSummary.fromdict(v)
|
|
980
|
+
summaries.append(summary)
|
|
981
|
+
except FileNotFoundError:
|
|
982
|
+
not_found.append(j)
|
|
983
|
+
if len(not_found) > 0:
|
|
984
|
+
raise FileNotFoundError(
|
|
985
|
+
f"Partition metadata not found for {len(not_found)} partitions: {not_found}"
|
|
986
|
+
)
|
|
987
|
+
return summaries
|
|
988
|
+
|
|
989
|
+
def load_metadata(self):
|
|
990
|
+
if self.metadata is None:
|
|
991
|
+
with open(self.wip_path / f"metadata.json") as f:
|
|
992
|
+
self.metadata = IcfMetadata.fromdict(json.load(f))
|
|
993
|
+
|
|
994
|
+
def process_partition(self, partition_index):
|
|
995
|
+
self.load_metadata()
|
|
996
|
+
summary_path = self.wip_path / f"p{partition_index}_summary.json"
|
|
997
|
+
# If someone is rewriting a summary path (for whatever reason), make sure it
|
|
998
|
+
# doesn't look like it's already been completed.
|
|
999
|
+
# NOTE to do this properly we probably need to take a lock on this file - but
|
|
1000
|
+
# this simple approach will catch the vast majority of problems.
|
|
1001
|
+
if summary_path.exists():
|
|
1002
|
+
summary_path.unlink()
|
|
1003
|
+
|
|
1004
|
+
partition = self.metadata.partitions[partition_index]
|
|
896
1005
|
logger.info(
|
|
897
1006
|
f"Start p{partition_index} {partition.vcf_path}__{partition.region}"
|
|
898
1007
|
)
|
|
899
|
-
info_fields =
|
|
1008
|
+
info_fields = self.metadata.info_fields
|
|
900
1009
|
format_fields = []
|
|
901
1010
|
has_gt = False
|
|
902
|
-
for field in
|
|
1011
|
+
for field in self.metadata.format_fields:
|
|
903
1012
|
if field.name == "GT":
|
|
904
1013
|
has_gt = True
|
|
905
1014
|
else:
|
|
906
1015
|
format_fields.append(field)
|
|
907
1016
|
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
vcf_metadata,
|
|
912
|
-
out_path,
|
|
1017
|
+
with IcfPartitionWriter(
|
|
1018
|
+
self.metadata,
|
|
1019
|
+
self.path,
|
|
913
1020
|
partition_index,
|
|
914
|
-
compressor,
|
|
915
|
-
chunk_size=column_chunk_size,
|
|
916
1021
|
) as tcw:
|
|
917
1022
|
with vcf_utils.IndexedVcf(partition.vcf_path) as ivcf:
|
|
918
1023
|
num_records = 0
|
|
@@ -930,108 +1035,172 @@ class PickleChunkedVcf(collections.abc.Mapping):
|
|
|
930
1035
|
if has_gt:
|
|
931
1036
|
tcw.append("FORMAT/GT", variant.genotype.array())
|
|
932
1037
|
for field in format_fields:
|
|
933
|
-
val =
|
|
934
|
-
try:
|
|
935
|
-
val = variant.format(field.name)
|
|
936
|
-
except KeyError:
|
|
937
|
-
pass
|
|
1038
|
+
val = variant.format(field.name)
|
|
938
1039
|
tcw.append(field.full_name, val)
|
|
939
1040
|
# Note: an issue with updating the progress per variant here like this
|
|
940
1041
|
# is that we get a significant pause at the end of the counter while
|
|
941
1042
|
# all the "small" fields get flushed. Possibly not much to be done about it.
|
|
942
1043
|
core.update_progress(1)
|
|
1044
|
+
logger.info(
|
|
1045
|
+
f"Finished reading VCF for partition {partition_index}, flushing buffers"
|
|
1046
|
+
)
|
|
943
1047
|
|
|
1048
|
+
partition_metadata = {
|
|
1049
|
+
"num_records": num_records,
|
|
1050
|
+
"field_summaries": {k: v.asdict() for k, v in tcw.field_summaries.items()},
|
|
1051
|
+
}
|
|
1052
|
+
with open(summary_path, "w") as f:
|
|
1053
|
+
json.dump(partition_metadata, f, indent=4)
|
|
944
1054
|
logger.info(
|
|
945
1055
|
f"Finish p{partition_index} {partition.vcf_path}__{partition.region}="
|
|
946
1056
|
f"{num_records} records"
|
|
947
1057
|
)
|
|
948
|
-
return partition_index, tcw.field_summaries, num_records
|
|
949
1058
|
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
1059
|
+
def process_partition_slice(
|
|
1060
|
+
self,
|
|
1061
|
+
start,
|
|
1062
|
+
stop,
|
|
1063
|
+
*,
|
|
1064
|
+
worker_processes=1,
|
|
1065
|
+
show_progress=False,
|
|
953
1066
|
):
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
)
|
|
963
|
-
pcvcf = PickleChunkedVcf(out_path, vcf_metadata, header)
|
|
964
|
-
pcvcf.mkdirs()
|
|
965
|
-
|
|
1067
|
+
self.load_metadata()
|
|
1068
|
+
if start == 0 and stop == self.num_partitions:
|
|
1069
|
+
num_records = self.metadata.num_records
|
|
1070
|
+
else:
|
|
1071
|
+
# We only know the number of records if all partitions are done at once,
|
|
1072
|
+
# and we signal this to tqdm by passing None as the total.
|
|
1073
|
+
num_records = None
|
|
1074
|
+
num_columns = len(self.metadata.fields)
|
|
1075
|
+
num_samples = len(self.metadata.samples)
|
|
966
1076
|
logger.info(
|
|
967
|
-
f"Exploding {
|
|
968
|
-
f"{
|
|
1077
|
+
f"Exploding columns={num_columns} samples={num_samples}; "
|
|
1078
|
+
f"partitions={stop - start} "
|
|
1079
|
+
f"variants={'unknown' if num_records is None else num_records}"
|
|
969
1080
|
)
|
|
970
1081
|
progress_config = core.ProgressConfig(
|
|
971
|
-
total=
|
|
1082
|
+
total=num_records,
|
|
972
1083
|
units="vars",
|
|
973
1084
|
title="Explode",
|
|
974
1085
|
show=show_progress,
|
|
975
1086
|
)
|
|
976
1087
|
with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
|
|
977
|
-
for j
|
|
978
|
-
pwm.submit(
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
for index, summary, num_records in pwm.results_as_completed():
|
|
988
|
-
partition_summaries.append(summary)
|
|
989
|
-
vcf_metadata.partitions[index].num_records = num_records
|
|
990
|
-
|
|
991
|
-
total_records = sum(
|
|
992
|
-
partition.num_records for partition in vcf_metadata.partitions
|
|
1088
|
+
for j in range(start, stop):
|
|
1089
|
+
pwm.submit(self.process_partition, j)
|
|
1090
|
+
|
|
1091
|
+
def explode(self, *, worker_processes=1, show_progress=False):
|
|
1092
|
+
self.load_metadata()
|
|
1093
|
+
return self.process_partition_slice(
|
|
1094
|
+
0,
|
|
1095
|
+
self.num_partitions,
|
|
1096
|
+
worker_processes=worker_processes,
|
|
1097
|
+
show_progress=show_progress,
|
|
993
1098
|
)
|
|
994
|
-
assert total_records == pcvcf.num_records
|
|
995
1099
|
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1100
|
+
def explode_partition(self, partition, *, show_progress=False, worker_processes=1):
|
|
1101
|
+
self.load_metadata()
|
|
1102
|
+
if partition < 0 or partition >= self.num_partitions:
|
|
1103
|
+
raise ValueError(
|
|
1104
|
+
"Partition index must be in the range 0 <= index < num_partitions"
|
|
1105
|
+
)
|
|
1106
|
+
return self.process_partition_slice(
|
|
1107
|
+
partition,
|
|
1108
|
+
partition + 1,
|
|
1109
|
+
worker_processes=worker_processes,
|
|
1110
|
+
show_progress=show_progress,
|
|
1111
|
+
)
|
|
1112
|
+
|
|
1113
|
+
def finalise(self):
|
|
1114
|
+
self.load_metadata()
|
|
1115
|
+
partition_summaries = self.load_partition_summaries()
|
|
1116
|
+
total_records = 0
|
|
1117
|
+
for index, summary in enumerate(partition_summaries):
|
|
1118
|
+
partition_records = summary["num_records"]
|
|
1119
|
+
self.metadata.partitions[index].num_records = partition_records
|
|
1120
|
+
total_records += partition_records
|
|
1121
|
+
assert total_records == self.metadata.num_records
|
|
1122
|
+
|
|
1123
|
+
for field in self.metadata.fields:
|
|
1000
1124
|
for summary in partition_summaries:
|
|
1001
|
-
field.summary.update(summary[field.full_name])
|
|
1125
|
+
field.summary.update(summary["field_summaries"][field.full_name])
|
|
1002
1126
|
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1127
|
+
logger.info(f"Finalising metadata")
|
|
1128
|
+
with open(self.path / "metadata.json", "w") as f:
|
|
1129
|
+
json.dump(self.metadata.asdict(), f, indent=4)
|
|
1130
|
+
|
|
1131
|
+
logger.debug(f"Removing WIP directory")
|
|
1132
|
+
shutil.rmtree(self.wip_path)
|
|
1007
1133
|
|
|
1008
1134
|
|
|
1009
1135
|
def explode(
|
|
1010
1136
|
vcfs,
|
|
1011
|
-
|
|
1137
|
+
icf_path,
|
|
1012
1138
|
*,
|
|
1013
1139
|
column_chunk_size=16,
|
|
1014
1140
|
worker_processes=1,
|
|
1015
1141
|
show_progress=False,
|
|
1016
1142
|
):
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
shutil.rmtree(out_path)
|
|
1020
|
-
|
|
1021
|
-
PickleChunkedVcf.convert(
|
|
1143
|
+
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1144
|
+
num_partitions = writer.init(
|
|
1022
1145
|
vcfs,
|
|
1023
|
-
|
|
1146
|
+
# Heuristic to get reasonable worker utilisation with lumpy partition sizing
|
|
1147
|
+
target_num_partitions=max(1, worker_processes * 4),
|
|
1148
|
+
worker_processes=worker_processes,
|
|
1149
|
+
show_progress=show_progress,
|
|
1024
1150
|
column_chunk_size=column_chunk_size,
|
|
1151
|
+
)
|
|
1152
|
+
writer.explode(worker_processes=worker_processes, show_progress=show_progress)
|
|
1153
|
+
writer.finalise()
|
|
1154
|
+
return IntermediateColumnarFormat(icf_path)
|
|
1155
|
+
|
|
1156
|
+
|
|
1157
|
+
def explode_init(
|
|
1158
|
+
icf_path,
|
|
1159
|
+
vcfs,
|
|
1160
|
+
*,
|
|
1161
|
+
column_chunk_size=16,
|
|
1162
|
+
target_num_partitions=1,
|
|
1163
|
+
worker_processes=1,
|
|
1164
|
+
show_progress=False,
|
|
1165
|
+
):
|
|
1166
|
+
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1167
|
+
return writer.init(
|
|
1168
|
+
vcfs,
|
|
1169
|
+
target_num_partitions=target_num_partitions,
|
|
1025
1170
|
worker_processes=worker_processes,
|
|
1026
1171
|
show_progress=show_progress,
|
|
1172
|
+
column_chunk_size=column_chunk_size,
|
|
1027
1173
|
)
|
|
1028
|
-
return PickleChunkedVcf.load(out_path)
|
|
1029
1174
|
|
|
1030
1175
|
|
|
1031
|
-
|
|
1176
|
+
# NOTE only including worker_processes here so we can use the 0 option to get the
|
|
1177
|
+
# work done syncronously and so we can get test coverage on it. Should find a
|
|
1178
|
+
# better way to do this.
|
|
1179
|
+
def explode_partition(icf_path, partition, *, show_progress=False, worker_processes=1):
|
|
1180
|
+
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1181
|
+
writer.explode_partition(
|
|
1182
|
+
partition, show_progress=show_progress, worker_processes=worker_processes
|
|
1183
|
+
)
|
|
1184
|
+
|
|
1185
|
+
|
|
1186
|
+
def explode_finalise(icf_path):
|
|
1187
|
+
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1188
|
+
writer.finalise()
|
|
1189
|
+
|
|
1190
|
+
|
|
1191
|
+
def inspect(path):
|
|
1192
|
+
path = pathlib.Path(path)
|
|
1032
1193
|
# TODO add support for the Zarr format also
|
|
1033
|
-
|
|
1034
|
-
|
|
1194
|
+
if (path / "metadata.json").exists():
|
|
1195
|
+
obj = IntermediateColumnarFormat(path)
|
|
1196
|
+
elif (path / ".zmetadata").exists():
|
|
1197
|
+
obj = VcfZarr(path)
|
|
1198
|
+
else:
|
|
1199
|
+
raise ValueError("Format not recognised") # NEEDS TEST
|
|
1200
|
+
return obj.summary_table()
|
|
1201
|
+
|
|
1202
|
+
|
|
1203
|
+
DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
|
|
1035
1204
|
|
|
1036
1205
|
|
|
1037
1206
|
@dataclasses.dataclass
|
|
@@ -1043,20 +1212,46 @@ class ZarrColumnSpec:
|
|
|
1043
1212
|
dimensions: list
|
|
1044
1213
|
description: str
|
|
1045
1214
|
vcf_field: str
|
|
1046
|
-
compressor: dict
|
|
1215
|
+
compressor: dict = None
|
|
1216
|
+
filters: list = None
|
|
1047
1217
|
# TODO add filters
|
|
1048
1218
|
|
|
1049
1219
|
def __post_init__(self):
|
|
1050
1220
|
self.shape = tuple(self.shape)
|
|
1051
1221
|
self.chunks = tuple(self.chunks)
|
|
1052
1222
|
self.dimensions = tuple(self.dimensions)
|
|
1223
|
+
self.compressor = DEFAULT_ZARR_COMPRESSOR.get_config()
|
|
1224
|
+
self.filters = []
|
|
1225
|
+
self._choose_compressor_settings()
|
|
1226
|
+
|
|
1227
|
+
def _choose_compressor_settings(self):
|
|
1228
|
+
"""
|
|
1229
|
+
Choose compressor and filter settings based on the size and
|
|
1230
|
+
type of the array, plus some hueristics from observed properties
|
|
1231
|
+
of VCFs.
|
|
1232
|
+
|
|
1233
|
+
See https://github.com/pystatgen/bio2zarr/discussions/74
|
|
1234
|
+
"""
|
|
1235
|
+
dt = np.dtype(self.dtype)
|
|
1236
|
+
# Default is to not shuffle, because autoshuffle isn't recognised
|
|
1237
|
+
# by many Zarr implementations, and shuffling can lead to worse
|
|
1238
|
+
# performance in some cases anyway. Turning on shuffle should be a
|
|
1239
|
+
# deliberate choice.
|
|
1240
|
+
shuffle = numcodecs.Blosc.NOSHUFFLE
|
|
1241
|
+
if dt.itemsize == 1:
|
|
1242
|
+
# Any 1 byte field gets BITSHUFFLE by default
|
|
1243
|
+
shuffle = numcodecs.Blosc.BITSHUFFLE
|
|
1244
|
+
self.compressor["shuffle"] = shuffle
|
|
1245
|
+
|
|
1246
|
+
|
|
1247
|
+
ZARR_SCHEMA_FORMAT_VERSION = "0.2"
|
|
1053
1248
|
|
|
1054
1249
|
|
|
1055
1250
|
@dataclasses.dataclass
|
|
1056
|
-
class
|
|
1251
|
+
class VcfZarrSchema:
|
|
1057
1252
|
format_version: str
|
|
1058
|
-
|
|
1059
|
-
|
|
1253
|
+
samples_chunk_size: int
|
|
1254
|
+
variants_chunk_size: int
|
|
1060
1255
|
dimensions: list
|
|
1061
1256
|
sample_id: list
|
|
1062
1257
|
contig_id: list
|
|
@@ -1072,7 +1267,12 @@ class ZarrConversionSpec:
|
|
|
1072
1267
|
|
|
1073
1268
|
@staticmethod
|
|
1074
1269
|
def fromdict(d):
|
|
1075
|
-
|
|
1270
|
+
if d["format_version"] != ZARR_SCHEMA_FORMAT_VERSION:
|
|
1271
|
+
raise ValueError(
|
|
1272
|
+
"Zarr schema format version mismatch: "
|
|
1273
|
+
f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
|
|
1274
|
+
)
|
|
1275
|
+
ret = VcfZarrSchema(**d)
|
|
1076
1276
|
ret.columns = {
|
|
1077
1277
|
key: ZarrColumnSpec(**value) for key, value in d["columns"].items()
|
|
1078
1278
|
}
|
|
@@ -1080,19 +1280,20 @@ class ZarrConversionSpec:
|
|
|
1080
1280
|
|
|
1081
1281
|
@staticmethod
|
|
1082
1282
|
def fromjson(s):
|
|
1083
|
-
return
|
|
1283
|
+
return VcfZarrSchema.fromdict(json.loads(s))
|
|
1084
1284
|
|
|
1085
1285
|
@staticmethod
|
|
1086
|
-
def generate(
|
|
1087
|
-
m =
|
|
1088
|
-
n =
|
|
1286
|
+
def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
|
|
1287
|
+
m = icf.num_records
|
|
1288
|
+
n = icf.num_samples
|
|
1089
1289
|
# FIXME
|
|
1090
|
-
if
|
|
1091
|
-
|
|
1092
|
-
if
|
|
1093
|
-
|
|
1094
|
-
logger.info(
|
|
1095
|
-
|
|
1290
|
+
if samples_chunk_size is None:
|
|
1291
|
+
samples_chunk_size = 1000
|
|
1292
|
+
if variants_chunk_size is None:
|
|
1293
|
+
variants_chunk_size = 10_000
|
|
1294
|
+
logger.info(
|
|
1295
|
+
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
|
|
1296
|
+
)
|
|
1096
1297
|
|
|
1097
1298
|
def fixed_field_spec(
|
|
1098
1299
|
name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
|
|
@@ -1104,13 +1305,12 @@ class ZarrConversionSpec:
|
|
|
1104
1305
|
shape=shape,
|
|
1105
1306
|
description="",
|
|
1106
1307
|
dimensions=dimensions,
|
|
1107
|
-
chunks=[
|
|
1108
|
-
compressor=compressor,
|
|
1308
|
+
chunks=[variants_chunk_size],
|
|
1109
1309
|
)
|
|
1110
1310
|
|
|
1111
|
-
alt_col =
|
|
1311
|
+
alt_col = icf.columns["ALT"]
|
|
1112
1312
|
max_alleles = alt_col.vcf_field.summary.max_number + 1
|
|
1113
|
-
num_filters = len(
|
|
1313
|
+
num_filters = len(icf.metadata.filters)
|
|
1114
1314
|
|
|
1115
1315
|
# # FIXME get dtype from lookup table
|
|
1116
1316
|
colspecs = [
|
|
@@ -1153,7 +1353,7 @@ class ZarrConversionSpec:
|
|
|
1153
1353
|
]
|
|
1154
1354
|
|
|
1155
1355
|
gt_field = None
|
|
1156
|
-
for field in
|
|
1356
|
+
for field in icf.metadata.fields:
|
|
1157
1357
|
if field.category == "fixed":
|
|
1158
1358
|
continue
|
|
1159
1359
|
if field.name == "GT":
|
|
@@ -1162,11 +1362,11 @@ class ZarrConversionSpec:
|
|
|
1162
1362
|
shape = [m]
|
|
1163
1363
|
prefix = "variant_"
|
|
1164
1364
|
dimensions = ["variants"]
|
|
1165
|
-
chunks = [
|
|
1365
|
+
chunks = [variants_chunk_size]
|
|
1166
1366
|
if field.category == "FORMAT":
|
|
1167
1367
|
prefix = "call_"
|
|
1168
1368
|
shape.append(n)
|
|
1169
|
-
chunks.append(
|
|
1369
|
+
chunks.append(samples_chunk_size),
|
|
1170
1370
|
dimensions.append("samples")
|
|
1171
1371
|
# TODO make an option to add in the empty extra dimension
|
|
1172
1372
|
if field.summary.max_number > 1:
|
|
@@ -1181,14 +1381,13 @@ class ZarrConversionSpec:
|
|
|
1181
1381
|
chunks=chunks,
|
|
1182
1382
|
dimensions=dimensions,
|
|
1183
1383
|
description=field.description,
|
|
1184
|
-
compressor=compressor,
|
|
1185
1384
|
)
|
|
1186
1385
|
colspecs.append(colspec)
|
|
1187
1386
|
|
|
1188
1387
|
if gt_field is not None:
|
|
1189
1388
|
ploidy = gt_field.summary.max_number - 1
|
|
1190
1389
|
shape = [m, n]
|
|
1191
|
-
chunks = [
|
|
1390
|
+
chunks = [variants_chunk_size, samples_chunk_size]
|
|
1192
1391
|
dimensions = ["variants", "samples"]
|
|
1193
1392
|
|
|
1194
1393
|
colspecs.append(
|
|
@@ -1200,7 +1399,6 @@ class ZarrConversionSpec:
|
|
|
1200
1399
|
chunks=list(chunks),
|
|
1201
1400
|
dimensions=list(dimensions),
|
|
1202
1401
|
description="",
|
|
1203
|
-
compressor=compressor,
|
|
1204
1402
|
)
|
|
1205
1403
|
)
|
|
1206
1404
|
shape += [ploidy]
|
|
@@ -1214,7 +1412,6 @@ class ZarrConversionSpec:
|
|
|
1214
1412
|
chunks=list(chunks),
|
|
1215
1413
|
dimensions=list(dimensions),
|
|
1216
1414
|
description="",
|
|
1217
|
-
compressor=compressor,
|
|
1218
1415
|
)
|
|
1219
1416
|
)
|
|
1220
1417
|
colspecs.append(
|
|
@@ -1226,47 +1423,100 @@ class ZarrConversionSpec:
|
|
|
1226
1423
|
chunks=list(chunks),
|
|
1227
1424
|
dimensions=list(dimensions),
|
|
1228
1425
|
description="",
|
|
1229
|
-
compressor=compressor,
|
|
1230
1426
|
)
|
|
1231
1427
|
)
|
|
1232
1428
|
|
|
1233
|
-
return
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
chunk_length=chunk_length,
|
|
1429
|
+
return VcfZarrSchema(
|
|
1430
|
+
format_version=ZARR_SCHEMA_FORMAT_VERSION,
|
|
1431
|
+
samples_chunk_size=samples_chunk_size,
|
|
1432
|
+
variants_chunk_size=variants_chunk_size,
|
|
1238
1433
|
columns={col.name: col for col in colspecs},
|
|
1239
1434
|
dimensions=["variants", "samples", "ploidy", "alleles", "filters"],
|
|
1240
|
-
sample_id=
|
|
1241
|
-
contig_id=
|
|
1242
|
-
contig_length=
|
|
1243
|
-
filter_id=
|
|
1435
|
+
sample_id=icf.metadata.samples,
|
|
1436
|
+
contig_id=icf.metadata.contig_names,
|
|
1437
|
+
contig_length=icf.metadata.contig_lengths,
|
|
1438
|
+
filter_id=icf.metadata.filters,
|
|
1244
1439
|
)
|
|
1245
1440
|
|
|
1246
1441
|
|
|
1247
|
-
class
|
|
1442
|
+
class VcfZarr:
|
|
1248
1443
|
def __init__(self, path):
|
|
1444
|
+
if not (path / ".zmetadata").exists():
|
|
1445
|
+
raise ValueError("Not in VcfZarr format") # NEEDS TEST
|
|
1446
|
+
self.root = zarr.open(path, mode="r")
|
|
1447
|
+
|
|
1448
|
+
def __repr__(self):
|
|
1449
|
+
return repr(self.root) # NEEDS TEST
|
|
1450
|
+
|
|
1451
|
+
def summary_table(self):
|
|
1452
|
+
data = []
|
|
1453
|
+
arrays = [(a.nbytes_stored, a) for _, a in self.root.arrays()]
|
|
1454
|
+
arrays.sort(key=lambda x: x[0])
|
|
1455
|
+
for stored, array in reversed(arrays):
|
|
1456
|
+
d = {
|
|
1457
|
+
"name": array.name,
|
|
1458
|
+
"dtype": str(array.dtype),
|
|
1459
|
+
"stored": display_size(stored),
|
|
1460
|
+
"size": display_size(array.nbytes),
|
|
1461
|
+
"ratio": display_number(array.nbytes / stored),
|
|
1462
|
+
"nchunks": str(array.nchunks),
|
|
1463
|
+
"chunk_size": display_size(array.nbytes / array.nchunks),
|
|
1464
|
+
"avg_chunk_stored": display_size(int(stored / array.nchunks)),
|
|
1465
|
+
"shape": str(array.shape),
|
|
1466
|
+
"chunk_shape": str(array.chunks),
|
|
1467
|
+
"compressor": str(array.compressor),
|
|
1468
|
+
"filters": str(array.filters),
|
|
1469
|
+
}
|
|
1470
|
+
data.append(d)
|
|
1471
|
+
return data
|
|
1472
|
+
|
|
1473
|
+
|
|
1474
|
+
@dataclasses.dataclass
|
|
1475
|
+
class EncodingWork:
|
|
1476
|
+
func: callable = dataclasses.field(repr=False)
|
|
1477
|
+
start: int
|
|
1478
|
+
stop: int
|
|
1479
|
+
columns: list[str]
|
|
1480
|
+
memory: int = 0
|
|
1481
|
+
|
|
1482
|
+
|
|
1483
|
+
class VcfZarrWriter:
|
|
1484
|
+
def __init__(self, path, icf, schema):
|
|
1249
1485
|
self.path = pathlib.Path(path)
|
|
1250
|
-
self.
|
|
1486
|
+
self.icf = icf
|
|
1487
|
+
self.schema = schema
|
|
1488
|
+
store = zarr.DirectoryStore(self.path)
|
|
1489
|
+
self.root = zarr.group(store=store)
|
|
1251
1490
|
|
|
1252
|
-
def
|
|
1491
|
+
def init_array(self, variable):
|
|
1253
1492
|
# print("CREATE", variable)
|
|
1254
1493
|
object_codec = None
|
|
1255
1494
|
if variable.dtype == "O":
|
|
1256
1495
|
object_codec = numcodecs.VLenUTF8()
|
|
1257
1496
|
a = self.root.empty(
|
|
1258
|
-
variable.name,
|
|
1497
|
+
"wip_" + variable.name,
|
|
1259
1498
|
shape=variable.shape,
|
|
1260
1499
|
chunks=variable.chunks,
|
|
1261
1500
|
dtype=variable.dtype,
|
|
1262
1501
|
compressor=numcodecs.get_codec(variable.compressor),
|
|
1502
|
+
filters=[numcodecs.get_codec(filt) for filt in variable.filters],
|
|
1263
1503
|
object_codec=object_codec,
|
|
1264
1504
|
)
|
|
1265
1505
|
a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
|
|
1266
1506
|
|
|
1267
|
-
def
|
|
1268
|
-
|
|
1269
|
-
|
|
1507
|
+
def get_array(self, name):
|
|
1508
|
+
return self.root["wip_" + name]
|
|
1509
|
+
|
|
1510
|
+
def finalise_array(self, variable_name):
|
|
1511
|
+
source = self.path / ("wip_" + variable_name)
|
|
1512
|
+
dest = self.path / variable_name
|
|
1513
|
+
# Atomic swap
|
|
1514
|
+
os.rename(source, dest)
|
|
1515
|
+
logger.info(f"Finalised {variable_name}")
|
|
1516
|
+
|
|
1517
|
+
def encode_array_slice(self, column, start, stop):
|
|
1518
|
+
source_col = self.icf.columns[column.vcf_field]
|
|
1519
|
+
array = self.get_array(column.name)
|
|
1270
1520
|
ba = core.BufferedArray(array, start)
|
|
1271
1521
|
sanitiser = source_col.sanitiser_factory(ba.buff.shape)
|
|
1272
1522
|
|
|
@@ -1278,11 +1528,11 @@ class SgvcfZarr:
|
|
|
1278
1528
|
ba.flush()
|
|
1279
1529
|
logger.debug(f"Encoded {column.name} slice {start}:{stop}")
|
|
1280
1530
|
|
|
1281
|
-
def encode_genotypes_slice(self,
|
|
1282
|
-
source_col =
|
|
1283
|
-
gt = core.BufferedArray(self.
|
|
1284
|
-
gt_mask = core.BufferedArray(self.
|
|
1285
|
-
gt_phased = core.BufferedArray(self.
|
|
1531
|
+
def encode_genotypes_slice(self, start, stop):
|
|
1532
|
+
source_col = self.icf.columns["FORMAT/GT"]
|
|
1533
|
+
gt = core.BufferedArray(self.get_array("call_genotype"), start)
|
|
1534
|
+
gt_mask = core.BufferedArray(self.get_array("call_genotype_mask"), start)
|
|
1535
|
+
gt_phased = core.BufferedArray(self.get_array("call_genotype_phased"), start)
|
|
1286
1536
|
|
|
1287
1537
|
for value in source_col.iter_values(start, stop):
|
|
1288
1538
|
j = gt.next_buffer_row()
|
|
@@ -1298,10 +1548,10 @@ class SgvcfZarr:
|
|
|
1298
1548
|
gt_mask.flush()
|
|
1299
1549
|
logger.debug(f"Encoded GT slice {start}:{stop}")
|
|
1300
1550
|
|
|
1301
|
-
def encode_alleles_slice(self,
|
|
1302
|
-
ref_col =
|
|
1303
|
-
alt_col =
|
|
1304
|
-
alleles = core.BufferedArray(self.
|
|
1551
|
+
def encode_alleles_slice(self, start, stop):
|
|
1552
|
+
ref_col = self.icf.columns["REF"]
|
|
1553
|
+
alt_col = self.icf.columns["ALT"]
|
|
1554
|
+
alleles = core.BufferedArray(self.get_array("variant_allele"), start)
|
|
1305
1555
|
|
|
1306
1556
|
for ref, alt in zip(
|
|
1307
1557
|
ref_col.iter_values(start, stop), alt_col.iter_values(start, stop)
|
|
@@ -1313,10 +1563,10 @@ class SgvcfZarr:
|
|
|
1313
1563
|
alleles.flush()
|
|
1314
1564
|
logger.debug(f"Encoded alleles slice {start}:{stop}")
|
|
1315
1565
|
|
|
1316
|
-
def encode_id_slice(self,
|
|
1317
|
-
col =
|
|
1318
|
-
vid = core.BufferedArray(self.
|
|
1319
|
-
vid_mask = core.BufferedArray(self.
|
|
1566
|
+
def encode_id_slice(self, start, stop):
|
|
1567
|
+
col = self.icf.columns["ID"]
|
|
1568
|
+
vid = core.BufferedArray(self.get_array("variant_id"), start)
|
|
1569
|
+
vid_mask = core.BufferedArray(self.get_array("variant_id_mask"), start)
|
|
1320
1570
|
|
|
1321
1571
|
for value in col.iter_values(start, stop):
|
|
1322
1572
|
j = vid.next_buffer_row()
|
|
@@ -1332,182 +1582,246 @@ class SgvcfZarr:
|
|
|
1332
1582
|
vid_mask.flush()
|
|
1333
1583
|
logger.debug(f"Encoded ID slice {start}:{stop}")
|
|
1334
1584
|
|
|
1335
|
-
def encode_filters_slice(self,
|
|
1336
|
-
col =
|
|
1337
|
-
var_filter = core.BufferedArray(self.
|
|
1585
|
+
def encode_filters_slice(self, lookup, start, stop):
|
|
1586
|
+
col = self.icf.columns["FILTERS"]
|
|
1587
|
+
var_filter = core.BufferedArray(self.get_array("variant_filter"), start)
|
|
1338
1588
|
|
|
1339
1589
|
for value in col.iter_values(start, stop):
|
|
1340
1590
|
j = var_filter.next_buffer_row()
|
|
1341
1591
|
var_filter.buff[j] = False
|
|
1342
|
-
|
|
1343
|
-
|
|
1592
|
+
for f in value:
|
|
1593
|
+
try:
|
|
1344
1594
|
var_filter.buff[j, lookup[f]] = True
|
|
1345
|
-
|
|
1346
|
-
|
|
1595
|
+
except KeyError:
|
|
1596
|
+
raise ValueError(f"Filter '{f}' was not defined in the header.")
|
|
1347
1597
|
var_filter.flush()
|
|
1348
1598
|
logger.debug(f"Encoded FILTERS slice {start}:{stop}")
|
|
1349
1599
|
|
|
1350
|
-
def encode_contig_slice(self,
|
|
1351
|
-
col =
|
|
1352
|
-
contig = core.BufferedArray(self.
|
|
1600
|
+
def encode_contig_slice(self, lookup, start, stop):
|
|
1601
|
+
col = self.icf.columns["CHROM"]
|
|
1602
|
+
contig = core.BufferedArray(self.get_array("variant_contig"), start)
|
|
1353
1603
|
|
|
1354
1604
|
for value in col.iter_values(start, stop):
|
|
1355
1605
|
j = contig.next_buffer_row()
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1606
|
+
# Note: because we are using the indexes to define the lookups
|
|
1607
|
+
# and we always have an index, it seems that we the contig lookup
|
|
1608
|
+
# will always succeed. However, if anyone ever does hit a KeyError
|
|
1609
|
+
# here, please do open an issue with a reproducible example!
|
|
1610
|
+
contig.buff[j] = lookup[value[0]]
|
|
1361
1611
|
contig.flush()
|
|
1362
1612
|
logger.debug(f"Encoded CHROM slice {start}:{stop}")
|
|
1363
1613
|
|
|
1364
|
-
def encode_samples(self
|
|
1365
|
-
if not np.array_equal(sample_id,
|
|
1366
|
-
raise ValueError(
|
|
1614
|
+
def encode_samples(self):
|
|
1615
|
+
if not np.array_equal(self.schema.sample_id, self.icf.metadata.samples):
|
|
1616
|
+
raise ValueError(
|
|
1617
|
+
"Subsetting or reordering samples not supported currently"
|
|
1618
|
+
) # NEEDS TEST
|
|
1367
1619
|
array = self.root.array(
|
|
1368
1620
|
"sample_id",
|
|
1369
|
-
sample_id,
|
|
1621
|
+
self.schema.sample_id,
|
|
1370
1622
|
dtype="str",
|
|
1371
|
-
compressor=
|
|
1372
|
-
chunks=(
|
|
1623
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1624
|
+
chunks=(self.schema.samples_chunk_size,),
|
|
1373
1625
|
)
|
|
1374
1626
|
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
1375
1627
|
logger.debug("Samples done")
|
|
1376
1628
|
|
|
1377
|
-
def encode_contig_id(self
|
|
1629
|
+
def encode_contig_id(self):
|
|
1378
1630
|
array = self.root.array(
|
|
1379
1631
|
"contig_id",
|
|
1380
|
-
|
|
1632
|
+
self.schema.contig_id,
|
|
1381
1633
|
dtype="str",
|
|
1382
|
-
compressor=
|
|
1634
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1383
1635
|
)
|
|
1384
1636
|
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
1385
|
-
if
|
|
1637
|
+
if self.schema.contig_length is not None:
|
|
1386
1638
|
array = self.root.array(
|
|
1387
1639
|
"contig_length",
|
|
1388
|
-
|
|
1640
|
+
self.schema.contig_length,
|
|
1389
1641
|
dtype=np.int64,
|
|
1390
1642
|
)
|
|
1391
1643
|
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
1392
|
-
return {v: j for j, v in enumerate(
|
|
1644
|
+
return {v: j for j, v in enumerate(self.schema.contig_id)}
|
|
1393
1645
|
|
|
1394
|
-
def encode_filter_id(self
|
|
1646
|
+
def encode_filter_id(self):
|
|
1395
1647
|
array = self.root.array(
|
|
1396
1648
|
"filter_id",
|
|
1397
|
-
|
|
1649
|
+
self.schema.filter_id,
|
|
1398
1650
|
dtype="str",
|
|
1399
|
-
compressor=
|
|
1651
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1400
1652
|
)
|
|
1401
1653
|
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
|
|
1402
|
-
return {v: j for j, v in enumerate(
|
|
1654
|
+
return {v: j for j, v in enumerate(self.schema.filter_id)}
|
|
1655
|
+
|
|
1656
|
+
def init(self):
|
|
1657
|
+
self.root.attrs["vcf_zarr_version"] = "0.2"
|
|
1658
|
+
self.root.attrs["vcf_header"] = self.icf.vcf_header
|
|
1659
|
+
self.root.attrs["source"] = f"bio2zarr-{provenance.__version__}"
|
|
1660
|
+
for column in self.schema.columns.values():
|
|
1661
|
+
self.init_array(column)
|
|
1662
|
+
|
|
1663
|
+
def finalise(self):
|
|
1664
|
+
# for column in self.schema.columns.values():
|
|
1665
|
+
# self.finalise_array(column)
|
|
1666
|
+
zarr.consolidate_metadata(self.path)
|
|
1403
1667
|
|
|
1404
|
-
@staticmethod
|
|
1405
1668
|
def encode(
|
|
1406
|
-
|
|
1407
|
-
path,
|
|
1408
|
-
conversion_spec,
|
|
1409
|
-
*,
|
|
1669
|
+
self,
|
|
1410
1670
|
worker_processes=1,
|
|
1411
1671
|
max_v_chunks=None,
|
|
1412
1672
|
show_progress=False,
|
|
1673
|
+
max_memory=None,
|
|
1413
1674
|
):
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
store = zarr.DirectoryStore(write_path)
|
|
1421
|
-
# FIXME, duplicating logic about the store
|
|
1422
|
-
logger.info(f"Create zarr at {write_path}")
|
|
1423
|
-
sgvcf = SgvcfZarr(write_path)
|
|
1424
|
-
sgvcf.root = zarr.group(store=store, overwrite=True)
|
|
1425
|
-
for column in conversion_spec.columns.values():
|
|
1426
|
-
sgvcf.create_array(column)
|
|
1427
|
-
|
|
1428
|
-
sgvcf.root.attrs["vcf_zarr_version"] = "0.2"
|
|
1429
|
-
sgvcf.root.attrs["vcf_header"] = pcvcf.vcf_header
|
|
1430
|
-
sgvcf.root.attrs["source"] = f"bio2zarr-{provenance.__version__}"
|
|
1675
|
+
if max_memory is None:
|
|
1676
|
+
# Unbounded
|
|
1677
|
+
max_memory = 2**63
|
|
1678
|
+
else:
|
|
1679
|
+
# Value is specified in Mibibytes
|
|
1680
|
+
max_memory *= 2**20 # NEEDS TEST
|
|
1431
1681
|
|
|
1682
|
+
# TODO this will move into the setup logic later when we're making it possible
|
|
1683
|
+
# to split the work by slice
|
|
1432
1684
|
num_slices = max(1, worker_processes * 4)
|
|
1433
1685
|
# Using POS arbitrarily to get the array slices
|
|
1434
1686
|
slices = core.chunk_aligned_slices(
|
|
1435
|
-
|
|
1687
|
+
self.get_array("variant_position"), num_slices, max_chunks=max_v_chunks
|
|
1436
1688
|
)
|
|
1437
1689
|
truncated = slices[-1][-1]
|
|
1438
|
-
for array in
|
|
1690
|
+
for array in self.root.values():
|
|
1439
1691
|
if array.attrs["_ARRAY_DIMENSIONS"][0] == "variants":
|
|
1440
1692
|
shape = list(array.shape)
|
|
1441
1693
|
shape[0] = truncated
|
|
1442
1694
|
array.resize(shape)
|
|
1443
1695
|
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1696
|
+
total_bytes = 0
|
|
1697
|
+
encoding_memory_requirements = {}
|
|
1698
|
+
for col in self.schema.columns.values():
|
|
1699
|
+
array = self.get_array(col.name)
|
|
1700
|
+
# NOTE!! this is bad, we're potentially creating quite a large
|
|
1701
|
+
# numpy array for basically nothing. We can compute this.
|
|
1702
|
+
variant_chunk_size = array.blocks[0].nbytes
|
|
1703
|
+
encoding_memory_requirements[col.name] = variant_chunk_size
|
|
1704
|
+
logger.debug(
|
|
1705
|
+
f"{col.name} requires at least {display_size(variant_chunk_size)} per worker"
|
|
1706
|
+
)
|
|
1707
|
+
total_bytes += array.nbytes
|
|
1708
|
+
|
|
1709
|
+
filter_id_map = self.encode_filter_id()
|
|
1710
|
+
contig_id_map = self.encode_contig_id()
|
|
1711
|
+
|
|
1712
|
+
work = []
|
|
1713
|
+
for start, stop in slices:
|
|
1714
|
+
for col in self.schema.columns.values():
|
|
1715
|
+
if col.vcf_field is not None:
|
|
1716
|
+
f = functools.partial(self.encode_array_slice, col)
|
|
1717
|
+
work.append(
|
|
1718
|
+
EncodingWork(
|
|
1719
|
+
f,
|
|
1720
|
+
start,
|
|
1721
|
+
stop,
|
|
1722
|
+
[col.name],
|
|
1723
|
+
encoding_memory_requirements[col.name],
|
|
1724
|
+
)
|
|
1725
|
+
)
|
|
1726
|
+
work.append(
|
|
1727
|
+
EncodingWork(self.encode_alleles_slice, start, stop, ["variant_allele"])
|
|
1728
|
+
)
|
|
1729
|
+
work.append(
|
|
1730
|
+
EncodingWork(
|
|
1731
|
+
self.encode_id_slice, start, stop, ["variant_id", "variant_id_mask"]
|
|
1732
|
+
)
|
|
1733
|
+
)
|
|
1734
|
+
work.append(
|
|
1735
|
+
EncodingWork(
|
|
1736
|
+
functools.partial(self.encode_filters_slice, filter_id_map),
|
|
1737
|
+
start,
|
|
1738
|
+
stop,
|
|
1739
|
+
["variant_filter"],
|
|
1740
|
+
)
|
|
1741
|
+
)
|
|
1742
|
+
work.append(
|
|
1743
|
+
EncodingWork(
|
|
1744
|
+
functools.partial(self.encode_contig_slice, contig_id_map),
|
|
1745
|
+
start,
|
|
1746
|
+
stop,
|
|
1747
|
+
["variant_contig"],
|
|
1748
|
+
)
|
|
1749
|
+
)
|
|
1750
|
+
if "call_genotype" in self.schema.columns:
|
|
1751
|
+
variables = [
|
|
1752
|
+
"call_genotype",
|
|
1753
|
+
"call_genotype_phased",
|
|
1754
|
+
"call_genotype_mask",
|
|
1755
|
+
]
|
|
1756
|
+
gt_memory = sum(
|
|
1757
|
+
encoding_memory_requirements[name] for name in variables
|
|
1758
|
+
)
|
|
1759
|
+
work.append(
|
|
1760
|
+
EncodingWork(
|
|
1761
|
+
self.encode_genotypes_slice, start, stop, variables, gt_memory
|
|
1762
|
+
)
|
|
1763
|
+
)
|
|
1764
|
+
|
|
1765
|
+
# Fail early if we can't fit a particular column into memory
|
|
1766
|
+
for wp in work:
|
|
1767
|
+
if wp.memory >= max_memory:
|
|
1768
|
+
raise ValueError( # NEEDS TEST
|
|
1769
|
+
f"Insufficient memory for {wp.columns}: "
|
|
1770
|
+
f"{display_size(wp.memory)} > {display_size(max_memory)}"
|
|
1771
|
+
)
|
|
1772
|
+
|
|
1447
1773
|
progress_config = core.ProgressConfig(
|
|
1448
|
-
total=
|
|
1449
|
-
title="Encode
|
|
1450
|
-
units="
|
|
1774
|
+
total=total_bytes,
|
|
1775
|
+
title="Encode",
|
|
1776
|
+
units="B",
|
|
1451
1777
|
show=show_progress,
|
|
1452
1778
|
)
|
|
1453
1779
|
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
pcvcf, conversion_spec.contig_id, conversion_spec.contig_length
|
|
1458
|
-
)
|
|
1780
|
+
used_memory = 0
|
|
1781
|
+
max_queued = 4 * max(1, worker_processes)
|
|
1782
|
+
encoded_slices = collections.Counter()
|
|
1459
1783
|
|
|
1460
1784
|
with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
|
|
1461
|
-
pwm.submit(
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
logger.info(f"Submit encode {col.name} in {len(slices)} slices")
|
|
1497
|
-
for start, stop in slices:
|
|
1498
|
-
pwm.submit(
|
|
1499
|
-
sgvcf.encode_column_slice, pcvcf, col, start, stop
|
|
1500
|
-
)
|
|
1501
|
-
|
|
1502
|
-
zarr.consolidate_metadata(write_path)
|
|
1503
|
-
# Atomic swap, now we've completely finished.
|
|
1504
|
-
logger.info(f"Moving to final path {path}")
|
|
1505
|
-
os.rename(write_path, path)
|
|
1785
|
+
future = pwm.submit(self.encode_samples)
|
|
1786
|
+
future_to_work = {future: EncodingWork(None, 0, 0, [])}
|
|
1787
|
+
|
|
1788
|
+
def service_completed_futures():
|
|
1789
|
+
nonlocal used_memory
|
|
1790
|
+
|
|
1791
|
+
completed = pwm.wait_for_completed()
|
|
1792
|
+
for future in completed:
|
|
1793
|
+
wp_done = future_to_work.pop(future)
|
|
1794
|
+
used_memory -= wp_done.memory
|
|
1795
|
+
logger.debug(
|
|
1796
|
+
f"Complete {wp_done}: used mem={display_size(used_memory)}"
|
|
1797
|
+
)
|
|
1798
|
+
for column in wp_done.columns:
|
|
1799
|
+
encoded_slices[column] += 1
|
|
1800
|
+
if encoded_slices[column] == len(slices):
|
|
1801
|
+
# Do this syncronously for simplicity. Should be
|
|
1802
|
+
# fine as the workers will probably be busy with
|
|
1803
|
+
# large encode tasks most of the time.
|
|
1804
|
+
self.finalise_array(column)
|
|
1805
|
+
|
|
1806
|
+
for wp in work:
|
|
1807
|
+
if (
|
|
1808
|
+
used_memory + wp.memory > max_memory
|
|
1809
|
+
or len(future_to_work) > max_queued
|
|
1810
|
+
):
|
|
1811
|
+
service_completed_futures()
|
|
1812
|
+
future = pwm.submit(wp.func, wp.start, wp.stop)
|
|
1813
|
+
used_memory += wp.memory
|
|
1814
|
+
logger.debug(f"Submit {wp}: used mem={display_size(used_memory)}")
|
|
1815
|
+
future_to_work[future] = wp
|
|
1816
|
+
|
|
1817
|
+
logger.debug("All work submitted")
|
|
1818
|
+
while len(future_to_work) > 0:
|
|
1819
|
+
service_completed_futures()
|
|
1506
1820
|
|
|
1507
1821
|
|
|
1508
1822
|
def mkschema(if_path, out):
|
|
1509
|
-
|
|
1510
|
-
spec =
|
|
1823
|
+
icf = IntermediateColumnarFormat(if_path)
|
|
1824
|
+
spec = VcfZarrSchema.generate(icf)
|
|
1511
1825
|
out.write(spec.asjson())
|
|
1512
1826
|
|
|
1513
1827
|
|
|
@@ -1515,42 +1829,49 @@ def encode(
|
|
|
1515
1829
|
if_path,
|
|
1516
1830
|
zarr_path,
|
|
1517
1831
|
schema_path=None,
|
|
1518
|
-
|
|
1519
|
-
|
|
1832
|
+
variants_chunk_size=None,
|
|
1833
|
+
samples_chunk_size=None,
|
|
1520
1834
|
max_v_chunks=None,
|
|
1835
|
+
max_memory=None,
|
|
1521
1836
|
worker_processes=1,
|
|
1522
1837
|
show_progress=False,
|
|
1523
1838
|
):
|
|
1524
|
-
|
|
1839
|
+
icf = IntermediateColumnarFormat(if_path)
|
|
1525
1840
|
if schema_path is None:
|
|
1526
|
-
schema =
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1841
|
+
schema = VcfZarrSchema.generate(
|
|
1842
|
+
icf,
|
|
1843
|
+
variants_chunk_size=variants_chunk_size,
|
|
1844
|
+
samples_chunk_size=samples_chunk_size,
|
|
1530
1845
|
)
|
|
1531
1846
|
else:
|
|
1532
1847
|
logger.info(f"Reading schema from {schema_path}")
|
|
1533
|
-
if
|
|
1534
|
-
raise ValueError(
|
|
1848
|
+
if variants_chunk_size is not None or samples_chunk_size is not None:
|
|
1849
|
+
raise ValueError(
|
|
1850
|
+
"Cannot specify schema along with chunk sizes"
|
|
1851
|
+
) # NEEDS TEST
|
|
1535
1852
|
with open(schema_path, "r") as f:
|
|
1536
|
-
schema =
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
zarr_path
|
|
1541
|
-
|
|
1853
|
+
schema = VcfZarrSchema.fromjson(f.read())
|
|
1854
|
+
zarr_path = pathlib.Path(zarr_path)
|
|
1855
|
+
if zarr_path.exists():
|
|
1856
|
+
logger.warning(f"Deleting existing {zarr_path}")
|
|
1857
|
+
shutil.rmtree(zarr_path)
|
|
1858
|
+
vzw = VcfZarrWriter(zarr_path, icf, schema)
|
|
1859
|
+
vzw.init()
|
|
1860
|
+
vzw.encode(
|
|
1542
1861
|
max_v_chunks=max_v_chunks,
|
|
1543
1862
|
worker_processes=worker_processes,
|
|
1863
|
+
max_memory=max_memory,
|
|
1544
1864
|
show_progress=show_progress,
|
|
1545
1865
|
)
|
|
1866
|
+
vzw.finalise()
|
|
1546
1867
|
|
|
1547
1868
|
|
|
1548
1869
|
def convert(
|
|
1549
1870
|
vcfs,
|
|
1550
1871
|
out_path,
|
|
1551
1872
|
*,
|
|
1552
|
-
|
|
1553
|
-
|
|
1873
|
+
variants_chunk_size=None,
|
|
1874
|
+
samples_chunk_size=None,
|
|
1554
1875
|
worker_processes=1,
|
|
1555
1876
|
show_progress=False,
|
|
1556
1877
|
# TODO add arguments to control location of tmpdir
|
|
@@ -1565,8 +1886,8 @@ def convert(
|
|
|
1565
1886
|
encode(
|
|
1566
1887
|
if_dir,
|
|
1567
1888
|
out_path,
|
|
1568
|
-
|
|
1569
|
-
|
|
1889
|
+
variants_chunk_size=variants_chunk_size,
|
|
1890
|
+
samples_chunk_size=samples_chunk_size,
|
|
1570
1891
|
worker_processes=worker_processes,
|
|
1571
1892
|
show_progress=show_progress,
|
|
1572
1893
|
)
|
|
@@ -1744,16 +2065,14 @@ def validate(vcf_path, zarr_path, show_progress=False):
|
|
|
1744
2065
|
name = colname.split("_", 1)[1]
|
|
1745
2066
|
if name.isupper():
|
|
1746
2067
|
vcf_type = info_headers[name]["Type"]
|
|
1747
|
-
# print(root[colname])
|
|
1748
2068
|
info_fields[name] = vcf_type, iter(root[colname])
|
|
1749
|
-
# print(info_fields)
|
|
1750
2069
|
|
|
1751
2070
|
first_pos = next(vcf).POS
|
|
1752
2071
|
start_index = np.searchsorted(pos, first_pos)
|
|
1753
2072
|
assert pos[start_index] == first_pos
|
|
1754
2073
|
vcf = cyvcf2.VCF(vcf_path)
|
|
1755
2074
|
if show_progress:
|
|
1756
|
-
iterator = tqdm.tqdm(vcf, desc="
|
|
2075
|
+
iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
|
|
1757
2076
|
else:
|
|
1758
2077
|
iterator = vcf
|
|
1759
2078
|
for j, row in enumerate(iterator, start_index):
|
|
@@ -1790,11 +2109,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
|
|
|
1790
2109
|
assert_info_val_equal(vcf_val, zarr_val, vcf_type)
|
|
1791
2110
|
|
|
1792
2111
|
for name, (vcf_type, zarr_iter) in format_fields.items():
|
|
1793
|
-
vcf_val =
|
|
1794
|
-
try:
|
|
1795
|
-
vcf_val = row.format(name)
|
|
1796
|
-
except KeyError:
|
|
1797
|
-
pass
|
|
2112
|
+
vcf_val = row.format(name)
|
|
1798
2113
|
zarr_val = next(zarr_iter)
|
|
1799
2114
|
if vcf_val is None:
|
|
1800
2115
|
assert_format_val_missing(zarr_val, vcf_type)
|