bio2zarr 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/__main__.py +2 -1
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +91 -24
- bio2zarr/core.py +43 -22
- bio2zarr/plink.py +314 -189
- bio2zarr/tskit.py +301 -0
- bio2zarr/typing.py +1 -2
- bio2zarr/{vcf2zarr/icf.py → vcf.py} +614 -118
- bio2zarr/vcf_utils.py +66 -33
- bio2zarr/{vcf2zarr/vcz.py → vcz.py} +544 -708
- bio2zarr/{vcf2zarr/verification.py → vcz_verification.py} +5 -2
- {bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/METADATA +19 -7
- bio2zarr-0.1.6.dist-info/RECORD +21 -0
- {bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/WHEEL +1 -1
- {bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/entry_points.txt +2 -0
- bio2zarr/vcf2zarr/__init__.py +0 -38
- bio2zarr-0.1.4.dist-info/RECORD +0 -21
- {bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info/licenses}/LICENSE +0 -0
- {bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/top_level.txt +0 -0
|
@@ -6,14 +6,17 @@ import logging
|
|
|
6
6
|
import math
|
|
7
7
|
import pathlib
|
|
8
8
|
import pickle
|
|
9
|
+
import re
|
|
9
10
|
import shutil
|
|
10
11
|
import sys
|
|
12
|
+
import tempfile
|
|
13
|
+
from functools import partial
|
|
11
14
|
from typing import Any
|
|
12
15
|
|
|
13
16
|
import numcodecs
|
|
14
17
|
import numpy as np
|
|
15
18
|
|
|
16
|
-
from
|
|
19
|
+
from . import constants, core, provenance, vcf_utils, vcz
|
|
17
20
|
|
|
18
21
|
logger = logging.getLogger(__name__)
|
|
19
22
|
|
|
@@ -77,6 +80,14 @@ class VcfField:
|
|
|
77
80
|
return self.name
|
|
78
81
|
return f"{self.category}/{self.name}"
|
|
79
82
|
|
|
83
|
+
@property
|
|
84
|
+
def max_number(self):
|
|
85
|
+
if self.vcf_number in ("R", "A", "G", "."):
|
|
86
|
+
return self.summary.max_number
|
|
87
|
+
else:
|
|
88
|
+
# use declared number if larger than max found
|
|
89
|
+
return max(self.summary.max_number, int(self.vcf_number))
|
|
90
|
+
|
|
80
91
|
def smallest_dtype(self):
|
|
81
92
|
"""
|
|
82
93
|
Returns the smallest dtype suitable for this field based
|
|
@@ -116,23 +127,6 @@ ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
|
|
|
116
127
|
)
|
|
117
128
|
|
|
118
129
|
|
|
119
|
-
@dataclasses.dataclass
|
|
120
|
-
class Contig:
|
|
121
|
-
id: str
|
|
122
|
-
length: int = None
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
@dataclasses.dataclass
|
|
126
|
-
class Sample:
|
|
127
|
-
id: str
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
@dataclasses.dataclass
|
|
131
|
-
class Filter:
|
|
132
|
-
id: str
|
|
133
|
-
description: str = ""
|
|
134
|
-
|
|
135
|
-
|
|
136
130
|
@dataclasses.dataclass
|
|
137
131
|
class IcfMetadata(core.JsonDataclass):
|
|
138
132
|
samples: list
|
|
@@ -187,9 +181,9 @@ class IcfMetadata(core.JsonDataclass):
|
|
|
187
181
|
d = d.copy()
|
|
188
182
|
d["partitions"] = partitions
|
|
189
183
|
d["fields"] = [VcfField.fromdict(fd) for fd in d["fields"]]
|
|
190
|
-
d["samples"] = [Sample(**sd) for sd in d["samples"]]
|
|
191
|
-
d["filters"] = [Filter(**fd) for fd in d["filters"]]
|
|
192
|
-
d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
|
|
184
|
+
d["samples"] = [vcz.Sample(**sd) for sd in d["samples"]]
|
|
185
|
+
d["filters"] = [vcz.Filter(**fd) for fd in d["filters"]]
|
|
186
|
+
d["contigs"] = [vcz.Contig(**cd) for cd in d["contigs"]]
|
|
193
187
|
return IcfMetadata(**d)
|
|
194
188
|
|
|
195
189
|
def __eq__(self, other):
|
|
@@ -228,8 +222,8 @@ def fixed_vcf_field_definitions():
|
|
|
228
222
|
|
|
229
223
|
|
|
230
224
|
def scan_vcf(path, target_num_partitions):
|
|
231
|
-
with vcf_utils.
|
|
232
|
-
vcf =
|
|
225
|
+
with vcf_utils.VcfFile(path) as vcf_file:
|
|
226
|
+
vcf = vcf_file.vcf
|
|
233
227
|
filters = []
|
|
234
228
|
pass_index = -1
|
|
235
229
|
for h in vcf.header_iter():
|
|
@@ -240,7 +234,7 @@ def scan_vcf(path, target_num_partitions):
|
|
|
240
234
|
description = ""
|
|
241
235
|
if h["ID"] == "PASS":
|
|
242
236
|
pass_index = len(filters)
|
|
243
|
-
filters.append(Filter(h["ID"], description))
|
|
237
|
+
filters.append(vcz.Filter(h["ID"], description))
|
|
244
238
|
|
|
245
239
|
# Ensure PASS is the first filter if present
|
|
246
240
|
if pass_index > 0:
|
|
@@ -262,18 +256,18 @@ def scan_vcf(path, target_num_partitions):
|
|
|
262
256
|
contig_lengths = [None for _ in vcf.seqnames]
|
|
263
257
|
|
|
264
258
|
metadata = IcfMetadata(
|
|
265
|
-
samples=[Sample(sample_id) for sample_id in vcf.samples],
|
|
259
|
+
samples=[vcz.Sample(sample_id) for sample_id in vcf.samples],
|
|
266
260
|
contigs=[
|
|
267
|
-
Contig(contig_id, length)
|
|
261
|
+
vcz.Contig(contig_id, length)
|
|
268
262
|
for contig_id, length in zip(vcf.seqnames, contig_lengths)
|
|
269
263
|
],
|
|
270
264
|
filters=filters,
|
|
271
265
|
fields=fields,
|
|
272
266
|
partitions=[],
|
|
273
|
-
num_records=sum(
|
|
267
|
+
num_records=sum(vcf_file.contig_record_counts().values()),
|
|
274
268
|
)
|
|
275
269
|
|
|
276
|
-
regions =
|
|
270
|
+
regions = vcf_file.partition_into_regions(num_parts=target_num_partitions)
|
|
277
271
|
for region in regions:
|
|
278
272
|
metadata.partitions.append(
|
|
279
273
|
VcfPartition(
|
|
@@ -291,7 +285,12 @@ def scan_vcf(path, target_num_partitions):
|
|
|
291
285
|
return metadata, vcf.raw_header
|
|
292
286
|
|
|
293
287
|
|
|
294
|
-
def scan_vcfs(
|
|
288
|
+
def scan_vcfs(
|
|
289
|
+
paths,
|
|
290
|
+
show_progress,
|
|
291
|
+
target_num_partitions,
|
|
292
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
293
|
+
):
|
|
295
294
|
logger.info(
|
|
296
295
|
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
|
|
297
296
|
f" partitions."
|
|
@@ -324,14 +323,28 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
|
324
323
|
# are compatible.
|
|
325
324
|
all_partitions = []
|
|
326
325
|
total_records = 0
|
|
326
|
+
contigs = {}
|
|
327
327
|
for metadata, _ in results:
|
|
328
328
|
for partition in metadata.partitions:
|
|
329
329
|
logger.debug(f"Scanned partition {partition}")
|
|
330
330
|
all_partitions.append(partition)
|
|
331
|
+
for contig in metadata.contigs:
|
|
332
|
+
if contig.id in contigs:
|
|
333
|
+
if contig != contigs[contig.id]:
|
|
334
|
+
raise ValueError(
|
|
335
|
+
"Incompatible contig definitions: "
|
|
336
|
+
f"{contig} != {contigs[contig.id]}"
|
|
337
|
+
)
|
|
338
|
+
else:
|
|
339
|
+
contigs[contig.id] = contig
|
|
331
340
|
total_records += metadata.num_records
|
|
332
341
|
metadata.num_records = 0
|
|
333
342
|
metadata.partitions = []
|
|
334
343
|
|
|
344
|
+
contig_union = list(contigs.values())
|
|
345
|
+
for metadata, _ in results:
|
|
346
|
+
metadata.contigs = contig_union
|
|
347
|
+
|
|
335
348
|
icf_metadata, header = results[0]
|
|
336
349
|
for metadata, _ in results[1:]:
|
|
337
350
|
if metadata != icf_metadata:
|
|
@@ -352,64 +365,58 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
|
352
365
|
return icf_metadata, header
|
|
353
366
|
|
|
354
367
|
|
|
355
|
-
def sanitise_value_bool(
|
|
368
|
+
def sanitise_value_bool(shape, value):
|
|
356
369
|
x = True
|
|
357
370
|
if value is None:
|
|
358
371
|
x = False
|
|
359
|
-
|
|
372
|
+
return x
|
|
360
373
|
|
|
361
374
|
|
|
362
|
-
def sanitise_value_float_scalar(
|
|
375
|
+
def sanitise_value_float_scalar(shape, value):
|
|
363
376
|
x = value
|
|
364
377
|
if value is None:
|
|
365
378
|
x = [constants.FLOAT32_MISSING]
|
|
366
|
-
|
|
379
|
+
return x[0]
|
|
367
380
|
|
|
368
381
|
|
|
369
|
-
def sanitise_value_int_scalar(
|
|
382
|
+
def sanitise_value_int_scalar(shape, value):
|
|
370
383
|
x = value
|
|
371
384
|
if value is None:
|
|
372
|
-
# print("MISSING", INT_MISSING, INT_FILL)
|
|
373
385
|
x = [constants.INT_MISSING]
|
|
374
386
|
else:
|
|
375
387
|
x = sanitise_int_array(value, ndmin=1, dtype=np.int32)
|
|
376
|
-
|
|
388
|
+
return x[0]
|
|
377
389
|
|
|
378
390
|
|
|
379
|
-
def sanitise_value_string_scalar(
|
|
391
|
+
def sanitise_value_string_scalar(shape, value):
|
|
380
392
|
if value is None:
|
|
381
|
-
|
|
393
|
+
return "."
|
|
382
394
|
else:
|
|
383
|
-
|
|
395
|
+
return value[0]
|
|
384
396
|
|
|
385
397
|
|
|
386
|
-
def sanitise_value_string_1d(
|
|
398
|
+
def sanitise_value_string_1d(shape, value):
|
|
387
399
|
if value is None:
|
|
388
|
-
|
|
400
|
+
return np.full(shape, ".", dtype="O")
|
|
389
401
|
else:
|
|
390
|
-
# value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
|
|
391
|
-
# FIXME failure isn't coming from here, it seems to be from an
|
|
392
|
-
# incorrectly detected dimension in the zarr array
|
|
393
|
-
# The dimesions look all wrong, and the dtype should be Object
|
|
394
|
-
# not str
|
|
395
402
|
value = drop_empty_second_dim(value)
|
|
396
|
-
|
|
397
|
-
|
|
403
|
+
result = np.full(shape, "", dtype=value.dtype)
|
|
404
|
+
result[: value.shape[0]] = value
|
|
405
|
+
return result
|
|
398
406
|
|
|
399
407
|
|
|
400
|
-
def sanitise_value_string_2d(
|
|
408
|
+
def sanitise_value_string_2d(shape, value):
|
|
401
409
|
if value is None:
|
|
402
|
-
|
|
410
|
+
return np.full(shape, ".", dtype="O")
|
|
403
411
|
else:
|
|
404
|
-
|
|
405
|
-
# assert value.ndim == 2
|
|
406
|
-
buff[j] = ""
|
|
412
|
+
result = np.full(shape, "", dtype="O")
|
|
407
413
|
if value.ndim == 2:
|
|
408
|
-
|
|
414
|
+
result[: value.shape[0], : value.shape[1]] = value
|
|
409
415
|
else:
|
|
410
|
-
#
|
|
416
|
+
# Convert 1D array into 2D with appropriate shape
|
|
411
417
|
for k, val in enumerate(value):
|
|
412
|
-
|
|
418
|
+
result[k, : len(val)] = val
|
|
419
|
+
return result
|
|
413
420
|
|
|
414
421
|
|
|
415
422
|
def drop_empty_second_dim(value):
|
|
@@ -419,27 +426,28 @@ def drop_empty_second_dim(value):
|
|
|
419
426
|
return value
|
|
420
427
|
|
|
421
428
|
|
|
422
|
-
def sanitise_value_float_1d(
|
|
429
|
+
def sanitise_value_float_1d(shape, value):
|
|
423
430
|
if value is None:
|
|
424
|
-
|
|
431
|
+
return np.full(shape, constants.FLOAT32_MISSING)
|
|
425
432
|
else:
|
|
426
|
-
value = np.array(value, ndmin=1, dtype=
|
|
433
|
+
value = np.array(value, ndmin=1, dtype=np.float32, copy=True)
|
|
427
434
|
# numpy will map None values to Nan, but we need a
|
|
428
435
|
# specific NaN
|
|
429
436
|
value[np.isnan(value)] = constants.FLOAT32_MISSING
|
|
430
437
|
value = drop_empty_second_dim(value)
|
|
431
|
-
|
|
432
|
-
|
|
438
|
+
result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
|
|
439
|
+
result[: value.shape[0]] = value
|
|
440
|
+
return result
|
|
433
441
|
|
|
434
442
|
|
|
435
|
-
def sanitise_value_float_2d(
|
|
443
|
+
def sanitise_value_float_2d(shape, value):
|
|
436
444
|
if value is None:
|
|
437
|
-
|
|
445
|
+
return np.full(shape, constants.FLOAT32_MISSING)
|
|
438
446
|
else:
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
447
|
+
value = np.array(value, ndmin=2, dtype=np.float32, copy=True)
|
|
448
|
+
result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
|
|
449
|
+
result[:, : value.shape[1]] = value
|
|
450
|
+
return result
|
|
443
451
|
|
|
444
452
|
|
|
445
453
|
def sanitise_int_array(value, ndmin, dtype):
|
|
@@ -454,23 +462,25 @@ def sanitise_int_array(value, ndmin, dtype):
|
|
|
454
462
|
return value.astype(dtype)
|
|
455
463
|
|
|
456
464
|
|
|
457
|
-
def sanitise_value_int_1d(
|
|
465
|
+
def sanitise_value_int_1d(shape, value):
|
|
458
466
|
if value is None:
|
|
459
|
-
|
|
467
|
+
return np.full(shape, -1)
|
|
460
468
|
else:
|
|
461
|
-
value = sanitise_int_array(value, 1,
|
|
469
|
+
value = sanitise_int_array(value, 1, np.int32)
|
|
462
470
|
value = drop_empty_second_dim(value)
|
|
463
|
-
|
|
464
|
-
|
|
471
|
+
result = np.full(shape, -2, dtype=np.int32)
|
|
472
|
+
result[: value.shape[0]] = value
|
|
473
|
+
return result
|
|
465
474
|
|
|
466
475
|
|
|
467
|
-
def sanitise_value_int_2d(
|
|
476
|
+
def sanitise_value_int_2d(shape, value):
|
|
468
477
|
if value is None:
|
|
469
|
-
|
|
478
|
+
return np.full(shape, -1)
|
|
470
479
|
else:
|
|
471
|
-
value = sanitise_int_array(value, 2,
|
|
472
|
-
|
|
473
|
-
|
|
480
|
+
value = sanitise_int_array(value, 2, np.int32)
|
|
481
|
+
result = np.full(shape, -2, dtype=np.int32)
|
|
482
|
+
result[:, : value.shape[1]] = value
|
|
483
|
+
return result
|
|
474
484
|
|
|
475
485
|
|
|
476
486
|
missing_value_map = {
|
|
@@ -634,7 +644,8 @@ class IntermediateColumnarFormatField:
|
|
|
634
644
|
chunk_cumulative_records = self.chunk_record_index(partition_id)
|
|
635
645
|
chunk_num_records = np.diff(chunk_cumulative_records)
|
|
636
646
|
for count, cumulative in zip(
|
|
637
|
-
chunk_num_records[start_chunk:],
|
|
647
|
+
chunk_num_records[start_chunk:],
|
|
648
|
+
chunk_cumulative_records[start_chunk + 1 :],
|
|
638
649
|
):
|
|
639
650
|
path = partition_path / f"{cumulative}"
|
|
640
651
|
chunk = self.read_chunk(path)
|
|
@@ -693,36 +704,32 @@ class IntermediateColumnarFormatField:
|
|
|
693
704
|
return ret
|
|
694
705
|
|
|
695
706
|
def sanitiser_factory(self, shape):
|
|
696
|
-
|
|
697
|
-
Return a function that sanitised values from this column
|
|
698
|
-
and writes into a buffer of the specified shape.
|
|
699
|
-
"""
|
|
700
|
-
assert len(shape) <= 3
|
|
707
|
+
assert len(shape) <= 2
|
|
701
708
|
if self.vcf_field.vcf_type == "Flag":
|
|
702
|
-
assert len(shape) ==
|
|
703
|
-
return sanitise_value_bool
|
|
709
|
+
assert len(shape) == 0
|
|
710
|
+
return partial(sanitise_value_bool, shape)
|
|
704
711
|
elif self.vcf_field.vcf_type == "Float":
|
|
705
|
-
if len(shape) ==
|
|
706
|
-
return sanitise_value_float_scalar
|
|
707
|
-
elif len(shape) ==
|
|
708
|
-
return sanitise_value_float_1d
|
|
712
|
+
if len(shape) == 0:
|
|
713
|
+
return partial(sanitise_value_float_scalar, shape)
|
|
714
|
+
elif len(shape) == 1:
|
|
715
|
+
return partial(sanitise_value_float_1d, shape)
|
|
709
716
|
else:
|
|
710
|
-
return sanitise_value_float_2d
|
|
717
|
+
return partial(sanitise_value_float_2d, shape)
|
|
711
718
|
elif self.vcf_field.vcf_type == "Integer":
|
|
712
|
-
if len(shape) ==
|
|
713
|
-
return sanitise_value_int_scalar
|
|
714
|
-
elif len(shape) ==
|
|
715
|
-
return sanitise_value_int_1d
|
|
719
|
+
if len(shape) == 0:
|
|
720
|
+
return partial(sanitise_value_int_scalar, shape)
|
|
721
|
+
elif len(shape) == 1:
|
|
722
|
+
return partial(sanitise_value_int_1d, shape)
|
|
716
723
|
else:
|
|
717
|
-
return sanitise_value_int_2d
|
|
724
|
+
return partial(sanitise_value_int_2d, shape)
|
|
718
725
|
else:
|
|
719
726
|
assert self.vcf_field.vcf_type in ("String", "Character")
|
|
720
|
-
if len(shape) ==
|
|
721
|
-
return sanitise_value_string_scalar
|
|
722
|
-
elif len(shape) ==
|
|
723
|
-
return sanitise_value_string_1d
|
|
727
|
+
if len(shape) == 0:
|
|
728
|
+
return partial(sanitise_value_string_scalar, shape)
|
|
729
|
+
elif len(shape) == 1:
|
|
730
|
+
return partial(sanitise_value_string_1d, shape)
|
|
724
731
|
else:
|
|
725
|
-
return sanitise_value_string_2d
|
|
732
|
+
return partial(sanitise_value_string_2d, shape)
|
|
726
733
|
|
|
727
734
|
|
|
728
735
|
@dataclasses.dataclass
|
|
@@ -829,9 +836,66 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
|
|
|
829
836
|
return False
|
|
830
837
|
|
|
831
838
|
|
|
832
|
-
|
|
839
|
+
def convert_local_allele_field_types(fields, schema_instance):
|
|
840
|
+
"""
|
|
841
|
+
Update the specified list of fields to include the LAA field, and to convert
|
|
842
|
+
any supported localisable fields to the L* counterpart.
|
|
843
|
+
|
|
844
|
+
Note that we currently support only two ALT alleles per sample, and so the
|
|
845
|
+
dimensions of these fields are fixed by that requirement. Later versions may
|
|
846
|
+
use summary data storted in the ICF to make different choices, if information
|
|
847
|
+
about subsequent alleles (not in the actual genotype calls) should also be
|
|
848
|
+
stored.
|
|
849
|
+
"""
|
|
850
|
+
fields_by_name = {field.name: field for field in fields}
|
|
851
|
+
gt = fields_by_name["call_genotype"]
|
|
852
|
+
|
|
853
|
+
if schema_instance.get_shape(["ploidy"])[0] != 2:
|
|
854
|
+
raise ValueError("Local alleles only supported on diploid data")
|
|
855
|
+
|
|
856
|
+
dimensions = gt.dimensions[:-1]
|
|
857
|
+
|
|
858
|
+
la = vcz.ZarrArraySpec(
|
|
859
|
+
name="call_LA",
|
|
860
|
+
dtype="i1",
|
|
861
|
+
dimensions=(*dimensions, "local_alleles"),
|
|
862
|
+
description=(
|
|
863
|
+
"0-based indices into REF+ALT, indicating which alleles"
|
|
864
|
+
" are relevant (local) for the current sample"
|
|
865
|
+
),
|
|
866
|
+
)
|
|
867
|
+
schema_instance.dimensions["local_alleles"] = vcz.VcfZarrDimension.unchunked(
|
|
868
|
+
schema_instance.dimensions["ploidy"].size
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
ad = fields_by_name.get("call_AD", None)
|
|
872
|
+
if ad is not None:
|
|
873
|
+
# TODO check if call_LAD is in the list already
|
|
874
|
+
ad.name = "call_LAD"
|
|
875
|
+
ad.source = None
|
|
876
|
+
ad.dimensions = (*dimensions, "local_alleles_AD")
|
|
877
|
+
ad.description += " (local-alleles)"
|
|
878
|
+
schema_instance.dimensions["local_alleles_AD"] = vcz.VcfZarrDimension.unchunked(
|
|
879
|
+
2
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
pl = fields_by_name.get("call_PL", None)
|
|
883
|
+
if pl is not None:
|
|
884
|
+
# TODO check if call_LPL is in the list already
|
|
885
|
+
pl.name = "call_LPL"
|
|
886
|
+
pl.source = None
|
|
887
|
+
pl.description += " (local-alleles)"
|
|
888
|
+
pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1].split("_")[-1])
|
|
889
|
+
schema_instance.dimensions["local_" + pl.dimensions[-1].split("_")[-1]] = (
|
|
890
|
+
vcz.VcfZarrDimension.unchunked(3)
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
return [*fields, la]
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
class IntermediateColumnarFormat(vcz.Source):
|
|
833
897
|
def __init__(self, path):
|
|
834
|
-
self.
|
|
898
|
+
self._path = pathlib.Path(path)
|
|
835
899
|
# TODO raise a more informative error here telling people this
|
|
836
900
|
# directory is either a WIP or the wrong format.
|
|
837
901
|
with open(self.path / "metadata.json") as f:
|
|
@@ -845,8 +909,12 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
845
909
|
]
|
|
846
910
|
# Allow us to find which partition a given record is in
|
|
847
911
|
self.partition_record_index = np.cumsum([0, *partition_num_records])
|
|
912
|
+
self.gt_field = None
|
|
848
913
|
for field in self.metadata.fields:
|
|
849
914
|
self.fields[field.full_name] = IntermediateColumnarFormatField(self, field)
|
|
915
|
+
if field.name == "GT":
|
|
916
|
+
self.gt_field = field
|
|
917
|
+
|
|
850
918
|
logger.info(
|
|
851
919
|
f"Loaded IntermediateColumnarFormat(partitions={self.num_partitions}, "
|
|
852
920
|
f"records={self.num_records}, fields={self.num_fields})"
|
|
@@ -854,20 +922,11 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
854
922
|
|
|
855
923
|
def __repr__(self):
|
|
856
924
|
return (
|
|
857
|
-
f"IntermediateColumnarFormat(fields={len(self)}, "
|
|
925
|
+
f"IntermediateColumnarFormat(fields={len(self.fields)}, "
|
|
858
926
|
f"partitions={self.num_partitions}, "
|
|
859
927
|
f"records={self.num_records}, path={self.path})"
|
|
860
928
|
)
|
|
861
929
|
|
|
862
|
-
def __getitem__(self, key):
|
|
863
|
-
return self.fields[key]
|
|
864
|
-
|
|
865
|
-
def __iter__(self):
|
|
866
|
-
return iter(self.fields)
|
|
867
|
-
|
|
868
|
-
def __len__(self):
|
|
869
|
-
return len(self.fields)
|
|
870
|
-
|
|
871
930
|
def summary_table(self):
|
|
872
931
|
data = []
|
|
873
932
|
for name, icf_field in self.fields.items():
|
|
@@ -886,6 +945,10 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
886
945
|
data.append(d)
|
|
887
946
|
return data
|
|
888
947
|
|
|
948
|
+
@property
|
|
949
|
+
def path(self):
|
|
950
|
+
return self._path
|
|
951
|
+
|
|
889
952
|
@property
|
|
890
953
|
def num_records(self):
|
|
891
954
|
return self.metadata.num_records
|
|
@@ -894,6 +957,18 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
894
957
|
def num_partitions(self):
|
|
895
958
|
return len(self.metadata.partitions)
|
|
896
959
|
|
|
960
|
+
@property
|
|
961
|
+
def samples(self):
|
|
962
|
+
return self.metadata.samples
|
|
963
|
+
|
|
964
|
+
@property
|
|
965
|
+
def contigs(self):
|
|
966
|
+
return self.metadata.contigs
|
|
967
|
+
|
|
968
|
+
@property
|
|
969
|
+
def filters(self):
|
|
970
|
+
return self.metadata.filters
|
|
971
|
+
|
|
897
972
|
@property
|
|
898
973
|
def num_samples(self):
|
|
899
974
|
return len(self.metadata.samples)
|
|
@@ -902,6 +977,261 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
902
977
|
def num_fields(self):
|
|
903
978
|
return len(self.fields)
|
|
904
979
|
|
|
980
|
+
@property
|
|
981
|
+
def root_attrs(self):
|
|
982
|
+
meta_information_pattern = re.compile("##([^=]+)=(.*)")
|
|
983
|
+
vcf_meta_information = []
|
|
984
|
+
for line in self.vcf_header.split("\n"):
|
|
985
|
+
match = re.fullmatch(meta_information_pattern, line)
|
|
986
|
+
if match:
|
|
987
|
+
key = match.group(1)
|
|
988
|
+
if key in ("contig", "FILTER", "INFO", "FORMAT"):
|
|
989
|
+
# these fields are stored in Zarr arrays
|
|
990
|
+
continue
|
|
991
|
+
value = match.group(2)
|
|
992
|
+
vcf_meta_information.append((key, value))
|
|
993
|
+
return {
|
|
994
|
+
"vcf_meta_information": vcf_meta_information,
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
def iter_id(self, start, stop):
|
|
998
|
+
for value in self.fields["ID"].iter_values(start, stop):
|
|
999
|
+
if value is not None:
|
|
1000
|
+
yield value[0]
|
|
1001
|
+
else:
|
|
1002
|
+
yield None
|
|
1003
|
+
|
|
1004
|
+
def iter_filters(self, start, stop):
|
|
1005
|
+
source_field = self.fields["FILTERS"]
|
|
1006
|
+
lookup = {filt.id: index for index, filt in enumerate(self.metadata.filters)}
|
|
1007
|
+
|
|
1008
|
+
for filter_values in source_field.iter_values(start, stop):
|
|
1009
|
+
filters = np.zeros(len(self.metadata.filters), dtype=bool)
|
|
1010
|
+
if filter_values is not None:
|
|
1011
|
+
for filter_id in filter_values:
|
|
1012
|
+
try:
|
|
1013
|
+
filters[lookup[filter_id]] = True
|
|
1014
|
+
except KeyError:
|
|
1015
|
+
raise ValueError(
|
|
1016
|
+
f"Filter '{filter_id}' was not defined in the header."
|
|
1017
|
+
) from None
|
|
1018
|
+
yield filters
|
|
1019
|
+
|
|
1020
|
+
def iter_contig(self, start, stop):
|
|
1021
|
+
source_field = self.fields["CHROM"]
|
|
1022
|
+
lookup = {
|
|
1023
|
+
contig.id: index for index, contig in enumerate(self.metadata.contigs)
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
for value in source_field.iter_values(start, stop):
|
|
1027
|
+
# Note: because we are using the indexes to define the lookups
|
|
1028
|
+
# and we always have an index, it seems that we the contig lookup
|
|
1029
|
+
# will always succeed. However, if anyone ever does hit a KeyError
|
|
1030
|
+
# here, please do open an issue with a reproducible example!
|
|
1031
|
+
yield lookup[value[0]]
|
|
1032
|
+
|
|
1033
|
+
def iter_field(self, field_name, shape, start, stop):
|
|
1034
|
+
source_field = self.fields[field_name]
|
|
1035
|
+
sanitiser = source_field.sanitiser_factory(shape)
|
|
1036
|
+
for value in source_field.iter_values(start, stop):
|
|
1037
|
+
yield sanitiser(value)
|
|
1038
|
+
|
|
1039
|
+
def iter_alleles(self, start, stop, num_alleles):
|
|
1040
|
+
ref_field = self.fields["REF"]
|
|
1041
|
+
alt_field = self.fields["ALT"]
|
|
1042
|
+
|
|
1043
|
+
for ref, alt in zip(
|
|
1044
|
+
ref_field.iter_values(start, stop),
|
|
1045
|
+
alt_field.iter_values(start, stop),
|
|
1046
|
+
):
|
|
1047
|
+
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
|
|
1048
|
+
alleles[0] = ref[0]
|
|
1049
|
+
alleles[1 : 1 + len(alt)] = alt
|
|
1050
|
+
yield alleles
|
|
1051
|
+
|
|
1052
|
+
def iter_genotypes(self, shape, start, stop):
|
|
1053
|
+
source_field = self.fields["FORMAT/GT"]
|
|
1054
|
+
for value in source_field.iter_values(start, stop):
|
|
1055
|
+
genotypes = value[:, :-1] if value is not None else None
|
|
1056
|
+
phased = value[:, -1] if value is not None else None
|
|
1057
|
+
sanitised_genotypes = sanitise_value_int_2d(shape, genotypes)
|
|
1058
|
+
sanitised_phased = sanitise_value_int_1d(shape[:-1], phased)
|
|
1059
|
+
# Force haploids to always be phased
|
|
1060
|
+
# https://github.com/sgkit-dev/bio2zarr/issues/399
|
|
1061
|
+
if sanitised_genotypes.shape[1] == 1:
|
|
1062
|
+
sanitised_phased[:] = True
|
|
1063
|
+
yield sanitised_genotypes, sanitised_phased
|
|
1064
|
+
|
|
1065
|
+
def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
|
|
1066
|
+
variant_lengths = self.fields["rlen"].iter_values(start, stop)
|
|
1067
|
+
if self.gt_field is None or shape is None:
|
|
1068
|
+
for variant_length, alleles in zip(
|
|
1069
|
+
variant_lengths, self.iter_alleles(start, stop, num_alleles)
|
|
1070
|
+
):
|
|
1071
|
+
yield vcz.VariantData(variant_length, alleles, None, None)
|
|
1072
|
+
else:
|
|
1073
|
+
for variant_length, alleles, (gt, phased) in zip(
|
|
1074
|
+
variant_lengths,
|
|
1075
|
+
self.iter_alleles(start, stop, num_alleles),
|
|
1076
|
+
self.iter_genotypes(shape, start, stop),
|
|
1077
|
+
):
|
|
1078
|
+
yield vcz.VariantData(variant_length, alleles, gt, phased)
|
|
1079
|
+
|
|
1080
|
+
def generate_schema(
|
|
1081
|
+
self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
|
|
1082
|
+
):
|
|
1083
|
+
if local_alleles is None:
|
|
1084
|
+
local_alleles = False
|
|
1085
|
+
|
|
1086
|
+
max_alleles = max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2)
|
|
1087
|
+
|
|
1088
|
+
# Add ploidy and genotypes dimensions only when needed
|
|
1089
|
+
max_genotypes = 0
|
|
1090
|
+
for field in self.metadata.format_fields:
|
|
1091
|
+
if field.vcf_number == "G":
|
|
1092
|
+
max_genotypes = max(max_genotypes, field.summary.max_number)
|
|
1093
|
+
|
|
1094
|
+
ploidy = None
|
|
1095
|
+
genotypes_size = None
|
|
1096
|
+
if self.gt_field is not None:
|
|
1097
|
+
ploidy = max(self.gt_field.summary.max_number - 1, 1)
|
|
1098
|
+
# NOTE: it's not clear why we're computing this, when we must have had
|
|
1099
|
+
# at least one number=G field to require it anyway?
|
|
1100
|
+
genotypes_size = math.comb(max_alleles + ploidy - 1, ploidy)
|
|
1101
|
+
# assert max_genotypes == genotypes_size
|
|
1102
|
+
else:
|
|
1103
|
+
if max_genotypes > 0:
|
|
1104
|
+
# there is no GT field, but there is at least one Number=G field,
|
|
1105
|
+
# so need to define genotypes dimension
|
|
1106
|
+
genotypes_size = max_genotypes
|
|
1107
|
+
|
|
1108
|
+
dimensions = vcz.standard_dimensions(
|
|
1109
|
+
variants_size=self.num_records,
|
|
1110
|
+
variants_chunk_size=variants_chunk_size,
|
|
1111
|
+
samples_size=self.num_samples,
|
|
1112
|
+
samples_chunk_size=samples_chunk_size,
|
|
1113
|
+
alleles_size=max_alleles,
|
|
1114
|
+
filters_size=self.metadata.num_filters,
|
|
1115
|
+
ploidy_size=ploidy,
|
|
1116
|
+
genotypes_size=genotypes_size,
|
|
1117
|
+
)
|
|
1118
|
+
|
|
1119
|
+
schema_instance = vcz.VcfZarrSchema(
|
|
1120
|
+
format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
|
|
1121
|
+
dimensions=dimensions,
|
|
1122
|
+
fields=[],
|
|
1123
|
+
)
|
|
1124
|
+
|
|
1125
|
+
logger.info(
|
|
1126
|
+
"Generating schema with chunks="
|
|
1127
|
+
f"variants={dimensions['variants'].chunk_size}, "
|
|
1128
|
+
f"samples={dimensions['samples'].chunk_size}"
|
|
1129
|
+
)
|
|
1130
|
+
|
|
1131
|
+
def spec_from_field(field, array_name=None):
|
|
1132
|
+
return vcz.ZarrArraySpec.from_field(
|
|
1133
|
+
field,
|
|
1134
|
+
schema_instance,
|
|
1135
|
+
array_name=array_name,
|
|
1136
|
+
)
|
|
1137
|
+
|
|
1138
|
+
def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
|
|
1139
|
+
compressor = (
|
|
1140
|
+
vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config()
|
|
1141
|
+
if dtype == "bool"
|
|
1142
|
+
else None
|
|
1143
|
+
)
|
|
1144
|
+
return vcz.ZarrArraySpec(
|
|
1145
|
+
source=source,
|
|
1146
|
+
name=name,
|
|
1147
|
+
dtype=dtype,
|
|
1148
|
+
description="",
|
|
1149
|
+
dimensions=dimensions,
|
|
1150
|
+
compressor=compressor,
|
|
1151
|
+
)
|
|
1152
|
+
|
|
1153
|
+
name_map = {field.full_name: field for field in self.metadata.fields}
|
|
1154
|
+
array_specs = [
|
|
1155
|
+
fixed_field_spec(
|
|
1156
|
+
name="variant_contig",
|
|
1157
|
+
dtype=core.min_int_dtype(0, self.metadata.num_contigs),
|
|
1158
|
+
),
|
|
1159
|
+
fixed_field_spec(
|
|
1160
|
+
name="variant_filter",
|
|
1161
|
+
dtype="bool",
|
|
1162
|
+
dimensions=["variants", "filters"],
|
|
1163
|
+
),
|
|
1164
|
+
fixed_field_spec(
|
|
1165
|
+
name="variant_allele",
|
|
1166
|
+
dtype="O",
|
|
1167
|
+
dimensions=["variants", "alleles"],
|
|
1168
|
+
),
|
|
1169
|
+
fixed_field_spec(
|
|
1170
|
+
name="variant_length",
|
|
1171
|
+
dtype=name_map["rlen"].smallest_dtype(),
|
|
1172
|
+
dimensions=["variants"],
|
|
1173
|
+
),
|
|
1174
|
+
fixed_field_spec(
|
|
1175
|
+
name="variant_id",
|
|
1176
|
+
dtype="O",
|
|
1177
|
+
),
|
|
1178
|
+
fixed_field_spec(
|
|
1179
|
+
name="variant_id_mask",
|
|
1180
|
+
dtype="bool",
|
|
1181
|
+
),
|
|
1182
|
+
]
|
|
1183
|
+
|
|
1184
|
+
# Only two of the fixed fields have a direct one-to-one mapping.
|
|
1185
|
+
array_specs.extend(
|
|
1186
|
+
[
|
|
1187
|
+
spec_from_field(name_map["QUAL"], array_name="variant_quality"),
|
|
1188
|
+
spec_from_field(name_map["POS"], array_name="variant_position"),
|
|
1189
|
+
]
|
|
1190
|
+
)
|
|
1191
|
+
array_specs.extend(
|
|
1192
|
+
[spec_from_field(field) for field in self.metadata.info_fields]
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
for field in self.metadata.format_fields:
|
|
1196
|
+
if field.name == "GT":
|
|
1197
|
+
continue
|
|
1198
|
+
array_specs.append(spec_from_field(field))
|
|
1199
|
+
|
|
1200
|
+
if self.gt_field is not None and self.num_samples > 0:
|
|
1201
|
+
array_specs.append(
|
|
1202
|
+
vcz.ZarrArraySpec(
|
|
1203
|
+
name="call_genotype_phased",
|
|
1204
|
+
dtype="bool",
|
|
1205
|
+
dimensions=["variants", "samples"],
|
|
1206
|
+
description="",
|
|
1207
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
|
|
1208
|
+
)
|
|
1209
|
+
)
|
|
1210
|
+
array_specs.append(
|
|
1211
|
+
vcz.ZarrArraySpec(
|
|
1212
|
+
name="call_genotype",
|
|
1213
|
+
dtype=self.gt_field.smallest_dtype(),
|
|
1214
|
+
dimensions=["variants", "samples", "ploidy"],
|
|
1215
|
+
description="",
|
|
1216
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
|
|
1217
|
+
)
|
|
1218
|
+
)
|
|
1219
|
+
array_specs.append(
|
|
1220
|
+
vcz.ZarrArraySpec(
|
|
1221
|
+
name="call_genotype_mask",
|
|
1222
|
+
dtype="bool",
|
|
1223
|
+
dimensions=["variants", "samples", "ploidy"],
|
|
1224
|
+
description="",
|
|
1225
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
|
|
1226
|
+
)
|
|
1227
|
+
)
|
|
1228
|
+
|
|
1229
|
+
if local_alleles:
|
|
1230
|
+
array_specs = convert_local_allele_field_types(array_specs, schema_instance)
|
|
1231
|
+
|
|
1232
|
+
schema_instance.fields = array_specs
|
|
1233
|
+
return schema_instance
|
|
1234
|
+
|
|
905
1235
|
|
|
906
1236
|
@dataclasses.dataclass
|
|
907
1237
|
class IcfPartitionMetadata(core.JsonDataclass):
|
|
@@ -973,7 +1303,7 @@ class IntermediateColumnarFormatWriter:
|
|
|
973
1303
|
vcfs,
|
|
974
1304
|
*,
|
|
975
1305
|
column_chunk_size=16,
|
|
976
|
-
worker_processes=
|
|
1306
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
977
1307
|
target_num_partitions=None,
|
|
978
1308
|
show_progress=False,
|
|
979
1309
|
compressor=None,
|
|
@@ -1079,9 +1409,9 @@ class IntermediateColumnarFormatWriter:
|
|
|
1079
1409
|
self.path,
|
|
1080
1410
|
partition_index,
|
|
1081
1411
|
) as tcw:
|
|
1082
|
-
with vcf_utils.
|
|
1412
|
+
with vcf_utils.VcfFile(partition.vcf_path) as vcf:
|
|
1083
1413
|
num_records = 0
|
|
1084
|
-
for variant in
|
|
1414
|
+
for variant in vcf.variants(partition.region):
|
|
1085
1415
|
num_records += 1
|
|
1086
1416
|
last_position = variant.POS
|
|
1087
1417
|
tcw.append("CHROM", variant.CHROM)
|
|
@@ -1125,7 +1455,9 @@ class IntermediateColumnarFormatWriter:
|
|
|
1125
1455
|
f"{num_records} records last_pos={last_position}"
|
|
1126
1456
|
)
|
|
1127
1457
|
|
|
1128
|
-
def explode(
|
|
1458
|
+
def explode(
|
|
1459
|
+
self, *, worker_processes=core.DEFAULT_WORKER_PROCESSES, show_progress=False
|
|
1460
|
+
):
|
|
1129
1461
|
self.load_metadata()
|
|
1130
1462
|
num_records = self.metadata.num_records
|
|
1131
1463
|
if np.isinf(num_records):
|
|
@@ -1193,7 +1525,7 @@ def explode(
|
|
|
1193
1525
|
vcfs,
|
|
1194
1526
|
*,
|
|
1195
1527
|
column_chunk_size=16,
|
|
1196
|
-
worker_processes=
|
|
1528
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1197
1529
|
show_progress=False,
|
|
1198
1530
|
compressor=None,
|
|
1199
1531
|
):
|
|
@@ -1218,7 +1550,7 @@ def explode_init(
|
|
|
1218
1550
|
*,
|
|
1219
1551
|
column_chunk_size=16,
|
|
1220
1552
|
target_num_partitions=1,
|
|
1221
|
-
worker_processes=
|
|
1553
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1222
1554
|
show_progress=False,
|
|
1223
1555
|
compressor=None,
|
|
1224
1556
|
):
|
|
@@ -1241,3 +1573,167 @@ def explode_partition(icf_path, partition):
|
|
|
1241
1573
|
def explode_finalise(icf_path):
|
|
1242
1574
|
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1243
1575
|
writer.finalise()
|
|
1576
|
+
|
|
1577
|
+
|
|
1578
|
+
def inspect(path):
|
|
1579
|
+
path = pathlib.Path(path)
|
|
1580
|
+
if not path.exists():
|
|
1581
|
+
raise ValueError(f"Path not found: {path}")
|
|
1582
|
+
if (path / "metadata.json").exists():
|
|
1583
|
+
obj = IntermediateColumnarFormat(path)
|
|
1584
|
+
# NOTE: this is too strict, we should support more general Zarrs, see #276
|
|
1585
|
+
elif (path / ".zmetadata").exists():
|
|
1586
|
+
obj = vcz.VcfZarr(path)
|
|
1587
|
+
else:
|
|
1588
|
+
raise ValueError(f"{path} not in ICF or VCF Zarr format")
|
|
1589
|
+
return obj.summary_table()
|
|
1590
|
+
|
|
1591
|
+
|
|
1592
|
+
def mkschema(
|
|
1593
|
+
if_path,
|
|
1594
|
+
out,
|
|
1595
|
+
*,
|
|
1596
|
+
variants_chunk_size=None,
|
|
1597
|
+
samples_chunk_size=None,
|
|
1598
|
+
local_alleles=None,
|
|
1599
|
+
):
|
|
1600
|
+
store = IntermediateColumnarFormat(if_path)
|
|
1601
|
+
spec = store.generate_schema(
|
|
1602
|
+
variants_chunk_size=variants_chunk_size,
|
|
1603
|
+
samples_chunk_size=samples_chunk_size,
|
|
1604
|
+
local_alleles=local_alleles,
|
|
1605
|
+
)
|
|
1606
|
+
out.write(spec.asjson())
|
|
1607
|
+
|
|
1608
|
+
|
|
1609
|
+
def convert(
|
|
1610
|
+
vcfs,
|
|
1611
|
+
vcz_path,
|
|
1612
|
+
*,
|
|
1613
|
+
variants_chunk_size=None,
|
|
1614
|
+
samples_chunk_size=None,
|
|
1615
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1616
|
+
local_alleles=None,
|
|
1617
|
+
show_progress=False,
|
|
1618
|
+
icf_path=None,
|
|
1619
|
+
):
|
|
1620
|
+
"""
|
|
1621
|
+
Convert the VCF data at the specified list of paths
|
|
1622
|
+
to VCF Zarr format stored at the specified path.
|
|
1623
|
+
|
|
1624
|
+
.. todo:: Document parameters
|
|
1625
|
+
"""
|
|
1626
|
+
if icf_path is None:
|
|
1627
|
+
cm = temp_icf_path(prefix="vcf2zarr")
|
|
1628
|
+
else:
|
|
1629
|
+
cm = contextlib.nullcontext(icf_path)
|
|
1630
|
+
|
|
1631
|
+
with cm as icf_path:
|
|
1632
|
+
explode(
|
|
1633
|
+
icf_path,
|
|
1634
|
+
vcfs,
|
|
1635
|
+
worker_processes=worker_processes,
|
|
1636
|
+
show_progress=show_progress,
|
|
1637
|
+
)
|
|
1638
|
+
encode(
|
|
1639
|
+
icf_path,
|
|
1640
|
+
vcz_path,
|
|
1641
|
+
variants_chunk_size=variants_chunk_size,
|
|
1642
|
+
samples_chunk_size=samples_chunk_size,
|
|
1643
|
+
worker_processes=worker_processes,
|
|
1644
|
+
show_progress=show_progress,
|
|
1645
|
+
local_alleles=local_alleles,
|
|
1646
|
+
)
|
|
1647
|
+
|
|
1648
|
+
|
|
1649
|
+
@contextlib.contextmanager
|
|
1650
|
+
def temp_icf_path(prefix=None):
|
|
1651
|
+
with tempfile.TemporaryDirectory(prefix=prefix) as tmp:
|
|
1652
|
+
yield pathlib.Path(tmp) / "icf"
|
|
1653
|
+
|
|
1654
|
+
|
|
1655
|
+
def encode(
|
|
1656
|
+
icf_path,
|
|
1657
|
+
zarr_path,
|
|
1658
|
+
schema_path=None,
|
|
1659
|
+
variants_chunk_size=None,
|
|
1660
|
+
samples_chunk_size=None,
|
|
1661
|
+
max_variant_chunks=None,
|
|
1662
|
+
dimension_separator=None,
|
|
1663
|
+
max_memory=None,
|
|
1664
|
+
local_alleles=None,
|
|
1665
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1666
|
+
show_progress=False,
|
|
1667
|
+
):
|
|
1668
|
+
# Rough heuristic to split work up enough to keep utilisation high
|
|
1669
|
+
target_num_partitions = max(1, worker_processes * 4)
|
|
1670
|
+
encode_init(
|
|
1671
|
+
icf_path,
|
|
1672
|
+
zarr_path,
|
|
1673
|
+
target_num_partitions,
|
|
1674
|
+
schema_path=schema_path,
|
|
1675
|
+
variants_chunk_size=variants_chunk_size,
|
|
1676
|
+
samples_chunk_size=samples_chunk_size,
|
|
1677
|
+
local_alleles=local_alleles,
|
|
1678
|
+
max_variant_chunks=max_variant_chunks,
|
|
1679
|
+
dimension_separator=dimension_separator,
|
|
1680
|
+
)
|
|
1681
|
+
vzw = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
|
|
1682
|
+
vzw.encode_all_partitions(
|
|
1683
|
+
worker_processes=worker_processes,
|
|
1684
|
+
show_progress=show_progress,
|
|
1685
|
+
max_memory=max_memory,
|
|
1686
|
+
)
|
|
1687
|
+
vzw.finalise(show_progress)
|
|
1688
|
+
vzw.create_index()
|
|
1689
|
+
|
|
1690
|
+
|
|
1691
|
+
def encode_init(
|
|
1692
|
+
icf_path,
|
|
1693
|
+
zarr_path,
|
|
1694
|
+
target_num_partitions,
|
|
1695
|
+
*,
|
|
1696
|
+
schema_path=None,
|
|
1697
|
+
variants_chunk_size=None,
|
|
1698
|
+
samples_chunk_size=None,
|
|
1699
|
+
local_alleles=None,
|
|
1700
|
+
max_variant_chunks=None,
|
|
1701
|
+
dimension_separator=None,
|
|
1702
|
+
max_memory=None,
|
|
1703
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1704
|
+
show_progress=False,
|
|
1705
|
+
):
|
|
1706
|
+
icf_store = IntermediateColumnarFormat(icf_path)
|
|
1707
|
+
if schema_path is None:
|
|
1708
|
+
schema_instance = icf_store.generate_schema(
|
|
1709
|
+
variants_chunk_size=variants_chunk_size,
|
|
1710
|
+
samples_chunk_size=samples_chunk_size,
|
|
1711
|
+
local_alleles=local_alleles,
|
|
1712
|
+
)
|
|
1713
|
+
else:
|
|
1714
|
+
logger.info(f"Reading schema from {schema_path}")
|
|
1715
|
+
if variants_chunk_size is not None or samples_chunk_size is not None:
|
|
1716
|
+
raise ValueError(
|
|
1717
|
+
"Cannot specify schema along with chunk sizes"
|
|
1718
|
+
) # NEEDS TEST
|
|
1719
|
+
with open(schema_path) as f:
|
|
1720
|
+
schema_instance = vcz.VcfZarrSchema.fromjson(f.read())
|
|
1721
|
+
zarr_path = pathlib.Path(zarr_path)
|
|
1722
|
+
vzw = vcz.VcfZarrWriter("icf", zarr_path)
|
|
1723
|
+
return vzw.init(
|
|
1724
|
+
icf_store,
|
|
1725
|
+
target_num_partitions=target_num_partitions,
|
|
1726
|
+
schema=schema_instance,
|
|
1727
|
+
dimension_separator=dimension_separator,
|
|
1728
|
+
max_variant_chunks=max_variant_chunks,
|
|
1729
|
+
)
|
|
1730
|
+
|
|
1731
|
+
|
|
1732
|
+
def encode_partition(zarr_path, partition):
|
|
1733
|
+
writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
|
|
1734
|
+
writer_instance.encode_partition(partition)
|
|
1735
|
+
|
|
1736
|
+
|
|
1737
|
+
def encode_finalise(zarr_path, show_progress=False):
|
|
1738
|
+
writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
|
|
1739
|
+
writer_instance.finalise(show_progress=show_progress)
|