bio2zarr 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bio2zarr/__main__.py +2 -1
- bio2zarr/_version.py +16 -3
- bio2zarr/cli.py +102 -22
- bio2zarr/core.py +43 -22
- bio2zarr/plink.py +316 -189
- bio2zarr/tskit.py +296 -0
- bio2zarr/typing.py +1 -2
- bio2zarr/{vcf2zarr/icf.py → vcf.py} +606 -114
- bio2zarr/vcf_utils.py +12 -11
- bio2zarr/{vcf2zarr/vcz.py → vcz.py} +568 -739
- bio2zarr/{vcf2zarr/verification.py → vcz_verification.py} +5 -2
- bio2zarr/zarr_utils.py +169 -2
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/METADATA +23 -8
- bio2zarr-0.1.7.dist-info/RECORD +21 -0
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/WHEEL +1 -1
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/entry_points.txt +2 -0
- bio2zarr/vcf2zarr/__init__.py +0 -38
- bio2zarr-0.1.5.dist-info/RECORD +0 -21
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/top_level.txt +0 -0
|
@@ -6,14 +6,19 @@ import logging
|
|
|
6
6
|
import math
|
|
7
7
|
import pathlib
|
|
8
8
|
import pickle
|
|
9
|
+
import re
|
|
9
10
|
import shutil
|
|
10
11
|
import sys
|
|
12
|
+
import tempfile
|
|
13
|
+
from functools import partial
|
|
11
14
|
from typing import Any
|
|
12
15
|
|
|
13
16
|
import numcodecs
|
|
14
17
|
import numpy as np
|
|
15
18
|
|
|
16
|
-
from
|
|
19
|
+
from bio2zarr.zarr_utils import STRING_DTYPE_NAME, zarr_exists
|
|
20
|
+
|
|
21
|
+
from . import constants, core, provenance, vcf_utils, vcz
|
|
17
22
|
|
|
18
23
|
logger = logging.getLogger(__name__)
|
|
19
24
|
|
|
@@ -77,6 +82,14 @@ class VcfField:
|
|
|
77
82
|
return self.name
|
|
78
83
|
return f"{self.category}/{self.name}"
|
|
79
84
|
|
|
85
|
+
@property
|
|
86
|
+
def max_number(self):
|
|
87
|
+
if self.vcf_number in ("R", "A", "G", "."):
|
|
88
|
+
return self.summary.max_number
|
|
89
|
+
else:
|
|
90
|
+
# use declared number if larger than max found
|
|
91
|
+
return max(self.summary.max_number, int(self.vcf_number))
|
|
92
|
+
|
|
80
93
|
def smallest_dtype(self):
|
|
81
94
|
"""
|
|
82
95
|
Returns the smallest dtype suitable for this field based
|
|
@@ -99,7 +112,7 @@ class VcfField:
|
|
|
99
112
|
ret = "U1"
|
|
100
113
|
else:
|
|
101
114
|
assert self.vcf_type == "String"
|
|
102
|
-
ret =
|
|
115
|
+
ret = STRING_DTYPE_NAME
|
|
103
116
|
return ret
|
|
104
117
|
|
|
105
118
|
|
|
@@ -116,23 +129,6 @@ ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
|
|
|
116
129
|
)
|
|
117
130
|
|
|
118
131
|
|
|
119
|
-
@dataclasses.dataclass
|
|
120
|
-
class Contig:
|
|
121
|
-
id: str
|
|
122
|
-
length: int = None
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
@dataclasses.dataclass
|
|
126
|
-
class Sample:
|
|
127
|
-
id: str
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
@dataclasses.dataclass
|
|
131
|
-
class Filter:
|
|
132
|
-
id: str
|
|
133
|
-
description: str = ""
|
|
134
|
-
|
|
135
|
-
|
|
136
132
|
@dataclasses.dataclass
|
|
137
133
|
class IcfMetadata(core.JsonDataclass):
|
|
138
134
|
samples: list
|
|
@@ -187,9 +183,9 @@ class IcfMetadata(core.JsonDataclass):
|
|
|
187
183
|
d = d.copy()
|
|
188
184
|
d["partitions"] = partitions
|
|
189
185
|
d["fields"] = [VcfField.fromdict(fd) for fd in d["fields"]]
|
|
190
|
-
d["samples"] = [Sample(**sd) for sd in d["samples"]]
|
|
191
|
-
d["filters"] = [Filter(**fd) for fd in d["filters"]]
|
|
192
|
-
d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
|
|
186
|
+
d["samples"] = [vcz.Sample(**sd) for sd in d["samples"]]
|
|
187
|
+
d["filters"] = [vcz.Filter(**fd) for fd in d["filters"]]
|
|
188
|
+
d["contigs"] = [vcz.Contig(**cd) for cd in d["contigs"]]
|
|
193
189
|
return IcfMetadata(**d)
|
|
194
190
|
|
|
195
191
|
def __eq__(self, other):
|
|
@@ -240,7 +236,7 @@ def scan_vcf(path, target_num_partitions):
|
|
|
240
236
|
description = ""
|
|
241
237
|
if h["ID"] == "PASS":
|
|
242
238
|
pass_index = len(filters)
|
|
243
|
-
filters.append(Filter(h["ID"], description))
|
|
239
|
+
filters.append(vcz.Filter(h["ID"], description))
|
|
244
240
|
|
|
245
241
|
# Ensure PASS is the first filter if present
|
|
246
242
|
if pass_index > 0:
|
|
@@ -262,9 +258,9 @@ def scan_vcf(path, target_num_partitions):
|
|
|
262
258
|
contig_lengths = [None for _ in vcf.seqnames]
|
|
263
259
|
|
|
264
260
|
metadata = IcfMetadata(
|
|
265
|
-
samples=[Sample(sample_id) for sample_id in vcf.samples],
|
|
261
|
+
samples=[vcz.Sample(sample_id) for sample_id in vcf.samples],
|
|
266
262
|
contigs=[
|
|
267
|
-
Contig(contig_id, length)
|
|
263
|
+
vcz.Contig(contig_id, length)
|
|
268
264
|
for contig_id, length in zip(vcf.seqnames, contig_lengths)
|
|
269
265
|
],
|
|
270
266
|
filters=filters,
|
|
@@ -291,7 +287,12 @@ def scan_vcf(path, target_num_partitions):
|
|
|
291
287
|
return metadata, vcf.raw_header
|
|
292
288
|
|
|
293
289
|
|
|
294
|
-
def scan_vcfs(
|
|
290
|
+
def scan_vcfs(
|
|
291
|
+
paths,
|
|
292
|
+
show_progress,
|
|
293
|
+
target_num_partitions,
|
|
294
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
295
|
+
):
|
|
295
296
|
logger.info(
|
|
296
297
|
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
|
|
297
298
|
f" partitions."
|
|
@@ -366,64 +367,58 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
|
366
367
|
return icf_metadata, header
|
|
367
368
|
|
|
368
369
|
|
|
369
|
-
def sanitise_value_bool(
|
|
370
|
+
def sanitise_value_bool(shape, value):
|
|
370
371
|
x = True
|
|
371
372
|
if value is None:
|
|
372
373
|
x = False
|
|
373
|
-
|
|
374
|
+
return x
|
|
374
375
|
|
|
375
376
|
|
|
376
|
-
def sanitise_value_float_scalar(
|
|
377
|
+
def sanitise_value_float_scalar(shape, value):
|
|
377
378
|
x = value
|
|
378
379
|
if value is None:
|
|
379
380
|
x = [constants.FLOAT32_MISSING]
|
|
380
|
-
|
|
381
|
+
return x[0]
|
|
381
382
|
|
|
382
383
|
|
|
383
|
-
def sanitise_value_int_scalar(
|
|
384
|
+
def sanitise_value_int_scalar(shape, value):
|
|
384
385
|
x = value
|
|
385
386
|
if value is None:
|
|
386
|
-
# print("MISSING", INT_MISSING, INT_FILL)
|
|
387
387
|
x = [constants.INT_MISSING]
|
|
388
388
|
else:
|
|
389
389
|
x = sanitise_int_array(value, ndmin=1, dtype=np.int32)
|
|
390
|
-
|
|
390
|
+
return x[0]
|
|
391
391
|
|
|
392
392
|
|
|
393
|
-
def sanitise_value_string_scalar(
|
|
393
|
+
def sanitise_value_string_scalar(shape, value):
|
|
394
394
|
if value is None:
|
|
395
|
-
|
|
395
|
+
return "."
|
|
396
396
|
else:
|
|
397
|
-
|
|
397
|
+
return value[0]
|
|
398
398
|
|
|
399
399
|
|
|
400
|
-
def sanitise_value_string_1d(
|
|
400
|
+
def sanitise_value_string_1d(shape, value):
|
|
401
401
|
if value is None:
|
|
402
|
-
|
|
402
|
+
return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
|
|
403
403
|
else:
|
|
404
|
-
# value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
|
|
405
|
-
# FIXME failure isn't coming from here, it seems to be from an
|
|
406
|
-
# incorrectly detected dimension in the zarr array
|
|
407
|
-
# The dimesions look all wrong, and the dtype should be Object
|
|
408
|
-
# not str
|
|
409
404
|
value = drop_empty_second_dim(value)
|
|
410
|
-
|
|
411
|
-
|
|
405
|
+
result = np.full(shape, "", dtype=value.dtype)
|
|
406
|
+
result[: value.shape[0]] = value
|
|
407
|
+
return result
|
|
412
408
|
|
|
413
409
|
|
|
414
|
-
def sanitise_value_string_2d(
|
|
410
|
+
def sanitise_value_string_2d(shape, value):
|
|
415
411
|
if value is None:
|
|
416
|
-
|
|
412
|
+
return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
|
|
417
413
|
else:
|
|
418
|
-
|
|
419
|
-
# assert value.ndim == 2
|
|
420
|
-
buff[j] = ""
|
|
414
|
+
result = np.full(shape, "", dtype=STRING_DTYPE_NAME)
|
|
421
415
|
if value.ndim == 2:
|
|
422
|
-
|
|
416
|
+
result[: value.shape[0], : value.shape[1]] = value
|
|
423
417
|
else:
|
|
424
|
-
#
|
|
418
|
+
# Convert 1D array into 2D with appropriate shape
|
|
425
419
|
for k, val in enumerate(value):
|
|
426
|
-
|
|
420
|
+
result[k, : len(val)] = val
|
|
421
|
+
return result
|
|
427
422
|
|
|
428
423
|
|
|
429
424
|
def drop_empty_second_dim(value):
|
|
@@ -433,27 +428,28 @@ def drop_empty_second_dim(value):
|
|
|
433
428
|
return value
|
|
434
429
|
|
|
435
430
|
|
|
436
|
-
def sanitise_value_float_1d(
|
|
431
|
+
def sanitise_value_float_1d(shape, value):
|
|
437
432
|
if value is None:
|
|
438
|
-
|
|
433
|
+
return np.full(shape, constants.FLOAT32_MISSING)
|
|
439
434
|
else:
|
|
440
|
-
value = np.array(value, ndmin=1, dtype=
|
|
435
|
+
value = np.array(value, ndmin=1, dtype=np.float32, copy=True)
|
|
441
436
|
# numpy will map None values to Nan, but we need a
|
|
442
437
|
# specific NaN
|
|
443
438
|
value[np.isnan(value)] = constants.FLOAT32_MISSING
|
|
444
439
|
value = drop_empty_second_dim(value)
|
|
445
|
-
|
|
446
|
-
|
|
440
|
+
result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
|
|
441
|
+
result[: value.shape[0]] = value
|
|
442
|
+
return result
|
|
447
443
|
|
|
448
444
|
|
|
449
|
-
def sanitise_value_float_2d(
|
|
445
|
+
def sanitise_value_float_2d(shape, value):
|
|
450
446
|
if value is None:
|
|
451
|
-
|
|
447
|
+
return np.full(shape, constants.FLOAT32_MISSING)
|
|
452
448
|
else:
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
449
|
+
value = np.array(value, ndmin=2, dtype=np.float32, copy=True)
|
|
450
|
+
result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
|
|
451
|
+
result[:, : value.shape[1]] = value
|
|
452
|
+
return result
|
|
457
453
|
|
|
458
454
|
|
|
459
455
|
def sanitise_int_array(value, ndmin, dtype):
|
|
@@ -468,23 +464,25 @@ def sanitise_int_array(value, ndmin, dtype):
|
|
|
468
464
|
return value.astype(dtype)
|
|
469
465
|
|
|
470
466
|
|
|
471
|
-
def sanitise_value_int_1d(
|
|
467
|
+
def sanitise_value_int_1d(shape, value):
|
|
472
468
|
if value is None:
|
|
473
|
-
|
|
469
|
+
return np.full(shape, -1)
|
|
474
470
|
else:
|
|
475
|
-
value = sanitise_int_array(value, 1,
|
|
471
|
+
value = sanitise_int_array(value, 1, np.int32)
|
|
476
472
|
value = drop_empty_second_dim(value)
|
|
477
|
-
|
|
478
|
-
|
|
473
|
+
result = np.full(shape, -2, dtype=np.int32)
|
|
474
|
+
result[: value.shape[0]] = value
|
|
475
|
+
return result
|
|
479
476
|
|
|
480
477
|
|
|
481
|
-
def sanitise_value_int_2d(
|
|
478
|
+
def sanitise_value_int_2d(shape, value):
|
|
482
479
|
if value is None:
|
|
483
|
-
|
|
480
|
+
return np.full(shape, -1)
|
|
484
481
|
else:
|
|
485
|
-
value = sanitise_int_array(value, 2,
|
|
486
|
-
|
|
487
|
-
|
|
482
|
+
value = sanitise_int_array(value, 2, np.int32)
|
|
483
|
+
result = np.full(shape, -2, dtype=np.int32)
|
|
484
|
+
result[:, : value.shape[1]] = value
|
|
485
|
+
return result
|
|
488
486
|
|
|
489
487
|
|
|
490
488
|
missing_value_map = {
|
|
@@ -573,7 +571,12 @@ class StringValueTransformer(VcfValueTransformer):
|
|
|
573
571
|
value = np.array(list(vcf_value.split(",")))
|
|
574
572
|
else:
|
|
575
573
|
# TODO can we make this faster??
|
|
576
|
-
|
|
574
|
+
var_len_values = [v.split(",") for v in vcf_value]
|
|
575
|
+
number = max(len(v) for v in var_len_values)
|
|
576
|
+
value = np.array(
|
|
577
|
+
[v + [""] * (number - len(v)) for v in var_len_values],
|
|
578
|
+
dtype=STRING_DTYPE_NAME,
|
|
579
|
+
)
|
|
577
580
|
# print("HERE", vcf_value, value)
|
|
578
581
|
# for v in vcf_value:
|
|
579
582
|
# print("\t", type(v), len(v), v.split(","))
|
|
@@ -648,7 +651,8 @@ class IntermediateColumnarFormatField:
|
|
|
648
651
|
chunk_cumulative_records = self.chunk_record_index(partition_id)
|
|
649
652
|
chunk_num_records = np.diff(chunk_cumulative_records)
|
|
650
653
|
for count, cumulative in zip(
|
|
651
|
-
chunk_num_records[start_chunk:],
|
|
654
|
+
chunk_num_records[start_chunk:],
|
|
655
|
+
chunk_cumulative_records[start_chunk + 1 :],
|
|
652
656
|
):
|
|
653
657
|
path = partition_path / f"{cumulative}"
|
|
654
658
|
chunk = self.read_chunk(path)
|
|
@@ -707,36 +711,32 @@ class IntermediateColumnarFormatField:
|
|
|
707
711
|
return ret
|
|
708
712
|
|
|
709
713
|
def sanitiser_factory(self, shape):
|
|
710
|
-
|
|
711
|
-
Return a function that sanitised values from this column
|
|
712
|
-
and writes into a buffer of the specified shape.
|
|
713
|
-
"""
|
|
714
|
-
assert len(shape) <= 3
|
|
714
|
+
assert len(shape) <= 2
|
|
715
715
|
if self.vcf_field.vcf_type == "Flag":
|
|
716
|
-
assert len(shape) ==
|
|
717
|
-
return sanitise_value_bool
|
|
716
|
+
assert len(shape) == 0
|
|
717
|
+
return partial(sanitise_value_bool, shape)
|
|
718
718
|
elif self.vcf_field.vcf_type == "Float":
|
|
719
|
-
if len(shape) ==
|
|
720
|
-
return sanitise_value_float_scalar
|
|
721
|
-
elif len(shape) ==
|
|
722
|
-
return sanitise_value_float_1d
|
|
719
|
+
if len(shape) == 0:
|
|
720
|
+
return partial(sanitise_value_float_scalar, shape)
|
|
721
|
+
elif len(shape) == 1:
|
|
722
|
+
return partial(sanitise_value_float_1d, shape)
|
|
723
723
|
else:
|
|
724
|
-
return sanitise_value_float_2d
|
|
724
|
+
return partial(sanitise_value_float_2d, shape)
|
|
725
725
|
elif self.vcf_field.vcf_type == "Integer":
|
|
726
|
-
if len(shape) ==
|
|
727
|
-
return sanitise_value_int_scalar
|
|
728
|
-
elif len(shape) ==
|
|
729
|
-
return sanitise_value_int_1d
|
|
726
|
+
if len(shape) == 0:
|
|
727
|
+
return partial(sanitise_value_int_scalar, shape)
|
|
728
|
+
elif len(shape) == 1:
|
|
729
|
+
return partial(sanitise_value_int_1d, shape)
|
|
730
730
|
else:
|
|
731
|
-
return sanitise_value_int_2d
|
|
731
|
+
return partial(sanitise_value_int_2d, shape)
|
|
732
732
|
else:
|
|
733
733
|
assert self.vcf_field.vcf_type in ("String", "Character")
|
|
734
|
-
if len(shape) ==
|
|
735
|
-
return sanitise_value_string_scalar
|
|
736
|
-
elif len(shape) ==
|
|
737
|
-
return sanitise_value_string_1d
|
|
734
|
+
if len(shape) == 0:
|
|
735
|
+
return partial(sanitise_value_string_scalar, shape)
|
|
736
|
+
elif len(shape) == 1:
|
|
737
|
+
return partial(sanitise_value_string_1d, shape)
|
|
738
738
|
else:
|
|
739
|
-
return sanitise_value_string_2d
|
|
739
|
+
return partial(sanitise_value_string_2d, shape)
|
|
740
740
|
|
|
741
741
|
|
|
742
742
|
@dataclasses.dataclass
|
|
@@ -843,9 +843,66 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
|
|
|
843
843
|
return False
|
|
844
844
|
|
|
845
845
|
|
|
846
|
-
|
|
846
|
+
def convert_local_allele_field_types(fields, schema_instance):
|
|
847
|
+
"""
|
|
848
|
+
Update the specified list of fields to include the LAA field, and to convert
|
|
849
|
+
any supported localisable fields to the L* counterpart.
|
|
850
|
+
|
|
851
|
+
Note that we currently support only two ALT alleles per sample, and so the
|
|
852
|
+
dimensions of these fields are fixed by that requirement. Later versions may
|
|
853
|
+
use summary data storted in the ICF to make different choices, if information
|
|
854
|
+
about subsequent alleles (not in the actual genotype calls) should also be
|
|
855
|
+
stored.
|
|
856
|
+
"""
|
|
857
|
+
fields_by_name = {field.name: field for field in fields}
|
|
858
|
+
gt = fields_by_name["call_genotype"]
|
|
859
|
+
|
|
860
|
+
if schema_instance.get_shape(["ploidy"])[0] != 2:
|
|
861
|
+
raise ValueError("Local alleles only supported on diploid data")
|
|
862
|
+
|
|
863
|
+
dimensions = gt.dimensions[:-1]
|
|
864
|
+
|
|
865
|
+
la = vcz.ZarrArraySpec(
|
|
866
|
+
name="call_LA",
|
|
867
|
+
dtype="i1",
|
|
868
|
+
dimensions=(*dimensions, "local_alleles"),
|
|
869
|
+
description=(
|
|
870
|
+
"0-based indices into REF+ALT, indicating which alleles"
|
|
871
|
+
" are relevant (local) for the current sample"
|
|
872
|
+
),
|
|
873
|
+
)
|
|
874
|
+
schema_instance.dimensions["local_alleles"] = vcz.VcfZarrDimension.unchunked(
|
|
875
|
+
schema_instance.dimensions["ploidy"].size
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
ad = fields_by_name.get("call_AD", None)
|
|
879
|
+
if ad is not None:
|
|
880
|
+
# TODO check if call_LAD is in the list already
|
|
881
|
+
ad.name = "call_LAD"
|
|
882
|
+
ad.source = None
|
|
883
|
+
ad.dimensions = (*dimensions, "local_alleles_AD")
|
|
884
|
+
ad.description += " (local-alleles)"
|
|
885
|
+
schema_instance.dimensions["local_alleles_AD"] = vcz.VcfZarrDimension.unchunked(
|
|
886
|
+
2
|
|
887
|
+
)
|
|
888
|
+
|
|
889
|
+
pl = fields_by_name.get("call_PL", None)
|
|
890
|
+
if pl is not None:
|
|
891
|
+
# TODO check if call_LPL is in the list already
|
|
892
|
+
pl.name = "call_LPL"
|
|
893
|
+
pl.source = None
|
|
894
|
+
pl.description += " (local-alleles)"
|
|
895
|
+
pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1].split("_")[-1])
|
|
896
|
+
schema_instance.dimensions["local_" + pl.dimensions[-1].split("_")[-1]] = (
|
|
897
|
+
vcz.VcfZarrDimension.unchunked(3)
|
|
898
|
+
)
|
|
899
|
+
|
|
900
|
+
return [*fields, la]
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
class IntermediateColumnarFormat(vcz.Source):
|
|
847
904
|
def __init__(self, path):
|
|
848
|
-
self.
|
|
905
|
+
self._path = pathlib.Path(path)
|
|
849
906
|
# TODO raise a more informative error here telling people this
|
|
850
907
|
# directory is either a WIP or the wrong format.
|
|
851
908
|
with open(self.path / "metadata.json") as f:
|
|
@@ -859,8 +916,12 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
859
916
|
]
|
|
860
917
|
# Allow us to find which partition a given record is in
|
|
861
918
|
self.partition_record_index = np.cumsum([0, *partition_num_records])
|
|
919
|
+
self.gt_field = None
|
|
862
920
|
for field in self.metadata.fields:
|
|
863
921
|
self.fields[field.full_name] = IntermediateColumnarFormatField(self, field)
|
|
922
|
+
if field.name == "GT":
|
|
923
|
+
self.gt_field = field
|
|
924
|
+
|
|
864
925
|
logger.info(
|
|
865
926
|
f"Loaded IntermediateColumnarFormat(partitions={self.num_partitions}, "
|
|
866
927
|
f"records={self.num_records}, fields={self.num_fields})"
|
|
@@ -868,20 +929,11 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
868
929
|
|
|
869
930
|
def __repr__(self):
|
|
870
931
|
return (
|
|
871
|
-
f"IntermediateColumnarFormat(fields={len(self)}, "
|
|
932
|
+
f"IntermediateColumnarFormat(fields={len(self.fields)}, "
|
|
872
933
|
f"partitions={self.num_partitions}, "
|
|
873
934
|
f"records={self.num_records}, path={self.path})"
|
|
874
935
|
)
|
|
875
936
|
|
|
876
|
-
def __getitem__(self, key):
|
|
877
|
-
return self.fields[key]
|
|
878
|
-
|
|
879
|
-
def __iter__(self):
|
|
880
|
-
return iter(self.fields)
|
|
881
|
-
|
|
882
|
-
def __len__(self):
|
|
883
|
-
return len(self.fields)
|
|
884
|
-
|
|
885
937
|
def summary_table(self):
|
|
886
938
|
data = []
|
|
887
939
|
for name, icf_field in self.fields.items():
|
|
@@ -900,6 +952,10 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
900
952
|
data.append(d)
|
|
901
953
|
return data
|
|
902
954
|
|
|
955
|
+
@property
|
|
956
|
+
def path(self):
|
|
957
|
+
return self._path
|
|
958
|
+
|
|
903
959
|
@property
|
|
904
960
|
def num_records(self):
|
|
905
961
|
return self.metadata.num_records
|
|
@@ -908,6 +964,18 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
908
964
|
def num_partitions(self):
|
|
909
965
|
return len(self.metadata.partitions)
|
|
910
966
|
|
|
967
|
+
@property
|
|
968
|
+
def samples(self):
|
|
969
|
+
return self.metadata.samples
|
|
970
|
+
|
|
971
|
+
@property
|
|
972
|
+
def contigs(self):
|
|
973
|
+
return self.metadata.contigs
|
|
974
|
+
|
|
975
|
+
@property
|
|
976
|
+
def filters(self):
|
|
977
|
+
return self.metadata.filters
|
|
978
|
+
|
|
911
979
|
@property
|
|
912
980
|
def num_samples(self):
|
|
913
981
|
return len(self.metadata.samples)
|
|
@@ -916,6 +984,265 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
916
984
|
def num_fields(self):
|
|
917
985
|
return len(self.fields)
|
|
918
986
|
|
|
987
|
+
@property
|
|
988
|
+
def root_attrs(self):
|
|
989
|
+
meta_information_pattern = re.compile("##([^=]+)=(.*)")
|
|
990
|
+
vcf_meta_information = []
|
|
991
|
+
for line in self.vcf_header.split("\n"):
|
|
992
|
+
match = re.fullmatch(meta_information_pattern, line)
|
|
993
|
+
if match:
|
|
994
|
+
key = match.group(1)
|
|
995
|
+
if key in ("contig", "FILTER", "INFO", "FORMAT"):
|
|
996
|
+
# these fields are stored in Zarr arrays
|
|
997
|
+
continue
|
|
998
|
+
value = match.group(2)
|
|
999
|
+
vcf_meta_information.append((key, value))
|
|
1000
|
+
return {
|
|
1001
|
+
"vcf_meta_information": vcf_meta_information,
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
def iter_id(self, start, stop):
|
|
1005
|
+
for value in self.fields["ID"].iter_values(start, stop):
|
|
1006
|
+
if value is not None:
|
|
1007
|
+
yield value[0]
|
|
1008
|
+
else:
|
|
1009
|
+
yield None
|
|
1010
|
+
|
|
1011
|
+
def iter_filters(self, start, stop):
|
|
1012
|
+
source_field = self.fields["FILTERS"]
|
|
1013
|
+
lookup = {filt.id: index for index, filt in enumerate(self.metadata.filters)}
|
|
1014
|
+
|
|
1015
|
+
for filter_values in source_field.iter_values(start, stop):
|
|
1016
|
+
filters = np.zeros(len(self.metadata.filters), dtype=bool)
|
|
1017
|
+
if filter_values is not None:
|
|
1018
|
+
for filter_id in filter_values:
|
|
1019
|
+
try:
|
|
1020
|
+
filters[lookup[filter_id]] = True
|
|
1021
|
+
except KeyError:
|
|
1022
|
+
raise ValueError(
|
|
1023
|
+
f"Filter '{filter_id}' was not defined in the header."
|
|
1024
|
+
) from None
|
|
1025
|
+
yield filters
|
|
1026
|
+
|
|
1027
|
+
def iter_contig(self, start, stop):
|
|
1028
|
+
source_field = self.fields["CHROM"]
|
|
1029
|
+
lookup = {
|
|
1030
|
+
contig.id: index for index, contig in enumerate(self.metadata.contigs)
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
for value in source_field.iter_values(start, stop):
|
|
1034
|
+
# Note: because we are using the indexes to define the lookups
|
|
1035
|
+
# and we always have an index, it seems that we the contig lookup
|
|
1036
|
+
# will always succeed. However, if anyone ever does hit a KeyError
|
|
1037
|
+
# here, please do open an issue with a reproducible example!
|
|
1038
|
+
yield lookup[value[0]]
|
|
1039
|
+
|
|
1040
|
+
def iter_field(self, field_name, shape, start, stop):
|
|
1041
|
+
source_field = self.fields[field_name]
|
|
1042
|
+
sanitiser = source_field.sanitiser_factory(shape)
|
|
1043
|
+
for value in source_field.iter_values(start, stop):
|
|
1044
|
+
yield sanitiser(value)
|
|
1045
|
+
|
|
1046
|
+
def iter_alleles(self, start, stop, num_alleles):
|
|
1047
|
+
ref_field = self.fields["REF"]
|
|
1048
|
+
alt_field = self.fields["ALT"]
|
|
1049
|
+
|
|
1050
|
+
for ref, alt in zip(
|
|
1051
|
+
ref_field.iter_values(start, stop),
|
|
1052
|
+
alt_field.iter_values(start, stop),
|
|
1053
|
+
):
|
|
1054
|
+
alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
|
|
1055
|
+
alleles[0] = ref[0]
|
|
1056
|
+
alleles[1 : 1 + len(alt)] = alt
|
|
1057
|
+
yield alleles
|
|
1058
|
+
|
|
1059
|
+
def iter_genotypes(self, shape, start, stop):
|
|
1060
|
+
source_field = self.fields["FORMAT/GT"]
|
|
1061
|
+
for value in source_field.iter_values(start, stop):
|
|
1062
|
+
genotypes = value[:, :-1] if value is not None else None
|
|
1063
|
+
phased = value[:, -1] if value is not None else None
|
|
1064
|
+
sanitised_genotypes = sanitise_value_int_2d(shape, genotypes)
|
|
1065
|
+
sanitised_phased = sanitise_value_int_1d(shape[:-1], phased)
|
|
1066
|
+
# Force haploids to always be phased
|
|
1067
|
+
# https://github.com/sgkit-dev/bio2zarr/issues/399
|
|
1068
|
+
if sanitised_genotypes.shape[1] == 1:
|
|
1069
|
+
sanitised_phased[:] = True
|
|
1070
|
+
yield sanitised_genotypes, sanitised_phased
|
|
1071
|
+
|
|
1072
|
+
def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
|
|
1073
|
+
variant_lengths = self.fields["rlen"].iter_values(start, stop)
|
|
1074
|
+
if self.gt_field is None or shape is None:
|
|
1075
|
+
for variant_length, alleles in zip(
|
|
1076
|
+
variant_lengths, self.iter_alleles(start, stop, num_alleles)
|
|
1077
|
+
):
|
|
1078
|
+
# Stored ICF values are always at least 1D arrays; "rlen" is Number=1
|
|
1079
|
+
# so we must extract the scalar to avoid NumPy scalar-conversion issues.
|
|
1080
|
+
yield vcz.VariantData(variant_length[0], alleles, None, None)
|
|
1081
|
+
else:
|
|
1082
|
+
for variant_length, alleles, (gt, phased) in zip(
|
|
1083
|
+
variant_lengths,
|
|
1084
|
+
self.iter_alleles(start, stop, num_alleles),
|
|
1085
|
+
self.iter_genotypes(shape, start, stop),
|
|
1086
|
+
):
|
|
1087
|
+
yield vcz.VariantData(variant_length[0], alleles, gt, phased)
|
|
1088
|
+
|
|
1089
|
+
def generate_schema(
|
|
1090
|
+
self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
|
|
1091
|
+
):
|
|
1092
|
+
if local_alleles is None:
|
|
1093
|
+
local_alleles = False
|
|
1094
|
+
|
|
1095
|
+
max_alleles = max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2)
|
|
1096
|
+
|
|
1097
|
+
# Add ploidy and genotypes dimensions only when needed
|
|
1098
|
+
max_genotypes = 0
|
|
1099
|
+
has_g_field = False
|
|
1100
|
+
for field in self.metadata.format_fields:
|
|
1101
|
+
if field.vcf_number == "G":
|
|
1102
|
+
has_g_field = True
|
|
1103
|
+
max_genotypes = max(max_genotypes, field.summary.max_number)
|
|
1104
|
+
|
|
1105
|
+
ploidy = None
|
|
1106
|
+
genotypes_size = None
|
|
1107
|
+
if self.gt_field is not None:
|
|
1108
|
+
ploidy = max(self.gt_field.summary.max_number - 1, 1)
|
|
1109
|
+
# NOTE: it's not clear why we're computing this, when we must have had
|
|
1110
|
+
# at least one number=G field to require it anyway?
|
|
1111
|
+
genotypes_size = math.comb(max_alleles + ploidy - 1, ploidy)
|
|
1112
|
+
# assert max_genotypes == genotypes_size
|
|
1113
|
+
else:
|
|
1114
|
+
if max_genotypes > 0 or has_g_field:
|
|
1115
|
+
# there is no GT field, but there is at least one Number=G field,
|
|
1116
|
+
# so need to define genotypes dimension
|
|
1117
|
+
genotypes_size = max_genotypes
|
|
1118
|
+
|
|
1119
|
+
dimensions = vcz.standard_dimensions(
|
|
1120
|
+
variants_size=self.num_records,
|
|
1121
|
+
variants_chunk_size=variants_chunk_size,
|
|
1122
|
+
samples_size=self.num_samples,
|
|
1123
|
+
samples_chunk_size=samples_chunk_size,
|
|
1124
|
+
alleles_size=max_alleles,
|
|
1125
|
+
filters_size=self.metadata.num_filters,
|
|
1126
|
+
ploidy_size=ploidy,
|
|
1127
|
+
genotypes_size=genotypes_size,
|
|
1128
|
+
)
|
|
1129
|
+
|
|
1130
|
+
schema_instance = vcz.VcfZarrSchema(
|
|
1131
|
+
format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
|
|
1132
|
+
dimensions=dimensions,
|
|
1133
|
+
fields=[],
|
|
1134
|
+
)
|
|
1135
|
+
|
|
1136
|
+
logger.info(
|
|
1137
|
+
"Generating schema with chunks="
|
|
1138
|
+
f"variants={dimensions['variants'].chunk_size}, "
|
|
1139
|
+
f"samples={dimensions['samples'].chunk_size}"
|
|
1140
|
+
)
|
|
1141
|
+
|
|
1142
|
+
def spec_from_field(field, array_name=None):
|
|
1143
|
+
return vcz.ZarrArraySpec.from_field(
|
|
1144
|
+
field,
|
|
1145
|
+
schema_instance,
|
|
1146
|
+
array_name=array_name,
|
|
1147
|
+
)
|
|
1148
|
+
|
|
1149
|
+
def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
|
|
1150
|
+
compressor = (
|
|
1151
|
+
vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config()
|
|
1152
|
+
if dtype == "bool"
|
|
1153
|
+
else None
|
|
1154
|
+
)
|
|
1155
|
+
return vcz.ZarrArraySpec(
|
|
1156
|
+
source=source,
|
|
1157
|
+
name=name,
|
|
1158
|
+
dtype=dtype,
|
|
1159
|
+
description="",
|
|
1160
|
+
dimensions=dimensions,
|
|
1161
|
+
compressor=compressor,
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
name_map = {field.full_name: field for field in self.metadata.fields}
|
|
1165
|
+
array_specs = [
|
|
1166
|
+
fixed_field_spec(
|
|
1167
|
+
name="variant_contig",
|
|
1168
|
+
dtype=core.min_int_dtype(0, self.metadata.num_contigs),
|
|
1169
|
+
),
|
|
1170
|
+
fixed_field_spec(
|
|
1171
|
+
name="variant_filter",
|
|
1172
|
+
dtype="bool",
|
|
1173
|
+
dimensions=["variants", "filters"],
|
|
1174
|
+
),
|
|
1175
|
+
fixed_field_spec(
|
|
1176
|
+
name="variant_allele",
|
|
1177
|
+
dtype=STRING_DTYPE_NAME,
|
|
1178
|
+
dimensions=["variants", "alleles"],
|
|
1179
|
+
),
|
|
1180
|
+
fixed_field_spec(
|
|
1181
|
+
name="variant_length",
|
|
1182
|
+
dtype=name_map["rlen"].smallest_dtype(),
|
|
1183
|
+
dimensions=["variants"],
|
|
1184
|
+
),
|
|
1185
|
+
fixed_field_spec(
|
|
1186
|
+
name="variant_id",
|
|
1187
|
+
dtype=STRING_DTYPE_NAME,
|
|
1188
|
+
),
|
|
1189
|
+
fixed_field_spec(
|
|
1190
|
+
name="variant_id_mask",
|
|
1191
|
+
dtype="bool",
|
|
1192
|
+
),
|
|
1193
|
+
]
|
|
1194
|
+
|
|
1195
|
+
# Only two of the fixed fields have a direct one-to-one mapping.
|
|
1196
|
+
array_specs.extend(
|
|
1197
|
+
[
|
|
1198
|
+
spec_from_field(name_map["QUAL"], array_name="variant_quality"),
|
|
1199
|
+
spec_from_field(name_map["POS"], array_name="variant_position"),
|
|
1200
|
+
]
|
|
1201
|
+
)
|
|
1202
|
+
array_specs.extend(
|
|
1203
|
+
[spec_from_field(field) for field in self.metadata.info_fields]
|
|
1204
|
+
)
|
|
1205
|
+
|
|
1206
|
+
for field in self.metadata.format_fields:
|
|
1207
|
+
if field.name == "GT":
|
|
1208
|
+
continue
|
|
1209
|
+
array_specs.append(spec_from_field(field))
|
|
1210
|
+
|
|
1211
|
+
if self.gt_field is not None and self.num_samples > 0:
|
|
1212
|
+
array_specs.append(
|
|
1213
|
+
vcz.ZarrArraySpec(
|
|
1214
|
+
name="call_genotype_phased",
|
|
1215
|
+
dtype="bool",
|
|
1216
|
+
dimensions=["variants", "samples"],
|
|
1217
|
+
description="",
|
|
1218
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
|
|
1219
|
+
)
|
|
1220
|
+
)
|
|
1221
|
+
array_specs.append(
|
|
1222
|
+
vcz.ZarrArraySpec(
|
|
1223
|
+
name="call_genotype",
|
|
1224
|
+
dtype=self.gt_field.smallest_dtype(),
|
|
1225
|
+
dimensions=["variants", "samples", "ploidy"],
|
|
1226
|
+
description="",
|
|
1227
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
|
|
1228
|
+
)
|
|
1229
|
+
)
|
|
1230
|
+
array_specs.append(
|
|
1231
|
+
vcz.ZarrArraySpec(
|
|
1232
|
+
name="call_genotype_mask",
|
|
1233
|
+
dtype="bool",
|
|
1234
|
+
dimensions=["variants", "samples", "ploidy"],
|
|
1235
|
+
description="",
|
|
1236
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
|
|
1237
|
+
)
|
|
1238
|
+
)
|
|
1239
|
+
|
|
1240
|
+
if local_alleles:
|
|
1241
|
+
array_specs = convert_local_allele_field_types(array_specs, schema_instance)
|
|
1242
|
+
|
|
1243
|
+
schema_instance.fields = array_specs
|
|
1244
|
+
return schema_instance
|
|
1245
|
+
|
|
919
1246
|
|
|
920
1247
|
@dataclasses.dataclass
|
|
921
1248
|
class IcfPartitionMetadata(core.JsonDataclass):
|
|
@@ -987,7 +1314,7 @@ class IntermediateColumnarFormatWriter:
|
|
|
987
1314
|
vcfs,
|
|
988
1315
|
*,
|
|
989
1316
|
column_chunk_size=16,
|
|
990
|
-
worker_processes=
|
|
1317
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
991
1318
|
target_num_partitions=None,
|
|
992
1319
|
show_progress=False,
|
|
993
1320
|
compressor=None,
|
|
@@ -1139,7 +1466,9 @@ class IntermediateColumnarFormatWriter:
|
|
|
1139
1466
|
f"{num_records} records last_pos={last_position}"
|
|
1140
1467
|
)
|
|
1141
1468
|
|
|
1142
|
-
def explode(
|
|
1469
|
+
def explode(
|
|
1470
|
+
self, *, worker_processes=core.DEFAULT_WORKER_PROCESSES, show_progress=False
|
|
1471
|
+
):
|
|
1143
1472
|
self.load_metadata()
|
|
1144
1473
|
num_records = self.metadata.num_records
|
|
1145
1474
|
if np.isinf(num_records):
|
|
@@ -1207,7 +1536,7 @@ def explode(
|
|
|
1207
1536
|
vcfs,
|
|
1208
1537
|
*,
|
|
1209
1538
|
column_chunk_size=16,
|
|
1210
|
-
worker_processes=
|
|
1539
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1211
1540
|
show_progress=False,
|
|
1212
1541
|
compressor=None,
|
|
1213
1542
|
):
|
|
@@ -1232,7 +1561,7 @@ def explode_init(
|
|
|
1232
1561
|
*,
|
|
1233
1562
|
column_chunk_size=16,
|
|
1234
1563
|
target_num_partitions=1,
|
|
1235
|
-
worker_processes=
|
|
1564
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1236
1565
|
show_progress=False,
|
|
1237
1566
|
compressor=None,
|
|
1238
1567
|
):
|
|
@@ -1255,3 +1584,166 @@ def explode_partition(icf_path, partition):
|
|
|
1255
1584
|
def explode_finalise(icf_path):
|
|
1256
1585
|
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1257
1586
|
writer.finalise()
|
|
1587
|
+
|
|
1588
|
+
|
|
1589
|
+
def inspect(path):
|
|
1590
|
+
path = pathlib.Path(path)
|
|
1591
|
+
if not path.exists():
|
|
1592
|
+
raise ValueError(f"Path not found: {path}")
|
|
1593
|
+
if (path / "metadata.json").exists():
|
|
1594
|
+
obj = IntermediateColumnarFormat(path)
|
|
1595
|
+
elif zarr_exists(path):
|
|
1596
|
+
obj = vcz.VcfZarr(path)
|
|
1597
|
+
else:
|
|
1598
|
+
raise ValueError(f"{path} not in ICF or VCF Zarr format")
|
|
1599
|
+
return obj.summary_table()
|
|
1600
|
+
|
|
1601
|
+
|
|
1602
|
+
def mkschema(
|
|
1603
|
+
if_path,
|
|
1604
|
+
out,
|
|
1605
|
+
*,
|
|
1606
|
+
variants_chunk_size=None,
|
|
1607
|
+
samples_chunk_size=None,
|
|
1608
|
+
local_alleles=None,
|
|
1609
|
+
):
|
|
1610
|
+
store = IntermediateColumnarFormat(if_path)
|
|
1611
|
+
spec = store.generate_schema(
|
|
1612
|
+
variants_chunk_size=variants_chunk_size,
|
|
1613
|
+
samples_chunk_size=samples_chunk_size,
|
|
1614
|
+
local_alleles=local_alleles,
|
|
1615
|
+
)
|
|
1616
|
+
out.write(spec.asjson())
|
|
1617
|
+
|
|
1618
|
+
|
|
1619
|
+
def convert(
|
|
1620
|
+
vcfs,
|
|
1621
|
+
vcz_path,
|
|
1622
|
+
*,
|
|
1623
|
+
variants_chunk_size=None,
|
|
1624
|
+
samples_chunk_size=None,
|
|
1625
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1626
|
+
local_alleles=None,
|
|
1627
|
+
show_progress=False,
|
|
1628
|
+
icf_path=None,
|
|
1629
|
+
):
|
|
1630
|
+
"""
|
|
1631
|
+
Convert the VCF data at the specified list of paths
|
|
1632
|
+
to VCF Zarr format stored at the specified path.
|
|
1633
|
+
|
|
1634
|
+
.. todo:: Document parameters
|
|
1635
|
+
"""
|
|
1636
|
+
if icf_path is None:
|
|
1637
|
+
cm = temp_icf_path(prefix="vcf2zarr")
|
|
1638
|
+
else:
|
|
1639
|
+
cm = contextlib.nullcontext(icf_path)
|
|
1640
|
+
|
|
1641
|
+
with cm as icf_path:
|
|
1642
|
+
explode(
|
|
1643
|
+
icf_path,
|
|
1644
|
+
vcfs,
|
|
1645
|
+
worker_processes=worker_processes,
|
|
1646
|
+
show_progress=show_progress,
|
|
1647
|
+
)
|
|
1648
|
+
encode(
|
|
1649
|
+
icf_path,
|
|
1650
|
+
vcz_path,
|
|
1651
|
+
variants_chunk_size=variants_chunk_size,
|
|
1652
|
+
samples_chunk_size=samples_chunk_size,
|
|
1653
|
+
worker_processes=worker_processes,
|
|
1654
|
+
show_progress=show_progress,
|
|
1655
|
+
local_alleles=local_alleles,
|
|
1656
|
+
)
|
|
1657
|
+
|
|
1658
|
+
|
|
1659
|
+
@contextlib.contextmanager
|
|
1660
|
+
def temp_icf_path(prefix=None):
|
|
1661
|
+
with tempfile.TemporaryDirectory(prefix=prefix) as tmp:
|
|
1662
|
+
yield pathlib.Path(tmp) / "icf"
|
|
1663
|
+
|
|
1664
|
+
|
|
1665
|
+
def encode(
|
|
1666
|
+
icf_path,
|
|
1667
|
+
zarr_path,
|
|
1668
|
+
schema_path=None,
|
|
1669
|
+
variants_chunk_size=None,
|
|
1670
|
+
samples_chunk_size=None,
|
|
1671
|
+
max_variant_chunks=None,
|
|
1672
|
+
dimension_separator=None,
|
|
1673
|
+
max_memory=None,
|
|
1674
|
+
local_alleles=None,
|
|
1675
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1676
|
+
show_progress=False,
|
|
1677
|
+
):
|
|
1678
|
+
# Rough heuristic to split work up enough to keep utilisation high
|
|
1679
|
+
target_num_partitions = max(1, worker_processes * 4)
|
|
1680
|
+
encode_init(
|
|
1681
|
+
icf_path,
|
|
1682
|
+
zarr_path,
|
|
1683
|
+
target_num_partitions,
|
|
1684
|
+
schema_path=schema_path,
|
|
1685
|
+
variants_chunk_size=variants_chunk_size,
|
|
1686
|
+
samples_chunk_size=samples_chunk_size,
|
|
1687
|
+
local_alleles=local_alleles,
|
|
1688
|
+
max_variant_chunks=max_variant_chunks,
|
|
1689
|
+
dimension_separator=dimension_separator,
|
|
1690
|
+
)
|
|
1691
|
+
vzw = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
|
|
1692
|
+
vzw.encode_all_partitions(
|
|
1693
|
+
worker_processes=worker_processes,
|
|
1694
|
+
show_progress=show_progress,
|
|
1695
|
+
max_memory=max_memory,
|
|
1696
|
+
)
|
|
1697
|
+
vzw.finalise(show_progress)
|
|
1698
|
+
vzw.create_index()
|
|
1699
|
+
|
|
1700
|
+
|
|
1701
|
+
def encode_init(
|
|
1702
|
+
icf_path,
|
|
1703
|
+
zarr_path,
|
|
1704
|
+
target_num_partitions,
|
|
1705
|
+
*,
|
|
1706
|
+
schema_path=None,
|
|
1707
|
+
variants_chunk_size=None,
|
|
1708
|
+
samples_chunk_size=None,
|
|
1709
|
+
local_alleles=None,
|
|
1710
|
+
max_variant_chunks=None,
|
|
1711
|
+
dimension_separator=None,
|
|
1712
|
+
max_memory=None,
|
|
1713
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1714
|
+
show_progress=False,
|
|
1715
|
+
):
|
|
1716
|
+
icf_store = IntermediateColumnarFormat(icf_path)
|
|
1717
|
+
if schema_path is None:
|
|
1718
|
+
schema_instance = icf_store.generate_schema(
|
|
1719
|
+
variants_chunk_size=variants_chunk_size,
|
|
1720
|
+
samples_chunk_size=samples_chunk_size,
|
|
1721
|
+
local_alleles=local_alleles,
|
|
1722
|
+
)
|
|
1723
|
+
else:
|
|
1724
|
+
logger.info(f"Reading schema from {schema_path}")
|
|
1725
|
+
if variants_chunk_size is not None or samples_chunk_size is not None:
|
|
1726
|
+
raise ValueError(
|
|
1727
|
+
"Cannot specify schema along with chunk sizes"
|
|
1728
|
+
) # NEEDS TEST
|
|
1729
|
+
with open(schema_path) as f:
|
|
1730
|
+
schema_instance = vcz.VcfZarrSchema.fromjson(f.read())
|
|
1731
|
+
zarr_path = pathlib.Path(zarr_path)
|
|
1732
|
+
vzw = vcz.VcfZarrWriter("icf", zarr_path)
|
|
1733
|
+
return vzw.init(
|
|
1734
|
+
icf_store,
|
|
1735
|
+
target_num_partitions=target_num_partitions,
|
|
1736
|
+
schema=schema_instance,
|
|
1737
|
+
dimension_separator=dimension_separator,
|
|
1738
|
+
max_variant_chunks=max_variant_chunks,
|
|
1739
|
+
)
|
|
1740
|
+
|
|
1741
|
+
|
|
1742
|
+
def encode_partition(zarr_path, partition):
|
|
1743
|
+
writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
|
|
1744
|
+
writer_instance.encode_partition(partition)
|
|
1745
|
+
|
|
1746
|
+
|
|
1747
|
+
def encode_finalise(zarr_path, show_progress=False):
|
|
1748
|
+
writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
|
|
1749
|
+
writer_instance.finalise(show_progress=show_progress)
|