bio2zarr 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/__main__.py +2 -1
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +89 -22
- bio2zarr/core.py +43 -22
- bio2zarr/plink.py +314 -189
- bio2zarr/tskit.py +301 -0
- bio2zarr/typing.py +1 -2
- bio2zarr/{vcf2zarr/icf.py → vcf.py} +594 -112
- bio2zarr/vcf_utils.py +12 -11
- bio2zarr/{vcf2zarr/vcz.py → vcz.py} +544 -708
- bio2zarr/{vcf2zarr/verification.py → vcz_verification.py} +5 -2
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/METADATA +17 -6
- bio2zarr-0.1.6.dist-info/RECORD +21 -0
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/WHEEL +1 -1
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/entry_points.txt +2 -0
- bio2zarr/vcf2zarr/__init__.py +0 -38
- bio2zarr-0.1.5.dist-info/RECORD +0 -21
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/licenses/LICENSE +0 -0
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/top_level.txt +0 -0
|
@@ -6,14 +6,17 @@ import logging
|
|
|
6
6
|
import math
|
|
7
7
|
import pathlib
|
|
8
8
|
import pickle
|
|
9
|
+
import re
|
|
9
10
|
import shutil
|
|
10
11
|
import sys
|
|
12
|
+
import tempfile
|
|
13
|
+
from functools import partial
|
|
11
14
|
from typing import Any
|
|
12
15
|
|
|
13
16
|
import numcodecs
|
|
14
17
|
import numpy as np
|
|
15
18
|
|
|
16
|
-
from
|
|
19
|
+
from . import constants, core, provenance, vcf_utils, vcz
|
|
17
20
|
|
|
18
21
|
logger = logging.getLogger(__name__)
|
|
19
22
|
|
|
@@ -77,6 +80,14 @@ class VcfField:
|
|
|
77
80
|
return self.name
|
|
78
81
|
return f"{self.category}/{self.name}"
|
|
79
82
|
|
|
83
|
+
@property
|
|
84
|
+
def max_number(self):
|
|
85
|
+
if self.vcf_number in ("R", "A", "G", "."):
|
|
86
|
+
return self.summary.max_number
|
|
87
|
+
else:
|
|
88
|
+
# use declared number if larger than max found
|
|
89
|
+
return max(self.summary.max_number, int(self.vcf_number))
|
|
90
|
+
|
|
80
91
|
def smallest_dtype(self):
|
|
81
92
|
"""
|
|
82
93
|
Returns the smallest dtype suitable for this field based
|
|
@@ -116,23 +127,6 @@ ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
|
|
|
116
127
|
)
|
|
117
128
|
|
|
118
129
|
|
|
119
|
-
@dataclasses.dataclass
|
|
120
|
-
class Contig:
|
|
121
|
-
id: str
|
|
122
|
-
length: int = None
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
@dataclasses.dataclass
|
|
126
|
-
class Sample:
|
|
127
|
-
id: str
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
@dataclasses.dataclass
|
|
131
|
-
class Filter:
|
|
132
|
-
id: str
|
|
133
|
-
description: str = ""
|
|
134
|
-
|
|
135
|
-
|
|
136
130
|
@dataclasses.dataclass
|
|
137
131
|
class IcfMetadata(core.JsonDataclass):
|
|
138
132
|
samples: list
|
|
@@ -187,9 +181,9 @@ class IcfMetadata(core.JsonDataclass):
|
|
|
187
181
|
d = d.copy()
|
|
188
182
|
d["partitions"] = partitions
|
|
189
183
|
d["fields"] = [VcfField.fromdict(fd) for fd in d["fields"]]
|
|
190
|
-
d["samples"] = [Sample(**sd) for sd in d["samples"]]
|
|
191
|
-
d["filters"] = [Filter(**fd) for fd in d["filters"]]
|
|
192
|
-
d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
|
|
184
|
+
d["samples"] = [vcz.Sample(**sd) for sd in d["samples"]]
|
|
185
|
+
d["filters"] = [vcz.Filter(**fd) for fd in d["filters"]]
|
|
186
|
+
d["contigs"] = [vcz.Contig(**cd) for cd in d["contigs"]]
|
|
193
187
|
return IcfMetadata(**d)
|
|
194
188
|
|
|
195
189
|
def __eq__(self, other):
|
|
@@ -240,7 +234,7 @@ def scan_vcf(path, target_num_partitions):
|
|
|
240
234
|
description = ""
|
|
241
235
|
if h["ID"] == "PASS":
|
|
242
236
|
pass_index = len(filters)
|
|
243
|
-
filters.append(Filter(h["ID"], description))
|
|
237
|
+
filters.append(vcz.Filter(h["ID"], description))
|
|
244
238
|
|
|
245
239
|
# Ensure PASS is the first filter if present
|
|
246
240
|
if pass_index > 0:
|
|
@@ -262,9 +256,9 @@ def scan_vcf(path, target_num_partitions):
|
|
|
262
256
|
contig_lengths = [None for _ in vcf.seqnames]
|
|
263
257
|
|
|
264
258
|
metadata = IcfMetadata(
|
|
265
|
-
samples=[Sample(sample_id) for sample_id in vcf.samples],
|
|
259
|
+
samples=[vcz.Sample(sample_id) for sample_id in vcf.samples],
|
|
266
260
|
contigs=[
|
|
267
|
-
Contig(contig_id, length)
|
|
261
|
+
vcz.Contig(contig_id, length)
|
|
268
262
|
for contig_id, length in zip(vcf.seqnames, contig_lengths)
|
|
269
263
|
],
|
|
270
264
|
filters=filters,
|
|
@@ -291,7 +285,12 @@ def scan_vcf(path, target_num_partitions):
|
|
|
291
285
|
return metadata, vcf.raw_header
|
|
292
286
|
|
|
293
287
|
|
|
294
|
-
def scan_vcfs(
|
|
288
|
+
def scan_vcfs(
|
|
289
|
+
paths,
|
|
290
|
+
show_progress,
|
|
291
|
+
target_num_partitions,
|
|
292
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
293
|
+
):
|
|
295
294
|
logger.info(
|
|
296
295
|
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
|
|
297
296
|
f" partitions."
|
|
@@ -366,64 +365,58 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
|
366
365
|
return icf_metadata, header
|
|
367
366
|
|
|
368
367
|
|
|
369
|
-
def sanitise_value_bool(
|
|
368
|
+
def sanitise_value_bool(shape, value):
|
|
370
369
|
x = True
|
|
371
370
|
if value is None:
|
|
372
371
|
x = False
|
|
373
|
-
|
|
372
|
+
return x
|
|
374
373
|
|
|
375
374
|
|
|
376
|
-
def sanitise_value_float_scalar(
|
|
375
|
+
def sanitise_value_float_scalar(shape, value):
|
|
377
376
|
x = value
|
|
378
377
|
if value is None:
|
|
379
378
|
x = [constants.FLOAT32_MISSING]
|
|
380
|
-
|
|
379
|
+
return x[0]
|
|
381
380
|
|
|
382
381
|
|
|
383
|
-
def sanitise_value_int_scalar(
|
|
382
|
+
def sanitise_value_int_scalar(shape, value):
|
|
384
383
|
x = value
|
|
385
384
|
if value is None:
|
|
386
|
-
# print("MISSING", INT_MISSING, INT_FILL)
|
|
387
385
|
x = [constants.INT_MISSING]
|
|
388
386
|
else:
|
|
389
387
|
x = sanitise_int_array(value, ndmin=1, dtype=np.int32)
|
|
390
|
-
|
|
388
|
+
return x[0]
|
|
391
389
|
|
|
392
390
|
|
|
393
|
-
def sanitise_value_string_scalar(
|
|
391
|
+
def sanitise_value_string_scalar(shape, value):
|
|
394
392
|
if value is None:
|
|
395
|
-
|
|
393
|
+
return "."
|
|
396
394
|
else:
|
|
397
|
-
|
|
395
|
+
return value[0]
|
|
398
396
|
|
|
399
397
|
|
|
400
|
-
def sanitise_value_string_1d(
|
|
398
|
+
def sanitise_value_string_1d(shape, value):
|
|
401
399
|
if value is None:
|
|
402
|
-
|
|
400
|
+
return np.full(shape, ".", dtype="O")
|
|
403
401
|
else:
|
|
404
|
-
# value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
|
|
405
|
-
# FIXME failure isn't coming from here, it seems to be from an
|
|
406
|
-
# incorrectly detected dimension in the zarr array
|
|
407
|
-
# The dimesions look all wrong, and the dtype should be Object
|
|
408
|
-
# not str
|
|
409
402
|
value = drop_empty_second_dim(value)
|
|
410
|
-
|
|
411
|
-
|
|
403
|
+
result = np.full(shape, "", dtype=value.dtype)
|
|
404
|
+
result[: value.shape[0]] = value
|
|
405
|
+
return result
|
|
412
406
|
|
|
413
407
|
|
|
414
|
-
def sanitise_value_string_2d(
|
|
408
|
+
def sanitise_value_string_2d(shape, value):
|
|
415
409
|
if value is None:
|
|
416
|
-
|
|
410
|
+
return np.full(shape, ".", dtype="O")
|
|
417
411
|
else:
|
|
418
|
-
|
|
419
|
-
# assert value.ndim == 2
|
|
420
|
-
buff[j] = ""
|
|
412
|
+
result = np.full(shape, "", dtype="O")
|
|
421
413
|
if value.ndim == 2:
|
|
422
|
-
|
|
414
|
+
result[: value.shape[0], : value.shape[1]] = value
|
|
423
415
|
else:
|
|
424
|
-
#
|
|
416
|
+
# Convert 1D array into 2D with appropriate shape
|
|
425
417
|
for k, val in enumerate(value):
|
|
426
|
-
|
|
418
|
+
result[k, : len(val)] = val
|
|
419
|
+
return result
|
|
427
420
|
|
|
428
421
|
|
|
429
422
|
def drop_empty_second_dim(value):
|
|
@@ -433,27 +426,28 @@ def drop_empty_second_dim(value):
|
|
|
433
426
|
return value
|
|
434
427
|
|
|
435
428
|
|
|
436
|
-
def sanitise_value_float_1d(
|
|
429
|
+
def sanitise_value_float_1d(shape, value):
|
|
437
430
|
if value is None:
|
|
438
|
-
|
|
431
|
+
return np.full(shape, constants.FLOAT32_MISSING)
|
|
439
432
|
else:
|
|
440
|
-
value = np.array(value, ndmin=1, dtype=
|
|
433
|
+
value = np.array(value, ndmin=1, dtype=np.float32, copy=True)
|
|
441
434
|
# numpy will map None values to Nan, but we need a
|
|
442
435
|
# specific NaN
|
|
443
436
|
value[np.isnan(value)] = constants.FLOAT32_MISSING
|
|
444
437
|
value = drop_empty_second_dim(value)
|
|
445
|
-
|
|
446
|
-
|
|
438
|
+
result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
|
|
439
|
+
result[: value.shape[0]] = value
|
|
440
|
+
return result
|
|
447
441
|
|
|
448
442
|
|
|
449
|
-
def sanitise_value_float_2d(
|
|
443
|
+
def sanitise_value_float_2d(shape, value):
|
|
450
444
|
if value is None:
|
|
451
|
-
|
|
445
|
+
return np.full(shape, constants.FLOAT32_MISSING)
|
|
452
446
|
else:
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
447
|
+
value = np.array(value, ndmin=2, dtype=np.float32, copy=True)
|
|
448
|
+
result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
|
|
449
|
+
result[:, : value.shape[1]] = value
|
|
450
|
+
return result
|
|
457
451
|
|
|
458
452
|
|
|
459
453
|
def sanitise_int_array(value, ndmin, dtype):
|
|
@@ -468,23 +462,25 @@ def sanitise_int_array(value, ndmin, dtype):
|
|
|
468
462
|
return value.astype(dtype)
|
|
469
463
|
|
|
470
464
|
|
|
471
|
-
def sanitise_value_int_1d(
|
|
465
|
+
def sanitise_value_int_1d(shape, value):
|
|
472
466
|
if value is None:
|
|
473
|
-
|
|
467
|
+
return np.full(shape, -1)
|
|
474
468
|
else:
|
|
475
|
-
value = sanitise_int_array(value, 1,
|
|
469
|
+
value = sanitise_int_array(value, 1, np.int32)
|
|
476
470
|
value = drop_empty_second_dim(value)
|
|
477
|
-
|
|
478
|
-
|
|
471
|
+
result = np.full(shape, -2, dtype=np.int32)
|
|
472
|
+
result[: value.shape[0]] = value
|
|
473
|
+
return result
|
|
479
474
|
|
|
480
475
|
|
|
481
|
-
def sanitise_value_int_2d(
|
|
476
|
+
def sanitise_value_int_2d(shape, value):
|
|
482
477
|
if value is None:
|
|
483
|
-
|
|
478
|
+
return np.full(shape, -1)
|
|
484
479
|
else:
|
|
485
|
-
value = sanitise_int_array(value, 2,
|
|
486
|
-
|
|
487
|
-
|
|
480
|
+
value = sanitise_int_array(value, 2, np.int32)
|
|
481
|
+
result = np.full(shape, -2, dtype=np.int32)
|
|
482
|
+
result[:, : value.shape[1]] = value
|
|
483
|
+
return result
|
|
488
484
|
|
|
489
485
|
|
|
490
486
|
missing_value_map = {
|
|
@@ -648,7 +644,8 @@ class IntermediateColumnarFormatField:
|
|
|
648
644
|
chunk_cumulative_records = self.chunk_record_index(partition_id)
|
|
649
645
|
chunk_num_records = np.diff(chunk_cumulative_records)
|
|
650
646
|
for count, cumulative in zip(
|
|
651
|
-
chunk_num_records[start_chunk:],
|
|
647
|
+
chunk_num_records[start_chunk:],
|
|
648
|
+
chunk_cumulative_records[start_chunk + 1 :],
|
|
652
649
|
):
|
|
653
650
|
path = partition_path / f"{cumulative}"
|
|
654
651
|
chunk = self.read_chunk(path)
|
|
@@ -707,36 +704,32 @@ class IntermediateColumnarFormatField:
|
|
|
707
704
|
return ret
|
|
708
705
|
|
|
709
706
|
def sanitiser_factory(self, shape):
|
|
710
|
-
|
|
711
|
-
Return a function that sanitised values from this column
|
|
712
|
-
and writes into a buffer of the specified shape.
|
|
713
|
-
"""
|
|
714
|
-
assert len(shape) <= 3
|
|
707
|
+
assert len(shape) <= 2
|
|
715
708
|
if self.vcf_field.vcf_type == "Flag":
|
|
716
|
-
assert len(shape) ==
|
|
717
|
-
return sanitise_value_bool
|
|
709
|
+
assert len(shape) == 0
|
|
710
|
+
return partial(sanitise_value_bool, shape)
|
|
718
711
|
elif self.vcf_field.vcf_type == "Float":
|
|
719
|
-
if len(shape) ==
|
|
720
|
-
return sanitise_value_float_scalar
|
|
721
|
-
elif len(shape) ==
|
|
722
|
-
return sanitise_value_float_1d
|
|
712
|
+
if len(shape) == 0:
|
|
713
|
+
return partial(sanitise_value_float_scalar, shape)
|
|
714
|
+
elif len(shape) == 1:
|
|
715
|
+
return partial(sanitise_value_float_1d, shape)
|
|
723
716
|
else:
|
|
724
|
-
return sanitise_value_float_2d
|
|
717
|
+
return partial(sanitise_value_float_2d, shape)
|
|
725
718
|
elif self.vcf_field.vcf_type == "Integer":
|
|
726
|
-
if len(shape) ==
|
|
727
|
-
return sanitise_value_int_scalar
|
|
728
|
-
elif len(shape) ==
|
|
729
|
-
return sanitise_value_int_1d
|
|
719
|
+
if len(shape) == 0:
|
|
720
|
+
return partial(sanitise_value_int_scalar, shape)
|
|
721
|
+
elif len(shape) == 1:
|
|
722
|
+
return partial(sanitise_value_int_1d, shape)
|
|
730
723
|
else:
|
|
731
|
-
return sanitise_value_int_2d
|
|
724
|
+
return partial(sanitise_value_int_2d, shape)
|
|
732
725
|
else:
|
|
733
726
|
assert self.vcf_field.vcf_type in ("String", "Character")
|
|
734
|
-
if len(shape) ==
|
|
735
|
-
return sanitise_value_string_scalar
|
|
736
|
-
elif len(shape) ==
|
|
737
|
-
return sanitise_value_string_1d
|
|
727
|
+
if len(shape) == 0:
|
|
728
|
+
return partial(sanitise_value_string_scalar, shape)
|
|
729
|
+
elif len(shape) == 1:
|
|
730
|
+
return partial(sanitise_value_string_1d, shape)
|
|
738
731
|
else:
|
|
739
|
-
return sanitise_value_string_2d
|
|
732
|
+
return partial(sanitise_value_string_2d, shape)
|
|
740
733
|
|
|
741
734
|
|
|
742
735
|
@dataclasses.dataclass
|
|
@@ -843,9 +836,66 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
|
|
|
843
836
|
return False
|
|
844
837
|
|
|
845
838
|
|
|
846
|
-
|
|
839
|
+
def convert_local_allele_field_types(fields, schema_instance):
|
|
840
|
+
"""
|
|
841
|
+
Update the specified list of fields to include the LAA field, and to convert
|
|
842
|
+
any supported localisable fields to the L* counterpart.
|
|
843
|
+
|
|
844
|
+
Note that we currently support only two ALT alleles per sample, and so the
|
|
845
|
+
dimensions of these fields are fixed by that requirement. Later versions may
|
|
846
|
+
use summary data storted in the ICF to make different choices, if information
|
|
847
|
+
about subsequent alleles (not in the actual genotype calls) should also be
|
|
848
|
+
stored.
|
|
849
|
+
"""
|
|
850
|
+
fields_by_name = {field.name: field for field in fields}
|
|
851
|
+
gt = fields_by_name["call_genotype"]
|
|
852
|
+
|
|
853
|
+
if schema_instance.get_shape(["ploidy"])[0] != 2:
|
|
854
|
+
raise ValueError("Local alleles only supported on diploid data")
|
|
855
|
+
|
|
856
|
+
dimensions = gt.dimensions[:-1]
|
|
857
|
+
|
|
858
|
+
la = vcz.ZarrArraySpec(
|
|
859
|
+
name="call_LA",
|
|
860
|
+
dtype="i1",
|
|
861
|
+
dimensions=(*dimensions, "local_alleles"),
|
|
862
|
+
description=(
|
|
863
|
+
"0-based indices into REF+ALT, indicating which alleles"
|
|
864
|
+
" are relevant (local) for the current sample"
|
|
865
|
+
),
|
|
866
|
+
)
|
|
867
|
+
schema_instance.dimensions["local_alleles"] = vcz.VcfZarrDimension.unchunked(
|
|
868
|
+
schema_instance.dimensions["ploidy"].size
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
ad = fields_by_name.get("call_AD", None)
|
|
872
|
+
if ad is not None:
|
|
873
|
+
# TODO check if call_LAD is in the list already
|
|
874
|
+
ad.name = "call_LAD"
|
|
875
|
+
ad.source = None
|
|
876
|
+
ad.dimensions = (*dimensions, "local_alleles_AD")
|
|
877
|
+
ad.description += " (local-alleles)"
|
|
878
|
+
schema_instance.dimensions["local_alleles_AD"] = vcz.VcfZarrDimension.unchunked(
|
|
879
|
+
2
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
pl = fields_by_name.get("call_PL", None)
|
|
883
|
+
if pl is not None:
|
|
884
|
+
# TODO check if call_LPL is in the list already
|
|
885
|
+
pl.name = "call_LPL"
|
|
886
|
+
pl.source = None
|
|
887
|
+
pl.description += " (local-alleles)"
|
|
888
|
+
pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1].split("_")[-1])
|
|
889
|
+
schema_instance.dimensions["local_" + pl.dimensions[-1].split("_")[-1]] = (
|
|
890
|
+
vcz.VcfZarrDimension.unchunked(3)
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
return [*fields, la]
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
class IntermediateColumnarFormat(vcz.Source):
|
|
847
897
|
def __init__(self, path):
|
|
848
|
-
self.
|
|
898
|
+
self._path = pathlib.Path(path)
|
|
849
899
|
# TODO raise a more informative error here telling people this
|
|
850
900
|
# directory is either a WIP or the wrong format.
|
|
851
901
|
with open(self.path / "metadata.json") as f:
|
|
@@ -859,8 +909,12 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
859
909
|
]
|
|
860
910
|
# Allow us to find which partition a given record is in
|
|
861
911
|
self.partition_record_index = np.cumsum([0, *partition_num_records])
|
|
912
|
+
self.gt_field = None
|
|
862
913
|
for field in self.metadata.fields:
|
|
863
914
|
self.fields[field.full_name] = IntermediateColumnarFormatField(self, field)
|
|
915
|
+
if field.name == "GT":
|
|
916
|
+
self.gt_field = field
|
|
917
|
+
|
|
864
918
|
logger.info(
|
|
865
919
|
f"Loaded IntermediateColumnarFormat(partitions={self.num_partitions}, "
|
|
866
920
|
f"records={self.num_records}, fields={self.num_fields})"
|
|
@@ -868,20 +922,11 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
868
922
|
|
|
869
923
|
def __repr__(self):
|
|
870
924
|
return (
|
|
871
|
-
f"IntermediateColumnarFormat(fields={len(self)}, "
|
|
925
|
+
f"IntermediateColumnarFormat(fields={len(self.fields)}, "
|
|
872
926
|
f"partitions={self.num_partitions}, "
|
|
873
927
|
f"records={self.num_records}, path={self.path})"
|
|
874
928
|
)
|
|
875
929
|
|
|
876
|
-
def __getitem__(self, key):
|
|
877
|
-
return self.fields[key]
|
|
878
|
-
|
|
879
|
-
def __iter__(self):
|
|
880
|
-
return iter(self.fields)
|
|
881
|
-
|
|
882
|
-
def __len__(self):
|
|
883
|
-
return len(self.fields)
|
|
884
|
-
|
|
885
930
|
def summary_table(self):
|
|
886
931
|
data = []
|
|
887
932
|
for name, icf_field in self.fields.items():
|
|
@@ -900,6 +945,10 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
900
945
|
data.append(d)
|
|
901
946
|
return data
|
|
902
947
|
|
|
948
|
+
@property
|
|
949
|
+
def path(self):
|
|
950
|
+
return self._path
|
|
951
|
+
|
|
903
952
|
@property
|
|
904
953
|
def num_records(self):
|
|
905
954
|
return self.metadata.num_records
|
|
@@ -908,6 +957,18 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
908
957
|
def num_partitions(self):
|
|
909
958
|
return len(self.metadata.partitions)
|
|
910
959
|
|
|
960
|
+
@property
|
|
961
|
+
def samples(self):
|
|
962
|
+
return self.metadata.samples
|
|
963
|
+
|
|
964
|
+
@property
|
|
965
|
+
def contigs(self):
|
|
966
|
+
return self.metadata.contigs
|
|
967
|
+
|
|
968
|
+
@property
|
|
969
|
+
def filters(self):
|
|
970
|
+
return self.metadata.filters
|
|
971
|
+
|
|
911
972
|
@property
|
|
912
973
|
def num_samples(self):
|
|
913
974
|
return len(self.metadata.samples)
|
|
@@ -916,6 +977,261 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
916
977
|
def num_fields(self):
|
|
917
978
|
return len(self.fields)
|
|
918
979
|
|
|
980
|
+
@property
|
|
981
|
+
def root_attrs(self):
|
|
982
|
+
meta_information_pattern = re.compile("##([^=]+)=(.*)")
|
|
983
|
+
vcf_meta_information = []
|
|
984
|
+
for line in self.vcf_header.split("\n"):
|
|
985
|
+
match = re.fullmatch(meta_information_pattern, line)
|
|
986
|
+
if match:
|
|
987
|
+
key = match.group(1)
|
|
988
|
+
if key in ("contig", "FILTER", "INFO", "FORMAT"):
|
|
989
|
+
# these fields are stored in Zarr arrays
|
|
990
|
+
continue
|
|
991
|
+
value = match.group(2)
|
|
992
|
+
vcf_meta_information.append((key, value))
|
|
993
|
+
return {
|
|
994
|
+
"vcf_meta_information": vcf_meta_information,
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
def iter_id(self, start, stop):
|
|
998
|
+
for value in self.fields["ID"].iter_values(start, stop):
|
|
999
|
+
if value is not None:
|
|
1000
|
+
yield value[0]
|
|
1001
|
+
else:
|
|
1002
|
+
yield None
|
|
1003
|
+
|
|
1004
|
+
def iter_filters(self, start, stop):
|
|
1005
|
+
source_field = self.fields["FILTERS"]
|
|
1006
|
+
lookup = {filt.id: index for index, filt in enumerate(self.metadata.filters)}
|
|
1007
|
+
|
|
1008
|
+
for filter_values in source_field.iter_values(start, stop):
|
|
1009
|
+
filters = np.zeros(len(self.metadata.filters), dtype=bool)
|
|
1010
|
+
if filter_values is not None:
|
|
1011
|
+
for filter_id in filter_values:
|
|
1012
|
+
try:
|
|
1013
|
+
filters[lookup[filter_id]] = True
|
|
1014
|
+
except KeyError:
|
|
1015
|
+
raise ValueError(
|
|
1016
|
+
f"Filter '{filter_id}' was not defined in the header."
|
|
1017
|
+
) from None
|
|
1018
|
+
yield filters
|
|
1019
|
+
|
|
1020
|
+
def iter_contig(self, start, stop):
|
|
1021
|
+
source_field = self.fields["CHROM"]
|
|
1022
|
+
lookup = {
|
|
1023
|
+
contig.id: index for index, contig in enumerate(self.metadata.contigs)
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
for value in source_field.iter_values(start, stop):
|
|
1027
|
+
# Note: because we are using the indexes to define the lookups
|
|
1028
|
+
# and we always have an index, it seems that we the contig lookup
|
|
1029
|
+
# will always succeed. However, if anyone ever does hit a KeyError
|
|
1030
|
+
# here, please do open an issue with a reproducible example!
|
|
1031
|
+
yield lookup[value[0]]
|
|
1032
|
+
|
|
1033
|
+
def iter_field(self, field_name, shape, start, stop):
|
|
1034
|
+
source_field = self.fields[field_name]
|
|
1035
|
+
sanitiser = source_field.sanitiser_factory(shape)
|
|
1036
|
+
for value in source_field.iter_values(start, stop):
|
|
1037
|
+
yield sanitiser(value)
|
|
1038
|
+
|
|
1039
|
+
def iter_alleles(self, start, stop, num_alleles):
|
|
1040
|
+
ref_field = self.fields["REF"]
|
|
1041
|
+
alt_field = self.fields["ALT"]
|
|
1042
|
+
|
|
1043
|
+
for ref, alt in zip(
|
|
1044
|
+
ref_field.iter_values(start, stop),
|
|
1045
|
+
alt_field.iter_values(start, stop),
|
|
1046
|
+
):
|
|
1047
|
+
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
|
|
1048
|
+
alleles[0] = ref[0]
|
|
1049
|
+
alleles[1 : 1 + len(alt)] = alt
|
|
1050
|
+
yield alleles
|
|
1051
|
+
|
|
1052
|
+
def iter_genotypes(self, shape, start, stop):
|
|
1053
|
+
source_field = self.fields["FORMAT/GT"]
|
|
1054
|
+
for value in source_field.iter_values(start, stop):
|
|
1055
|
+
genotypes = value[:, :-1] if value is not None else None
|
|
1056
|
+
phased = value[:, -1] if value is not None else None
|
|
1057
|
+
sanitised_genotypes = sanitise_value_int_2d(shape, genotypes)
|
|
1058
|
+
sanitised_phased = sanitise_value_int_1d(shape[:-1], phased)
|
|
1059
|
+
# Force haploids to always be phased
|
|
1060
|
+
# https://github.com/sgkit-dev/bio2zarr/issues/399
|
|
1061
|
+
if sanitised_genotypes.shape[1] == 1:
|
|
1062
|
+
sanitised_phased[:] = True
|
|
1063
|
+
yield sanitised_genotypes, sanitised_phased
|
|
1064
|
+
|
|
1065
|
+
def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
|
|
1066
|
+
variant_lengths = self.fields["rlen"].iter_values(start, stop)
|
|
1067
|
+
if self.gt_field is None or shape is None:
|
|
1068
|
+
for variant_length, alleles in zip(
|
|
1069
|
+
variant_lengths, self.iter_alleles(start, stop, num_alleles)
|
|
1070
|
+
):
|
|
1071
|
+
yield vcz.VariantData(variant_length, alleles, None, None)
|
|
1072
|
+
else:
|
|
1073
|
+
for variant_length, alleles, (gt, phased) in zip(
|
|
1074
|
+
variant_lengths,
|
|
1075
|
+
self.iter_alleles(start, stop, num_alleles),
|
|
1076
|
+
self.iter_genotypes(shape, start, stop),
|
|
1077
|
+
):
|
|
1078
|
+
yield vcz.VariantData(variant_length, alleles, gt, phased)
|
|
1079
|
+
|
|
1080
|
+
def generate_schema(
|
|
1081
|
+
self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
|
|
1082
|
+
):
|
|
1083
|
+
if local_alleles is None:
|
|
1084
|
+
local_alleles = False
|
|
1085
|
+
|
|
1086
|
+
max_alleles = max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2)
|
|
1087
|
+
|
|
1088
|
+
# Add ploidy and genotypes dimensions only when needed
|
|
1089
|
+
max_genotypes = 0
|
|
1090
|
+
for field in self.metadata.format_fields:
|
|
1091
|
+
if field.vcf_number == "G":
|
|
1092
|
+
max_genotypes = max(max_genotypes, field.summary.max_number)
|
|
1093
|
+
|
|
1094
|
+
ploidy = None
|
|
1095
|
+
genotypes_size = None
|
|
1096
|
+
if self.gt_field is not None:
|
|
1097
|
+
ploidy = max(self.gt_field.summary.max_number - 1, 1)
|
|
1098
|
+
# NOTE: it's not clear why we're computing this, when we must have had
|
|
1099
|
+
# at least one number=G field to require it anyway?
|
|
1100
|
+
genotypes_size = math.comb(max_alleles + ploidy - 1, ploidy)
|
|
1101
|
+
# assert max_genotypes == genotypes_size
|
|
1102
|
+
else:
|
|
1103
|
+
if max_genotypes > 0:
|
|
1104
|
+
# there is no GT field, but there is at least one Number=G field,
|
|
1105
|
+
# so need to define genotypes dimension
|
|
1106
|
+
genotypes_size = max_genotypes
|
|
1107
|
+
|
|
1108
|
+
dimensions = vcz.standard_dimensions(
|
|
1109
|
+
variants_size=self.num_records,
|
|
1110
|
+
variants_chunk_size=variants_chunk_size,
|
|
1111
|
+
samples_size=self.num_samples,
|
|
1112
|
+
samples_chunk_size=samples_chunk_size,
|
|
1113
|
+
alleles_size=max_alleles,
|
|
1114
|
+
filters_size=self.metadata.num_filters,
|
|
1115
|
+
ploidy_size=ploidy,
|
|
1116
|
+
genotypes_size=genotypes_size,
|
|
1117
|
+
)
|
|
1118
|
+
|
|
1119
|
+
schema_instance = vcz.VcfZarrSchema(
|
|
1120
|
+
format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
|
|
1121
|
+
dimensions=dimensions,
|
|
1122
|
+
fields=[],
|
|
1123
|
+
)
|
|
1124
|
+
|
|
1125
|
+
logger.info(
|
|
1126
|
+
"Generating schema with chunks="
|
|
1127
|
+
f"variants={dimensions['variants'].chunk_size}, "
|
|
1128
|
+
f"samples={dimensions['samples'].chunk_size}"
|
|
1129
|
+
)
|
|
1130
|
+
|
|
1131
|
+
def spec_from_field(field, array_name=None):
|
|
1132
|
+
return vcz.ZarrArraySpec.from_field(
|
|
1133
|
+
field,
|
|
1134
|
+
schema_instance,
|
|
1135
|
+
array_name=array_name,
|
|
1136
|
+
)
|
|
1137
|
+
|
|
1138
|
+
def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
|
|
1139
|
+
compressor = (
|
|
1140
|
+
vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config()
|
|
1141
|
+
if dtype == "bool"
|
|
1142
|
+
else None
|
|
1143
|
+
)
|
|
1144
|
+
return vcz.ZarrArraySpec(
|
|
1145
|
+
source=source,
|
|
1146
|
+
name=name,
|
|
1147
|
+
dtype=dtype,
|
|
1148
|
+
description="",
|
|
1149
|
+
dimensions=dimensions,
|
|
1150
|
+
compressor=compressor,
|
|
1151
|
+
)
|
|
1152
|
+
|
|
1153
|
+
name_map = {field.full_name: field for field in self.metadata.fields}
|
|
1154
|
+
array_specs = [
|
|
1155
|
+
fixed_field_spec(
|
|
1156
|
+
name="variant_contig",
|
|
1157
|
+
dtype=core.min_int_dtype(0, self.metadata.num_contigs),
|
|
1158
|
+
),
|
|
1159
|
+
fixed_field_spec(
|
|
1160
|
+
name="variant_filter",
|
|
1161
|
+
dtype="bool",
|
|
1162
|
+
dimensions=["variants", "filters"],
|
|
1163
|
+
),
|
|
1164
|
+
fixed_field_spec(
|
|
1165
|
+
name="variant_allele",
|
|
1166
|
+
dtype="O",
|
|
1167
|
+
dimensions=["variants", "alleles"],
|
|
1168
|
+
),
|
|
1169
|
+
fixed_field_spec(
|
|
1170
|
+
name="variant_length",
|
|
1171
|
+
dtype=name_map["rlen"].smallest_dtype(),
|
|
1172
|
+
dimensions=["variants"],
|
|
1173
|
+
),
|
|
1174
|
+
fixed_field_spec(
|
|
1175
|
+
name="variant_id",
|
|
1176
|
+
dtype="O",
|
|
1177
|
+
),
|
|
1178
|
+
fixed_field_spec(
|
|
1179
|
+
name="variant_id_mask",
|
|
1180
|
+
dtype="bool",
|
|
1181
|
+
),
|
|
1182
|
+
]
|
|
1183
|
+
|
|
1184
|
+
# Only two of the fixed fields have a direct one-to-one mapping.
|
|
1185
|
+
array_specs.extend(
|
|
1186
|
+
[
|
|
1187
|
+
spec_from_field(name_map["QUAL"], array_name="variant_quality"),
|
|
1188
|
+
spec_from_field(name_map["POS"], array_name="variant_position"),
|
|
1189
|
+
]
|
|
1190
|
+
)
|
|
1191
|
+
array_specs.extend(
|
|
1192
|
+
[spec_from_field(field) for field in self.metadata.info_fields]
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
for field in self.metadata.format_fields:
|
|
1196
|
+
if field.name == "GT":
|
|
1197
|
+
continue
|
|
1198
|
+
array_specs.append(spec_from_field(field))
|
|
1199
|
+
|
|
1200
|
+
if self.gt_field is not None and self.num_samples > 0:
|
|
1201
|
+
array_specs.append(
|
|
1202
|
+
vcz.ZarrArraySpec(
|
|
1203
|
+
name="call_genotype_phased",
|
|
1204
|
+
dtype="bool",
|
|
1205
|
+
dimensions=["variants", "samples"],
|
|
1206
|
+
description="",
|
|
1207
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
|
|
1208
|
+
)
|
|
1209
|
+
)
|
|
1210
|
+
array_specs.append(
|
|
1211
|
+
vcz.ZarrArraySpec(
|
|
1212
|
+
name="call_genotype",
|
|
1213
|
+
dtype=self.gt_field.smallest_dtype(),
|
|
1214
|
+
dimensions=["variants", "samples", "ploidy"],
|
|
1215
|
+
description="",
|
|
1216
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
|
|
1217
|
+
)
|
|
1218
|
+
)
|
|
1219
|
+
array_specs.append(
|
|
1220
|
+
vcz.ZarrArraySpec(
|
|
1221
|
+
name="call_genotype_mask",
|
|
1222
|
+
dtype="bool",
|
|
1223
|
+
dimensions=["variants", "samples", "ploidy"],
|
|
1224
|
+
description="",
|
|
1225
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
|
|
1226
|
+
)
|
|
1227
|
+
)
|
|
1228
|
+
|
|
1229
|
+
if local_alleles:
|
|
1230
|
+
array_specs = convert_local_allele_field_types(array_specs, schema_instance)
|
|
1231
|
+
|
|
1232
|
+
schema_instance.fields = array_specs
|
|
1233
|
+
return schema_instance
|
|
1234
|
+
|
|
919
1235
|
|
|
920
1236
|
@dataclasses.dataclass
|
|
921
1237
|
class IcfPartitionMetadata(core.JsonDataclass):
|
|
@@ -987,7 +1303,7 @@ class IntermediateColumnarFormatWriter:
|
|
|
987
1303
|
vcfs,
|
|
988
1304
|
*,
|
|
989
1305
|
column_chunk_size=16,
|
|
990
|
-
worker_processes=
|
|
1306
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
991
1307
|
target_num_partitions=None,
|
|
992
1308
|
show_progress=False,
|
|
993
1309
|
compressor=None,
|
|
@@ -1139,7 +1455,9 @@ class IntermediateColumnarFormatWriter:
|
|
|
1139
1455
|
f"{num_records} records last_pos={last_position}"
|
|
1140
1456
|
)
|
|
1141
1457
|
|
|
1142
|
-
def explode(
|
|
1458
|
+
def explode(
|
|
1459
|
+
self, *, worker_processes=core.DEFAULT_WORKER_PROCESSES, show_progress=False
|
|
1460
|
+
):
|
|
1143
1461
|
self.load_metadata()
|
|
1144
1462
|
num_records = self.metadata.num_records
|
|
1145
1463
|
if np.isinf(num_records):
|
|
@@ -1207,7 +1525,7 @@ def explode(
|
|
|
1207
1525
|
vcfs,
|
|
1208
1526
|
*,
|
|
1209
1527
|
column_chunk_size=16,
|
|
1210
|
-
worker_processes=
|
|
1528
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1211
1529
|
show_progress=False,
|
|
1212
1530
|
compressor=None,
|
|
1213
1531
|
):
|
|
@@ -1232,7 +1550,7 @@ def explode_init(
|
|
|
1232
1550
|
*,
|
|
1233
1551
|
column_chunk_size=16,
|
|
1234
1552
|
target_num_partitions=1,
|
|
1235
|
-
worker_processes=
|
|
1553
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1236
1554
|
show_progress=False,
|
|
1237
1555
|
compressor=None,
|
|
1238
1556
|
):
|
|
@@ -1255,3 +1573,167 @@ def explode_partition(icf_path, partition):
|
|
|
1255
1573
|
def explode_finalise(icf_path):
|
|
1256
1574
|
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1257
1575
|
writer.finalise()
|
|
1576
|
+
|
|
1577
|
+
|
|
1578
|
+
def inspect(path):
|
|
1579
|
+
path = pathlib.Path(path)
|
|
1580
|
+
if not path.exists():
|
|
1581
|
+
raise ValueError(f"Path not found: {path}")
|
|
1582
|
+
if (path / "metadata.json").exists():
|
|
1583
|
+
obj = IntermediateColumnarFormat(path)
|
|
1584
|
+
# NOTE: this is too strict, we should support more general Zarrs, see #276
|
|
1585
|
+
elif (path / ".zmetadata").exists():
|
|
1586
|
+
obj = vcz.VcfZarr(path)
|
|
1587
|
+
else:
|
|
1588
|
+
raise ValueError(f"{path} not in ICF or VCF Zarr format")
|
|
1589
|
+
return obj.summary_table()
|
|
1590
|
+
|
|
1591
|
+
|
|
1592
|
+
def mkschema(
|
|
1593
|
+
if_path,
|
|
1594
|
+
out,
|
|
1595
|
+
*,
|
|
1596
|
+
variants_chunk_size=None,
|
|
1597
|
+
samples_chunk_size=None,
|
|
1598
|
+
local_alleles=None,
|
|
1599
|
+
):
|
|
1600
|
+
store = IntermediateColumnarFormat(if_path)
|
|
1601
|
+
spec = store.generate_schema(
|
|
1602
|
+
variants_chunk_size=variants_chunk_size,
|
|
1603
|
+
samples_chunk_size=samples_chunk_size,
|
|
1604
|
+
local_alleles=local_alleles,
|
|
1605
|
+
)
|
|
1606
|
+
out.write(spec.asjson())
|
|
1607
|
+
|
|
1608
|
+
|
|
1609
|
+
def convert(
|
|
1610
|
+
vcfs,
|
|
1611
|
+
vcz_path,
|
|
1612
|
+
*,
|
|
1613
|
+
variants_chunk_size=None,
|
|
1614
|
+
samples_chunk_size=None,
|
|
1615
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1616
|
+
local_alleles=None,
|
|
1617
|
+
show_progress=False,
|
|
1618
|
+
icf_path=None,
|
|
1619
|
+
):
|
|
1620
|
+
"""
|
|
1621
|
+
Convert the VCF data at the specified list of paths
|
|
1622
|
+
to VCF Zarr format stored at the specified path.
|
|
1623
|
+
|
|
1624
|
+
.. todo:: Document parameters
|
|
1625
|
+
"""
|
|
1626
|
+
if icf_path is None:
|
|
1627
|
+
cm = temp_icf_path(prefix="vcf2zarr")
|
|
1628
|
+
else:
|
|
1629
|
+
cm = contextlib.nullcontext(icf_path)
|
|
1630
|
+
|
|
1631
|
+
with cm as icf_path:
|
|
1632
|
+
explode(
|
|
1633
|
+
icf_path,
|
|
1634
|
+
vcfs,
|
|
1635
|
+
worker_processes=worker_processes,
|
|
1636
|
+
show_progress=show_progress,
|
|
1637
|
+
)
|
|
1638
|
+
encode(
|
|
1639
|
+
icf_path,
|
|
1640
|
+
vcz_path,
|
|
1641
|
+
variants_chunk_size=variants_chunk_size,
|
|
1642
|
+
samples_chunk_size=samples_chunk_size,
|
|
1643
|
+
worker_processes=worker_processes,
|
|
1644
|
+
show_progress=show_progress,
|
|
1645
|
+
local_alleles=local_alleles,
|
|
1646
|
+
)
|
|
1647
|
+
|
|
1648
|
+
|
|
1649
|
+
@contextlib.contextmanager
|
|
1650
|
+
def temp_icf_path(prefix=None):
|
|
1651
|
+
with tempfile.TemporaryDirectory(prefix=prefix) as tmp:
|
|
1652
|
+
yield pathlib.Path(tmp) / "icf"
|
|
1653
|
+
|
|
1654
|
+
|
|
1655
|
+
def encode(
|
|
1656
|
+
icf_path,
|
|
1657
|
+
zarr_path,
|
|
1658
|
+
schema_path=None,
|
|
1659
|
+
variants_chunk_size=None,
|
|
1660
|
+
samples_chunk_size=None,
|
|
1661
|
+
max_variant_chunks=None,
|
|
1662
|
+
dimension_separator=None,
|
|
1663
|
+
max_memory=None,
|
|
1664
|
+
local_alleles=None,
|
|
1665
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1666
|
+
show_progress=False,
|
|
1667
|
+
):
|
|
1668
|
+
# Rough heuristic to split work up enough to keep utilisation high
|
|
1669
|
+
target_num_partitions = max(1, worker_processes * 4)
|
|
1670
|
+
encode_init(
|
|
1671
|
+
icf_path,
|
|
1672
|
+
zarr_path,
|
|
1673
|
+
target_num_partitions,
|
|
1674
|
+
schema_path=schema_path,
|
|
1675
|
+
variants_chunk_size=variants_chunk_size,
|
|
1676
|
+
samples_chunk_size=samples_chunk_size,
|
|
1677
|
+
local_alleles=local_alleles,
|
|
1678
|
+
max_variant_chunks=max_variant_chunks,
|
|
1679
|
+
dimension_separator=dimension_separator,
|
|
1680
|
+
)
|
|
1681
|
+
vzw = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
|
|
1682
|
+
vzw.encode_all_partitions(
|
|
1683
|
+
worker_processes=worker_processes,
|
|
1684
|
+
show_progress=show_progress,
|
|
1685
|
+
max_memory=max_memory,
|
|
1686
|
+
)
|
|
1687
|
+
vzw.finalise(show_progress)
|
|
1688
|
+
vzw.create_index()
|
|
1689
|
+
|
|
1690
|
+
|
|
1691
|
+
def encode_init(
|
|
1692
|
+
icf_path,
|
|
1693
|
+
zarr_path,
|
|
1694
|
+
target_num_partitions,
|
|
1695
|
+
*,
|
|
1696
|
+
schema_path=None,
|
|
1697
|
+
variants_chunk_size=None,
|
|
1698
|
+
samples_chunk_size=None,
|
|
1699
|
+
local_alleles=None,
|
|
1700
|
+
max_variant_chunks=None,
|
|
1701
|
+
dimension_separator=None,
|
|
1702
|
+
max_memory=None,
|
|
1703
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
1704
|
+
show_progress=False,
|
|
1705
|
+
):
|
|
1706
|
+
icf_store = IntermediateColumnarFormat(icf_path)
|
|
1707
|
+
if schema_path is None:
|
|
1708
|
+
schema_instance = icf_store.generate_schema(
|
|
1709
|
+
variants_chunk_size=variants_chunk_size,
|
|
1710
|
+
samples_chunk_size=samples_chunk_size,
|
|
1711
|
+
local_alleles=local_alleles,
|
|
1712
|
+
)
|
|
1713
|
+
else:
|
|
1714
|
+
logger.info(f"Reading schema from {schema_path}")
|
|
1715
|
+
if variants_chunk_size is not None or samples_chunk_size is not None:
|
|
1716
|
+
raise ValueError(
|
|
1717
|
+
"Cannot specify schema along with chunk sizes"
|
|
1718
|
+
) # NEEDS TEST
|
|
1719
|
+
with open(schema_path) as f:
|
|
1720
|
+
schema_instance = vcz.VcfZarrSchema.fromjson(f.read())
|
|
1721
|
+
zarr_path = pathlib.Path(zarr_path)
|
|
1722
|
+
vzw = vcz.VcfZarrWriter("icf", zarr_path)
|
|
1723
|
+
return vzw.init(
|
|
1724
|
+
icf_store,
|
|
1725
|
+
target_num_partitions=target_num_partitions,
|
|
1726
|
+
schema=schema_instance,
|
|
1727
|
+
dimension_separator=dimension_separator,
|
|
1728
|
+
max_variant_chunks=max_variant_chunks,
|
|
1729
|
+
)
|
|
1730
|
+
|
|
1731
|
+
|
|
1732
|
+
def encode_partition(zarr_path, partition):
|
|
1733
|
+
writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
|
|
1734
|
+
writer_instance.encode_partition(partition)
|
|
1735
|
+
|
|
1736
|
+
|
|
1737
|
+
def encode_finalise(zarr_path, show_progress=False):
|
|
1738
|
+
writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
|
|
1739
|
+
writer_instance.finalise(show_progress=show_progress)
|