bio2zarr 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +46 -12
- bio2zarr/core.py +32 -2
- bio2zarr/plink.py +19 -14
- bio2zarr/vcf2zarr/icf.py +30 -17
- bio2zarr/vcf2zarr/vcz.py +460 -138
- bio2zarr/vcf2zarr/verification.py +19 -16
- bio2zarr/vcf_utils.py +30 -14
- bio2zarr/zarr_utils.py +19 -0
- {bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/METADATA +15 -13
- bio2zarr-0.1.2.dist-info/RECORD +21 -0
- {bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/WHEEL +1 -1
- bio2zarr-0.1.0.dist-info/RECORD +0 -20
- {bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/LICENSE +0 -0
- {bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/top_level.txt +0 -0
bio2zarr/_version.py
CHANGED
bio2zarr/cli.py
CHANGED
|
@@ -149,6 +149,13 @@ max_memory = click.option(
|
|
|
149
149
|
help="An approximate bound on overall memory usage (e.g. 10G),",
|
|
150
150
|
)
|
|
151
151
|
|
|
152
|
+
local_alleles = click.option(
|
|
153
|
+
"--local-alleles/--no-local-alleles",
|
|
154
|
+
show_default=True,
|
|
155
|
+
default=False,
|
|
156
|
+
help="Use local allele fields to reduce the storage requirements of the output.",
|
|
157
|
+
)
|
|
158
|
+
|
|
152
159
|
|
|
153
160
|
def setup_logging(verbosity):
|
|
154
161
|
level = "WARNING"
|
|
@@ -312,7 +319,7 @@ def dexplode_finalise(icf_path, verbose):
|
|
|
312
319
|
|
|
313
320
|
|
|
314
321
|
@click.command
|
|
315
|
-
@click.argument("path", type=click.Path())
|
|
322
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
316
323
|
@verbose
|
|
317
324
|
def inspect(path, verbose):
|
|
318
325
|
"""
|
|
@@ -325,12 +332,26 @@ def inspect(path, verbose):
|
|
|
325
332
|
|
|
326
333
|
@click.command
|
|
327
334
|
@icf_path
|
|
328
|
-
|
|
335
|
+
@variants_chunk_size
|
|
336
|
+
@samples_chunk_size
|
|
337
|
+
@local_alleles
|
|
338
|
+
def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles):
|
|
329
339
|
"""
|
|
330
340
|
Generate a schema for zarr encoding
|
|
331
341
|
"""
|
|
342
|
+
if local_alleles:
|
|
343
|
+
click.echo(
|
|
344
|
+
"WARNING: Local alleles support is preliminary; please use with caution.",
|
|
345
|
+
err=True,
|
|
346
|
+
)
|
|
332
347
|
stream = click.get_text_stream("stdout")
|
|
333
|
-
vcf2zarr.mkschema(
|
|
348
|
+
vcf2zarr.mkschema(
|
|
349
|
+
icf_path,
|
|
350
|
+
stream,
|
|
351
|
+
variants_chunk_size=variants_chunk_size,
|
|
352
|
+
samples_chunk_size=samples_chunk_size,
|
|
353
|
+
local_alleles=local_alleles,
|
|
354
|
+
)
|
|
334
355
|
|
|
335
356
|
|
|
336
357
|
@click.command
|
|
@@ -469,6 +490,7 @@ def dencode_finalise(zarr_path, verbose, progress):
|
|
|
469
490
|
@verbose
|
|
470
491
|
@progress
|
|
471
492
|
@worker_processes
|
|
493
|
+
@local_alleles
|
|
472
494
|
def convert_vcf(
|
|
473
495
|
vcfs,
|
|
474
496
|
zarr_path,
|
|
@@ -478,6 +500,7 @@ def convert_vcf(
|
|
|
478
500
|
verbose,
|
|
479
501
|
progress,
|
|
480
502
|
worker_processes,
|
|
503
|
+
local_alleles,
|
|
481
504
|
):
|
|
482
505
|
"""
|
|
483
506
|
Convert input VCF(s) directly to vcfzarr (not recommended for large files).
|
|
@@ -491,6 +514,7 @@ def convert_vcf(
|
|
|
491
514
|
samples_chunk_size=samples_chunk_size,
|
|
492
515
|
show_progress=progress,
|
|
493
516
|
worker_processes=worker_processes,
|
|
517
|
+
local_alleles=local_alleles,
|
|
494
518
|
)
|
|
495
519
|
|
|
496
520
|
|
|
@@ -560,7 +584,7 @@ plink2zarr.add_command(convert_plink)
|
|
|
560
584
|
|
|
561
585
|
@click.command
|
|
562
586
|
@version
|
|
563
|
-
@
|
|
587
|
+
@vcfs
|
|
564
588
|
@verbose
|
|
565
589
|
@num_partitions
|
|
566
590
|
@click.option(
|
|
@@ -570,12 +594,16 @@ plink2zarr.add_command(convert_plink)
|
|
|
570
594
|
default=None,
|
|
571
595
|
help="Target (compressed) size of VCF partitions, e.g. 100KB, 10MiB, 1G.",
|
|
572
596
|
)
|
|
573
|
-
def vcfpartition(
|
|
597
|
+
def vcfpartition(vcfs, verbose, num_partitions, partition_size):
|
|
574
598
|
"""
|
|
575
|
-
Output bcftools region strings that partition
|
|
599
|
+
Output bcftools region strings that partition the indexed VCF/BCF files
|
|
576
600
|
into either an approximate number of parts (-n), or parts of approximately
|
|
577
601
|
a given size (-s). One of -n or -s must be supplied.
|
|
578
602
|
|
|
603
|
+
If multiple VCF/BCF files are provided, the number of parts (-n) is
|
|
604
|
+
interpreted as the total number of partitions across all the files,
|
|
605
|
+
and the partitions are distributed evenly among the files.
|
|
606
|
+
|
|
579
607
|
Note that both the number of partitions and sizes are a target, and the
|
|
580
608
|
returned number of partitions may not exactly correspond. In particular,
|
|
581
609
|
there is a maximum level of granularity determined by the associated index
|
|
@@ -590,9 +618,15 @@ def vcfpartition(vcf_path, verbose, num_partitions, partition_size):
|
|
|
590
618
|
"Either --num-partitions or --partition-size must be specified"
|
|
591
619
|
)
|
|
592
620
|
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
621
|
+
if num_partitions is None:
|
|
622
|
+
num_parts_per_path = None
|
|
623
|
+
else:
|
|
624
|
+
num_parts_per_path = max(1, num_partitions // len(vcfs))
|
|
625
|
+
|
|
626
|
+
for vcf_path in vcfs:
|
|
627
|
+
indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
|
|
628
|
+
regions = indexed_vcf.partition_into_regions(
|
|
629
|
+
num_parts=num_parts_per_path, target_part_size=partition_size
|
|
630
|
+
)
|
|
631
|
+
for region in regions:
|
|
632
|
+
click.echo(f"{region}\t{vcf_path}")
|
bio2zarr/core.py
CHANGED
|
@@ -63,6 +63,27 @@ def chunk_aligned_slices(z, n, max_chunks=None):
|
|
|
63
63
|
return slices
|
|
64
64
|
|
|
65
65
|
|
|
66
|
+
def first_dim_slice_iter(z, start, stop):
|
|
67
|
+
"""
|
|
68
|
+
Efficiently iterate over the specified slice of the first dimension of the zarr
|
|
69
|
+
array z.
|
|
70
|
+
"""
|
|
71
|
+
chunk_size = z.chunks[0]
|
|
72
|
+
first_chunk = start // chunk_size
|
|
73
|
+
last_chunk = (stop // chunk_size) + (stop % chunk_size != 0)
|
|
74
|
+
for chunk in range(first_chunk, last_chunk):
|
|
75
|
+
Z = z.blocks[chunk]
|
|
76
|
+
chunk_start = chunk * chunk_size
|
|
77
|
+
chunk_stop = chunk_start + chunk_size
|
|
78
|
+
slice_start = None
|
|
79
|
+
if start > chunk_start:
|
|
80
|
+
slice_start = start - chunk_start
|
|
81
|
+
slice_stop = None
|
|
82
|
+
if stop < chunk_stop:
|
|
83
|
+
slice_stop = stop - chunk_start
|
|
84
|
+
yield from Z[slice_start:slice_stop]
|
|
85
|
+
|
|
86
|
+
|
|
66
87
|
def du(path):
|
|
67
88
|
"""
|
|
68
89
|
Return the total bytes stored at this path.
|
|
@@ -113,13 +134,16 @@ def cancel_futures(futures):
|
|
|
113
134
|
class BufferedArray:
|
|
114
135
|
array: zarr.Array
|
|
115
136
|
array_offset: int
|
|
137
|
+
name: str
|
|
116
138
|
buff: np.ndarray
|
|
117
139
|
buffer_row: int
|
|
140
|
+
max_buff_size: int = 0
|
|
118
141
|
|
|
119
|
-
def __init__(self, array, offset):
|
|
142
|
+
def __init__(self, array, offset, name="Unknown"):
|
|
120
143
|
self.array = array
|
|
121
144
|
self.array_offset = offset
|
|
122
145
|
assert offset % array.chunks[0] == 0
|
|
146
|
+
self.name = name
|
|
123
147
|
dims = list(array.shape)
|
|
124
148
|
dims[0] = min(array.chunks[0], array.shape[0])
|
|
125
149
|
self.buff = np.empty(dims, dtype=array.dtype)
|
|
@@ -150,11 +174,17 @@ class BufferedArray:
|
|
|
150
174
|
self.buff[: self.buffer_row], self.array, self.array_offset
|
|
151
175
|
)
|
|
152
176
|
logger.debug(
|
|
153
|
-
f"Flushed <{self.
|
|
177
|
+
f"Flushed <{self.name} {self.array.shape} "
|
|
154
178
|
f"{self.array.dtype}> "
|
|
155
179
|
f"{self.array_offset}:{self.array_offset + self.buffer_row}"
|
|
156
180
|
f"{self.buff.nbytes / 2**20: .2f}Mb"
|
|
157
181
|
)
|
|
182
|
+
# Note this is inaccurate for string data as we're just reporting the
|
|
183
|
+
# size of the container. When we switch the numpy 2 StringDtype this
|
|
184
|
+
# should improve and we can get more visibility on how memory
|
|
185
|
+
# is being used.
|
|
186
|
+
# https://github.com/sgkit-dev/bio2zarr/issues/30
|
|
187
|
+
self.max_buff_size = max(self.max_buff_size, self.buff.nbytes)
|
|
158
188
|
self.array_offset += self.variants_chunk_size
|
|
159
189
|
self.buffer_row = 0
|
|
160
190
|
|
bio2zarr/plink.py
CHANGED
|
@@ -6,6 +6,8 @@ import numcodecs
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import zarr
|
|
8
8
|
|
|
9
|
+
from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
|
|
10
|
+
|
|
9
11
|
from . import core
|
|
10
12
|
|
|
11
13
|
logger = logging.getLogger(__name__)
|
|
@@ -17,8 +19,7 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
|
|
|
17
19
|
# the correct approach is, but it is important to note that the
|
|
18
20
|
# 0th allele is *not* necessarily the REF for these datasets.
|
|
19
21
|
bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
|
|
20
|
-
|
|
21
|
-
root = zarr.group(store=store)
|
|
22
|
+
root = zarr.open(store=zarr_path, mode="a", **ZARR_FORMAT_KWARGS)
|
|
22
23
|
gt = core.BufferedArray(root["call_genotype"], start)
|
|
23
24
|
gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
|
|
24
25
|
gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
|
|
@@ -73,8 +74,7 @@ def convert(
|
|
|
73
74
|
if variants_chunk_size is None:
|
|
74
75
|
variants_chunk_size = 10_000
|
|
75
76
|
|
|
76
|
-
|
|
77
|
-
root = zarr.group(store=store, overwrite=True)
|
|
77
|
+
root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
|
|
78
78
|
|
|
79
79
|
ploidy = 2
|
|
80
80
|
shape = [m, n]
|
|
@@ -88,7 +88,8 @@ def convert(
|
|
|
88
88
|
|
|
89
89
|
a = root.array(
|
|
90
90
|
"sample_id",
|
|
91
|
-
bed.iid,
|
|
91
|
+
data=bed.iid,
|
|
92
|
+
shape=bed.iid.shape,
|
|
92
93
|
dtype="str",
|
|
93
94
|
compressor=default_compressor,
|
|
94
95
|
chunks=(samples_chunk_size,),
|
|
@@ -100,7 +101,8 @@ def convert(
|
|
|
100
101
|
# fetching repeatedly from bim file
|
|
101
102
|
a = root.array(
|
|
102
103
|
"variant_position",
|
|
103
|
-
bed.bp_position,
|
|
104
|
+
data=bed.bp_position,
|
|
105
|
+
shape=bed.bp_position.shape,
|
|
104
106
|
dtype=np.int32,
|
|
105
107
|
compressor=default_compressor,
|
|
106
108
|
chunks=(variants_chunk_size,),
|
|
@@ -111,41 +113,45 @@ def convert(
|
|
|
111
113
|
alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
|
|
112
114
|
a = root.array(
|
|
113
115
|
"variant_allele",
|
|
114
|
-
alleles,
|
|
116
|
+
data=alleles,
|
|
117
|
+
shape=alleles.shape,
|
|
115
118
|
dtype="str",
|
|
116
119
|
compressor=default_compressor,
|
|
117
|
-
chunks=(variants_chunk_size,),
|
|
120
|
+
chunks=(variants_chunk_size, alleles.shape[1]),
|
|
118
121
|
)
|
|
119
122
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
|
|
120
123
|
logger.debug("encoded variant_allele")
|
|
121
124
|
|
|
122
125
|
# TODO remove this?
|
|
123
126
|
a = root.empty(
|
|
124
|
-
"call_genotype_phased",
|
|
127
|
+
name="call_genotype_phased",
|
|
125
128
|
dtype="bool",
|
|
126
129
|
shape=list(shape),
|
|
127
130
|
chunks=list(chunks),
|
|
128
131
|
compressor=default_compressor,
|
|
132
|
+
**ZARR_FORMAT_KWARGS,
|
|
129
133
|
)
|
|
130
134
|
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
131
135
|
|
|
132
136
|
shape += [ploidy]
|
|
133
137
|
dimensions += ["ploidy"]
|
|
134
138
|
a = root.empty(
|
|
135
|
-
"call_genotype",
|
|
139
|
+
name="call_genotype",
|
|
136
140
|
dtype="i1",
|
|
137
141
|
shape=list(shape),
|
|
138
142
|
chunks=list(chunks),
|
|
139
143
|
compressor=default_compressor,
|
|
144
|
+
**ZARR_FORMAT_KWARGS,
|
|
140
145
|
)
|
|
141
146
|
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
142
147
|
|
|
143
148
|
a = root.empty(
|
|
144
|
-
"call_genotype_mask",
|
|
149
|
+
name="call_genotype_mask",
|
|
145
150
|
dtype="bool",
|
|
146
151
|
shape=list(shape),
|
|
147
152
|
chunks=list(chunks),
|
|
148
153
|
compressor=default_compressor,
|
|
154
|
+
**ZARR_FORMAT_KWARGS,
|
|
149
155
|
)
|
|
150
156
|
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
151
157
|
|
|
@@ -154,7 +160,7 @@ def convert(
|
|
|
154
160
|
num_slices = max(1, worker_processes * 4)
|
|
155
161
|
slices = core.chunk_aligned_slices(a, num_slices)
|
|
156
162
|
|
|
157
|
-
total_chunks = sum(a.nchunks for a in root.
|
|
163
|
+
total_chunks = sum(a.nchunks for _, a in root.arrays())
|
|
158
164
|
|
|
159
165
|
progress_config = core.ProgressConfig(
|
|
160
166
|
total=total_chunks, title="Convert", units="chunks", show=show_progress
|
|
@@ -171,8 +177,7 @@ def convert(
|
|
|
171
177
|
# FIXME do this more efficiently - currently reading the whole thing
|
|
172
178
|
# in for convenience, and also comparing call-by-call
|
|
173
179
|
def validate(bed_path, zarr_path):
|
|
174
|
-
|
|
175
|
-
root = zarr.group(store=store)
|
|
180
|
+
root = zarr.open(store=zarr_path, mode="r")
|
|
176
181
|
call_genotype = root["call_genotype"][:]
|
|
177
182
|
|
|
178
183
|
bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
|
bio2zarr/vcf2zarr/icf.py
CHANGED
|
@@ -110,7 +110,7 @@ class VcfPartition:
|
|
|
110
110
|
num_records: int = -1
|
|
111
111
|
|
|
112
112
|
|
|
113
|
-
ICF_METADATA_FORMAT_VERSION = "0.
|
|
113
|
+
ICF_METADATA_FORMAT_VERSION = "0.4"
|
|
114
114
|
ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
|
|
115
115
|
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
|
|
116
116
|
)
|
|
@@ -212,6 +212,7 @@ def fixed_vcf_field_definitions():
|
|
|
212
212
|
make_field_def("FILTERS", "String", "."),
|
|
213
213
|
make_field_def("REF", "String", "1"),
|
|
214
214
|
make_field_def("ALT", "String", "."),
|
|
215
|
+
make_field_def("rlen", "Integer", "1"), # computed field
|
|
215
216
|
]
|
|
216
217
|
return fields
|
|
217
218
|
|
|
@@ -240,7 +241,7 @@ def scan_vcf(path, target_num_partitions):
|
|
|
240
241
|
for h in vcf.header_iter():
|
|
241
242
|
if h["HeaderType"] in ["INFO", "FORMAT"]:
|
|
242
243
|
field = VcfField.from_header(h)
|
|
243
|
-
if field.name == "GT":
|
|
244
|
+
if h["HeaderType"] == "FORMAT" and field.name == "GT":
|
|
244
245
|
field.vcf_type = "Integer"
|
|
245
246
|
field.vcf_number = "."
|
|
246
247
|
fields.append(field)
|
|
@@ -300,7 +301,11 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
|
300
301
|
)
|
|
301
302
|
with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
|
|
302
303
|
for path in paths:
|
|
303
|
-
pwm.submit(
|
|
304
|
+
pwm.submit(
|
|
305
|
+
scan_vcf,
|
|
306
|
+
path,
|
|
307
|
+
max(1, target_num_partitions // len(paths)),
|
|
308
|
+
)
|
|
304
309
|
results = list(pwm.results_as_completed())
|
|
305
310
|
|
|
306
311
|
# Sort to make the ordering deterministic
|
|
@@ -408,7 +413,7 @@ def sanitise_value_float_1d(buff, j, value):
|
|
|
408
413
|
if value is None:
|
|
409
414
|
buff[j] = constants.FLOAT32_MISSING
|
|
410
415
|
else:
|
|
411
|
-
value = np.array(value, ndmin=1, dtype=buff.dtype, copy=
|
|
416
|
+
value = np.array(value, ndmin=1, dtype=buff.dtype, copy=True)
|
|
412
417
|
# numpy will map None values to Nan, but we need a
|
|
413
418
|
# specific NaN
|
|
414
419
|
value[np.isnan(value)] = constants.FLOAT32_MISSING
|
|
@@ -422,7 +427,7 @@ def sanitise_value_float_2d(buff, j, value):
|
|
|
422
427
|
buff[j] = constants.FLOAT32_MISSING
|
|
423
428
|
else:
|
|
424
429
|
# print("value = ", value)
|
|
425
|
-
value = np.array(value, ndmin=2, dtype=buff.dtype, copy=
|
|
430
|
+
value = np.array(value, ndmin=2, dtype=buff.dtype, copy=True)
|
|
426
431
|
buff[j] = constants.FLOAT32_FILL
|
|
427
432
|
buff[j, :, : value.shape[1]] = value
|
|
428
433
|
|
|
@@ -432,7 +437,7 @@ def sanitise_int_array(value, ndmin, dtype):
|
|
|
432
437
|
value = [
|
|
433
438
|
constants.VCF_INT_MISSING if x is None else x for x in value
|
|
434
439
|
] # NEEDS TEST
|
|
435
|
-
value = np.array(value, ndmin=ndmin, copy=
|
|
440
|
+
value = np.array(value, ndmin=ndmin, copy=True)
|
|
436
441
|
value[value == constants.VCF_INT_MISSING] = -1
|
|
437
442
|
value[value == constants.VCF_INT_FILL] = -2
|
|
438
443
|
# TODO watch out for clipping here!
|
|
@@ -494,15 +499,15 @@ class VcfValueTransformer:
|
|
|
494
499
|
def transform(self, vcf_value):
|
|
495
500
|
if isinstance(vcf_value, tuple):
|
|
496
501
|
vcf_value = [self.missing if v is None else v for v in vcf_value]
|
|
497
|
-
value = np.array(vcf_value, ndmin=self.dimension, copy=
|
|
502
|
+
value = np.array(vcf_value, ndmin=self.dimension, copy=True)
|
|
498
503
|
return value
|
|
499
504
|
|
|
500
505
|
def transform_and_update_bounds(self, vcf_value):
|
|
501
506
|
if vcf_value is None:
|
|
502
507
|
return None
|
|
508
|
+
# print(self, self.field.full_name, "T", vcf_value)
|
|
503
509
|
value = self.transform(vcf_value)
|
|
504
510
|
self.update_bounds(value)
|
|
505
|
-
# print(self.field.full_name, "T", vcf_value, "->", value)
|
|
506
511
|
return value
|
|
507
512
|
|
|
508
513
|
|
|
@@ -531,13 +536,15 @@ class FloatValueTransformer(VcfValueTransformer):
|
|
|
531
536
|
class StringValueTransformer(VcfValueTransformer):
|
|
532
537
|
def update_bounds(self, value):
|
|
533
538
|
summary = self.field.summary
|
|
534
|
-
|
|
539
|
+
if self.field.category == "FORMAT":
|
|
540
|
+
number = max(len(v) for v in value)
|
|
541
|
+
else:
|
|
542
|
+
number = value.shape[-1]
|
|
535
543
|
# TODO would be nice to report string lengths, but not
|
|
536
544
|
# really necessary.
|
|
537
545
|
summary.max_number = max(summary.max_number, number)
|
|
538
546
|
|
|
539
547
|
def transform(self, vcf_value):
|
|
540
|
-
# print("transform", vcf_value)
|
|
541
548
|
if self.dimension == 1:
|
|
542
549
|
value = np.array(list(vcf_value.split(",")))
|
|
543
550
|
else:
|
|
@@ -853,11 +860,11 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
853
860
|
|
|
854
861
|
def summary_table(self):
|
|
855
862
|
data = []
|
|
856
|
-
for name,
|
|
857
|
-
summary =
|
|
863
|
+
for name, icf_field in self.fields.items():
|
|
864
|
+
summary = icf_field.vcf_field.summary
|
|
858
865
|
d = {
|
|
859
866
|
"name": name,
|
|
860
|
-
"type":
|
|
867
|
+
"type": icf_field.vcf_field.vcf_type,
|
|
861
868
|
"chunks": summary.num_chunks,
|
|
862
869
|
"size": core.display_size(summary.uncompressed_size),
|
|
863
870
|
"compressed": core.display_size(summary.compressed_size),
|
|
@@ -962,7 +969,7 @@ class IntermediateColumnarFormatWriter:
|
|
|
962
969
|
compressor=None,
|
|
963
970
|
):
|
|
964
971
|
if self.path.exists():
|
|
965
|
-
raise ValueError("ICF path already exists")
|
|
972
|
+
raise ValueError(f"ICF path already exists: {self.path}")
|
|
966
973
|
if compressor is None:
|
|
967
974
|
compressor = ICF_DEFAULT_COMPRESSOR
|
|
968
975
|
vcfs = [pathlib.Path(vcf) for vcf in vcfs]
|
|
@@ -1009,8 +1016,8 @@ class IntermediateColumnarFormatWriter:
|
|
|
1009
1016
|
self.path.mkdir()
|
|
1010
1017
|
self.wip_path.mkdir()
|
|
1011
1018
|
for field in self.metadata.fields:
|
|
1012
|
-
|
|
1013
|
-
|
|
1019
|
+
field_path = get_vcf_field_path(self.path, field)
|
|
1020
|
+
field_path.mkdir(parents=True)
|
|
1014
1021
|
|
|
1015
1022
|
def load_partition_summaries(self):
|
|
1016
1023
|
summaries = []
|
|
@@ -1074,13 +1081,19 @@ class IntermediateColumnarFormatWriter:
|
|
|
1074
1081
|
tcw.append("FILTERS", variant.FILTERS)
|
|
1075
1082
|
tcw.append("REF", variant.REF)
|
|
1076
1083
|
tcw.append("ALT", variant.ALT)
|
|
1084
|
+
tcw.append("rlen", variant.end - variant.start)
|
|
1077
1085
|
for field in info_fields:
|
|
1078
1086
|
tcw.append(field.full_name, variant.INFO.get(field.name, None))
|
|
1079
1087
|
if has_gt:
|
|
1080
|
-
|
|
1088
|
+
if variant.genotype is None:
|
|
1089
|
+
val = None
|
|
1090
|
+
else:
|
|
1091
|
+
val = variant.genotype.array()
|
|
1092
|
+
tcw.append("FORMAT/GT", val)
|
|
1081
1093
|
for field in format_fields:
|
|
1082
1094
|
val = variant.format(field.name)
|
|
1083
1095
|
tcw.append(field.full_name, val)
|
|
1096
|
+
|
|
1084
1097
|
# Note: an issue with updating the progress per variant here like
|
|
1085
1098
|
# this is that we get a significant pause at the end of the counter
|
|
1086
1099
|
# while all the "small" fields get flushed. Possibly not much to be
|