bio2zarr 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/__init__.py +1 -1
- bio2zarr/__main__.py +2 -0
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +129 -32
- bio2zarr/core.py +18 -9
- bio2zarr/plink.py +6 -8
- bio2zarr/typing.py +1 -1
- bio2zarr/vcf.py +642 -386
- bio2zarr/vcf_utils.py +26 -8
- {bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/METADATA +1 -1
- bio2zarr-0.0.6.dist-info/RECORD +16 -0
- bio2zarr-0.0.4.dist-info/RECORD +0 -16
- {bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/WHEEL +0 -0
- {bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/top_level.txt +0 -0
bio2zarr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
from .
|
|
1
|
+
from .provenance import __version__ # noqa F401
|
bio2zarr/__main__.py
CHANGED
|
@@ -2,11 +2,13 @@ import click
|
|
|
2
2
|
|
|
3
3
|
from . import cli
|
|
4
4
|
|
|
5
|
+
|
|
5
6
|
@cli.version
|
|
6
7
|
@click.group()
|
|
7
8
|
def bio2zarr():
|
|
8
9
|
pass
|
|
9
10
|
|
|
11
|
+
|
|
10
12
|
# Provide a single top-level interface to all of the functionality.
|
|
11
13
|
# This probably isn't the recommended way of interacting, as we
|
|
12
14
|
# install individual commands as console scripts. However, this
|
bio2zarr/_version.py
CHANGED
bio2zarr/cli.py
CHANGED
|
@@ -4,15 +4,12 @@ import pathlib
|
|
|
4
4
|
import shutil
|
|
5
5
|
|
|
6
6
|
import click
|
|
7
|
-
import tabulate
|
|
8
7
|
import coloredlogs
|
|
8
|
+
import humanfriendly
|
|
9
9
|
import numcodecs
|
|
10
|
+
import tabulate
|
|
10
11
|
|
|
11
|
-
from . import vcf
|
|
12
|
-
from . import vcf_utils
|
|
13
|
-
from . import plink
|
|
14
|
-
from . import provenance
|
|
15
|
-
|
|
12
|
+
from . import plink, provenance, vcf, vcf_utils
|
|
16
13
|
|
|
17
14
|
logger = logging.getLogger(__name__)
|
|
18
15
|
|
|
@@ -43,6 +40,14 @@ new_zarr_path = click.argument(
|
|
|
43
40
|
"zarr_path", type=click.Path(file_okay=False, dir_okay=True)
|
|
44
41
|
)
|
|
45
42
|
|
|
43
|
+
zarr_path = click.argument(
|
|
44
|
+
"zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
num_partitions = click.argument("num_partitions", type=click.IntRange(min=1))
|
|
48
|
+
|
|
49
|
+
partition = click.argument("partition", type=click.IntRange(min=0))
|
|
50
|
+
|
|
46
51
|
verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
|
|
47
52
|
|
|
48
53
|
force = click.option(
|
|
@@ -75,7 +80,7 @@ compressor = click.option(
|
|
|
75
80
|
"--compressor",
|
|
76
81
|
type=click.Choice(["lz4", "zstd"]),
|
|
77
82
|
default=None,
|
|
78
|
-
help="Codec to use for compressing column chunks (Default=zstd)."
|
|
83
|
+
help="Codec to use for compressing column chunks (Default=zstd).",
|
|
79
84
|
)
|
|
80
85
|
|
|
81
86
|
# Note: -l and -w were chosen when these were called "width" and "length".
|
|
@@ -96,6 +101,27 @@ samples_chunk_size = click.option(
|
|
|
96
101
|
help="Chunk size in the samples dimension",
|
|
97
102
|
)
|
|
98
103
|
|
|
104
|
+
schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True))
|
|
105
|
+
|
|
106
|
+
max_variant_chunks = click.option(
|
|
107
|
+
"-V",
|
|
108
|
+
"--max-variant-chunks",
|
|
109
|
+
type=int,
|
|
110
|
+
default=None,
|
|
111
|
+
help=(
|
|
112
|
+
"Truncate the output in the variants dimension to have "
|
|
113
|
+
"this number of chunks. Mainly intended to help with "
|
|
114
|
+
"schema tuning."
|
|
115
|
+
),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
max_memory = click.option(
|
|
119
|
+
"-M",
|
|
120
|
+
"--max-memory",
|
|
121
|
+
default=None,
|
|
122
|
+
help="An approximate bound on overall memory usage (e.g. 10G),",
|
|
123
|
+
)
|
|
124
|
+
|
|
99
125
|
|
|
100
126
|
def setup_logging(verbosity):
|
|
101
127
|
level = "WARNING"
|
|
@@ -162,7 +188,7 @@ def explode(
|
|
|
162
188
|
@click.command
|
|
163
189
|
@vcfs
|
|
164
190
|
@new_icf_path
|
|
165
|
-
@
|
|
191
|
+
@num_partitions
|
|
166
192
|
@force
|
|
167
193
|
@column_chunk_size
|
|
168
194
|
@compressor
|
|
@@ -198,7 +224,7 @@ def dexplode_init(
|
|
|
198
224
|
|
|
199
225
|
@click.command
|
|
200
226
|
@icf_path
|
|
201
|
-
@
|
|
227
|
+
@partition
|
|
202
228
|
@verbose
|
|
203
229
|
def dexplode_partition(icf_path, partition, verbose):
|
|
204
230
|
"""
|
|
@@ -207,18 +233,18 @@ def dexplode_partition(icf_path, partition, verbose):
|
|
|
207
233
|
from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
|
|
208
234
|
"""
|
|
209
235
|
setup_logging(verbose)
|
|
210
|
-
vcf.explode_partition(icf_path, partition, show_progress=
|
|
236
|
+
vcf.explode_partition(icf_path, partition, show_progress=False)
|
|
211
237
|
|
|
212
238
|
|
|
213
239
|
@click.command
|
|
214
|
-
@
|
|
240
|
+
@icf_path
|
|
215
241
|
@verbose
|
|
216
|
-
def dexplode_finalise(
|
|
242
|
+
def dexplode_finalise(icf_path, verbose):
|
|
217
243
|
"""
|
|
218
244
|
Final step for distributed conversion of VCF(s) to intermediate columnar format.
|
|
219
245
|
"""
|
|
220
246
|
setup_logging(verbose)
|
|
221
|
-
vcf.explode_finalise(
|
|
247
|
+
vcf.explode_finalise(icf_path)
|
|
222
248
|
|
|
223
249
|
|
|
224
250
|
@click.command
|
|
@@ -248,26 +274,11 @@ def mkschema(icf_path):
|
|
|
248
274
|
@new_zarr_path
|
|
249
275
|
@force
|
|
250
276
|
@verbose
|
|
251
|
-
@
|
|
277
|
+
@schema
|
|
252
278
|
@variants_chunk_size
|
|
253
279
|
@samples_chunk_size
|
|
254
|
-
@
|
|
255
|
-
|
|
256
|
-
"--max-variant-chunks",
|
|
257
|
-
type=int,
|
|
258
|
-
default=None,
|
|
259
|
-
help=(
|
|
260
|
-
"Truncate the output in the variants dimension to have "
|
|
261
|
-
"this number of chunks. Mainly intended to help with "
|
|
262
|
-
"schema tuning."
|
|
263
|
-
),
|
|
264
|
-
)
|
|
265
|
-
@click.option(
|
|
266
|
-
"-M",
|
|
267
|
-
"--max-memory",
|
|
268
|
-
default=None,
|
|
269
|
-
help="An approximate bound on overall memory usage (e.g. 10G),",
|
|
270
|
-
)
|
|
280
|
+
@max_variant_chunks
|
|
281
|
+
@max_memory
|
|
271
282
|
@worker_processes
|
|
272
283
|
def encode(
|
|
273
284
|
icf_path,
|
|
@@ -292,13 +303,96 @@ def encode(
|
|
|
292
303
|
schema_path=schema,
|
|
293
304
|
variants_chunk_size=variants_chunk_size,
|
|
294
305
|
samples_chunk_size=samples_chunk_size,
|
|
295
|
-
|
|
306
|
+
max_variant_chunks=max_variant_chunks,
|
|
296
307
|
worker_processes=worker_processes,
|
|
297
308
|
max_memory=max_memory,
|
|
298
309
|
show_progress=True,
|
|
299
310
|
)
|
|
300
311
|
|
|
301
312
|
|
|
313
|
+
@click.command
|
|
314
|
+
@icf_path
|
|
315
|
+
@new_zarr_path
|
|
316
|
+
@num_partitions
|
|
317
|
+
@force
|
|
318
|
+
@schema
|
|
319
|
+
@variants_chunk_size
|
|
320
|
+
@samples_chunk_size
|
|
321
|
+
@max_variant_chunks
|
|
322
|
+
@verbose
|
|
323
|
+
def dencode_init(
|
|
324
|
+
icf_path,
|
|
325
|
+
zarr_path,
|
|
326
|
+
num_partitions,
|
|
327
|
+
force,
|
|
328
|
+
schema,
|
|
329
|
+
variants_chunk_size,
|
|
330
|
+
samples_chunk_size,
|
|
331
|
+
max_variant_chunks,
|
|
332
|
+
verbose,
|
|
333
|
+
):
|
|
334
|
+
"""
|
|
335
|
+
Initialise conversion of intermediate format to VCF Zarr. This will
|
|
336
|
+
set up the specified ZARR_PATH to perform this conversion over
|
|
337
|
+
NUM_PARTITIONS.
|
|
338
|
+
|
|
339
|
+
The output of this commmand is the actual number of partitions generated
|
|
340
|
+
(which may be less then the requested number, if there is not sufficient
|
|
341
|
+
chunks in the variants dimension) and a rough lower-bound on the amount
|
|
342
|
+
of memory required to encode a partition.
|
|
343
|
+
|
|
344
|
+
NOTE: the format of this output will likely change in subsequent releases;
|
|
345
|
+
it should not be considered machine-readable for now.
|
|
346
|
+
"""
|
|
347
|
+
setup_logging(verbose)
|
|
348
|
+
check_overwrite_dir(zarr_path, force)
|
|
349
|
+
num_partitions, max_memory = vcf.encode_init(
|
|
350
|
+
icf_path,
|
|
351
|
+
zarr_path,
|
|
352
|
+
target_num_partitions=num_partitions,
|
|
353
|
+
schema_path=schema,
|
|
354
|
+
variants_chunk_size=variants_chunk_size,
|
|
355
|
+
samples_chunk_size=samples_chunk_size,
|
|
356
|
+
max_variant_chunks=max_variant_chunks,
|
|
357
|
+
show_progress=True,
|
|
358
|
+
)
|
|
359
|
+
formatted_size = humanfriendly.format_size(max_memory, binary=True)
|
|
360
|
+
# NOTE adding the size to the stdout here so that users can parse it
|
|
361
|
+
# and use in their submission scripts. This is a first pass, and
|
|
362
|
+
# will most likely change as we see what works and doesn't.
|
|
363
|
+
# NOTE we probably want to format this as a table, which lists
|
|
364
|
+
# some other properties, line by line
|
|
365
|
+
# NOTE This size number is also not quite enough, you need a bit of
|
|
366
|
+
# headroom with it (probably 10% or so). We should include this.
|
|
367
|
+
click.echo(f"{num_partitions}\t{formatted_size}")
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
@click.command
|
|
371
|
+
@zarr_path
|
|
372
|
+
@partition
|
|
373
|
+
@verbose
|
|
374
|
+
def dencode_partition(zarr_path, partition, verbose):
|
|
375
|
+
"""
|
|
376
|
+
Convert a partition from intermediate columnar format to VCF Zarr.
|
|
377
|
+
Must be called *after* the Zarr path has been initialised with dencode_init.
|
|
378
|
+
Partition indexes must be from 0 (inclusive) to the number of paritions
|
|
379
|
+
returned by dencode_init (exclusive).
|
|
380
|
+
"""
|
|
381
|
+
setup_logging(verbose)
|
|
382
|
+
vcf.encode_partition(zarr_path, partition)
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
@click.command
|
|
386
|
+
@zarr_path
|
|
387
|
+
@verbose
|
|
388
|
+
def dencode_finalise(zarr_path, verbose):
|
|
389
|
+
"""
|
|
390
|
+
Final step for distributed conversion of ICF to VCF Zarr.
|
|
391
|
+
"""
|
|
392
|
+
setup_logging(verbose)
|
|
393
|
+
vcf.encode_finalise(zarr_path, show_progress=True)
|
|
394
|
+
|
|
395
|
+
|
|
302
396
|
@click.command(name="convert")
|
|
303
397
|
@vcfs
|
|
304
398
|
@new_zarr_path
|
|
@@ -386,6 +480,9 @@ vcf2zarr.add_command(encode)
|
|
|
386
480
|
vcf2zarr.add_command(dexplode_init)
|
|
387
481
|
vcf2zarr.add_command(dexplode_partition)
|
|
388
482
|
vcf2zarr.add_command(dexplode_finalise)
|
|
483
|
+
vcf2zarr.add_command(dencode_init)
|
|
484
|
+
vcf2zarr.add_command(dencode_partition)
|
|
485
|
+
vcf2zarr.add_command(dencode_finalise)
|
|
389
486
|
|
|
390
487
|
|
|
391
488
|
@click.command(name="convert")
|
bio2zarr/core.py
CHANGED
|
@@ -1,22 +1,31 @@
|
|
|
1
|
-
import dataclasses
|
|
2
|
-
import contextlib
|
|
3
1
|
import concurrent.futures as cf
|
|
2
|
+
import contextlib
|
|
3
|
+
import dataclasses
|
|
4
|
+
import logging
|
|
4
5
|
import multiprocessing
|
|
5
6
|
import threading
|
|
6
|
-
import logging
|
|
7
7
|
import time
|
|
8
8
|
|
|
9
|
-
import
|
|
9
|
+
import numcodecs
|
|
10
10
|
import numpy as np
|
|
11
11
|
import tqdm
|
|
12
|
-
import
|
|
13
|
-
|
|
12
|
+
import zarr
|
|
14
13
|
|
|
15
14
|
logger = logging.getLogger(__name__)
|
|
16
15
|
|
|
17
16
|
numcodecs.blosc.use_threads = False
|
|
18
17
|
|
|
19
18
|
|
|
19
|
+
def min_int_dtype(min_value, max_value):
|
|
20
|
+
if min_value > max_value:
|
|
21
|
+
raise ValueError("min_value must be <= max_value")
|
|
22
|
+
for a_dtype in ["i1", "i2", "i4", "i8"]:
|
|
23
|
+
info = np.iinfo(a_dtype)
|
|
24
|
+
if info.min <= min_value and max_value <= info.max:
|
|
25
|
+
return a_dtype
|
|
26
|
+
raise OverflowError("Integer cannot be represented")
|
|
27
|
+
|
|
28
|
+
|
|
20
29
|
def chunk_aligned_slices(z, n, max_chunks=None):
|
|
21
30
|
"""
|
|
22
31
|
Returns at n slices in the specified zarr array, aligned
|
|
@@ -101,6 +110,7 @@ class BufferedArray:
|
|
|
101
110
|
sync_flush_2d_array(
|
|
102
111
|
self.buff[: self.buffer_row], self.array, self.array_offset
|
|
103
112
|
)
|
|
113
|
+
# FIXME the array.name doesn't seem to be working here for some reason
|
|
104
114
|
logger.debug(
|
|
105
115
|
f"Flushed <{self.array.name} {self.array.shape} "
|
|
106
116
|
f"{self.array.dtype}> "
|
|
@@ -122,8 +132,7 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
|
|
|
122
132
|
# encoder implementations.
|
|
123
133
|
s = slice(offset, offset + np_buffer.shape[0])
|
|
124
134
|
samples_chunk_size = zarr_array.chunks[1]
|
|
125
|
-
# TODO use zarr chunks here
|
|
126
|
-
# and for simplicity
|
|
135
|
+
# TODO use zarr chunks here for simplicity
|
|
127
136
|
zarr_array_width = zarr_array.shape[1]
|
|
128
137
|
start = 0
|
|
129
138
|
while start < zarr_array_width:
|
|
@@ -183,7 +192,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
183
192
|
self.progress_config = progress_config
|
|
184
193
|
self.progress_bar = tqdm.tqdm(
|
|
185
194
|
total=progress_config.total,
|
|
186
|
-
desc=f"{progress_config.title:>
|
|
195
|
+
desc=f"{progress_config.title:>8}",
|
|
187
196
|
unit_scale=True,
|
|
188
197
|
unit=progress_config.units,
|
|
189
198
|
smoothing=0.1,
|
bio2zarr/plink.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
|
|
3
|
+
import bed_reader
|
|
3
4
|
import humanfriendly
|
|
5
|
+
import numcodecs
|
|
4
6
|
import numpy as np
|
|
5
7
|
import zarr
|
|
6
|
-
import bed_reader
|
|
7
|
-
import numcodecs
|
|
8
8
|
|
|
9
9
|
from . import core
|
|
10
10
|
|
|
11
|
-
|
|
12
11
|
logger = logging.getLogger(__name__)
|
|
13
12
|
|
|
14
13
|
|
|
@@ -24,7 +23,6 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
|
|
|
24
23
|
gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
|
|
25
24
|
gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
|
|
26
25
|
variants_chunk_size = gt.array.chunks[0]
|
|
27
|
-
n = gt.array.shape[1]
|
|
28
26
|
assert start % variants_chunk_size == 0
|
|
29
27
|
|
|
30
28
|
logger.debug(f"Reading slice {start}:{stop}")
|
|
@@ -96,7 +94,7 @@ def convert(
|
|
|
96
94
|
chunks=(samples_chunk_size,),
|
|
97
95
|
)
|
|
98
96
|
a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
99
|
-
logger.debug(
|
|
97
|
+
logger.debug("Encoded samples")
|
|
100
98
|
|
|
101
99
|
# TODO encode these in slices - but read them in one go to avoid
|
|
102
100
|
# fetching repeatedly from bim file
|
|
@@ -108,7 +106,7 @@ def convert(
|
|
|
108
106
|
chunks=(variants_chunk_size,),
|
|
109
107
|
)
|
|
110
108
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
|
|
111
|
-
logger.debug(
|
|
109
|
+
logger.debug("encoded variant_position")
|
|
112
110
|
|
|
113
111
|
alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
|
|
114
112
|
a = root.array(
|
|
@@ -119,7 +117,7 @@ def convert(
|
|
|
119
117
|
chunks=(variants_chunk_size,),
|
|
120
118
|
)
|
|
121
119
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
|
|
122
|
-
logger.debug(
|
|
120
|
+
logger.debug("encoded variant_allele")
|
|
123
121
|
|
|
124
122
|
# TODO remove this?
|
|
125
123
|
a = root.empty(
|
|
@@ -201,4 +199,4 @@ def validate(bed_path, zarr_path):
|
|
|
201
199
|
elif bed_call == 2:
|
|
202
200
|
assert list(zarr_call) == [1, 1]
|
|
203
201
|
else: # pragma no cover
|
|
204
|
-
|
|
202
|
+
raise AssertionError(f"Unexpected bed call {bed_call}")
|
bio2zarr/typing.py
CHANGED