bio2zarr 0.0.5__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +126 -25
- bio2zarr/core.py +31 -3
- bio2zarr/vcf.py +754 -475
- bio2zarr/vcf_utils.py +25 -16
- bio2zarr-0.0.9.dist-info/METADATA +363 -0
- bio2zarr-0.0.9.dist-info/RECORD +16 -0
- bio2zarr-0.0.5.dist-info/METADATA +0 -33
- bio2zarr-0.0.5.dist-info/RECORD +0 -16
- {bio2zarr-0.0.5.dist-info → bio2zarr-0.0.9.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.5.dist-info → bio2zarr-0.0.9.dist-info}/WHEEL +0 -0
- {bio2zarr-0.0.5.dist-info → bio2zarr-0.0.9.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.0.5.dist-info → bio2zarr-0.0.9.dist-info}/top_level.txt +0 -0
bio2zarr/_version.py
CHANGED
bio2zarr/cli.py
CHANGED
|
@@ -5,6 +5,7 @@ import shutil
|
|
|
5
5
|
|
|
6
6
|
import click
|
|
7
7
|
import coloredlogs
|
|
8
|
+
import humanfriendly
|
|
8
9
|
import numcodecs
|
|
9
10
|
import tabulate
|
|
10
11
|
|
|
@@ -39,6 +40,14 @@ new_zarr_path = click.argument(
|
|
|
39
40
|
"zarr_path", type=click.Path(file_okay=False, dir_okay=True)
|
|
40
41
|
)
|
|
41
42
|
|
|
43
|
+
zarr_path = click.argument(
|
|
44
|
+
"zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
num_partitions = click.argument("num_partitions", type=click.IntRange(min=1))
|
|
48
|
+
|
|
49
|
+
partition = click.argument("partition", type=click.IntRange(min=0))
|
|
50
|
+
|
|
42
51
|
verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
|
|
43
52
|
|
|
44
53
|
force = click.option(
|
|
@@ -92,6 +101,27 @@ samples_chunk_size = click.option(
|
|
|
92
101
|
help="Chunk size in the samples dimension",
|
|
93
102
|
)
|
|
94
103
|
|
|
104
|
+
schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True))
|
|
105
|
+
|
|
106
|
+
max_variant_chunks = click.option(
|
|
107
|
+
"-V",
|
|
108
|
+
"--max-variant-chunks",
|
|
109
|
+
type=int,
|
|
110
|
+
default=None,
|
|
111
|
+
help=(
|
|
112
|
+
"Truncate the output in the variants dimension to have "
|
|
113
|
+
"this number of chunks. Mainly intended to help with "
|
|
114
|
+
"schema tuning."
|
|
115
|
+
),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
max_memory = click.option(
|
|
119
|
+
"-M",
|
|
120
|
+
"--max-memory",
|
|
121
|
+
default=None,
|
|
122
|
+
help="An approximate bound on overall memory usage (e.g. 10G),",
|
|
123
|
+
)
|
|
124
|
+
|
|
95
125
|
|
|
96
126
|
def setup_logging(verbosity):
|
|
97
127
|
level = "WARNING"
|
|
@@ -158,7 +188,7 @@ def explode(
|
|
|
158
188
|
@click.command
|
|
159
189
|
@vcfs
|
|
160
190
|
@new_icf_path
|
|
161
|
-
@
|
|
191
|
+
@num_partitions
|
|
162
192
|
@force
|
|
163
193
|
@column_chunk_size
|
|
164
194
|
@compressor
|
|
@@ -194,7 +224,7 @@ def dexplode_init(
|
|
|
194
224
|
|
|
195
225
|
@click.command
|
|
196
226
|
@icf_path
|
|
197
|
-
@
|
|
227
|
+
@partition
|
|
198
228
|
@verbose
|
|
199
229
|
def dexplode_partition(icf_path, partition, verbose):
|
|
200
230
|
"""
|
|
@@ -203,18 +233,18 @@ def dexplode_partition(icf_path, partition, verbose):
|
|
|
203
233
|
from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
|
|
204
234
|
"""
|
|
205
235
|
setup_logging(verbose)
|
|
206
|
-
vcf.explode_partition(icf_path, partition
|
|
236
|
+
vcf.explode_partition(icf_path, partition)
|
|
207
237
|
|
|
208
238
|
|
|
209
239
|
@click.command
|
|
210
|
-
@
|
|
240
|
+
@icf_path
|
|
211
241
|
@verbose
|
|
212
|
-
def dexplode_finalise(
|
|
242
|
+
def dexplode_finalise(icf_path, verbose):
|
|
213
243
|
"""
|
|
214
244
|
Final step for distributed conversion of VCF(s) to intermediate columnar format.
|
|
215
245
|
"""
|
|
216
246
|
setup_logging(verbose)
|
|
217
|
-
vcf.explode_finalise(
|
|
247
|
+
vcf.explode_finalise(icf_path)
|
|
218
248
|
|
|
219
249
|
|
|
220
250
|
@click.command
|
|
@@ -244,26 +274,11 @@ def mkschema(icf_path):
|
|
|
244
274
|
@new_zarr_path
|
|
245
275
|
@force
|
|
246
276
|
@verbose
|
|
247
|
-
@
|
|
277
|
+
@schema
|
|
248
278
|
@variants_chunk_size
|
|
249
279
|
@samples_chunk_size
|
|
250
|
-
@
|
|
251
|
-
|
|
252
|
-
"--max-variant-chunks",
|
|
253
|
-
type=int,
|
|
254
|
-
default=None,
|
|
255
|
-
help=(
|
|
256
|
-
"Truncate the output in the variants dimension to have "
|
|
257
|
-
"this number of chunks. Mainly intended to help with "
|
|
258
|
-
"schema tuning."
|
|
259
|
-
),
|
|
260
|
-
)
|
|
261
|
-
@click.option(
|
|
262
|
-
"-M",
|
|
263
|
-
"--max-memory",
|
|
264
|
-
default=None,
|
|
265
|
-
help="An approximate bound on overall memory usage (e.g. 10G),",
|
|
266
|
-
)
|
|
280
|
+
@max_variant_chunks
|
|
281
|
+
@max_memory
|
|
267
282
|
@worker_processes
|
|
268
283
|
def encode(
|
|
269
284
|
icf_path,
|
|
@@ -288,13 +303,96 @@ def encode(
|
|
|
288
303
|
schema_path=schema,
|
|
289
304
|
variants_chunk_size=variants_chunk_size,
|
|
290
305
|
samples_chunk_size=samples_chunk_size,
|
|
291
|
-
|
|
306
|
+
max_variant_chunks=max_variant_chunks,
|
|
292
307
|
worker_processes=worker_processes,
|
|
293
308
|
max_memory=max_memory,
|
|
294
309
|
show_progress=True,
|
|
295
310
|
)
|
|
296
311
|
|
|
297
312
|
|
|
313
|
+
@click.command
|
|
314
|
+
@icf_path
|
|
315
|
+
@new_zarr_path
|
|
316
|
+
@num_partitions
|
|
317
|
+
@force
|
|
318
|
+
@schema
|
|
319
|
+
@variants_chunk_size
|
|
320
|
+
@samples_chunk_size
|
|
321
|
+
@max_variant_chunks
|
|
322
|
+
@verbose
|
|
323
|
+
def dencode_init(
|
|
324
|
+
icf_path,
|
|
325
|
+
zarr_path,
|
|
326
|
+
num_partitions,
|
|
327
|
+
force,
|
|
328
|
+
schema,
|
|
329
|
+
variants_chunk_size,
|
|
330
|
+
samples_chunk_size,
|
|
331
|
+
max_variant_chunks,
|
|
332
|
+
verbose,
|
|
333
|
+
):
|
|
334
|
+
"""
|
|
335
|
+
Initialise conversion of intermediate format to VCF Zarr. This will
|
|
336
|
+
set up the specified ZARR_PATH to perform this conversion over
|
|
337
|
+
NUM_PARTITIONS.
|
|
338
|
+
|
|
339
|
+
The output of this commmand is the actual number of partitions generated
|
|
340
|
+
(which may be less then the requested number, if there is not sufficient
|
|
341
|
+
chunks in the variants dimension) and a rough lower-bound on the amount
|
|
342
|
+
of memory required to encode a partition.
|
|
343
|
+
|
|
344
|
+
NOTE: the format of this output will likely change in subsequent releases;
|
|
345
|
+
it should not be considered machine-readable for now.
|
|
346
|
+
"""
|
|
347
|
+
setup_logging(verbose)
|
|
348
|
+
check_overwrite_dir(zarr_path, force)
|
|
349
|
+
num_partitions, max_memory = vcf.encode_init(
|
|
350
|
+
icf_path,
|
|
351
|
+
zarr_path,
|
|
352
|
+
target_num_partitions=num_partitions,
|
|
353
|
+
schema_path=schema,
|
|
354
|
+
variants_chunk_size=variants_chunk_size,
|
|
355
|
+
samples_chunk_size=samples_chunk_size,
|
|
356
|
+
max_variant_chunks=max_variant_chunks,
|
|
357
|
+
show_progress=True,
|
|
358
|
+
)
|
|
359
|
+
formatted_size = humanfriendly.format_size(max_memory, binary=True)
|
|
360
|
+
# NOTE adding the size to the stdout here so that users can parse it
|
|
361
|
+
# and use in their submission scripts. This is a first pass, and
|
|
362
|
+
# will most likely change as we see what works and doesn't.
|
|
363
|
+
# NOTE we probably want to format this as a table, which lists
|
|
364
|
+
# some other properties, line by line
|
|
365
|
+
# NOTE This size number is also not quite enough, you need a bit of
|
|
366
|
+
# headroom with it (probably 10% or so). We should include this.
|
|
367
|
+
click.echo(f"{num_partitions}\t{formatted_size}")
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
@click.command
|
|
371
|
+
@zarr_path
|
|
372
|
+
@partition
|
|
373
|
+
@verbose
|
|
374
|
+
def dencode_partition(zarr_path, partition, verbose):
|
|
375
|
+
"""
|
|
376
|
+
Convert a partition from intermediate columnar format to VCF Zarr.
|
|
377
|
+
Must be called *after* the Zarr path has been initialised with dencode_init.
|
|
378
|
+
Partition indexes must be from 0 (inclusive) to the number of paritions
|
|
379
|
+
returned by dencode_init (exclusive).
|
|
380
|
+
"""
|
|
381
|
+
setup_logging(verbose)
|
|
382
|
+
vcf.encode_partition(zarr_path, partition)
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
@click.command
|
|
386
|
+
@zarr_path
|
|
387
|
+
@verbose
|
|
388
|
+
def dencode_finalise(zarr_path, verbose):
|
|
389
|
+
"""
|
|
390
|
+
Final step for distributed conversion of ICF to VCF Zarr.
|
|
391
|
+
"""
|
|
392
|
+
setup_logging(verbose)
|
|
393
|
+
vcf.encode_finalise(zarr_path, show_progress=True)
|
|
394
|
+
|
|
395
|
+
|
|
298
396
|
@click.command(name="convert")
|
|
299
397
|
@vcfs
|
|
300
398
|
@new_zarr_path
|
|
@@ -382,6 +480,9 @@ vcf2zarr.add_command(encode)
|
|
|
382
480
|
vcf2zarr.add_command(dexplode_init)
|
|
383
481
|
vcf2zarr.add_command(dexplode_partition)
|
|
384
482
|
vcf2zarr.add_command(dexplode_finalise)
|
|
483
|
+
vcf2zarr.add_command(dencode_init)
|
|
484
|
+
vcf2zarr.add_command(dencode_partition)
|
|
485
|
+
vcf2zarr.add_command(dencode_finalise)
|
|
385
486
|
|
|
386
487
|
|
|
387
488
|
@click.command(name="convert")
|
bio2zarr/core.py
CHANGED
|
@@ -3,6 +3,8 @@ import contextlib
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import logging
|
|
5
5
|
import multiprocessing
|
|
6
|
+
import os
|
|
7
|
+
import os.path
|
|
6
8
|
import threading
|
|
7
9
|
import time
|
|
8
10
|
|
|
@@ -16,6 +18,16 @@ logger = logging.getLogger(__name__)
|
|
|
16
18
|
numcodecs.blosc.use_threads = False
|
|
17
19
|
|
|
18
20
|
|
|
21
|
+
def min_int_dtype(min_value, max_value):
|
|
22
|
+
if min_value > max_value:
|
|
23
|
+
raise ValueError("min_value must be <= max_value")
|
|
24
|
+
for a_dtype in ["i1", "i2", "i4", "i8"]:
|
|
25
|
+
info = np.iinfo(a_dtype)
|
|
26
|
+
if info.min <= min_value and max_value <= info.max:
|
|
27
|
+
return a_dtype
|
|
28
|
+
raise OverflowError("Integer cannot be represented")
|
|
29
|
+
|
|
30
|
+
|
|
19
31
|
def chunk_aligned_slices(z, n, max_chunks=None):
|
|
20
32
|
"""
|
|
21
33
|
Returns at n slices in the specified zarr array, aligned
|
|
@@ -35,6 +47,22 @@ def chunk_aligned_slices(z, n, max_chunks=None):
|
|
|
35
47
|
return slices
|
|
36
48
|
|
|
37
49
|
|
|
50
|
+
def du(path):
|
|
51
|
+
"""
|
|
52
|
+
Return the total bytes stored at this path.
|
|
53
|
+
"""
|
|
54
|
+
total = os.path.getsize(path)
|
|
55
|
+
# pathlib walk method doesn't exist until 3.12 :(
|
|
56
|
+
for root, dirs, files in os.walk(path):
|
|
57
|
+
for lst in [dirs, files]:
|
|
58
|
+
for name in lst:
|
|
59
|
+
fullname = os.path.join(root, name)
|
|
60
|
+
size = os.path.getsize(fullname)
|
|
61
|
+
total += size
|
|
62
|
+
logger.debug(f"du({path}) = {total}")
|
|
63
|
+
return total
|
|
64
|
+
|
|
65
|
+
|
|
38
66
|
class SynchronousExecutor(cf.Executor):
|
|
39
67
|
def submit(self, fn, /, *args, **kwargs):
|
|
40
68
|
future = cf.Future()
|
|
@@ -100,6 +128,7 @@ class BufferedArray:
|
|
|
100
128
|
sync_flush_2d_array(
|
|
101
129
|
self.buff[: self.buffer_row], self.array, self.array_offset
|
|
102
130
|
)
|
|
131
|
+
# FIXME the array.name doesn't seem to be working here for some reason
|
|
103
132
|
logger.debug(
|
|
104
133
|
f"Flushed <{self.array.name} {self.array.shape} "
|
|
105
134
|
f"{self.array.dtype}> "
|
|
@@ -121,8 +150,7 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
|
|
|
121
150
|
# encoder implementations.
|
|
122
151
|
s = slice(offset, offset + np_buffer.shape[0])
|
|
123
152
|
samples_chunk_size = zarr_array.chunks[1]
|
|
124
|
-
# TODO use zarr chunks here
|
|
125
|
-
# and for simplicity
|
|
153
|
+
# TODO use zarr chunks here for simplicity
|
|
126
154
|
zarr_array_width = zarr_array.shape[1]
|
|
127
155
|
start = 0
|
|
128
156
|
while start < zarr_array_width:
|
|
@@ -182,7 +210,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
182
210
|
self.progress_config = progress_config
|
|
183
211
|
self.progress_bar = tqdm.tqdm(
|
|
184
212
|
total=progress_config.total,
|
|
185
|
-
desc=f"{progress_config.title:>
|
|
213
|
+
desc=f"{progress_config.title:>8}",
|
|
186
214
|
unit_scale=True,
|
|
187
215
|
unit=progress_config.units,
|
|
188
216
|
smoothing=0.1,
|