bio2zarr 0.0.9__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/__main__.py +2 -2
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +176 -113
- bio2zarr/constants.py +18 -0
- bio2zarr/core.py +65 -20
- bio2zarr/vcf2zarr/__init__.py +38 -0
- bio2zarr/vcf2zarr/icf.py +1221 -0
- bio2zarr/vcf2zarr/vcz.py +1053 -0
- bio2zarr/vcf2zarr/verification.py +230 -0
- bio2zarr/vcf_utils.py +11 -6
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/METADATA +10 -123
- bio2zarr-0.1.0.dist-info/RECORD +20 -0
- bio2zarr-0.1.0.dist-info/entry_points.txt +3 -0
- bio2zarr/vcf.py +0 -2445
- bio2zarr-0.0.9.dist-info/RECORD +0 -16
- bio2zarr-0.0.9.dist-info/entry_points.txt +0 -4
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/WHEEL +0 -0
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/top_level.txt +0 -0
bio2zarr/__main__.py
CHANGED
|
@@ -14,9 +14,9 @@ def bio2zarr():
|
|
|
14
14
|
# install individual commands as console scripts. However, this
|
|
15
15
|
# is handy for development and for those whose PATHs aren't set
|
|
16
16
|
# up in the right way.
|
|
17
|
-
bio2zarr.add_command(cli.
|
|
17
|
+
bio2zarr.add_command(cli.vcf2zarr_main)
|
|
18
18
|
bio2zarr.add_command(cli.plink2zarr)
|
|
19
|
-
bio2zarr.add_command(cli.
|
|
19
|
+
bio2zarr.add_command(cli.vcfpartition)
|
|
20
20
|
|
|
21
21
|
if __name__ == "__main__":
|
|
22
22
|
bio2zarr()
|
bio2zarr/_version.py
CHANGED
bio2zarr/cli.py
CHANGED
|
@@ -5,11 +5,11 @@ import shutil
|
|
|
5
5
|
|
|
6
6
|
import click
|
|
7
7
|
import coloredlogs
|
|
8
|
-
import humanfriendly
|
|
9
8
|
import numcodecs
|
|
10
9
|
import tabulate
|
|
11
10
|
|
|
12
|
-
from . import plink, provenance,
|
|
11
|
+
from . import plink, provenance, vcf2zarr, vcf_utils
|
|
12
|
+
from .vcf2zarr import icf as icf_mod
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
15
15
|
|
|
@@ -44,7 +44,13 @@ zarr_path = click.argument(
|
|
|
44
44
|
"zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
|
|
45
45
|
)
|
|
46
46
|
|
|
47
|
-
num_partitions = click.
|
|
47
|
+
num_partitions = click.option(
|
|
48
|
+
"-n",
|
|
49
|
+
"--num-partitions",
|
|
50
|
+
type=click.IntRange(min=1),
|
|
51
|
+
default=None,
|
|
52
|
+
help="Target number of partitions to split into",
|
|
53
|
+
)
|
|
48
54
|
|
|
49
55
|
partition = click.argument("partition", type=click.IntRange(min=0))
|
|
50
56
|
|
|
@@ -58,6 +64,27 @@ force = click.option(
|
|
|
58
64
|
help="Force overwriting of existing directories",
|
|
59
65
|
)
|
|
60
66
|
|
|
67
|
+
progress = click.option(
|
|
68
|
+
"-P /-Q",
|
|
69
|
+
"--progress/--no-progress",
|
|
70
|
+
default=True,
|
|
71
|
+
help="Show progress bars (default: show)",
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
one_based = click.option(
|
|
75
|
+
"--one-based",
|
|
76
|
+
is_flag=True,
|
|
77
|
+
flag_value=True,
|
|
78
|
+
help="Partition indexes are interpreted as one-based",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
json = click.option(
|
|
82
|
+
"--json",
|
|
83
|
+
is_flag=True,
|
|
84
|
+
flag_value=True,
|
|
85
|
+
help="Output summary data in JSON format",
|
|
86
|
+
)
|
|
87
|
+
|
|
61
88
|
version = click.version_option(version=f"{provenance.__version__}")
|
|
62
89
|
|
|
63
90
|
worker_processes = click.option(
|
|
@@ -151,14 +178,33 @@ def check_overwrite_dir(path, force):
|
|
|
151
178
|
shutil.rmtree(tmp_delete_path)
|
|
152
179
|
|
|
153
180
|
|
|
181
|
+
def check_partitions(num_partitions):
|
|
182
|
+
if num_partitions is None:
|
|
183
|
+
raise click.UsageError(
|
|
184
|
+
"-n/--num-partitions must currently be specified. Future versions "
|
|
185
|
+
"will provide reasonable defaults or other means of specifying "
|
|
186
|
+
"partitions."
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
154
190
|
def get_compressor(cname):
|
|
155
191
|
if cname is None:
|
|
156
192
|
return None
|
|
157
|
-
config =
|
|
193
|
+
config = icf_mod.ICF_DEFAULT_COMPRESSOR.get_config()
|
|
158
194
|
config["cname"] = cname
|
|
159
195
|
return numcodecs.get_codec(config)
|
|
160
196
|
|
|
161
197
|
|
|
198
|
+
def show_work_summary(work_summary, json):
|
|
199
|
+
if json:
|
|
200
|
+
output = work_summary.asjson()
|
|
201
|
+
else:
|
|
202
|
+
data = work_summary.asdict()
|
|
203
|
+
output = tabulate.tabulate(list(data.items()), tablefmt="plain")
|
|
204
|
+
# output = "\n".join(f"{k}\t{v}" for k, v in data.items())
|
|
205
|
+
click.echo(output)
|
|
206
|
+
|
|
207
|
+
|
|
162
208
|
@click.command
|
|
163
209
|
@vcfs
|
|
164
210
|
@new_icf_path
|
|
@@ -166,22 +212,30 @@ def get_compressor(cname):
|
|
|
166
212
|
@verbose
|
|
167
213
|
@column_chunk_size
|
|
168
214
|
@compressor
|
|
215
|
+
@progress
|
|
169
216
|
@worker_processes
|
|
170
217
|
def explode(
|
|
171
|
-
vcfs,
|
|
218
|
+
vcfs,
|
|
219
|
+
icf_path,
|
|
220
|
+
force,
|
|
221
|
+
verbose,
|
|
222
|
+
column_chunk_size,
|
|
223
|
+
compressor,
|
|
224
|
+
progress,
|
|
225
|
+
worker_processes,
|
|
172
226
|
):
|
|
173
227
|
"""
|
|
174
228
|
Convert VCF(s) to intermediate columnar format
|
|
175
229
|
"""
|
|
176
230
|
setup_logging(verbose)
|
|
177
231
|
check_overwrite_dir(icf_path, force)
|
|
178
|
-
|
|
232
|
+
vcf2zarr.explode(
|
|
179
233
|
icf_path,
|
|
180
234
|
vcfs,
|
|
181
235
|
worker_processes=worker_processes,
|
|
182
236
|
column_chunk_size=column_chunk_size,
|
|
183
237
|
compressor=get_compressor(compressor),
|
|
184
|
-
show_progress=
|
|
238
|
+
show_progress=progress,
|
|
185
239
|
)
|
|
186
240
|
|
|
187
241
|
|
|
@@ -192,7 +246,9 @@ def explode(
|
|
|
192
246
|
@force
|
|
193
247
|
@column_chunk_size
|
|
194
248
|
@compressor
|
|
249
|
+
@json
|
|
195
250
|
@verbose
|
|
251
|
+
@progress
|
|
196
252
|
@worker_processes
|
|
197
253
|
def dexplode_init(
|
|
198
254
|
vcfs,
|
|
@@ -201,39 +257,47 @@ def dexplode_init(
|
|
|
201
257
|
force,
|
|
202
258
|
column_chunk_size,
|
|
203
259
|
compressor,
|
|
260
|
+
json,
|
|
204
261
|
verbose,
|
|
262
|
+
progress,
|
|
205
263
|
worker_processes,
|
|
206
264
|
):
|
|
207
265
|
"""
|
|
208
266
|
Initial step for distributed conversion of VCF(s) to intermediate columnar format
|
|
209
|
-
over
|
|
267
|
+
over some number of paritions.
|
|
210
268
|
"""
|
|
211
269
|
setup_logging(verbose)
|
|
212
270
|
check_overwrite_dir(icf_path, force)
|
|
213
|
-
num_partitions
|
|
271
|
+
check_partitions(num_partitions)
|
|
272
|
+
work_summary = vcf2zarr.explode_init(
|
|
214
273
|
icf_path,
|
|
215
274
|
vcfs,
|
|
216
275
|
target_num_partitions=num_partitions,
|
|
217
276
|
column_chunk_size=column_chunk_size,
|
|
218
277
|
worker_processes=worker_processes,
|
|
219
278
|
compressor=get_compressor(compressor),
|
|
220
|
-
show_progress=
|
|
279
|
+
show_progress=progress,
|
|
221
280
|
)
|
|
222
|
-
|
|
281
|
+
show_work_summary(work_summary, json)
|
|
223
282
|
|
|
224
283
|
|
|
225
284
|
@click.command
|
|
226
285
|
@icf_path
|
|
227
286
|
@partition
|
|
228
287
|
@verbose
|
|
229
|
-
|
|
288
|
+
@one_based
|
|
289
|
+
def dexplode_partition(icf_path, partition, verbose, one_based):
|
|
230
290
|
"""
|
|
231
|
-
Convert a VCF partition to intermediate columnar format. Must be called
|
|
232
|
-
the ICF path has been initialised with dexplode_init.
|
|
233
|
-
from 0
|
|
291
|
+
Convert a VCF partition to intermediate columnar format. Must be called
|
|
292
|
+
after the ICF path has been initialised with dexplode_init. By default,
|
|
293
|
+
partition indexes are from 0 to the number of partitions N (returned by
|
|
294
|
+
dexplode_init), exclusive. If the --one-based option is specifed,
|
|
295
|
+
partition indexes are in the range 1 to N, inclusive.
|
|
234
296
|
"""
|
|
235
297
|
setup_logging(verbose)
|
|
236
|
-
|
|
298
|
+
if one_based:
|
|
299
|
+
partition -= 1
|
|
300
|
+
vcf2zarr.explode_partition(icf_path, partition)
|
|
237
301
|
|
|
238
302
|
|
|
239
303
|
@click.command
|
|
@@ -244,7 +308,7 @@ def dexplode_finalise(icf_path, verbose):
|
|
|
244
308
|
Final step for distributed conversion of VCF(s) to intermediate columnar format.
|
|
245
309
|
"""
|
|
246
310
|
setup_logging(verbose)
|
|
247
|
-
|
|
311
|
+
vcf2zarr.explode_finalise(icf_path)
|
|
248
312
|
|
|
249
313
|
|
|
250
314
|
@click.command
|
|
@@ -255,7 +319,7 @@ def inspect(path, verbose):
|
|
|
255
319
|
Inspect an intermediate columnar format or Zarr path.
|
|
256
320
|
"""
|
|
257
321
|
setup_logging(verbose)
|
|
258
|
-
data =
|
|
322
|
+
data = vcf2zarr.inspect(path)
|
|
259
323
|
click.echo(tabulate.tabulate(data, headers="keys"))
|
|
260
324
|
|
|
261
325
|
|
|
@@ -266,7 +330,7 @@ def mkschema(icf_path):
|
|
|
266
330
|
Generate a schema for zarr encoding
|
|
267
331
|
"""
|
|
268
332
|
stream = click.get_text_stream("stdout")
|
|
269
|
-
|
|
333
|
+
vcf2zarr.mkschema(icf_path, stream)
|
|
270
334
|
|
|
271
335
|
|
|
272
336
|
@click.command
|
|
@@ -279,6 +343,7 @@ def mkschema(icf_path):
|
|
|
279
343
|
@samples_chunk_size
|
|
280
344
|
@max_variant_chunks
|
|
281
345
|
@max_memory
|
|
346
|
+
@progress
|
|
282
347
|
@worker_processes
|
|
283
348
|
def encode(
|
|
284
349
|
icf_path,
|
|
@@ -290,6 +355,7 @@ def encode(
|
|
|
290
355
|
samples_chunk_size,
|
|
291
356
|
max_variant_chunks,
|
|
292
357
|
max_memory,
|
|
358
|
+
progress,
|
|
293
359
|
worker_processes,
|
|
294
360
|
):
|
|
295
361
|
"""
|
|
@@ -297,7 +363,7 @@ def encode(
|
|
|
297
363
|
"""
|
|
298
364
|
setup_logging(verbose)
|
|
299
365
|
check_overwrite_dir(zarr_path, force)
|
|
300
|
-
|
|
366
|
+
vcf2zarr.encode(
|
|
301
367
|
icf_path,
|
|
302
368
|
zarr_path,
|
|
303
369
|
schema_path=schema,
|
|
@@ -306,7 +372,7 @@ def encode(
|
|
|
306
372
|
max_variant_chunks=max_variant_chunks,
|
|
307
373
|
worker_processes=worker_processes,
|
|
308
374
|
max_memory=max_memory,
|
|
309
|
-
show_progress=
|
|
375
|
+
show_progress=progress,
|
|
310
376
|
)
|
|
311
377
|
|
|
312
378
|
|
|
@@ -319,6 +385,8 @@ def encode(
|
|
|
319
385
|
@variants_chunk_size
|
|
320
386
|
@samples_chunk_size
|
|
321
387
|
@max_variant_chunks
|
|
388
|
+
@json
|
|
389
|
+
@progress
|
|
322
390
|
@verbose
|
|
323
391
|
def dencode_init(
|
|
324
392
|
icf_path,
|
|
@@ -329,12 +397,14 @@ def dencode_init(
|
|
|
329
397
|
variants_chunk_size,
|
|
330
398
|
samples_chunk_size,
|
|
331
399
|
max_variant_chunks,
|
|
400
|
+
json,
|
|
401
|
+
progress,
|
|
332
402
|
verbose,
|
|
333
403
|
):
|
|
334
404
|
"""
|
|
335
405
|
Initialise conversion of intermediate format to VCF Zarr. This will
|
|
336
406
|
set up the specified ZARR_PATH to perform this conversion over
|
|
337
|
-
|
|
407
|
+
some number of partitions.
|
|
338
408
|
|
|
339
409
|
The output of this commmand is the actual number of partitions generated
|
|
340
410
|
(which may be less then the requested number, if there is not sufficient
|
|
@@ -346,7 +416,8 @@ def dencode_init(
|
|
|
346
416
|
"""
|
|
347
417
|
setup_logging(verbose)
|
|
348
418
|
check_overwrite_dir(zarr_path, force)
|
|
349
|
-
num_partitions
|
|
419
|
+
check_partitions(num_partitions)
|
|
420
|
+
work_summary = vcf2zarr.encode_init(
|
|
350
421
|
icf_path,
|
|
351
422
|
zarr_path,
|
|
352
423
|
target_num_partitions=num_partitions,
|
|
@@ -354,141 +425,104 @@ def dencode_init(
|
|
|
354
425
|
variants_chunk_size=variants_chunk_size,
|
|
355
426
|
samples_chunk_size=samples_chunk_size,
|
|
356
427
|
max_variant_chunks=max_variant_chunks,
|
|
357
|
-
show_progress=
|
|
428
|
+
show_progress=progress,
|
|
358
429
|
)
|
|
359
|
-
|
|
360
|
-
# NOTE adding the size to the stdout here so that users can parse it
|
|
361
|
-
# and use in their submission scripts. This is a first pass, and
|
|
362
|
-
# will most likely change as we see what works and doesn't.
|
|
363
|
-
# NOTE we probably want to format this as a table, which lists
|
|
364
|
-
# some other properties, line by line
|
|
365
|
-
# NOTE This size number is also not quite enough, you need a bit of
|
|
366
|
-
# headroom with it (probably 10% or so). We should include this.
|
|
367
|
-
click.echo(f"{num_partitions}\t{formatted_size}")
|
|
430
|
+
show_work_summary(work_summary, json)
|
|
368
431
|
|
|
369
432
|
|
|
370
433
|
@click.command
|
|
371
434
|
@zarr_path
|
|
372
435
|
@partition
|
|
373
436
|
@verbose
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
Convert a partition from intermediate columnar format to VCF Zarr.
|
|
377
|
-
Must be called *after* the Zarr path has been initialised with dencode_init.
|
|
378
|
-
Partition indexes must be from 0 (inclusive) to the number of paritions
|
|
379
|
-
returned by dencode_init (exclusive).
|
|
437
|
+
@one_based
|
|
438
|
+
def dencode_partition(zarr_path, partition, verbose, one_based):
|
|
380
439
|
"""
|
|
440
|
+
Convert a partition from intermediate columnar format to VCF Zarr. Must be
|
|
441
|
+
called after the Zarr path has been initialised with dencode_init. By
|
|
442
|
+
default, partition indexes are from 0 to the number of partitions N
|
|
443
|
+
(returned by dencode_init), exclusive. If the --one-based option is
|
|
444
|
+
specifed, partition indexes are in the range 1 to N, inclusive."""
|
|
381
445
|
setup_logging(verbose)
|
|
382
|
-
|
|
446
|
+
if one_based:
|
|
447
|
+
partition -= 1
|
|
448
|
+
vcf2zarr.encode_partition(zarr_path, partition)
|
|
383
449
|
|
|
384
450
|
|
|
385
451
|
@click.command
|
|
386
452
|
@zarr_path
|
|
387
453
|
@verbose
|
|
388
|
-
|
|
454
|
+
@progress
|
|
455
|
+
def dencode_finalise(zarr_path, verbose, progress):
|
|
389
456
|
"""
|
|
390
457
|
Final step for distributed conversion of ICF to VCF Zarr.
|
|
391
458
|
"""
|
|
392
459
|
setup_logging(verbose)
|
|
393
|
-
|
|
460
|
+
vcf2zarr.encode_finalise(zarr_path, show_progress=progress)
|
|
394
461
|
|
|
395
462
|
|
|
396
463
|
@click.command(name="convert")
|
|
397
464
|
@vcfs
|
|
398
465
|
@new_zarr_path
|
|
466
|
+
@force
|
|
399
467
|
@variants_chunk_size
|
|
400
468
|
@samples_chunk_size
|
|
401
469
|
@verbose
|
|
470
|
+
@progress
|
|
402
471
|
@worker_processes
|
|
403
472
|
def convert_vcf(
|
|
404
|
-
vcfs,
|
|
473
|
+
vcfs,
|
|
474
|
+
zarr_path,
|
|
475
|
+
force,
|
|
476
|
+
variants_chunk_size,
|
|
477
|
+
samples_chunk_size,
|
|
478
|
+
verbose,
|
|
479
|
+
progress,
|
|
480
|
+
worker_processes,
|
|
405
481
|
):
|
|
406
482
|
"""
|
|
407
483
|
Convert input VCF(s) directly to vcfzarr (not recommended for large files).
|
|
408
484
|
"""
|
|
409
485
|
setup_logging(verbose)
|
|
410
|
-
|
|
486
|
+
check_overwrite_dir(zarr_path, force)
|
|
487
|
+
vcf2zarr.convert(
|
|
411
488
|
vcfs,
|
|
412
489
|
zarr_path,
|
|
413
490
|
variants_chunk_size=variants_chunk_size,
|
|
414
491
|
samples_chunk_size=samples_chunk_size,
|
|
415
|
-
show_progress=
|
|
492
|
+
show_progress=progress,
|
|
416
493
|
worker_processes=worker_processes,
|
|
417
494
|
)
|
|
418
495
|
|
|
419
496
|
|
|
420
497
|
@version
|
|
421
|
-
@click.group(cls=NaturalOrderGroup)
|
|
422
|
-
def
|
|
498
|
+
@click.group(cls=NaturalOrderGroup, name="vcf2zarr")
|
|
499
|
+
def vcf2zarr_main():
|
|
423
500
|
"""
|
|
424
501
|
Convert VCF file(s) to the vcfzarr format.
|
|
425
502
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
$ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
|
|
429
|
-
|
|
430
|
-
This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
|
|
431
|
-
step. As this writes the intermediate columnar format to a temporary directory,
|
|
432
|
-
we only recommend this approach for small files (< 1GB, say).
|
|
433
|
-
|
|
434
|
-
The recommended approach is to run the conversion in two passes, and
|
|
435
|
-
to keep the intermediate columnar format ("exploded") around to facilitate
|
|
436
|
-
experimentation with chunk sizes and compression settings:
|
|
437
|
-
|
|
438
|
-
\b
|
|
439
|
-
$ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
|
|
440
|
-
$ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
|
|
441
|
-
|
|
442
|
-
The inspect command provides a way to view contents of an exploded ICF
|
|
443
|
-
or Zarr:
|
|
444
|
-
|
|
445
|
-
$ vcf2zarr inspect [PATH]
|
|
446
|
-
|
|
447
|
-
This is useful when tweaking chunk sizes and compression settings to suit
|
|
448
|
-
your dataset, using the mkschema command and --schema option to encode:
|
|
449
|
-
|
|
450
|
-
\b
|
|
451
|
-
$ vcf2zarr mkschema [ICF_PATH] > schema.json
|
|
452
|
-
$ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
|
|
453
|
-
|
|
454
|
-
By editing the schema.json file you can drop columns that are not of interest
|
|
455
|
-
and edit column specific compression settings. The --max-variant-chunks option
|
|
456
|
-
to encode allows you to try out these options on small subsets, hopefully
|
|
457
|
-
arriving at settings with the desired balance of compression and query
|
|
458
|
-
performance.
|
|
459
|
-
|
|
460
|
-
ADVANCED USAGE
|
|
461
|
-
|
|
462
|
-
For very large datasets (terabyte scale) it may be necessary to distribute the
|
|
463
|
-
explode and encode steps across a cluster:
|
|
464
|
-
|
|
465
|
-
\b
|
|
466
|
-
$ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
|
|
467
|
-
$ vcf2zarr dexplode-partition [ICF_PATH] [PARTITION_INDEX]
|
|
468
|
-
$ vcf2zarr dexplode-finalise [ICF_PATH]
|
|
469
|
-
|
|
470
|
-
See the online documentation at [FIXME] for more details on distributed explode.
|
|
503
|
+
See the online documentation at https://sgkit-dev.github.io/bio2zarr/
|
|
504
|
+
for more information.
|
|
471
505
|
"""
|
|
472
506
|
|
|
473
507
|
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
vcf2zarr.add_command(dencode_finalise)
|
|
508
|
+
vcf2zarr_main.add_command(convert_vcf)
|
|
509
|
+
vcf2zarr_main.add_command(inspect)
|
|
510
|
+
vcf2zarr_main.add_command(explode)
|
|
511
|
+
vcf2zarr_main.add_command(mkschema)
|
|
512
|
+
vcf2zarr_main.add_command(encode)
|
|
513
|
+
vcf2zarr_main.add_command(dexplode_init)
|
|
514
|
+
vcf2zarr_main.add_command(dexplode_partition)
|
|
515
|
+
vcf2zarr_main.add_command(dexplode_finalise)
|
|
516
|
+
vcf2zarr_main.add_command(dencode_init)
|
|
517
|
+
vcf2zarr_main.add_command(dencode_partition)
|
|
518
|
+
vcf2zarr_main.add_command(dencode_finalise)
|
|
486
519
|
|
|
487
520
|
|
|
488
521
|
@click.command(name="convert")
|
|
489
522
|
@click.argument("in_path", type=click.Path())
|
|
490
523
|
@click.argument("zarr_path", type=click.Path())
|
|
491
524
|
@worker_processes
|
|
525
|
+
@progress
|
|
492
526
|
@verbose
|
|
493
527
|
@variants_chunk_size
|
|
494
528
|
@samples_chunk_size
|
|
@@ -497,6 +531,7 @@ def convert_plink(
|
|
|
497
531
|
zarr_path,
|
|
498
532
|
verbose,
|
|
499
533
|
worker_processes,
|
|
534
|
+
progress,
|
|
500
535
|
variants_chunk_size,
|
|
501
536
|
samples_chunk_size,
|
|
502
537
|
):
|
|
@@ -507,7 +542,7 @@ def convert_plink(
|
|
|
507
542
|
plink.convert(
|
|
508
543
|
in_path,
|
|
509
544
|
zarr_path,
|
|
510
|
-
show_progress=
|
|
545
|
+
show_progress=progress,
|
|
511
546
|
worker_processes=worker_processes,
|
|
512
547
|
samples_chunk_size=samples_chunk_size,
|
|
513
548
|
variants_chunk_size=variants_chunk_size,
|
|
@@ -525,11 +560,39 @@ plink2zarr.add_command(convert_plink)
|
|
|
525
560
|
|
|
526
561
|
@click.command
|
|
527
562
|
@version
|
|
528
|
-
@click.argument("vcf_path", type=click.Path())
|
|
529
|
-
@
|
|
530
|
-
@
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
563
|
+
@click.argument("vcf_path", type=click.Path(exists=True, dir_okay=False))
|
|
564
|
+
@verbose
|
|
565
|
+
@num_partitions
|
|
566
|
+
@click.option(
|
|
567
|
+
"-s",
|
|
568
|
+
"--partition-size",
|
|
569
|
+
type=str,
|
|
570
|
+
default=None,
|
|
571
|
+
help="Target (compressed) size of VCF partitions, e.g. 100KB, 10MiB, 1G.",
|
|
572
|
+
)
|
|
573
|
+
def vcfpartition(vcf_path, verbose, num_partitions, partition_size):
|
|
574
|
+
"""
|
|
575
|
+
Output bcftools region strings that partition an indexed VCF/BCF file
|
|
576
|
+
into either an approximate number of parts (-n), or parts of approximately
|
|
577
|
+
a given size (-s). One of -n or -s must be supplied.
|
|
578
|
+
|
|
579
|
+
Note that both the number of partitions and sizes are a target, and the
|
|
580
|
+
returned number of partitions may not exactly correspond. In particular,
|
|
581
|
+
there is a maximum level of granularity determined by the associated index
|
|
582
|
+
which cannot be exceeded.
|
|
583
|
+
|
|
584
|
+
Note also that the partitions returned may vary considerably in the number
|
|
585
|
+
of records that they contain.
|
|
586
|
+
"""
|
|
587
|
+
setup_logging(verbose)
|
|
588
|
+
if num_partitions is None and partition_size is None:
|
|
589
|
+
raise click.UsageError(
|
|
590
|
+
"Either --num-partitions or --partition-size must be specified"
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
|
|
594
|
+
regions = indexed_vcf.partition_into_regions(
|
|
595
|
+
num_parts=num_partitions, target_part_size=partition_size
|
|
596
|
+
)
|
|
597
|
+
for region in regions:
|
|
598
|
+
click.echo(f"{region}\t{vcf_path}")
|
bio2zarr/constants.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
INT_MISSING = -1
|
|
4
|
+
INT_FILL = -2
|
|
5
|
+
STR_MISSING = "."
|
|
6
|
+
STR_FILL = ""
|
|
7
|
+
|
|
8
|
+
FLOAT32_MISSING, FLOAT32_FILL = np.array([0x7F800001, 0x7F800002], dtype=np.int32).view(
|
|
9
|
+
np.float32
|
|
10
|
+
)
|
|
11
|
+
FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array(
|
|
12
|
+
[0x7F800001, 0x7F800002], dtype=np.int32
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
MIN_INT_VALUE = np.iinfo(np.int32).min + 2
|
|
17
|
+
VCF_INT_MISSING = np.iinfo(np.int32).min
|
|
18
|
+
VCF_INT_FILL = np.iinfo(np.int32).min + 1
|