bio2zarr 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/__main__.py +2 -2
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +87 -95
- bio2zarr/constants.py +18 -0
- bio2zarr/core.py +34 -16
- bio2zarr/vcf2zarr/__init__.py +38 -0
- bio2zarr/vcf2zarr/icf.py +1220 -0
- bio2zarr/vcf2zarr/vcz.py +1017 -0
- bio2zarr/vcf2zarr/verification.py +230 -0
- bio2zarr/vcf_utils.py +1 -1
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.0.10.dist-info}/METADATA +9 -122
- bio2zarr-0.0.10.dist-info/RECORD +20 -0
- bio2zarr-0.0.10.dist-info/entry_points.txt +3 -0
- bio2zarr/vcf.py +0 -2445
- bio2zarr-0.0.9.dist-info/RECORD +0 -16
- bio2zarr-0.0.9.dist-info/entry_points.txt +0 -4
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.0.10.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.0.10.dist-info}/WHEEL +0 -0
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.0.10.dist-info}/top_level.txt +0 -0
bio2zarr/__main__.py
CHANGED
|
@@ -14,9 +14,9 @@ def bio2zarr():
|
|
|
14
14
|
# install individual commands as console scripts. However, this
|
|
15
15
|
# is handy for development and for those whose PATHs aren't set
|
|
16
16
|
# up in the right way.
|
|
17
|
-
bio2zarr.add_command(cli.
|
|
17
|
+
bio2zarr.add_command(cli.vcf2zarr_main)
|
|
18
18
|
bio2zarr.add_command(cli.plink2zarr)
|
|
19
|
-
bio2zarr.add_command(cli.
|
|
19
|
+
bio2zarr.add_command(cli.vcfpartition)
|
|
20
20
|
|
|
21
21
|
if __name__ == "__main__":
|
|
22
22
|
bio2zarr()
|
bio2zarr/_version.py
CHANGED
bio2zarr/cli.py
CHANGED
|
@@ -5,11 +5,11 @@ import shutil
|
|
|
5
5
|
|
|
6
6
|
import click
|
|
7
7
|
import coloredlogs
|
|
8
|
-
import humanfriendly
|
|
9
8
|
import numcodecs
|
|
10
9
|
import tabulate
|
|
11
10
|
|
|
12
|
-
from . import plink, provenance,
|
|
11
|
+
from . import plink, provenance, vcf2zarr, vcf_utils
|
|
12
|
+
from .vcf2zarr import icf as icf_mod
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
15
15
|
|
|
@@ -58,6 +58,20 @@ force = click.option(
|
|
|
58
58
|
help="Force overwriting of existing directories",
|
|
59
59
|
)
|
|
60
60
|
|
|
61
|
+
one_based = click.option(
|
|
62
|
+
"--one-based",
|
|
63
|
+
is_flag=True,
|
|
64
|
+
flag_value=True,
|
|
65
|
+
help="Partition indexes are interpreted as one-based",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
json = click.option(
|
|
69
|
+
"--json",
|
|
70
|
+
is_flag=True,
|
|
71
|
+
flag_value=True,
|
|
72
|
+
help="Output summary data in JSON format",
|
|
73
|
+
)
|
|
74
|
+
|
|
61
75
|
version = click.version_option(version=f"{provenance.__version__}")
|
|
62
76
|
|
|
63
77
|
worker_processes = click.option(
|
|
@@ -154,11 +168,21 @@ def check_overwrite_dir(path, force):
|
|
|
154
168
|
def get_compressor(cname):
|
|
155
169
|
if cname is None:
|
|
156
170
|
return None
|
|
157
|
-
config =
|
|
171
|
+
config = icf_mod.ICF_DEFAULT_COMPRESSOR.get_config()
|
|
158
172
|
config["cname"] = cname
|
|
159
173
|
return numcodecs.get_codec(config)
|
|
160
174
|
|
|
161
175
|
|
|
176
|
+
def show_work_summary(work_summary, json):
|
|
177
|
+
if json:
|
|
178
|
+
output = work_summary.asjson()
|
|
179
|
+
else:
|
|
180
|
+
data = work_summary.asdict()
|
|
181
|
+
output = tabulate.tabulate(list(data.items()), tablefmt="plain")
|
|
182
|
+
# output = "\n".join(f"{k}\t{v}" for k, v in data.items())
|
|
183
|
+
click.echo(output)
|
|
184
|
+
|
|
185
|
+
|
|
162
186
|
@click.command
|
|
163
187
|
@vcfs
|
|
164
188
|
@new_icf_path
|
|
@@ -175,7 +199,7 @@ def explode(
|
|
|
175
199
|
"""
|
|
176
200
|
setup_logging(verbose)
|
|
177
201
|
check_overwrite_dir(icf_path, force)
|
|
178
|
-
|
|
202
|
+
vcf2zarr.explode(
|
|
179
203
|
icf_path,
|
|
180
204
|
vcfs,
|
|
181
205
|
worker_processes=worker_processes,
|
|
@@ -192,6 +216,7 @@ def explode(
|
|
|
192
216
|
@force
|
|
193
217
|
@column_chunk_size
|
|
194
218
|
@compressor
|
|
219
|
+
@json
|
|
195
220
|
@verbose
|
|
196
221
|
@worker_processes
|
|
197
222
|
def dexplode_init(
|
|
@@ -201,6 +226,7 @@ def dexplode_init(
|
|
|
201
226
|
force,
|
|
202
227
|
column_chunk_size,
|
|
203
228
|
compressor,
|
|
229
|
+
json,
|
|
204
230
|
verbose,
|
|
205
231
|
worker_processes,
|
|
206
232
|
):
|
|
@@ -210,7 +236,7 @@ def dexplode_init(
|
|
|
210
236
|
"""
|
|
211
237
|
setup_logging(verbose)
|
|
212
238
|
check_overwrite_dir(icf_path, force)
|
|
213
|
-
|
|
239
|
+
work_summary = vcf2zarr.explode_init(
|
|
214
240
|
icf_path,
|
|
215
241
|
vcfs,
|
|
216
242
|
target_num_partitions=num_partitions,
|
|
@@ -219,21 +245,26 @@ def dexplode_init(
|
|
|
219
245
|
compressor=get_compressor(compressor),
|
|
220
246
|
show_progress=True,
|
|
221
247
|
)
|
|
222
|
-
|
|
248
|
+
show_work_summary(work_summary, json)
|
|
223
249
|
|
|
224
250
|
|
|
225
251
|
@click.command
|
|
226
252
|
@icf_path
|
|
227
253
|
@partition
|
|
228
254
|
@verbose
|
|
229
|
-
|
|
255
|
+
@one_based
|
|
256
|
+
def dexplode_partition(icf_path, partition, verbose, one_based):
|
|
230
257
|
"""
|
|
231
|
-
Convert a VCF partition to intermediate columnar format. Must be called
|
|
232
|
-
the ICF path has been initialised with dexplode_init.
|
|
233
|
-
from 0
|
|
258
|
+
Convert a VCF partition to intermediate columnar format. Must be called
|
|
259
|
+
after the ICF path has been initialised with dexplode_init. By default,
|
|
260
|
+
partition indexes are from 0 to the number of partitions N (returned by
|
|
261
|
+
dexplode_init), exclusive. If the --one-based option is specifed,
|
|
262
|
+
partition indexes are in the range 1 to N, inclusive.
|
|
234
263
|
"""
|
|
235
264
|
setup_logging(verbose)
|
|
236
|
-
|
|
265
|
+
if one_based:
|
|
266
|
+
partition -= 1
|
|
267
|
+
vcf2zarr.explode_partition(icf_path, partition)
|
|
237
268
|
|
|
238
269
|
|
|
239
270
|
@click.command
|
|
@@ -244,7 +275,7 @@ def dexplode_finalise(icf_path, verbose):
|
|
|
244
275
|
Final step for distributed conversion of VCF(s) to intermediate columnar format.
|
|
245
276
|
"""
|
|
246
277
|
setup_logging(verbose)
|
|
247
|
-
|
|
278
|
+
vcf2zarr.explode_finalise(icf_path)
|
|
248
279
|
|
|
249
280
|
|
|
250
281
|
@click.command
|
|
@@ -255,7 +286,7 @@ def inspect(path, verbose):
|
|
|
255
286
|
Inspect an intermediate columnar format or Zarr path.
|
|
256
287
|
"""
|
|
257
288
|
setup_logging(verbose)
|
|
258
|
-
data =
|
|
289
|
+
data = vcf2zarr.inspect(path)
|
|
259
290
|
click.echo(tabulate.tabulate(data, headers="keys"))
|
|
260
291
|
|
|
261
292
|
|
|
@@ -266,7 +297,7 @@ def mkschema(icf_path):
|
|
|
266
297
|
Generate a schema for zarr encoding
|
|
267
298
|
"""
|
|
268
299
|
stream = click.get_text_stream("stdout")
|
|
269
|
-
|
|
300
|
+
vcf2zarr.mkschema(icf_path, stream)
|
|
270
301
|
|
|
271
302
|
|
|
272
303
|
@click.command
|
|
@@ -297,7 +328,7 @@ def encode(
|
|
|
297
328
|
"""
|
|
298
329
|
setup_logging(verbose)
|
|
299
330
|
check_overwrite_dir(zarr_path, force)
|
|
300
|
-
|
|
331
|
+
vcf2zarr.encode(
|
|
301
332
|
icf_path,
|
|
302
333
|
zarr_path,
|
|
303
334
|
schema_path=schema,
|
|
@@ -319,6 +350,7 @@ def encode(
|
|
|
319
350
|
@variants_chunk_size
|
|
320
351
|
@samples_chunk_size
|
|
321
352
|
@max_variant_chunks
|
|
353
|
+
@json
|
|
322
354
|
@verbose
|
|
323
355
|
def dencode_init(
|
|
324
356
|
icf_path,
|
|
@@ -329,6 +361,7 @@ def dencode_init(
|
|
|
329
361
|
variants_chunk_size,
|
|
330
362
|
samples_chunk_size,
|
|
331
363
|
max_variant_chunks,
|
|
364
|
+
json,
|
|
332
365
|
verbose,
|
|
333
366
|
):
|
|
334
367
|
"""
|
|
@@ -346,7 +379,7 @@ def dencode_init(
|
|
|
346
379
|
"""
|
|
347
380
|
setup_logging(verbose)
|
|
348
381
|
check_overwrite_dir(zarr_path, force)
|
|
349
|
-
|
|
382
|
+
work_summary = vcf2zarr.encode_init(
|
|
350
383
|
icf_path,
|
|
351
384
|
zarr_path,
|
|
352
385
|
target_num_partitions=num_partitions,
|
|
@@ -356,30 +389,25 @@ def dencode_init(
|
|
|
356
389
|
max_variant_chunks=max_variant_chunks,
|
|
357
390
|
show_progress=True,
|
|
358
391
|
)
|
|
359
|
-
|
|
360
|
-
# NOTE adding the size to the stdout here so that users can parse it
|
|
361
|
-
# and use in their submission scripts. This is a first pass, and
|
|
362
|
-
# will most likely change as we see what works and doesn't.
|
|
363
|
-
# NOTE we probably want to format this as a table, which lists
|
|
364
|
-
# some other properties, line by line
|
|
365
|
-
# NOTE This size number is also not quite enough, you need a bit of
|
|
366
|
-
# headroom with it (probably 10% or so). We should include this.
|
|
367
|
-
click.echo(f"{num_partitions}\t{formatted_size}")
|
|
392
|
+
show_work_summary(work_summary, json)
|
|
368
393
|
|
|
369
394
|
|
|
370
395
|
@click.command
|
|
371
396
|
@zarr_path
|
|
372
397
|
@partition
|
|
373
398
|
@verbose
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
Convert a partition from intermediate columnar format to VCF Zarr.
|
|
377
|
-
Must be called *after* the Zarr path has been initialised with dencode_init.
|
|
378
|
-
Partition indexes must be from 0 (inclusive) to the number of paritions
|
|
379
|
-
returned by dencode_init (exclusive).
|
|
399
|
+
@one_based
|
|
400
|
+
def dencode_partition(zarr_path, partition, verbose, one_based):
|
|
380
401
|
"""
|
|
402
|
+
Convert a partition from intermediate columnar format to VCF Zarr. Must be
|
|
403
|
+
called after the Zarr path has been initialised with dencode_init. By
|
|
404
|
+
default, partition indexes are from 0 to the number of partitions N
|
|
405
|
+
(returned by dencode_init), exclusive. If the --one-based option is
|
|
406
|
+
specifed, partition indexes are in the range 1 to N, inclusive."""
|
|
381
407
|
setup_logging(verbose)
|
|
382
|
-
|
|
408
|
+
if one_based:
|
|
409
|
+
partition -= 1
|
|
410
|
+
vcf2zarr.encode_partition(zarr_path, partition)
|
|
383
411
|
|
|
384
412
|
|
|
385
413
|
@click.command
|
|
@@ -390,24 +418,32 @@ def dencode_finalise(zarr_path, verbose):
|
|
|
390
418
|
Final step for distributed conversion of ICF to VCF Zarr.
|
|
391
419
|
"""
|
|
392
420
|
setup_logging(verbose)
|
|
393
|
-
|
|
421
|
+
vcf2zarr.encode_finalise(zarr_path, show_progress=True)
|
|
394
422
|
|
|
395
423
|
|
|
396
424
|
@click.command(name="convert")
|
|
397
425
|
@vcfs
|
|
398
426
|
@new_zarr_path
|
|
427
|
+
@force
|
|
399
428
|
@variants_chunk_size
|
|
400
429
|
@samples_chunk_size
|
|
401
430
|
@verbose
|
|
402
431
|
@worker_processes
|
|
403
432
|
def convert_vcf(
|
|
404
|
-
vcfs,
|
|
433
|
+
vcfs,
|
|
434
|
+
zarr_path,
|
|
435
|
+
force,
|
|
436
|
+
variants_chunk_size,
|
|
437
|
+
samples_chunk_size,
|
|
438
|
+
verbose,
|
|
439
|
+
worker_processes,
|
|
405
440
|
):
|
|
406
441
|
"""
|
|
407
442
|
Convert input VCF(s) directly to vcfzarr (not recommended for large files).
|
|
408
443
|
"""
|
|
409
444
|
setup_logging(verbose)
|
|
410
|
-
|
|
445
|
+
check_overwrite_dir(zarr_path, force)
|
|
446
|
+
vcf2zarr.convert(
|
|
411
447
|
vcfs,
|
|
412
448
|
zarr_path,
|
|
413
449
|
variants_chunk_size=variants_chunk_size,
|
|
@@ -418,71 +454,27 @@ def convert_vcf(
|
|
|
418
454
|
|
|
419
455
|
|
|
420
456
|
@version
|
|
421
|
-
@click.group(cls=NaturalOrderGroup)
|
|
422
|
-
def
|
|
457
|
+
@click.group(cls=NaturalOrderGroup, name="vcf2zarr")
|
|
458
|
+
def vcf2zarr_main():
|
|
423
459
|
"""
|
|
424
460
|
Convert VCF file(s) to the vcfzarr format.
|
|
425
461
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
$ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
|
|
429
|
-
|
|
430
|
-
This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
|
|
431
|
-
step. As this writes the intermediate columnar format to a temporary directory,
|
|
432
|
-
we only recommend this approach for small files (< 1GB, say).
|
|
433
|
-
|
|
434
|
-
The recommended approach is to run the conversion in two passes, and
|
|
435
|
-
to keep the intermediate columnar format ("exploded") around to facilitate
|
|
436
|
-
experimentation with chunk sizes and compression settings:
|
|
437
|
-
|
|
438
|
-
\b
|
|
439
|
-
$ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
|
|
440
|
-
$ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
|
|
441
|
-
|
|
442
|
-
The inspect command provides a way to view contents of an exploded ICF
|
|
443
|
-
or Zarr:
|
|
444
|
-
|
|
445
|
-
$ vcf2zarr inspect [PATH]
|
|
446
|
-
|
|
447
|
-
This is useful when tweaking chunk sizes and compression settings to suit
|
|
448
|
-
your dataset, using the mkschema command and --schema option to encode:
|
|
449
|
-
|
|
450
|
-
\b
|
|
451
|
-
$ vcf2zarr mkschema [ICF_PATH] > schema.json
|
|
452
|
-
$ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
|
|
453
|
-
|
|
454
|
-
By editing the schema.json file you can drop columns that are not of interest
|
|
455
|
-
and edit column specific compression settings. The --max-variant-chunks option
|
|
456
|
-
to encode allows you to try out these options on small subsets, hopefully
|
|
457
|
-
arriving at settings with the desired balance of compression and query
|
|
458
|
-
performance.
|
|
459
|
-
|
|
460
|
-
ADVANCED USAGE
|
|
461
|
-
|
|
462
|
-
For very large datasets (terabyte scale) it may be necessary to distribute the
|
|
463
|
-
explode and encode steps across a cluster:
|
|
464
|
-
|
|
465
|
-
\b
|
|
466
|
-
$ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
|
|
467
|
-
$ vcf2zarr dexplode-partition [ICF_PATH] [PARTITION_INDEX]
|
|
468
|
-
$ vcf2zarr dexplode-finalise [ICF_PATH]
|
|
469
|
-
|
|
470
|
-
See the online documentation at [FIXME] for more details on distributed explode.
|
|
462
|
+
See the online documentation at https://sgkit-dev.github.io/bio2zarr/
|
|
463
|
+
for more information.
|
|
471
464
|
"""
|
|
472
465
|
|
|
473
466
|
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
vcf2zarr.add_command(dencode_finalise)
|
|
467
|
+
vcf2zarr_main.add_command(convert_vcf)
|
|
468
|
+
vcf2zarr_main.add_command(inspect)
|
|
469
|
+
vcf2zarr_main.add_command(explode)
|
|
470
|
+
vcf2zarr_main.add_command(mkschema)
|
|
471
|
+
vcf2zarr_main.add_command(encode)
|
|
472
|
+
vcf2zarr_main.add_command(dexplode_init)
|
|
473
|
+
vcf2zarr_main.add_command(dexplode_partition)
|
|
474
|
+
vcf2zarr_main.add_command(dexplode_finalise)
|
|
475
|
+
vcf2zarr_main.add_command(dencode_init)
|
|
476
|
+
vcf2zarr_main.add_command(dencode_partition)
|
|
477
|
+
vcf2zarr_main.add_command(dencode_finalise)
|
|
486
478
|
|
|
487
479
|
|
|
488
480
|
@click.command(name="convert")
|
|
@@ -529,7 +521,7 @@ plink2zarr.add_command(convert_plink)
|
|
|
529
521
|
@click.option("-i", "--index", type=click.Path(), default=None)
|
|
530
522
|
@click.option("-n", "--num-parts", type=int, default=None)
|
|
531
523
|
# @click.option("-s", "--part-size", type=int, default=None)
|
|
532
|
-
def
|
|
524
|
+
def vcfpartition(vcf_path, index, num_parts):
|
|
533
525
|
indexed_vcf = vcf_utils.IndexedVcf(vcf_path, index)
|
|
534
526
|
regions = indexed_vcf.partition_into_regions(num_parts=num_parts)
|
|
535
527
|
click.echo("\n".join(map(str, regions)))
|
bio2zarr/constants.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
INT_MISSING = -1
|
|
4
|
+
INT_FILL = -2
|
|
5
|
+
STR_MISSING = "."
|
|
6
|
+
STR_FILL = ""
|
|
7
|
+
|
|
8
|
+
FLOAT32_MISSING, FLOAT32_FILL = np.array([0x7F800001, 0x7F800002], dtype=np.int32).view(
|
|
9
|
+
np.float32
|
|
10
|
+
)
|
|
11
|
+
FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array(
|
|
12
|
+
[0x7F800001, 0x7F800002], dtype=np.int32
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
MIN_INT_VALUE = np.iinfo(np.int32).min + 2
|
|
17
|
+
VCF_INT_MISSING = np.iinfo(np.int32).min
|
|
18
|
+
VCF_INT_FILL = np.iinfo(np.int32).min + 1
|
bio2zarr/core.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
import concurrent.futures as cf
|
|
2
2
|
import contextlib
|
|
3
3
|
import dataclasses
|
|
4
|
+
import json
|
|
4
5
|
import logging
|
|
6
|
+
import math
|
|
5
7
|
import multiprocessing
|
|
6
8
|
import os
|
|
7
9
|
import os.path
|
|
8
10
|
import threading
|
|
9
11
|
import time
|
|
10
12
|
|
|
13
|
+
import humanfriendly
|
|
11
14
|
import numcodecs
|
|
12
15
|
import numpy as np
|
|
13
16
|
import tqdm
|
|
@@ -18,6 +21,17 @@ logger = logging.getLogger(__name__)
|
|
|
18
21
|
numcodecs.blosc.use_threads = False
|
|
19
22
|
|
|
20
23
|
|
|
24
|
+
def display_number(x):
|
|
25
|
+
ret = "n/a"
|
|
26
|
+
if math.isfinite(x):
|
|
27
|
+
ret = f"{x: 0.2g}"
|
|
28
|
+
return ret
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def display_size(n):
|
|
32
|
+
return humanfriendly.format_size(n, binary=True)
|
|
33
|
+
|
|
34
|
+
|
|
21
35
|
def min_int_dtype(min_value, max_value):
|
|
22
36
|
if min_value > max_value:
|
|
23
37
|
raise ValueError("min_value must be <= max_value")
|
|
@@ -128,7 +142,6 @@ class BufferedArray:
|
|
|
128
142
|
sync_flush_2d_array(
|
|
129
143
|
self.buff[: self.buffer_row], self.array, self.array_offset
|
|
130
144
|
)
|
|
131
|
-
# FIXME the array.name doesn't seem to be working here for some reason
|
|
132
145
|
logger.debug(
|
|
133
146
|
f"Flushed <{self.array.name} {self.array.shape} "
|
|
134
147
|
f"{self.array.dtype}> "
|
|
@@ -174,7 +187,7 @@ class ProgressConfig:
|
|
|
174
187
|
# progressable thing happening per source process. This is
|
|
175
188
|
# probably fine in practise, but there could be corner cases
|
|
176
189
|
# where it's not. Something to watch out for.
|
|
177
|
-
_progress_counter =
|
|
190
|
+
_progress_counter = None
|
|
178
191
|
|
|
179
192
|
|
|
180
193
|
def update_progress(inc):
|
|
@@ -188,23 +201,30 @@ def get_progress():
|
|
|
188
201
|
return val
|
|
189
202
|
|
|
190
203
|
|
|
191
|
-
def
|
|
192
|
-
|
|
193
|
-
|
|
204
|
+
def setup_progress_counter(counter):
|
|
205
|
+
global _progress_counter
|
|
206
|
+
_progress_counter = counter
|
|
194
207
|
|
|
195
208
|
|
|
196
209
|
class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
197
210
|
def __init__(self, worker_processes=1, progress_config=None):
|
|
211
|
+
# Need to specify this explicitly to suppport Macs and
|
|
212
|
+
# for future proofing.
|
|
213
|
+
ctx = multiprocessing.get_context("spawn")
|
|
214
|
+
global _progress_counter
|
|
215
|
+
_progress_counter = ctx.Value("Q", 0)
|
|
198
216
|
if worker_processes <= 0:
|
|
199
217
|
# NOTE: this is only for testing, not for production use!
|
|
200
218
|
self.executor = SynchronousExecutor()
|
|
201
219
|
else:
|
|
202
220
|
self.executor = cf.ProcessPoolExecutor(
|
|
203
221
|
max_workers=worker_processes,
|
|
222
|
+
mp_context=ctx,
|
|
223
|
+
initializer=setup_progress_counter,
|
|
224
|
+
initargs=(_progress_counter,),
|
|
204
225
|
)
|
|
205
226
|
self.futures = set()
|
|
206
227
|
|
|
207
|
-
set_progress(0)
|
|
208
228
|
if progress_config is None:
|
|
209
229
|
progress_config = ProgressConfig()
|
|
210
230
|
self.progress_config = progress_config
|
|
@@ -245,16 +265,6 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
245
265
|
self.futures.add(future)
|
|
246
266
|
return future
|
|
247
267
|
|
|
248
|
-
def wait_for_completed(self, timeout=None):
|
|
249
|
-
done, not_done = cf.wait(self.futures, timeout, cf.FIRST_COMPLETED)
|
|
250
|
-
for future in done:
|
|
251
|
-
exception = future.exception()
|
|
252
|
-
# TODO do the check for BrokenProcessPool here
|
|
253
|
-
if exception is not None:
|
|
254
|
-
raise exception
|
|
255
|
-
self.futures = not_done
|
|
256
|
-
return done
|
|
257
|
-
|
|
258
268
|
def results_as_completed(self):
|
|
259
269
|
for future in cf.as_completed(self.futures):
|
|
260
270
|
yield future.result()
|
|
@@ -278,3 +288,11 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
278
288
|
self._update_progress()
|
|
279
289
|
self.progress_bar.close()
|
|
280
290
|
return False
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
class JsonDataclass:
|
|
294
|
+
def asdict(self):
|
|
295
|
+
return dataclasses.asdict(self)
|
|
296
|
+
|
|
297
|
+
def asjson(self):
|
|
298
|
+
return json.dumps(self.asdict(), indent=4)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from .icf import (
|
|
2
|
+
IntermediateColumnarFormat,
|
|
3
|
+
explode,
|
|
4
|
+
explode_finalise,
|
|
5
|
+
explode_init,
|
|
6
|
+
explode_partition,
|
|
7
|
+
)
|
|
8
|
+
from .vcz import (
|
|
9
|
+
VcfZarrSchema,
|
|
10
|
+
convert,
|
|
11
|
+
encode,
|
|
12
|
+
encode_finalise,
|
|
13
|
+
encode_init,
|
|
14
|
+
encode_partition,
|
|
15
|
+
inspect,
|
|
16
|
+
mkschema,
|
|
17
|
+
)
|
|
18
|
+
from .verification import verify
|
|
19
|
+
|
|
20
|
+
# NOTE some of these aren't intended to be part of the external
|
|
21
|
+
# interface (like IntermediateColumnarFormat), but putting
|
|
22
|
+
# them into the list to keep the lint nagging under control
|
|
23
|
+
__all__ = [
|
|
24
|
+
"IntermediateColumnarFormat",
|
|
25
|
+
"explode",
|
|
26
|
+
"explode_finalise",
|
|
27
|
+
"explode_init",
|
|
28
|
+
"explode_partition",
|
|
29
|
+
"VcfZarrSchema",
|
|
30
|
+
"convert",
|
|
31
|
+
"encode",
|
|
32
|
+
"encode_finalise",
|
|
33
|
+
"encode_init",
|
|
34
|
+
"encode_partition",
|
|
35
|
+
"inspect",
|
|
36
|
+
"mkschema",
|
|
37
|
+
"verify",
|
|
38
|
+
]
|